From 47b7aee14fd7e453370a5d15dfb11c958ca360f2 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Wed, 26 Sep 2018 16:12:06 +0100 Subject: sched/fair: Clean up load_balance() condition The alignment of the condition is off, clean that up. Also, logical operators have lower precedence than bitwise/relational operators, so remove one layer of parentheses to make the condition a bit simpler to follow. Signed-off-by: Valentin Schneider Signed-off-by: Peter Zijlstra (Intel) Cc: Dietmar.Eggemann@arm.com Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: patrick.bellasi@arm.com Cc: vincent.guittot@linaro.org Link: http://lkml.kernel.org/r/1537974727-30788-1-git-send-email-valentin.schneider@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ee271bb661cc..4e298931a715 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8877,9 +8877,9 @@ out_all_pinned: out_one_pinned: /* tune up the balancing interval */ - if (((env.flags & LBF_ALL_PINNED) && - sd->balance_interval < MAX_PINNED_INTERVAL) || - (sd->balance_interval < sd->max_interval)) + if ((env.flags & LBF_ALL_PINNED && + sd->balance_interval < MAX_PINNED_INTERVAL) || + sd->balance_interval < sd->max_interval) sd->balance_interval *= 2; ld_moved = 0; -- cgit From 3f130a37c442d5c4d66531b240ebe9abfef426b5 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Wed, 26 Sep 2018 16:12:07 +0100 Subject: sched/fair: Don't increase sd->balance_interval on newidle balance When load_balance() fails to move some load because of task affinity, we end up increasing sd->balance_interval to delay the next periodic balance in the hopes that next time we look, that annoying pinned task(s) will be gone. However, idle_balance() pays no attention to sd->balance_interval, yet it will still lead to an increase in balance_interval in case of pinned tasks. If we're going through several newidle balances (e.g. we have a periodic task), this can lead to a huge increase of the balance_interval in a very small amount of time. To prevent that, don't increase the balance interval when going through a newidle balance. This is a similar approach to what is done in commit 58b26c4c0257 ("sched: Increment cache_nice_tries only on periodic lb"), where we disregard newidle balance and rely on periodic balance for more stable results. Signed-off-by: Valentin Schneider Signed-off-by: Peter Zijlstra (Intel) Cc: Dietmar.Eggemann@arm.com Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: patrick.bellasi@arm.com Cc: vincent.guittot@linaro.org Link: http://lkml.kernel.org/r/1537974727-30788-2-git-send-email-valentin.schneider@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 4e298931a715..a17ca4254427 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8876,13 +8876,22 @@ out_all_pinned: sd->nr_balance_failed = 0; out_one_pinned: + ld_moved = 0; + + /* + * idle_balance() disregards balance intervals, so we could repeatedly + * reach this code, which would lead to balance_interval skyrocketting + * in a short amount of time. Skip the balance_interval increase logic + * to avoid that. + */ + if (env.idle == CPU_NEWLY_IDLE) + goto out; + /* tune up the balancing interval */ if ((env.flags & LBF_ALL_PINNED && sd->balance_interval < MAX_PINNED_INTERVAL) || sd->balance_interval < sd->max_interval) sd->balance_interval *= 2; - - ld_moved = 0; out: return ld_moved; } -- cgit From ff1cdc94de4d336be45336d70709dfcf3d682514 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Fri, 26 Oct 2018 21:17:43 +0800 Subject: sched/core: Introduce set_next_task() helper for better code readability When we pick the next task, we will do the following for the task: 1) p->se.exec_start = rq_clock_task(rq); 2) dequeue_pushable(_dl)_task(rq, p); When we call set_curr_task(), we also need to do the same thing above. In rt.c, the code at 1) is in the _pick_next_task_rt() and the code at 2) is in the pick_next_task_rt(). If we put two operations in one function, maybe better. So, we introduce a new function set_next_task(), which is responsible for doing the above. By introducing the function we can get rid of calling the dequeue_pushable(_dl)_task() directly(We can call set_next_task()) in pick_next_task() and have better code readability and reuse. In set_curr_task_rt(), we also can call set_next_task(). Do this things such that we end up with: static struct task_struct *pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { /* do something else ... */ put_prev_task(rq, prev); /* pick next task p */ set_next_task(rq, p); /* do something else ... */ } put_prev_task() can match set_next_task(), which can make the code more readable. Signed-off-by: Muchun Song Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20181026131743.21786-1-smuchun@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/deadline.c | 19 ++++++++++--------- kernel/sched/rt.c | 24 +++++++++++------------- 2 files changed, 21 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 91e4202b0634..470ba6b464fe 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1695,6 +1695,14 @@ static void start_hrtick_dl(struct rq *rq, struct task_struct *p) } #endif +static inline void set_next_task(struct rq *rq, struct task_struct *p) +{ + p->se.exec_start = rq_clock_task(rq); + + /* You can't push away the running task */ + dequeue_pushable_dl_task(rq, p); +} + static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq, struct dl_rq *dl_rq) { @@ -1750,10 +1758,8 @@ pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) BUG_ON(!dl_se); p = dl_task_of(dl_se); - p->se.exec_start = rq_clock_task(rq); - /* Running task will never be pushed. */ - dequeue_pushable_dl_task(rq, p); + set_next_task(rq, p); if (hrtick_enabled(rq)) start_hrtick_dl(rq, p); @@ -1808,12 +1814,7 @@ static void task_fork_dl(struct task_struct *p) static void set_curr_task_dl(struct rq *rq) { - struct task_struct *p = rq->curr; - - p->se.exec_start = rq_clock_task(rq); - - /* You can't push away the running task */ - dequeue_pushable_dl_task(rq, p); + set_next_task(rq, rq->curr); } #ifdef CONFIG_SMP diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index a21ea6021929..9aa3287ce301 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1498,6 +1498,14 @@ static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flag #endif } +static inline void set_next_task(struct rq *rq, struct task_struct *p) +{ + p->se.exec_start = rq_clock_task(rq); + + /* The running task is never eligible for pushing */ + dequeue_pushable_task(rq, p); +} + static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq, struct rt_rq *rt_rq) { @@ -1518,7 +1526,6 @@ static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq, static struct task_struct *_pick_next_task_rt(struct rq *rq) { struct sched_rt_entity *rt_se; - struct task_struct *p; struct rt_rq *rt_rq = &rq->rt; do { @@ -1527,10 +1534,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq) rt_rq = group_rt_rq(rt_se); } while (rt_rq); - p = rt_task_of(rt_se); - p->se.exec_start = rq_clock_task(rq); - - return p; + return rt_task_of(rt_se); } static struct task_struct * @@ -1573,8 +1577,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) p = _pick_next_task_rt(rq); - /* The running task is never eligible for pushing */ - dequeue_pushable_task(rq, p); + set_next_task(rq, p); rt_queue_push_tasks(rq); @@ -2355,12 +2358,7 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) static void set_curr_task_rt(struct rq *rq) { - struct task_struct *p = rq->curr; - - p->se.exec_start = rq_clock_task(rq); - - /* The running task is never eligible for pushing */ - dequeue_pushable_task(rq, p); + set_next_task(rq, rq->curr); } static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task) -- cgit From b82592199032bf7c778f861b936287e37ebc9f62 Mon Sep 17 00:00:00 2001 From: Long Li Date: Fri, 2 Nov 2018 18:02:48 +0000 Subject: genirq/affinity: Spread IRQs to all available NUMA nodes If the number of NUMA nodes exceeds the number of MSI/MSI-X interrupts which are allocated for a device, the interrupt affinity spreading code fails to spread them across all nodes. The reason is, that the spreading code starts from node 0 and continues up to the number of interrupts requested for allocation. This leaves the nodes past the last interrupt unused. This results in interrupt concentration on the first nodes which violates the assumption of the block layer that all nodes are covered evenly. As a consequence the NUMA nodes above the number of interrupts are all assigned to hardware queue 0 and therefore NUMA node 0, which results in bad performance and has CPU hotplug implications, because queue 0 gets shut down when the last CPU of node 0 is offlined. Go over all NUMA nodes and assign them round-robin to all requested interrupts to solve this. [ tglx: Massaged changelog ] Signed-off-by: Long Li Signed-off-by: Thomas Gleixner Reviewed-by: Ming Lei Cc: Michael Kelley Link: https://lkml.kernel.org/r/20181102180248.13583-1-longli@linuxonhyperv.com --- kernel/irq/affinity.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c index f4f29b9d90ee..e12cdf637c71 100644 --- a/kernel/irq/affinity.c +++ b/kernel/irq/affinity.c @@ -117,12 +117,11 @@ static int irq_build_affinity_masks(const struct irq_affinity *affd, */ if (numvecs <= nodes) { for_each_node_mask(n, nodemsk) { - cpumask_copy(masks + curvec, node_to_cpumask[n]); - if (++done == numvecs) - break; + cpumask_or(masks + curvec, masks + curvec, node_to_cpumask[n]); if (++curvec == last_affv) curvec = affd->pre_vectors; } + done = numvecs; goto out; } -- cgit From 5c903e108d0b005cf59904ca3520934fca4b9439 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 2 Nov 2018 22:59:49 +0800 Subject: genirq/affinity: Move two stage affinity spreading into a helper function No functional change. Prepares for supporting allocating and affinitizing interrupt sets. [ tglx: Minor changelog tweaks ] Signed-off-by: Ming Lei Signed-off-by: Thomas Gleixner Cc: Jens Axboe Cc: linux-block@vger.kernel.org Cc: Hannes Reinecke Cc: Keith Busch Cc: Sagi Grimberg Link: https://lkml.kernel.org/r/20181102145951.31979-3-ming.lei@redhat.com --- kernel/irq/affinity.c | 92 +++++++++++++++++++++++++++++++-------------------- 1 file changed, 56 insertions(+), 36 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c index e12cdf637c71..2f9812b6035e 100644 --- a/kernel/irq/affinity.c +++ b/kernel/irq/affinity.c @@ -94,7 +94,7 @@ static int get_nodes_in_cpumask(cpumask_var_t *node_to_cpumask, return nodes; } -static int irq_build_affinity_masks(const struct irq_affinity *affd, +static int __irq_build_affinity_masks(const struct irq_affinity *affd, int startvec, int numvecs, cpumask_var_t *node_to_cpumask, const struct cpumask *cpu_mask, @@ -165,6 +165,58 @@ out: return done; } +/* + * build affinity in two stages: + * 1) spread present CPU on these vectors + * 2) spread other possible CPUs on these vectors + */ +static int irq_build_affinity_masks(const struct irq_affinity *affd, + int startvec, int numvecs, + cpumask_var_t *node_to_cpumask, + struct cpumask *masks) +{ + int curvec = startvec, usedvecs = -1; + cpumask_var_t nmsk, npresmsk; + + if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL)) + return usedvecs; + + if (!zalloc_cpumask_var(&npresmsk, GFP_KERNEL)) + goto fail; + + /* Stabilize the cpumasks */ + get_online_cpus(); + build_node_to_cpumask(node_to_cpumask); + + /* Spread on present CPUs starting from affd->pre_vectors */ + usedvecs = __irq_build_affinity_masks(affd, curvec, numvecs, + node_to_cpumask, cpu_present_mask, + nmsk, masks); + + /* + * Spread on non present CPUs starting from the next vector to be + * handled. If the spreading of present CPUs already exhausted the + * vector space, assign the non present CPUs to the already spread + * out vectors. + */ + if (usedvecs >= numvecs) + curvec = affd->pre_vectors; + else + curvec = affd->pre_vectors + usedvecs; + cpumask_andnot(npresmsk, cpu_possible_mask, cpu_present_mask); + usedvecs += __irq_build_affinity_masks(affd, curvec, numvecs, + node_to_cpumask, npresmsk, + nmsk, masks); + put_online_cpus(); + + free_cpumask_var(npresmsk); + + fail: + free_cpumask_var(nmsk); + + return usedvecs; +} + /** * irq_create_affinity_masks - Create affinity masks for multiqueue spreading * @nvecs: The total number of vectors @@ -177,7 +229,7 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) { int affvecs = nvecs - affd->pre_vectors - affd->post_vectors; int curvec, usedvecs; - cpumask_var_t nmsk, npresmsk, *node_to_cpumask; + cpumask_var_t *node_to_cpumask; struct cpumask *masks = NULL; /* @@ -187,15 +239,9 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) if (nvecs == affd->pre_vectors + affd->post_vectors) return NULL; - if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL)) - return NULL; - - if (!zalloc_cpumask_var(&npresmsk, GFP_KERNEL)) - goto outcpumsk; - node_to_cpumask = alloc_node_to_cpumask(); if (!node_to_cpumask) - goto outnpresmsk; + return NULL; masks = kcalloc(nvecs, sizeof(*masks), GFP_KERNEL); if (!masks) @@ -205,30 +251,8 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) for (curvec = 0; curvec < affd->pre_vectors; curvec++) cpumask_copy(masks + curvec, irq_default_affinity); - /* Stabilize the cpumasks */ - get_online_cpus(); - build_node_to_cpumask(node_to_cpumask); - - /* Spread on present CPUs starting from affd->pre_vectors */ usedvecs = irq_build_affinity_masks(affd, curvec, affvecs, - node_to_cpumask, cpu_present_mask, - nmsk, masks); - - /* - * Spread on non present CPUs starting from the next vector to be - * handled. If the spreading of present CPUs already exhausted the - * vector space, assign the non present CPUs to the already spread - * out vectors. - */ - if (usedvecs >= affvecs) - curvec = affd->pre_vectors; - else - curvec = affd->pre_vectors + usedvecs; - cpumask_andnot(npresmsk, cpu_possible_mask, cpu_present_mask); - usedvecs += irq_build_affinity_masks(affd, curvec, affvecs, - node_to_cpumask, npresmsk, - nmsk, masks); - put_online_cpus(); + node_to_cpumask, masks); /* Fill out vectors at the end that don't need affinity */ if (usedvecs >= affvecs) @@ -240,10 +264,6 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) outnodemsk: free_node_to_cpumask(node_to_cpumask); -outnpresmsk: - free_cpumask_var(npresmsk); -outcpumsk: - free_cpumask_var(nmsk); return masks; } -- cgit From 060746d9e394084b7401e7532f2de528ecbfb521 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 2 Nov 2018 22:59:50 +0800 Subject: genirq/affinity: Pass first vector to __irq_build_affinity_masks() No functional change. Prepares for support of allocating and affinitizing sets of interrupts, in which each set of interrupts needs a full two stage spreading. The first vector argument is necessary for this so the affinitizing starts from the first vector of each set. [ tglx: Minor changelog tweaks ] Signed-off-by: Ming Lei Signed-off-by: Thomas Gleixner Cc: Jens Axboe Cc: linux-block@vger.kernel.org Cc: Hannes Reinecke Cc: Keith Busch Cc: Sagi Grimberg Link: https://lkml.kernel.org/r/20181102145951.31979-4-ming.lei@redhat.com --- kernel/irq/affinity.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c index 2f9812b6035e..e028b773e38a 100644 --- a/kernel/irq/affinity.c +++ b/kernel/irq/affinity.c @@ -95,14 +95,14 @@ static int get_nodes_in_cpumask(cpumask_var_t *node_to_cpumask, } static int __irq_build_affinity_masks(const struct irq_affinity *affd, - int startvec, int numvecs, + int startvec, int numvecs, int firstvec, cpumask_var_t *node_to_cpumask, const struct cpumask *cpu_mask, struct cpumask *nmsk, struct cpumask *masks) { int n, nodes, cpus_per_vec, extra_vecs, done = 0; - int last_affv = affd->pre_vectors + numvecs; + int last_affv = firstvec + numvecs; int curvec = startvec; nodemask_t nodemsk = NODE_MASK_NONE; @@ -119,7 +119,7 @@ static int __irq_build_affinity_masks(const struct irq_affinity *affd, for_each_node_mask(n, nodemsk) { cpumask_or(masks + curvec, masks + curvec, node_to_cpumask[n]); if (++curvec == last_affv) - curvec = affd->pre_vectors; + curvec = firstvec; } done = numvecs; goto out; @@ -129,7 +129,7 @@ static int __irq_build_affinity_masks(const struct irq_affinity *affd, int ncpus, v, vecs_to_assign, vecs_per_node; /* Spread the vectors per node */ - vecs_per_node = (numvecs - (curvec - affd->pre_vectors)) / nodes; + vecs_per_node = (numvecs - (curvec - firstvec)) / nodes; /* Get the cpus on this node which are in the mask */ cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]); @@ -157,7 +157,7 @@ static int __irq_build_affinity_masks(const struct irq_affinity *affd, if (done >= numvecs) break; if (curvec >= last_affv) - curvec = affd->pre_vectors; + curvec = firstvec; --nodes; } @@ -190,8 +190,9 @@ static int irq_build_affinity_masks(const struct irq_affinity *affd, /* Spread on present CPUs starting from affd->pre_vectors */ usedvecs = __irq_build_affinity_masks(affd, curvec, numvecs, - node_to_cpumask, cpu_present_mask, - nmsk, masks); + affd->pre_vectors, + node_to_cpumask, + cpu_present_mask, nmsk, masks); /* * Spread on non present CPUs starting from the next vector to be @@ -205,8 +206,9 @@ static int irq_build_affinity_masks(const struct irq_affinity *affd, curvec = affd->pre_vectors + usedvecs; cpumask_andnot(npresmsk, cpu_possible_mask, cpu_present_mask); usedvecs += __irq_build_affinity_masks(affd, curvec, numvecs, - node_to_cpumask, npresmsk, - nmsk, masks); + affd->pre_vectors, + node_to_cpumask, npresmsk, + nmsk, masks); put_online_cpus(); free_cpumask_var(npresmsk); -- cgit From 6da4b3ab9a6e9b1b5f90322ab3fa3a7dd18edb19 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 2 Nov 2018 22:59:51 +0800 Subject: genirq/affinity: Add support for allocating interrupt sets A driver may have a need to allocate multiple sets of MSI/MSI-X interrupts, and have them appropriately affinitized. Add support for defining a number of sets in the irq_affinity structure, of varying sizes, and get each set affinitized correctly across the machine. [ tglx: Minor changelog tweaks ] Signed-off-by: Jens Axboe Signed-off-by: Ming Lei Signed-off-by: Thomas Gleixner Reviewed-by: Hannes Reinecke Reviewed-by: Ming Lei Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg Cc: linux-block@vger.kernel.org Link: https://lkml.kernel.org/r/20181102145951.31979-5-ming.lei@redhat.com --- drivers/pci/msi.c | 14 +++++++++ include/linux/interrupt.h | 4 +++ kernel/irq/affinity.c | 77 +++++++++++++++++++++++++++++++++-------------- 3 files changed, 72 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c index af24ed50a245..265ed3e4c920 100644 --- a/drivers/pci/msi.c +++ b/drivers/pci/msi.c @@ -1036,6 +1036,13 @@ static int __pci_enable_msi_range(struct pci_dev *dev, int minvec, int maxvec, if (maxvec < minvec) return -ERANGE; + /* + * If the caller is passing in sets, we can't support a range of + * vectors. The caller needs to handle that. + */ + if (affd && affd->nr_sets && minvec != maxvec) + return -EINVAL; + if (WARN_ON_ONCE(dev->msi_enabled)) return -EINVAL; @@ -1087,6 +1094,13 @@ static int __pci_enable_msix_range(struct pci_dev *dev, if (maxvec < minvec) return -ERANGE; + /* + * If the caller is passing in sets, we can't support a range of + * supported vectors. The caller needs to handle that. + */ + if (affd && affd->nr_sets && minvec != maxvec) + return -EINVAL; + if (WARN_ON_ONCE(dev->msix_enabled)) return -EINVAL; diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index 1d6711c28271..ca397ff40836 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -247,10 +247,14 @@ struct irq_affinity_notify { * the MSI(-X) vector space * @post_vectors: Don't apply affinity to @post_vectors at end of * the MSI(-X) vector space + * @nr_sets: Length of passed in *sets array + * @sets: Number of affinitized sets */ struct irq_affinity { int pre_vectors; int post_vectors; + int nr_sets; + int *sets; }; #if defined(CONFIG_SMP) diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c index e028b773e38a..08c904eb7279 100644 --- a/kernel/irq/affinity.c +++ b/kernel/irq/affinity.c @@ -171,28 +171,29 @@ out: * 2) spread other possible CPUs on these vectors */ static int irq_build_affinity_masks(const struct irq_affinity *affd, - int startvec, int numvecs, + int startvec, int numvecs, int firstvec, cpumask_var_t *node_to_cpumask, struct cpumask *masks) { - int curvec = startvec, usedvecs = -1; + int curvec = startvec, nr_present, nr_others; + int ret = -ENOMEM; cpumask_var_t nmsk, npresmsk; if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL)) - return usedvecs; + return ret; if (!zalloc_cpumask_var(&npresmsk, GFP_KERNEL)) goto fail; + ret = 0; /* Stabilize the cpumasks */ get_online_cpus(); build_node_to_cpumask(node_to_cpumask); /* Spread on present CPUs starting from affd->pre_vectors */ - usedvecs = __irq_build_affinity_masks(affd, curvec, numvecs, - affd->pre_vectors, - node_to_cpumask, - cpu_present_mask, nmsk, masks); + nr_present = __irq_build_affinity_masks(affd, curvec, numvecs, + firstvec, node_to_cpumask, + cpu_present_mask, nmsk, masks); /* * Spread on non present CPUs starting from the next vector to be @@ -200,23 +201,24 @@ static int irq_build_affinity_masks(const struct irq_affinity *affd, * vector space, assign the non present CPUs to the already spread * out vectors. */ - if (usedvecs >= numvecs) - curvec = affd->pre_vectors; + if (nr_present >= numvecs) + curvec = firstvec; else - curvec = affd->pre_vectors + usedvecs; + curvec = firstvec + nr_present; cpumask_andnot(npresmsk, cpu_possible_mask, cpu_present_mask); - usedvecs += __irq_build_affinity_masks(affd, curvec, numvecs, - affd->pre_vectors, - node_to_cpumask, npresmsk, - nmsk, masks); + nr_others = __irq_build_affinity_masks(affd, curvec, numvecs, + firstvec, node_to_cpumask, + npresmsk, nmsk, masks); put_online_cpus(); + if (nr_present < numvecs) + WARN_ON(nr_present + nr_others < numvecs); + free_cpumask_var(npresmsk); fail: free_cpumask_var(nmsk); - - return usedvecs; + return ret; } /** @@ -233,6 +235,7 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) int curvec, usedvecs; cpumask_var_t *node_to_cpumask; struct cpumask *masks = NULL; + int i, nr_sets; /* * If there aren't any vectors left after applying the pre/post @@ -253,8 +256,28 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) for (curvec = 0; curvec < affd->pre_vectors; curvec++) cpumask_copy(masks + curvec, irq_default_affinity); - usedvecs = irq_build_affinity_masks(affd, curvec, affvecs, - node_to_cpumask, masks); + /* + * Spread on present CPUs starting from affd->pre_vectors. If we + * have multiple sets, build each sets affinity mask separately. + */ + nr_sets = affd->nr_sets; + if (!nr_sets) + nr_sets = 1; + + for (i = 0, usedvecs = 0; i < nr_sets; i++) { + int this_vecs = affd->sets ? affd->sets[i] : affvecs; + int ret; + + ret = irq_build_affinity_masks(affd, curvec, this_vecs, + curvec, node_to_cpumask, masks); + if (ret) { + kfree(masks); + masks = NULL; + goto outnodemsk; + } + curvec += this_vecs; + usedvecs += this_vecs; + } /* Fill out vectors at the end that don't need affinity */ if (usedvecs >= affvecs) @@ -279,13 +302,21 @@ int irq_calc_affinity_vectors(int minvec, int maxvec, const struct irq_affinity { int resv = affd->pre_vectors + affd->post_vectors; int vecs = maxvec - resv; - int ret; + int set_vecs; if (resv > minvec) return 0; - get_online_cpus(); - ret = min_t(int, cpumask_weight(cpu_possible_mask), vecs) + resv; - put_online_cpus(); - return ret; + if (affd->nr_sets) { + int i; + + for (i = 0, set_vecs = 0; i < affd->nr_sets; i++) + set_vecs += affd->sets[i]; + } else { + get_online_cpus(); + set_vecs = cpumask_weight(cpu_possible_mask); + put_online_cpus(); + } + + return resv + min(set_vecs, vecs); } -- cgit From 7d9df98be66fec64349f9f1c9d3e896293fe7b45 Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Sat, 3 Nov 2018 22:31:04 -0400 Subject: clockevents: Remove unnecessary unlikely() WARN_ON() and WARN_ON_ONCE() already contains an unlikely(), so it's not necessary to use unlikely. Signed-off-by: Yangtao Li Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20181104023104.2572-1-tiny.windzz@gmail.com --- kernel/time/clockevents.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 8c0e4092f661..af58898d9ebf 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -39,10 +39,8 @@ static u64 cev_delta2ns(unsigned long latch, struct clock_event_device *evt, u64 clc = (u64) latch << evt->shift; u64 rnd; - if (unlikely(!evt->mult)) { + if (WARN_ON(!evt->mult)) evt->mult = 1; - WARN_ON(1); - } rnd = (u64) evt->mult - 1; /* @@ -164,10 +162,8 @@ void clockevents_switch_state(struct clock_event_device *dev, * on it, so fix it up and emit a warning: */ if (clockevent_state_oneshot(dev)) { - if (unlikely(!dev->mult)) { + if (WARN_ON(!dev->mult)) dev->mult = 1; - WARN_ON(1); - } } } } @@ -315,10 +311,8 @@ int clockevents_program_event(struct clock_event_device *dev, ktime_t expires, int64_t delta; int rc; - if (unlikely(expires < 0)) { - WARN_ON_ONCE(1); + if (WARN_ON_ONCE(expires < 0)) return -ETIME; - } dev->next_event = expires; -- cgit From 4d9ebbe2b061a9c25e12ba8539ba172533132eb6 Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Sat, 3 Nov 2018 22:27:41 -0400 Subject: cgroup: remove unnecessary unlikely() WARN_ON() already contains an unlikely(), so it's not necessary to use unlikely. Signed-off-by: Yangtao Li Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 6aaf5dd5383b..2e5d90dfcb49 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -5978,10 +5978,8 @@ static ssize_t show_delegatable_files(struct cftype *files, char *buf, ret += snprintf(buf + ret, size - ret, "%s\n", cft->name); - if (unlikely(ret >= size)) { - WARN_ON(1); + if (WARN_ON(ret >= size)) break; - } } return ret; -- cgit From ea956d8be91edc702a98b7fe1f9463e7ca8c42ab Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Wed, 10 Oct 2018 16:22:57 -0400 Subject: audit: print empty EXECVE args Empty executable arguments were being skipped when printing out the list of arguments in an EXECVE record, making it appear they were somehow lost. Include empty arguments as an itemized empty string. Reproducer: autrace /bin/ls "" "/etc" ausearch --start recent -m execve -i | grep EXECVE type=EXECVE msg=audit(10/03/2018 13:04:03.208:1391) : argc=3 a0=/bin/ls a2=/etc With fix: type=EXECVE msg=audit(10/03/2018 21:51:38.290:194) : argc=3 a0=/bin/ls a1= a2=/etc type=EXECVE msg=audit(1538617898.290:194): argc=3 a0="/bin/ls" a1="" a2="/etc" Passes audit-testsuite. GH issue tracker at https://github.com/linux-audit/audit-kernel/issues/99 Signed-off-by: Richard Guy Briggs [PM: cleaned up the commit metadata] Signed-off-by: Paul Moore --- kernel/auditsc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index b2d1f043f17f..1513873e23bd 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1107,7 +1107,7 @@ static void audit_log_execve_info(struct audit_context *context, } /* write as much as we can to the audit log */ - if (len_buf > 0) { + if (len_buf >= 0) { /* NOTE: some magic numbers here - basically if we * can't fit a reasonable amount of data into the * existing audit buffer, flush it and start with -- cgit From e8da8794a7fd9eef1ec9a07f0d4897c68581c72b Mon Sep 17 00:00:00 2001 From: Long Li Date: Tue, 6 Nov 2018 04:00:00 +0000 Subject: genirq/matrix: Improve target CPU selection for managed interrupts. On large systems with multiple devices of the same class (e.g. NVMe disks, using managed interrupts), the kernel can affinitize these interrupts to a small subset of CPUs instead of spreading them out evenly. irq_matrix_alloc_managed() tries to select the CPU in the supplied cpumask of possible target CPUs which has the lowest number of interrupt vectors allocated. This is done by searching the CPU with the highest number of available vectors. While this is correct for non-managed CPUs it can select the wrong CPU for managed interrupts. Under certain constellations this results in affinitizing the managed interrupts of several devices to a single CPU in a set. The book keeping of available vectors works the following way: 1) Non-managed interrupts: available is decremented when the interrupt is actually requested by the device driver and a vector is assigned. It's incremented when the interrupt and the vector are freed. 2) Managed interrupts: Managed interrupts guarantee vector reservation when the MSI/MSI-X functionality of a device is enabled, which is achieved by reserving vectors in the bitmaps of the possible target CPUs. This reservation decrements the available count on each possible target CPU. When the interrupt is requested by the device driver then a vector is allocated from the reserved region. The operation is reversed when the interrupt is freed by the device driver. Neither of these operations affect the available count. The reservation persist up to the point where the MSI/MSI-X functionality is disabled and only this operation increments the available count again. For non-managed interrupts the available count is the correct selection criterion because the guaranteed reservations need to be taken into account. Using the allocated counter could lead to a failing allocation in the following situation (total vector space of 10 assumed): CPU0 CPU1 available: 2 0 allocated: 5 3 <--- CPU1 is selected, but available space = 0 managed reserved: 3 7 while available yields the correct result. For managed interrupts the available count is not the appropriate selection criterion because as explained above the available count is not affected by the actual vector allocation. The following example illustrates that. Total vector space of 10 assumed. The starting point is: CPU0 CPU1 available: 5 4 allocated: 2 3 managed reserved: 3 3 Allocating vectors for three non-managed interrupts will result in affinitizing the first two to CPU0 and the third one to CPU1 because the available count is adjusted with each allocation: CPU0 CPU1 available: 5 4 <- Select CPU0 for 1st allocation --> allocated: 3 3 available: 4 4 <- Select CPU0 for 2nd allocation --> allocated: 4 3 available: 3 4 <- Select CPU1 for 3rd allocation --> allocated: 4 4 But the allocation of three managed interrupts starting from the same point will affinitize all of them to CPU0 because the available count is not affected by the allocation (see above). So the end result is: CPU0 CPU1 available: 5 4 allocated: 5 3 Introduce a "managed_allocated" field in struct cpumap to track the vector allocation for managed interrupts separately. Use this information to select the target CPU when a vector is allocated for a managed interrupt, which results in more evenly distributed vector assignments. The above example results in the following allocations: CPU0 CPU1 managed_allocated: 0 0 <- Select CPU0 for 1st allocation --> allocated: 3 3 managed_allocated: 1 0 <- Select CPU1 for 2nd allocation --> allocated: 3 4 managed_allocated: 1 1 <- Select CPU0 for 3rd allocation --> allocated: 4 4 The allocation of non-managed interrupts is not affected by this change and is still evaluating the available count. The overall distribution of interrupt vectors for both types of interrupts might still not be perfectly even depending on the number of non-managed and managed interrupts in a system, but due to the reservation guarantee for managed interrupts this cannot be avoided. Expose the new field in debugfs as well. [ tglx: Clarified the background of the problem in the changelog and described it independent of NVME ] Signed-off-by: Long Li Signed-off-by: Thomas Gleixner Cc: Michael Kelley Link: https://lkml.kernel.org/r/20181106040000.27316-1-longli@linuxonhyperv.com --- kernel/irq/matrix.c | 34 ++++++++++++++++++++++++++++++---- 1 file changed, 30 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/matrix.c b/kernel/irq/matrix.c index 1f0985adf193..30cc217b8631 100644 --- a/kernel/irq/matrix.c +++ b/kernel/irq/matrix.c @@ -14,6 +14,7 @@ struct cpumap { unsigned int available; unsigned int allocated; unsigned int managed; + unsigned int managed_allocated; bool initialized; bool online; unsigned long alloc_map[IRQ_MATRIX_SIZE]; @@ -145,6 +146,27 @@ static unsigned int matrix_find_best_cpu(struct irq_matrix *m, return best_cpu; } +/* Find the best CPU which has the lowest number of managed IRQs allocated */ +static unsigned int matrix_find_best_cpu_managed(struct irq_matrix *m, + const struct cpumask *msk) +{ + unsigned int cpu, best_cpu, allocated = UINT_MAX; + struct cpumap *cm; + + best_cpu = UINT_MAX; + + for_each_cpu(cpu, msk) { + cm = per_cpu_ptr(m->maps, cpu); + + if (!cm->online || cm->managed_allocated > allocated) + continue; + + best_cpu = cpu; + allocated = cm->managed_allocated; + } + return best_cpu; +} + /** * irq_matrix_assign_system - Assign system wide entry in the matrix * @m: Matrix pointer @@ -269,7 +291,7 @@ int irq_matrix_alloc_managed(struct irq_matrix *m, const struct cpumask *msk, if (cpumask_empty(msk)) return -EINVAL; - cpu = matrix_find_best_cpu(m, msk); + cpu = matrix_find_best_cpu_managed(m, msk); if (cpu == UINT_MAX) return -ENOSPC; @@ -282,6 +304,7 @@ int irq_matrix_alloc_managed(struct irq_matrix *m, const struct cpumask *msk, return -ENOSPC; set_bit(bit, cm->alloc_map); cm->allocated++; + cm->managed_allocated++; m->total_allocated++; *mapped_cpu = cpu; trace_irq_matrix_alloc_managed(bit, cpu, m, cm); @@ -395,6 +418,8 @@ void irq_matrix_free(struct irq_matrix *m, unsigned int cpu, clear_bit(bit, cm->alloc_map); cm->allocated--; + if(managed) + cm->managed_allocated--; if (cm->online) m->total_allocated--; @@ -464,13 +489,14 @@ void irq_matrix_debug_show(struct seq_file *sf, struct irq_matrix *m, int ind) seq_printf(sf, "Total allocated: %6u\n", m->total_allocated); seq_printf(sf, "System: %u: %*pbl\n", nsys, m->matrix_bits, m->system_map); - seq_printf(sf, "%*s| CPU | avl | man | act | vectors\n", ind, " "); + seq_printf(sf, "%*s| CPU | avl | man | mac | act | vectors\n", ind, " "); cpus_read_lock(); for_each_online_cpu(cpu) { struct cpumap *cm = per_cpu_ptr(m->maps, cpu); - seq_printf(sf, "%*s %4d %4u %4u %4u %*pbl\n", ind, " ", - cpu, cm->available, cm->managed, cm->allocated, + seq_printf(sf, "%*s %4d %4u %4u %4u %4u %*pbl\n", ind, " ", + cpu, cm->available, cm->managed, + cm->managed_allocated, cm->allocated, m->matrix_bits, cm->alloc_map); } cpus_read_unlock(); -- cgit From e84cd7ee630e44a2cc8ae49e85920a271b214cb3 Mon Sep 17 00:00:00 2001 From: Ke Wu Date: Tue, 6 Nov 2018 15:21:30 -0800 Subject: modsign: use all trusted keys to verify module signature Make mod_verify_sig to use all trusted keys. This allows keys in secondary_trusted_keys to be used to verify PKCS#7 signature on a kernel module. Signed-off-by: Ke Wu Signed-off-by: Jessica Yu --- kernel/module_signing.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/module_signing.c b/kernel/module_signing.c index f2075ce8e4b3..6b9a926fd86b 100644 --- a/kernel/module_signing.c +++ b/kernel/module_signing.c @@ -83,6 +83,7 @@ int mod_verify_sig(const void *mod, struct load_info *info) } return verify_pkcs7_signature(mod, modlen, mod + modlen, sig_len, - NULL, VERIFYING_MODULE_SIGNATURE, + VERIFY_USE_SECONDARY_KEYRING, + VERIFYING_MODULE_SIGNATURE, NULL, NULL); } -- cgit From 4ec22e9c5a90e3809dd52014d5d239af8831a520 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Thu, 8 Nov 2018 10:08:35 -0500 Subject: cpuset: Enable cpuset controller in default hierarchy Given the fact that thread mode had been merged into 4.14, it is now time to enable cpuset to be used in the default hierarchy (cgroup v2) as it is clearly threaded. The cpuset controller had experienced feature creep since its introduction more than a decade ago. Besides the core cpus and mems control files to limit cpus and memory nodes, there are a bunch of additional features that can be controlled from the userspace. Some of the features are of doubtful usefulness and may not be actively used. This patch enables cpuset controller in the default hierarchy with a minimal set of features, namely just the cpus and mems and their effective_* counterparts. We can certainly add more features to the default hierarchy in the future if there is a real user need for them later on. Alternatively, with the unified hiearachy, it may make more sense to move some of those additional cpuset features, if desired, to memory controller or may be to the cpu controller instead of staying with cpuset. Signed-off-by: Waiman Long Acked-by: Peter Zijlstra (Intel) Signed-off-by: Tejun Heo --- Documentation/admin-guide/cgroup-v2.rst | 109 ++++++++++++++++++++++++++++++-- kernel/cgroup/cpuset.c | 48 +++++++++++++- 2 files changed, 149 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 476722b7b636..01b70f69304e 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -56,11 +56,13 @@ v1 is available under Documentation/cgroup-v1/. 5-3-3-2. IO Latency Interface Files 5-4. PID 5-4-1. PID Interface Files - 5-5. Device - 5-6. RDMA - 5-6-1. RDMA Interface Files - 5-7. Misc - 5-7-1. perf_event + 5-5. Cpuset + 5.5-1. Cpuset Interface Files + 5-6. Device + 5-7. RDMA + 5-7-1. RDMA Interface Files + 5-8. Misc + 5-8-1. perf_event 5-N. Non-normative information 5-N-1. CPU controller root cgroup process behaviour 5-N-2. IO controller root cgroup process behaviour @@ -1610,6 +1612,103 @@ through fork() or clone(). These will return -EAGAIN if the creation of a new process would cause a cgroup policy to be violated. +Cpuset +------ + +The "cpuset" controller provides a mechanism for constraining +the CPU and memory node placement of tasks to only the resources +specified in the cpuset interface files in a task's current cgroup. +This is especially valuable on large NUMA systems where placing jobs +on properly sized subsets of the systems with careful processor and +memory placement to reduce cross-node memory access and contention +can improve overall system performance. + +The "cpuset" controller is hierarchical. That means the controller +cannot use CPUs or memory nodes not allowed in its parent. + + +Cpuset Interface Files +~~~~~~~~~~~~~~~~~~~~~~ + + cpuset.cpus + A read-write multiple values file which exists on non-root + cpuset-enabled cgroups. + + It lists the requested CPUs to be used by tasks within this + cgroup. The actual list of CPUs to be granted, however, is + subjected to constraints imposed by its parent and can differ + from the requested CPUs. + + The CPU numbers are comma-separated numbers or ranges. + For example: + + # cat cpuset.cpus + 0-4,6,8-10 + + An empty value indicates that the cgroup is using the same + setting as the nearest cgroup ancestor with a non-empty + "cpuset.cpus" or all the available CPUs if none is found. + + The value of "cpuset.cpus" stays constant until the next update + and won't be affected by any CPU hotplug events. + + cpuset.cpus.effective + A read-only multiple values file which exists on non-root + cpuset-enabled cgroups. + + It lists the onlined CPUs that are actually granted to this + cgroup by its parent. These CPUs are allowed to be used by + tasks within the current cgroup. + + If "cpuset.cpus" is empty, the "cpuset.cpus.effective" file shows + all the CPUs from the parent cgroup that can be available to + be used by this cgroup. Otherwise, it should be a subset of + "cpuset.cpus" unless none of the CPUs listed in "cpuset.cpus" + can be granted. In this case, it will be treated just like an + empty "cpuset.cpus". + + Its value will be affected by CPU hotplug events. + + cpuset.mems + A read-write multiple values file which exists on non-root + cpuset-enabled cgroups. + + It lists the requested memory nodes to be used by tasks within + this cgroup. The actual list of memory nodes granted, however, + is subjected to constraints imposed by its parent and can differ + from the requested memory nodes. + + The memory node numbers are comma-separated numbers or ranges. + For example: + + # cat cpuset.mems + 0-1,3 + + An empty value indicates that the cgroup is using the same + setting as the nearest cgroup ancestor with a non-empty + "cpuset.mems" or all the available memory nodes if none + is found. + + The value of "cpuset.mems" stays constant until the next update + and won't be affected by any memory nodes hotplug events. + + cpuset.mems.effective + A read-only multiple values file which exists on non-root + cpuset-enabled cgroups. + + It lists the onlined memory nodes that are actually granted to + this cgroup by its parent. These memory nodes are allowed to + be used by tasks within the current cgroup. + + If "cpuset.mems" is empty, it shows all the memory nodes from the + parent cgroup that will be available to be used by this cgroup. + Otherwise, it should be a subset of "cpuset.mems" unless none of + the memory nodes listed in "cpuset.mems" can be granted. In this + case, it will be treated just like an empty "cpuset.mems". + + Its value will be affected by memory nodes hotplug events. + + Device controller ----------------- diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 266f10cb7222..2b5c4477d969 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -1824,12 +1824,11 @@ static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft) return 0; } - /* * for the common functions, 'private' gives the type of file */ -static struct cftype files[] = { +static struct cftype legacy_files[] = { { .name = "cpus", .seq_show = cpuset_common_seq_show, @@ -1931,6 +1930,47 @@ static struct cftype files[] = { { } /* terminate */ }; +/* + * This is currently a minimal set for the default hierarchy. It can be + * expanded later on by migrating more features and control files from v1. + */ +static struct cftype dfl_files[] = { + { + .name = "cpus", + .seq_show = cpuset_common_seq_show, + .write = cpuset_write_resmask, + .max_write_len = (100U + 6 * NR_CPUS), + .private = FILE_CPULIST, + .flags = CFTYPE_NOT_ON_ROOT, + }, + + { + .name = "mems", + .seq_show = cpuset_common_seq_show, + .write = cpuset_write_resmask, + .max_write_len = (100U + 6 * MAX_NUMNODES), + .private = FILE_MEMLIST, + .flags = CFTYPE_NOT_ON_ROOT, + }, + + { + .name = "cpus.effective", + .seq_show = cpuset_common_seq_show, + .private = FILE_EFFECTIVE_CPULIST, + .flags = CFTYPE_NOT_ON_ROOT, + }, + + { + .name = "mems.effective", + .seq_show = cpuset_common_seq_show, + .private = FILE_EFFECTIVE_MEMLIST, + .flags = CFTYPE_NOT_ON_ROOT, + }, + + { } /* terminate */ +}; + + /* * cpuset_css_alloc - allocate a cpuset css * cgrp: control group that the new cpuset will be part of @@ -2105,8 +2145,10 @@ struct cgroup_subsys cpuset_cgrp_subsys = { .post_attach = cpuset_post_attach, .bind = cpuset_bind, .fork = cpuset_fork, - .legacy_cftypes = files, + .legacy_cftypes = legacy_files, + .dfl_cftypes = dfl_files, .early_init = true, + .threaded = true, }; /** -- cgit From 58b7484250db8da0d49bdd289c877ccd3319575c Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Thu, 8 Nov 2018 10:08:36 -0500 Subject: cpuset: Define data structures to support scheduling partition >From a cpuset point of view, a scheduling partition is a group of cpusets with their own set of exclusive CPUs that are not shared by other tasks outside the scheduling partition. In the legacy hierarchy, scheduling partitions are supported indirectly via the right use of the load balancing and the exclusive CPUs flag which is not intuitive and can be hard to use. To fully support the concept of scheduling partitions in the default hierarchy, we need to add some new field into the cpuset structure as well as a new tmpmasks structure that is used to pre-allocate cpumasks at the top level cpuset functions to avoid memory allocation in inner functions as memory allocation failure in those inner functions may cause a cpuset to have inconsistent states. Signed-off-by: Waiman Long Acked-by: Peter Zijlstra (Intel) Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 2b5c4477d969..29a2bdc671fd 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -109,6 +109,13 @@ struct cpuset { cpumask_var_t effective_cpus; nodemask_t effective_mems; + /* + * CPUs allocated to child sub-partitions (default hierarchy only) + * - CPUs granted by the parent = effective_cpus U subparts_cpus + * - effective_cpus and subparts_cpus are mutually exclusive. + */ + cpumask_var_t subparts_cpus; + /* * This is old Memory Nodes tasks took on. * @@ -134,6 +141,30 @@ struct cpuset { /* for custom sched domain */ int relax_domain_level; + + /* number of CPUs in subparts_cpus */ + int nr_subparts_cpus; + + /* partition root state */ + int partition_root_state; +}; + +/* + * Partition root states: + * + * 0 - not a partition root + * 1 - partition root + */ +#define PRS_DISABLED 0 +#define PRS_ENABLED 1 + +/* + * Temporary cpumasks for working with partitions that are passed among + * functions to avoid memory allocation in inner functions. + */ +struct tmpmasks { + cpumask_var_t addmask, delmask; /* For partition root */ + cpumask_var_t new_cpus; /* For update_cpumasks_hier() */ }; static inline struct cpuset *css_cs(struct cgroup_subsys_state *css) @@ -218,9 +249,15 @@ static inline int is_spread_slab(const struct cpuset *cs) return test_bit(CS_SPREAD_SLAB, &cs->flags); } +static inline int is_partition_root(const struct cpuset *cs) +{ + return cs->partition_root_state; +} + static struct cpuset top_cpuset = { .flags = ((1 << CS_ONLINE) | (1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)), + .partition_root_state = PRS_ENABLED, }; /** -- cgit From bf92370c035d5ed73a90927450c20a07adf08cfd Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Thu, 8 Nov 2018 10:08:37 -0500 Subject: cpuset: Simply allocation and freeing of cpumasks The previous commit introduces a new subparts_cpus mask into the cpuset data structure and a new tmpmasks structure. Managing the allocation and freeing of those cpumasks is becoming more complex. So a number of helper functions are added to simplify and streamline the management of those cpumasks. To make it simple, all the cpumasks are now pre-cleared on allocation. Signed-off-by: Waiman Long Acked-by: Peter Zijlstra (Intel) Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 110 ++++++++++++++++++++++++++++++++++--------------- 1 file changed, 77 insertions(+), 33 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 29a2bdc671fd..2ac9437ce8f1 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -455,6 +455,65 @@ static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) is_mem_exclusive(p) <= is_mem_exclusive(q); } +/** + * alloc_cpumasks - allocate three cpumasks for cpuset + * @cs: the cpuset that have cpumasks to be allocated. + * @tmp: the tmpmasks structure pointer + * Return: 0 if successful, -ENOMEM otherwise. + * + * Only one of the two input arguments should be non-NULL. + */ +static inline int alloc_cpumasks(struct cpuset *cs, struct tmpmasks *tmp) +{ + cpumask_var_t *pmask1, *pmask2, *pmask3; + + if (cs) { + pmask1 = &cs->cpus_allowed; + pmask2 = &cs->effective_cpus; + pmask3 = &cs->subparts_cpus; + } else { + pmask1 = &tmp->new_cpus; + pmask2 = &tmp->addmask; + pmask3 = &tmp->delmask; + } + + if (!zalloc_cpumask_var(pmask1, GFP_KERNEL)) + return -ENOMEM; + + if (!zalloc_cpumask_var(pmask2, GFP_KERNEL)) + goto free_one; + + if (!zalloc_cpumask_var(pmask3, GFP_KERNEL)) + goto free_two; + + return 0; + +free_two: + free_cpumask_var(*pmask2); +free_one: + free_cpumask_var(*pmask1); + return -ENOMEM; +} + +/** + * free_cpumasks - free cpumasks in a tmpmasks structure + * @cs: the cpuset that have cpumasks to be free. + * @tmp: the tmpmasks structure pointer + */ +static inline void free_cpumasks(struct cpuset *cs, struct tmpmasks *tmp) +{ + if (cs) { + free_cpumask_var(cs->cpus_allowed); + free_cpumask_var(cs->effective_cpus); + free_cpumask_var(cs->subparts_cpus); + } + if (tmp) { + free_cpumask_var(tmp->new_cpus); + free_cpumask_var(tmp->addmask); + free_cpumask_var(tmp->delmask); + } +} + /** * alloc_trial_cpuset - allocate a trial cpuset * @cs: the cpuset that the trial cpuset duplicates @@ -467,31 +526,24 @@ static struct cpuset *alloc_trial_cpuset(struct cpuset *cs) if (!trial) return NULL; - if (!alloc_cpumask_var(&trial->cpus_allowed, GFP_KERNEL)) - goto free_cs; - if (!alloc_cpumask_var(&trial->effective_cpus, GFP_KERNEL)) - goto free_cpus; + if (alloc_cpumasks(trial, NULL)) { + kfree(trial); + return NULL; + } cpumask_copy(trial->cpus_allowed, cs->cpus_allowed); cpumask_copy(trial->effective_cpus, cs->effective_cpus); return trial; - -free_cpus: - free_cpumask_var(trial->cpus_allowed); -free_cs: - kfree(trial); - return NULL; } /** - * free_trial_cpuset - free the trial cpuset - * @trial: the trial cpuset to be freed + * free_cpuset - free the cpuset + * @cs: the cpuset to be freed */ -static void free_trial_cpuset(struct cpuset *trial) +static inline void free_cpuset(struct cpuset *cs) { - free_cpumask_var(trial->effective_cpus); - free_cpumask_var(trial->cpus_allowed); - kfree(trial); + free_cpumasks(cs, NULL); + kfree(cs); } /* @@ -1385,7 +1437,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, if (spread_flag_changed) update_tasks_flags(cs); out: - free_trial_cpuset(trialcs); + free_cpuset(trialcs); return err; } @@ -1769,7 +1821,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, break; } - free_trial_cpuset(trialcs); + free_cpuset(trialcs); out_unlock: mutex_unlock(&cpuset_mutex); kernfs_unbreak_active_protection(of->kn); @@ -2024,26 +2076,19 @@ cpuset_css_alloc(struct cgroup_subsys_state *parent_css) cs = kzalloc(sizeof(*cs), GFP_KERNEL); if (!cs) return ERR_PTR(-ENOMEM); - if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL)) - goto free_cs; - if (!alloc_cpumask_var(&cs->effective_cpus, GFP_KERNEL)) - goto free_cpus; + + if (alloc_cpumasks(cs, NULL)) { + kfree(cs); + return ERR_PTR(-ENOMEM); + } set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); - cpumask_clear(cs->cpus_allowed); nodes_clear(cs->mems_allowed); - cpumask_clear(cs->effective_cpus); nodes_clear(cs->effective_mems); fmeter_init(&cs->fmeter); cs->relax_domain_level = -1; return &cs->css; - -free_cpus: - free_cpumask_var(cs->cpus_allowed); -free_cs: - kfree(cs); - return ERR_PTR(-ENOMEM); } static int cpuset_css_online(struct cgroup_subsys_state *css) @@ -2134,9 +2179,7 @@ static void cpuset_css_free(struct cgroup_subsys_state *css) { struct cpuset *cs = css_cs(css); - free_cpumask_var(cs->effective_cpus); - free_cpumask_var(cs->cpus_allowed); - kfree(cs); + free_cpuset(cs); } static void cpuset_bind(struct cgroup_subsys_state *root_css) @@ -2200,6 +2243,7 @@ int __init cpuset_init(void) BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL)); BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL)); + BUG_ON(!zalloc_cpumask_var(&top_cpuset.subparts_cpus, GFP_KERNEL)); cpumask_setall(top_cpuset.cpus_allowed); nodes_setall(top_cpuset.mems_allowed); -- cgit From ee8dde0cd2ce78b62d16aec1c29960b64380e634 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Thu, 8 Nov 2018 10:08:38 -0500 Subject: cpuset: Add new v2 cpuset.sched.partition flag A new cpuset.sched.partition boolean flag is added to cpuset v2. This new flag, if set, indicates that the cgroup is the root of a new scheduling domain or partition that includes itself and all its descendants except those that are scheduling domain roots themselves and their descendants. With this new flag, one can directly create as many partitions as necessary without ever using the v1 trick of turning off load balancing in specific cpusets to create partitions as a side effect. This new flag is owned by the parent and will cause the CPUs in the cpuset to be removed from the effective CPUs of its parent. This is implemented internally by adding a new subparts_cpus mask that holds the CPUs belonging to child partitions so that: subparts_cpus | effective_cpus = cpus_allowed subparts_cpus & effective_cpus = 0 This new flag can only be turned on in a cpuset if its parent is a partition root itself. The state of this flag cannot be changed if the cpuset has children. Once turned on, further changes to "cpuset.cpus" is allowed as long as there is at least one CPU left that can be granted from the parent and a child partition root cannot use up all the CPUs in the parent's effective_cpus. Signed-off-by: Waiman Long Acked-by: Peter Zijlstra (Intel) Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 365 +++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 352 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 2ac9437ce8f1..85e04162adcb 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -970,10 +970,191 @@ static void update_tasks_cpumask(struct cpuset *cs) css_task_iter_end(&it); } +/** + * compute_effective_cpumask - Compute the effective cpumask of the cpuset + * @new_cpus: the temp variable for the new effective_cpus mask + * @cs: the cpuset the need to recompute the new effective_cpus mask + * @parent: the parent cpuset + * + * If the parent has subpartition CPUs, include them in the list of + * allowable CPUs in computing the new effective_cpus mask. + */ +static void compute_effective_cpumask(struct cpumask *new_cpus, + struct cpuset *cs, struct cpuset *parent) +{ + if (parent->nr_subparts_cpus) { + cpumask_or(new_cpus, parent->effective_cpus, + parent->subparts_cpus); + cpumask_and(new_cpus, new_cpus, cs->cpus_allowed); + } else { + cpumask_and(new_cpus, cs->cpus_allowed, parent->effective_cpus); + } +} + +/* + * Commands for update_parent_subparts_cpumask + */ +enum subparts_cmd { + partcmd_enable, /* Enable partition root */ + partcmd_disable, /* Disable partition root */ + partcmd_update, /* Update parent's subparts_cpus */ +}; + +/** + * update_parent_subparts_cpumask - update subparts_cpus mask of parent cpuset + * @cpuset: The cpuset that requests change in partition root state + * @cmd: Partition root state change command + * @newmask: Optional new cpumask for partcmd_update + * @tmp: Temporary addmask and delmask + * Return: 0, 1 or an error code + * + * For partcmd_enable, the cpuset is being transformed from a non-partition + * root to a partition root. The cpus_allowed mask of the given cpuset will + * be put into parent's subparts_cpus and taken away from parent's + * effective_cpus. The function will return 0 if all the CPUs listed in + * cpus_allowed can be granted or an error code will be returned. + * + * For partcmd_disable, the cpuset is being transofrmed from a partition + * root back to a non-partition root. any CPUs in cpus_allowed that are in + * parent's subparts_cpus will be taken away from that cpumask and put back + * into parent's effective_cpus. 0 should always be returned. + * + * For partcmd_update, if the optional newmask is specified, the cpu + * list is to be changed from cpus_allowed to newmask. Otherwise, + * cpus_allowed is assumed to remain the same. The function will return + * 1 if changes to parent's subparts_cpus and effective_cpus happen or 0 + * otherwise. In case of error, an error code will be returned. + * + * The partcmd_enable and partcmd_disable commands are used by + * update_prstate(). The partcmd_update command is used by + * update_cpumasks_hier() with newmask NULL and update_cpumask() with + * newmask set. + * + * The checking is more strict when enabling partition root than the + * other two commands. + * + * Because of the implicit cpu exclusive nature of a partition root, + * cpumask changes that violates the cpu exclusivity rule will not be + * permitted when checked by validate_change(). The validate_change() + * function will also prevent any changes to the cpu list if it is not + * a superset of children's cpu lists. + */ +static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd, + struct cpumask *newmask, + struct tmpmasks *tmp) +{ + struct cpuset *parent = parent_cs(cpuset); + int adding; /* Moving cpus from effective_cpus to subparts_cpus */ + int deleting; /* Moving cpus from subparts_cpus to effective_cpus */ + + lockdep_assert_held(&cpuset_mutex); + + /* + * The parent must be a partition root. + * The new cpumask, if present, or the current cpus_allowed must + * not be empty. + */ + if (!is_partition_root(parent) || + (newmask && cpumask_empty(newmask)) || + (!newmask && cpumask_empty(cpuset->cpus_allowed))) + return -EINVAL; + + /* + * Enabling/disabling partition root is not allowed if there are + * online children. + */ + if ((cmd != partcmd_update) && css_has_online_children(&cpuset->css)) + return -EBUSY; + + /* + * Enabling partition root is not allowed if not all the CPUs + * can be granted from parent's effective_cpus or at least one + * CPU will be left after that. + */ + if ((cmd == partcmd_enable) && + (!cpumask_subset(cpuset->cpus_allowed, parent->effective_cpus) || + cpumask_equal(cpuset->cpus_allowed, parent->effective_cpus))) + return -EINVAL; + + /* + * A cpumask update cannot make parent's effective_cpus become empty. + */ + adding = deleting = false; + if (cmd == partcmd_enable) { + cpumask_copy(tmp->addmask, cpuset->cpus_allowed); + adding = true; + } else if (cmd == partcmd_disable) { + deleting = cpumask_and(tmp->delmask, cpuset->cpus_allowed, + parent->subparts_cpus); + } else if (newmask) { + /* + * partcmd_update with newmask: + * + * delmask = cpus_allowed & ~newmask & parent->subparts_cpus + * addmask = newmask & parent->effective_cpus + * & ~parent->subparts_cpus + */ + cpumask_andnot(tmp->delmask, cpuset->cpus_allowed, newmask); + deleting = cpumask_and(tmp->delmask, tmp->delmask, + parent->subparts_cpus); + + cpumask_and(tmp->addmask, newmask, parent->effective_cpus); + adding = cpumask_andnot(tmp->addmask, tmp->addmask, + parent->subparts_cpus); + /* + * Return error if the new effective_cpus could become empty. + */ + if (adding && !deleting && + cpumask_equal(parent->effective_cpus, tmp->addmask)) + return -EINVAL; + } else { + /* + * partcmd_update w/o newmask: + * + * addmask = cpus_allowed & parent->effectiveb_cpus + * + * Note that parent's subparts_cpus may have been + * pre-shrunk in case the CPUs granted to the parent + * by the grandparent changes. So no deletion is needed. + */ + adding = cpumask_and(tmp->addmask, cpuset->cpus_allowed, + parent->effective_cpus); + if (cpumask_equal(tmp->addmask, parent->effective_cpus)) + return -EINVAL; + } + + if (!adding && !deleting) + return 0; + + /* + * Change the parent's subparts_cpus. + * Newly added CPUs will be removed from effective_cpus and + * newly deleted ones will be added back to effective_cpus. + */ + spin_lock_irq(&callback_lock); + if (adding) { + cpumask_or(parent->subparts_cpus, + parent->subparts_cpus, tmp->addmask); + cpumask_andnot(parent->effective_cpus, + parent->effective_cpus, tmp->addmask); + } + if (deleting) { + cpumask_andnot(parent->subparts_cpus, + parent->subparts_cpus, tmp->delmask); + cpumask_or(parent->effective_cpus, + parent->effective_cpus, tmp->delmask); + } + + parent->nr_subparts_cpus = cpumask_weight(parent->subparts_cpus); + spin_unlock_irq(&callback_lock); + + return cmd == partcmd_update; +} + /* * update_cpumasks_hier - Update effective cpumasks and tasks in the subtree - * @cs: the cpuset to consider - * @new_cpus: temp variable for calculating new effective_cpus + * @cs: the cpuset to consider + * @tmp: temp variables for calculating effective_cpus & partition setup * * When congifured cpumask is changed, the effective cpumasks of this cpuset * and all its descendants need to be updated. @@ -982,7 +1163,7 @@ static void update_tasks_cpumask(struct cpuset *cs) * * Called with cpuset_mutex held */ -static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus) +static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp) { struct cpuset *cp; struct cgroup_subsys_state *pos_css; @@ -991,28 +1172,61 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus) rcu_read_lock(); cpuset_for_each_descendant_pre(cp, pos_css, cs) { struct cpuset *parent = parent_cs(cp); + bool cs_empty; + + compute_effective_cpumask(tmp->new_cpus, cp, parent); + cs_empty = cpumask_empty(tmp->new_cpus); - cpumask_and(new_cpus, cp->cpus_allowed, parent->effective_cpus); + /* + * A partition root cannot have empty effective_cpus + */ + WARN_ON_ONCE(cs_empty && is_partition_root(cp)); /* * If it becomes empty, inherit the effective mask of the * parent, which is guaranteed to have some CPUs. */ - if (is_in_v2_mode() && cpumask_empty(new_cpus)) - cpumask_copy(new_cpus, parent->effective_cpus); + if (is_in_v2_mode() && cs_empty) + cpumask_copy(tmp->new_cpus, parent->effective_cpus); - /* Skip the whole subtree if the cpumask remains the same. */ - if (cpumask_equal(new_cpus, cp->effective_cpus)) { + /* + * Skip the whole subtree if the cpumask remains the same + * and has no partition root state. + */ + if (!is_partition_root(cp) && + cpumask_equal(tmp->new_cpus, cp->effective_cpus)) { pos_css = css_rightmost_descendant(pos_css); continue; } + /* + * update_parent_subparts_cpumask() should have been called + * for cs already in update_cpumask(). We should also call + * update_tasks_cpumask() again for tasks in the parent + * cpuset if the parent's subparts_cpus changes. + */ + if ((cp != cs) && cp->partition_root_state && + update_parent_subparts_cpumask(cp, partcmd_update, + NULL, tmp)) { + if (parent != &top_cpuset) + update_tasks_cpumask(parent); + } + if (!css_tryget_online(&cp->css)) continue; rcu_read_unlock(); spin_lock_irq(&callback_lock); - cpumask_copy(cp->effective_cpus, new_cpus); + + cpumask_copy(cp->effective_cpus, tmp->new_cpus); + if (cp->nr_subparts_cpus) { + /* + * Make sure that effective_cpus & subparts_cpus + * are mutually exclusive. + */ + cpumask_andnot(cp->effective_cpus, cp->effective_cpus, + cp->subparts_cpus); + } spin_unlock_irq(&callback_lock); WARN_ON(!is_in_v2_mode() && @@ -1047,6 +1261,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, const char *buf) { int retval; + struct tmpmasks tmp; /* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */ if (cs == &top_cpuset) @@ -1078,12 +1293,39 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, if (retval < 0) return retval; +#ifdef CONFIG_CPUMASK_OFFSTACK + /* + * Use the cpumasks in trialcs for tmpmasks when they are pointers + * to allocated cpumasks. + */ + tmp.addmask = trialcs->subparts_cpus; + tmp.delmask = trialcs->effective_cpus; + tmp.new_cpus = trialcs->cpus_allowed; +#endif + + if (cs->partition_root_state) { + /* Cpumask of a partition root cannot be empty */ + if (cpumask_empty(trialcs->cpus_allowed)) + return -EINVAL; + if (update_parent_subparts_cpumask(cs, partcmd_update, + trialcs->cpus_allowed, &tmp) < 0) + return -EINVAL; + } + spin_lock_irq(&callback_lock); cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed); + + /* + * Make sure that subparts_cpus is a subset of cpus_allowed. + */ + if (cs->nr_subparts_cpus) { + cpumask_andnot(cs->subparts_cpus, cs->subparts_cpus, + cs->cpus_allowed); + cs->nr_subparts_cpus = cpumask_weight(cs->subparts_cpus); + } spin_unlock_irq(&callback_lock); - /* use trialcs->cpus_allowed as a temp variable */ - update_cpumasks_hier(cs, trialcs->cpus_allowed); + update_cpumasks_hier(cs, &tmp); return 0; } @@ -1441,6 +1683,80 @@ out: return err; } +/* + * update_prstate - update partititon_root_state + * cs: the cpuset to update + * val: 0 - disabled, 1 - enabled + * + * Call with cpuset_mutex held. + */ +static int update_prstate(struct cpuset *cs, int val) +{ + int err; + struct cpuset *parent = parent_cs(cs); + struct tmpmasks tmp; + + if ((val != 0) && (val != 1)) + return -EINVAL; + if (val == cs->partition_root_state) + return 0; + + /* + * Cannot force a partial or erroneous partition root to a full + * partition root. + */ + if (val && cs->partition_root_state) + return -EINVAL; + + if (alloc_cpumasks(NULL, &tmp)) + return -ENOMEM; + + err = -EINVAL; + if (!cs->partition_root_state) { + /* + * Turning on partition root requires setting the + * CS_CPU_EXCLUSIVE bit implicitly as well and cpus_allowed + * cannot be NULL. + */ + if (cpumask_empty(cs->cpus_allowed)) + goto out; + + err = update_flag(CS_CPU_EXCLUSIVE, cs, 1); + if (err) + goto out; + + err = update_parent_subparts_cpumask(cs, partcmd_enable, + NULL, &tmp); + if (err) { + update_flag(CS_CPU_EXCLUSIVE, cs, 0); + goto out; + } + cs->partition_root_state = PRS_ENABLED; + } else { + err = update_parent_subparts_cpumask(cs, partcmd_disable, + NULL, &tmp); + if (err) + goto out; + + cs->partition_root_state = 0; + + /* Turning off CS_CPU_EXCLUSIVE will not return error */ + update_flag(CS_CPU_EXCLUSIVE, cs, 0); + } + + /* + * Update cpumask of parent's tasks except when it is the top + * cpuset as some system daemons cannot be mapped to other CPUs. + */ + if (parent != &top_cpuset) + update_tasks_cpumask(parent); + + rebuild_sched_domains_locked(); +out: + free_cpumasks(NULL, &tmp); + return err; +} + /* * Frequency meter - How fast is some event occurring? * @@ -1686,6 +2002,7 @@ typedef enum { FILE_MEM_EXCLUSIVE, FILE_MEM_HARDWALL, FILE_SCHED_LOAD_BALANCE, + FILE_PARTITION_ROOT, FILE_SCHED_RELAX_DOMAIN_LEVEL, FILE_MEMORY_PRESSURE_ENABLED, FILE_MEMORY_PRESSURE, @@ -1755,6 +2072,9 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, case FILE_SCHED_RELAX_DOMAIN_LEVEL: retval = update_relax_domain_level(cs, val); break; + case FILE_PARTITION_ROOT: + retval = update_prstate(cs, val); + break; default: retval = -EINVAL; break; @@ -1905,6 +2225,8 @@ static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft) switch (type) { case FILE_SCHED_RELAX_DOMAIN_LEVEL: return cs->relax_domain_level; + case FILE_PARTITION_ROOT: + return cs->partition_root_state; default: BUG(); } @@ -2056,6 +2378,14 @@ static struct cftype dfl_files[] = { .flags = CFTYPE_NOT_ON_ROOT, }, + { + .name = "sched.partition", + .read_s64 = cpuset_read_s64, + .write_s64 = cpuset_write_s64, + .private = FILE_PARTITION_ROOT, + .flags = CFTYPE_NOT_ON_ROOT, + }, + { } /* terminate */ }; @@ -2157,7 +2487,12 @@ out_unlock: /* * If the cpuset being removed has its flag 'sched_load_balance' * enabled, then simulate turning sched_load_balance off, which - * will call rebuild_sched_domains_locked(). + * will call rebuild_sched_domains_locked(). That is not needed + * in the default hierarchy where only changes in partition + * will cause repartitioning. + * + * If the cpuset has the 'sched.partition' flag enabled, simulate + * turning 'sched.partition" off. */ static void cpuset_css_offline(struct cgroup_subsys_state *css) @@ -2166,7 +2501,11 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css) mutex_lock(&cpuset_mutex); - if (is_sched_load_balance(cs)) + if (is_partition_root(cs)) + update_prstate(cs, 0); + + if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && + is_sched_load_balance(cs)) update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); cpuset_dec(); -- cgit From 3881b86128d0be22e8947ac1fca0429c74bf055e Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Thu, 8 Nov 2018 10:08:39 -0500 Subject: cpuset: Add an error state to cpuset.sched.partition When external events like CPU offlining or user events like changing the cpu list of an ancestor cpuset happen, update_cpumasks_hier() will be called to update the effective cpus of each of the affected cpusets. That will then call update_parent_subparts_cpumask() if partitions are impacted. Currently, these events may cause update_parent_subparts_cpumask() to return error if none of the requested cpus are available or it will consume all the cpus in the parent partition root. Handling these errors is problematic as the states may become inconsistent. Instead of letting update_parent_subparts_cpumask() return error, a new error state (-1) is added to the partition_root_state flag to designate the fact that the partition is no longer valid. IOW, it is no longer a real partition root, but the CS_CPU_EXCLUSIVE flag will still be set as it can be changed back to a real one if favorable change happens later on. This new error state is set internally and user cannot write this new value to "cpuset.sched.partition". Signed-off-by: Waiman Long Acked-by: Peter Zijlstra (Intel) Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 153 +++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 129 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 85e04162adcb..ef41f58d7cdf 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -153,10 +153,19 @@ struct cpuset { * Partition root states: * * 0 - not a partition root + * * 1 - partition root + * + * -1 - invalid partition root + * None of the cpus in cpus_allowed can be put into the parent's + * subparts_cpus. In this case, the cpuset is not a real partition + * root anymore. However, the CPU_EXCLUSIVE bit will still be set + * and the cpuset can be restored back to a partition root if the + * parent cpuset can give more CPUs back to this child cpuset. */ #define PRS_DISABLED 0 #define PRS_ENABLED 1 +#define PRS_ERROR -1 /* * Temporary cpumasks for working with partitions that are passed among @@ -251,7 +260,7 @@ static inline int is_spread_slab(const struct cpuset *cs) static inline int is_partition_root(const struct cpuset *cs) { - return cs->partition_root_state; + return cs->partition_root_state > 0; } static struct cpuset top_cpuset = { @@ -1021,9 +1030,12 @@ enum subparts_cmd { * * For partcmd_update, if the optional newmask is specified, the cpu * list is to be changed from cpus_allowed to newmask. Otherwise, - * cpus_allowed is assumed to remain the same. The function will return - * 1 if changes to parent's subparts_cpus and effective_cpus happen or 0 - * otherwise. In case of error, an error code will be returned. + * cpus_allowed is assumed to remain the same. The cpuset should either + * be a partition root or an invalid partition root. The partition root + * state may change if newmask is NULL and none of the requested CPUs can + * be granted by the parent. The function will return 1 if changes to + * parent's subparts_cpus and effective_cpus happen or 0 otherwise. + * Error code should only be returned when newmask is non-NULL. * * The partcmd_enable and partcmd_disable commands are used by * update_prstate(). The partcmd_update command is used by @@ -1046,6 +1058,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd, struct cpuset *parent = parent_cs(cpuset); int adding; /* Moving cpus from effective_cpus to subparts_cpus */ int deleting; /* Moving cpus from subparts_cpus to effective_cpus */ + bool part_error = false; /* Partition error? */ lockdep_assert_held(&cpuset_mutex); @@ -1114,13 +1127,48 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd, * addmask = cpus_allowed & parent->effectiveb_cpus * * Note that parent's subparts_cpus may have been - * pre-shrunk in case the CPUs granted to the parent - * by the grandparent changes. So no deletion is needed. + * pre-shrunk in case there is a change in the cpu list. + * So no deletion is needed. */ adding = cpumask_and(tmp->addmask, cpuset->cpus_allowed, parent->effective_cpus); - if (cpumask_equal(tmp->addmask, parent->effective_cpus)) - return -EINVAL; + part_error = cpumask_equal(tmp->addmask, + parent->effective_cpus); + } + + if (cmd == partcmd_update) { + int prev_prs = cpuset->partition_root_state; + + /* + * Check for possible transition between PRS_ENABLED + * and PRS_ERROR. + */ + switch (cpuset->partition_root_state) { + case PRS_ENABLED: + if (part_error) + cpuset->partition_root_state = PRS_ERROR; + break; + case PRS_ERROR: + if (!part_error) + cpuset->partition_root_state = PRS_ENABLED; + break; + } + /* + * Set part_error if previously in invalid state. + */ + part_error = (prev_prs == PRS_ERROR); + } + + if (!part_error && (cpuset->partition_root_state == PRS_ERROR)) + return 0; /* Nothing need to be done */ + + if (cpuset->partition_root_state == PRS_ERROR) { + /* + * Remove all its cpus from parent's subparts_cpus. + */ + adding = false; + deleting = cpumask_and(tmp->delmask, cpuset->cpus_allowed, + parent->subparts_cpus); } if (!adding && !deleting) @@ -1172,28 +1220,21 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp) rcu_read_lock(); cpuset_for_each_descendant_pre(cp, pos_css, cs) { struct cpuset *parent = parent_cs(cp); - bool cs_empty; compute_effective_cpumask(tmp->new_cpus, cp, parent); - cs_empty = cpumask_empty(tmp->new_cpus); - - /* - * A partition root cannot have empty effective_cpus - */ - WARN_ON_ONCE(cs_empty && is_partition_root(cp)); /* * If it becomes empty, inherit the effective mask of the * parent, which is guaranteed to have some CPUs. */ - if (is_in_v2_mode() && cs_empty) + if (is_in_v2_mode() && cpumask_empty(tmp->new_cpus)) cpumask_copy(tmp->new_cpus, parent->effective_cpus); /* * Skip the whole subtree if the cpumask remains the same * and has no partition root state. */ - if (!is_partition_root(cp) && + if (!cp->partition_root_state && cpumask_equal(tmp->new_cpus, cp->effective_cpus)) { pos_css = css_rightmost_descendant(pos_css); continue; @@ -1205,11 +1246,44 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp) * update_tasks_cpumask() again for tasks in the parent * cpuset if the parent's subparts_cpus changes. */ - if ((cp != cs) && cp->partition_root_state && - update_parent_subparts_cpumask(cp, partcmd_update, - NULL, tmp)) { - if (parent != &top_cpuset) - update_tasks_cpumask(parent); + if ((cp != cs) && cp->partition_root_state) { + switch (parent->partition_root_state) { + case PRS_DISABLED: + /* + * If parent is not a partition root or an + * invalid partition root, clear the state + * state and the CS_CPU_EXCLUSIVE flag. + */ + WARN_ON_ONCE(cp->partition_root_state + != PRS_ERROR); + cp->partition_root_state = 0; + + /* + * clear_bit() is an atomic operation and + * readers aren't interested in the state + * of CS_CPU_EXCLUSIVE anyway. So we can + * just update the flag without holding + * the callback_lock. + */ + clear_bit(CS_CPU_EXCLUSIVE, &cp->flags); + break; + + case PRS_ENABLED: + if (update_parent_subparts_cpumask(cp, partcmd_update, NULL, tmp)) + update_tasks_cpumask(parent); + break; + + case PRS_ERROR: + /* + * When parent is invalid, it has to be too. + */ + cp->partition_root_state = PRS_ERROR; + if (cp->nr_subparts_cpus) { + cp->nr_subparts_cpus = 0; + cpumask_clear(cp->subparts_cpus); + } + break; + } } if (!css_tryget_online(&cp->css)) @@ -1219,13 +1293,33 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp) spin_lock_irq(&callback_lock); cpumask_copy(cp->effective_cpus, tmp->new_cpus); - if (cp->nr_subparts_cpus) { + if (cp->nr_subparts_cpus && + (cp->partition_root_state != PRS_ENABLED)) { + cp->nr_subparts_cpus = 0; + cpumask_clear(cp->subparts_cpus); + } else if (cp->nr_subparts_cpus) { /* * Make sure that effective_cpus & subparts_cpus * are mutually exclusive. + * + * In the unlikely event that effective_cpus + * becomes empty. we clear cp->nr_subparts_cpus and + * let its child partition roots to compete for + * CPUs again. */ cpumask_andnot(cp->effective_cpus, cp->effective_cpus, cp->subparts_cpus); + if (cpumask_empty(cp->effective_cpus)) { + cpumask_copy(cp->effective_cpus, tmp->new_cpus); + cpumask_clear(cp->subparts_cpus); + cp->nr_subparts_cpus = 0; + } else if (!cpumask_subset(cp->subparts_cpus, + tmp->new_cpus)) { + cpumask_andnot(cp->subparts_cpus, + cp->subparts_cpus, tmp->new_cpus); + cp->nr_subparts_cpus + = cpumask_weight(cp->subparts_cpus); + } } spin_unlock_irq(&callback_lock); @@ -1702,7 +1796,7 @@ static int update_prstate(struct cpuset *cs, int val) return 0; /* - * Cannot force a partial or erroneous partition root to a full + * Cannot force a partial or invalid partition root to a full * partition root. */ if (val && cs->partition_root_state) @@ -1733,6 +1827,17 @@ static int update_prstate(struct cpuset *cs, int val) } cs->partition_root_state = PRS_ENABLED; } else { + /* + * Turning off partition root will clear the + * CS_CPU_EXCLUSIVE bit. + */ + if (cs->partition_root_state == PRS_ERROR) { + cs->partition_root_state = 0; + update_flag(CS_CPU_EXCLUSIVE, cs, 0); + err = 0; + goto out; + } + err = update_parent_subparts_cpumask(cs, partcmd_disable, NULL, &tmp); if (err) -- cgit From 4716909cc5c566e946a3acc884bf5dc469812007 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Thu, 8 Nov 2018 10:08:40 -0500 Subject: cpuset: Track cpusets that use parent's effective_cpus In the default hierarchy, a cpuset will use the parent's effective_cpus if none of the requested CPUs can be granted from the parent. That can be a problem if a parent is a partition root with children partition roots. Changes to a parent's effective_cpus list due to changes in a child partition root may not be properly reflected in a child cpuset that use parent's effective_cpus because the cpu_exclusive rule of a partition root will not guard against that. In order to avoid the mismatch, two new tracking variables are added to the cpuset structure to track if a cpuset uses parent's effective_cpus and the number of children cpusets that use its effective_cpus. So whenever cpumask changes are made to a parent, it will also check to see if it has other children cpusets that use its effective_cpus and call update_cpumasks_hier() if that is the case. Signed-off-by: Waiman Long Acked-by: Peter Zijlstra (Intel) Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 71 +++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 70 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index ef41f58d7cdf..21eaa896c1a9 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -147,6 +147,14 @@ struct cpuset { /* partition root state */ int partition_root_state; + + /* + * Default hierarchy only: + * use_parent_ecpus - set if using parent's effective_cpus + * child_ecpus_count - # of children with use_parent_ecpus set + */ + int use_parent_ecpus; + int child_ecpus_count; }; /* @@ -1227,8 +1235,17 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp) * If it becomes empty, inherit the effective mask of the * parent, which is guaranteed to have some CPUs. */ - if (is_in_v2_mode() && cpumask_empty(tmp->new_cpus)) + if (is_in_v2_mode() && cpumask_empty(tmp->new_cpus)) { cpumask_copy(tmp->new_cpus, parent->effective_cpus); + if (!cp->use_parent_ecpus) { + cp->use_parent_ecpus = true; + parent->child_ecpus_count++; + } + } else if (cp->use_parent_ecpus) { + cp->use_parent_ecpus = false; + WARN_ON_ONCE(!parent->child_ecpus_count); + parent->child_ecpus_count--; + } /* * Skip the whole subtree if the cpumask remains the same @@ -1345,6 +1362,35 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp) rebuild_sched_domains_locked(); } +/** + * update_sibling_cpumasks - Update siblings cpumasks + * @parent: Parent cpuset + * @cs: Current cpuset + * @tmp: Temp variables + */ +static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs, + struct tmpmasks *tmp) +{ + struct cpuset *sibling; + struct cgroup_subsys_state *pos_css; + + /* + * Check all its siblings and call update_cpumasks_hier() + * if their use_parent_ecpus flag is set in order for them + * to use the right effective_cpus value. + */ + rcu_read_lock(); + cpuset_for_each_child(sibling, pos_css, parent) { + if (sibling == cs) + continue; + if (!sibling->use_parent_ecpus) + continue; + + update_cpumasks_hier(sibling, tmp); + } + rcu_read_unlock(); +} + /** * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it * @cs: the cpuset to consider @@ -1420,6 +1466,17 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, spin_unlock_irq(&callback_lock); update_cpumasks_hier(cs, &tmp); + + if (cs->partition_root_state) { + struct cpuset *parent = parent_cs(cs); + + /* + * For partition root, update the cpumasks of sibling + * cpusets if they use parent's effective_cpus. + */ + if (parent->child_ecpus_count) + update_sibling_cpumasks(parent, cs, &tmp); + } return 0; } @@ -1856,6 +1913,9 @@ static int update_prstate(struct cpuset *cs, int val) if (parent != &top_cpuset) update_tasks_cpumask(parent); + if (parent->child_ecpus_count) + update_sibling_cpumasks(parent, cs, &tmp); + rebuild_sched_domains_locked(); out: free_cpumasks(NULL, &tmp); @@ -2550,6 +2610,8 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) if (is_in_v2_mode()) { cpumask_copy(cs->effective_cpus, parent->effective_cpus); cs->effective_mems = parent->effective_mems; + cs->use_parent_ecpus = true; + parent->child_ecpus_count++; } spin_unlock_irq(&callback_lock); @@ -2613,6 +2675,13 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css) is_sched_load_balance(cs)) update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); + if (cs->use_parent_ecpus) { + struct cpuset *parent = parent_cs(cs); + + cs->use_parent_ecpus = false; + parent->child_ecpus_count--; + } + cpuset_dec(); clear_bit(CS_ONLINE, &cs->flags); -- cgit From 4b842da276a8a1057aed7af6b2a5da471f840dd0 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Thu, 8 Nov 2018 10:08:41 -0500 Subject: cpuset: Make CPU hotplug work with partition When there is a cpu hotplug event (CPU online or offline), the partitions may need to be reconfigured and regenerated. So code is added to the hotplug functions to make them work with new subparts_cpus mask to compute the right effective_cpus for each of the affected cpusets. It may also change the state of a partition root from real one to an erroneous one or vice versa. Signed-off-by: Waiman Long Acked-by: Peter Zijlstra (Intel) Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 131 +++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 116 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 21eaa896c1a9..82c88c60c83d 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -113,6 +113,9 @@ struct cpuset { * CPUs allocated to child sub-partitions (default hierarchy only) * - CPUs granted by the parent = effective_cpus U subparts_cpus * - effective_cpus and subparts_cpus are mutually exclusive. + * + * effective_cpus contains only onlined CPUs, but subparts_cpus + * may have offlined ones. */ cpumask_var_t subparts_cpus; @@ -994,7 +997,9 @@ static void update_tasks_cpumask(struct cpuset *cs) * @parent: the parent cpuset * * If the parent has subpartition CPUs, include them in the list of - * allowable CPUs in computing the new effective_cpus mask. + * allowable CPUs in computing the new effective_cpus mask. Since offlined + * CPUs are not removed from subparts_cpus, we have to use cpu_active_mask + * to mask those out. */ static void compute_effective_cpumask(struct cpumask *new_cpus, struct cpuset *cs, struct cpuset *parent) @@ -1003,6 +1008,7 @@ static void compute_effective_cpumask(struct cpumask *new_cpus, cpumask_or(new_cpus, parent->effective_cpus, parent->subparts_cpus); cpumask_and(new_cpus, new_cpus, cs->cpus_allowed); + cpumask_and(new_cpus, new_cpus, cpu_active_mask); } else { cpumask_and(new_cpus, cs->cpus_allowed, parent->effective_cpus); } @@ -1125,9 +1131,20 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd, /* * Return error if the new effective_cpus could become empty. */ - if (adding && !deleting && - cpumask_equal(parent->effective_cpus, tmp->addmask)) - return -EINVAL; + if (adding && + cpumask_equal(parent->effective_cpus, tmp->addmask)) { + if (!deleting) + return -EINVAL; + /* + * As some of the CPUs in subparts_cpus might have + * been offlined, we need to compute the real delmask + * to confirm that. + */ + if (!cpumask_and(tmp->addmask, tmp->delmask, + cpu_active_mask)) + return -EINVAL; + cpumask_copy(tmp->addmask, parent->effective_cpus); + } } else { /* * partcmd_update w/o newmask: @@ -1197,6 +1214,10 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd, if (deleting) { cpumask_andnot(parent->subparts_cpus, parent->subparts_cpus, tmp->delmask); + /* + * Some of the CPUs in subparts_cpus might have been offlined. + */ + cpumask_and(tmp->delmask, tmp->delmask, cpu_active_mask); cpumask_or(parent->effective_cpus, parent->effective_cpus, tmp->delmask); } @@ -2863,20 +2884,29 @@ hotplug_update_tasks(struct cpuset *cs, update_tasks_nodemask(cs); } +static bool force_rebuild; + +void cpuset_force_rebuild(void) +{ + force_rebuild = true; +} + /** * cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug * @cs: cpuset in interest + * @tmp: the tmpmasks structure pointer * * Compare @cs's cpu and mem masks against top_cpuset and if some have gone * offline, update @cs accordingly. If @cs ends up with no CPU or memory, * all its tasks are moved to the nearest ancestor with both resources. */ -static void cpuset_hotplug_update_tasks(struct cpuset *cs) +static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp) { static cpumask_t new_cpus; static nodemask_t new_mems; bool cpus_updated; bool mems_updated; + struct cpuset *parent; retry: wait_event(cpuset_attach_wq, cs->attach_in_progress == 0); @@ -2891,9 +2921,60 @@ retry: goto retry; } - cpumask_and(&new_cpus, cs->cpus_allowed, parent_cs(cs)->effective_cpus); - nodes_and(new_mems, cs->mems_allowed, parent_cs(cs)->effective_mems); + parent = parent_cs(cs); + compute_effective_cpumask(&new_cpus, cs, parent); + nodes_and(new_mems, cs->mems_allowed, parent->effective_mems); + + if (cs->nr_subparts_cpus) + /* + * Make sure that CPUs allocated to child partitions + * do not show up in effective_cpus. + */ + cpumask_andnot(&new_cpus, &new_cpus, cs->subparts_cpus); + + if (!tmp || !cs->partition_root_state) + goto update_tasks; + + /* + * In the unlikely event that a partition root has empty + * effective_cpus or its parent becomes erroneous, we have to + * transition it to the erroneous state. + */ + if (is_partition_root(cs) && (cpumask_empty(&new_cpus) || + (parent->partition_root_state == PRS_ERROR))) { + if (cs->nr_subparts_cpus) { + cs->nr_subparts_cpus = 0; + cpumask_clear(cs->subparts_cpus); + compute_effective_cpumask(&new_cpus, cs, parent); + } + /* + * If the effective_cpus is empty because the child + * partitions take away all the CPUs, we can keep + * the current partition and let the child partitions + * fight for available CPUs. + */ + if ((parent->partition_root_state == PRS_ERROR) || + cpumask_empty(&new_cpus)) { + update_parent_subparts_cpumask(cs, partcmd_disable, + NULL, tmp); + cs->partition_root_state = PRS_ERROR; + } + cpuset_force_rebuild(); + } + + /* + * On the other hand, an erroneous partition root may be transitioned + * back to a regular one or a partition root with no CPU allocated + * from the parent may change to erroneous. + */ + if (is_partition_root(parent) && + ((cs->partition_root_state == PRS_ERROR) || + !cpumask_intersects(&new_cpus, parent->subparts_cpus)) && + update_parent_subparts_cpumask(cs, partcmd_update, NULL, tmp)) + cpuset_force_rebuild(); + +update_tasks: cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus); mems_updated = !nodes_equal(new_mems, cs->effective_mems); @@ -2907,13 +2988,6 @@ retry: mutex_unlock(&cpuset_mutex); } -static bool force_rebuild; - -void cpuset_force_rebuild(void) -{ - force_rebuild = true; -} - /** * cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset * @@ -2936,6 +3010,10 @@ static void cpuset_hotplug_workfn(struct work_struct *work) static nodemask_t new_mems; bool cpus_updated, mems_updated; bool on_dfl = is_in_v2_mode(); + struct tmpmasks tmp, *ptmp = NULL; + + if (on_dfl && !alloc_cpumasks(NULL, &tmp)) + ptmp = &tmp; mutex_lock(&cpuset_mutex); @@ -2943,6 +3021,11 @@ static void cpuset_hotplug_workfn(struct work_struct *work) cpumask_copy(&new_cpus, cpu_active_mask); new_mems = node_states[N_MEMORY]; + /* + * If subparts_cpus is populated, it is likely that the check below + * will produce a false positive on cpus_updated when the cpu list + * isn't changed. It is extra work, but it is better to be safe. + */ cpus_updated = !cpumask_equal(top_cpuset.effective_cpus, &new_cpus); mems_updated = !nodes_equal(top_cpuset.effective_mems, new_mems); @@ -2951,6 +3034,22 @@ static void cpuset_hotplug_workfn(struct work_struct *work) spin_lock_irq(&callback_lock); if (!on_dfl) cpumask_copy(top_cpuset.cpus_allowed, &new_cpus); + /* + * Make sure that CPUs allocated to child partitions + * do not show up in effective_cpus. If no CPU is left, + * we clear the subparts_cpus & let the child partitions + * fight for the CPUs again. + */ + if (top_cpuset.nr_subparts_cpus) { + if (cpumask_subset(&new_cpus, + top_cpuset.subparts_cpus)) { + top_cpuset.nr_subparts_cpus = 0; + cpumask_clear(top_cpuset.subparts_cpus); + } else { + cpumask_andnot(&new_cpus, &new_cpus, + top_cpuset.subparts_cpus); + } + } cpumask_copy(top_cpuset.effective_cpus, &new_cpus); spin_unlock_irq(&callback_lock); /* we don't mess with cpumasks of tasks in top_cpuset */ @@ -2979,7 +3078,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work) continue; rcu_read_unlock(); - cpuset_hotplug_update_tasks(cs); + cpuset_hotplug_update_tasks(cs, ptmp); rcu_read_lock(); css_put(&cs->css); @@ -2992,6 +3091,8 @@ static void cpuset_hotplug_workfn(struct work_struct *work) force_rebuild = false; rebuild_sched_domains(); } + + free_cpumasks(NULL, ptmp); } void cpuset_update_active_cpus(void) -- cgit From 0ccea8feb9807ba87b0405a826f6830a386706f5 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Thu, 8 Nov 2018 10:08:42 -0500 Subject: cpuset: Make generate_sched_domains() work with partition The generate_sched_domains() function is modified to make it work correctly with the newly introduced subparts_cpus mask for scheduling domains generation. Signed-off-by: Waiman Long Acked-by: Peter Zijlstra (Intel) Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 34 +++++++++++++++++++++++++++------- 1 file changed, 27 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 82c88c60c83d..3960de7a75cc 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -769,13 +769,14 @@ static int generate_sched_domains(cpumask_var_t **domains, int ndoms = 0; /* number of sched domains in result */ int nslot; /* next empty doms[] struct cpumask slot */ struct cgroup_subsys_state *pos_css; + bool root_load_balance = is_sched_load_balance(&top_cpuset); doms = NULL; dattr = NULL; csa = NULL; /* Special case for the 99% of systems with one, full, sched domain */ - if (is_sched_load_balance(&top_cpuset)) { + if (root_load_balance && !top_cpuset.nr_subparts_cpus) { ndoms = 1; doms = alloc_sched_domains(ndoms); if (!doms) @@ -798,6 +799,8 @@ static int generate_sched_domains(cpumask_var_t **domains, csn = 0; rcu_read_lock(); + if (root_load_balance) + csa[csn++] = &top_cpuset; cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) { if (cp == &top_cpuset) continue; @@ -808,6 +811,9 @@ static int generate_sched_domains(cpumask_var_t **domains, * parent's cpus, so just skip them, and then we call * update_domain_attr_tree() to calc relax_domain_level of * the corresponding sched domain. + * + * If root is load-balancing, we can skip @cp if it + * is a subset of the root's effective_cpus. */ if (!cpumask_empty(cp->cpus_allowed) && !(is_sched_load_balance(cp) && @@ -815,11 +821,16 @@ static int generate_sched_domains(cpumask_var_t **domains, housekeeping_cpumask(HK_FLAG_DOMAIN)))) continue; + if (root_load_balance && + cpumask_subset(cp->cpus_allowed, top_cpuset.effective_cpus)) + continue; + if (is_sched_load_balance(cp)) csa[csn++] = cp; - /* skip @cp's subtree */ - pos_css = css_rightmost_descendant(pos_css); + /* skip @cp's subtree if not a partition root */ + if (!is_partition_root(cp)) + pos_css = css_rightmost_descendant(pos_css); } rcu_read_unlock(); @@ -947,7 +958,12 @@ static void rebuild_sched_domains_locked(void) * passing doms with offlined cpu to partition_sched_domains(). * Anyways, hotplug work item will rebuild sched domains. */ - if (!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask)) + if (!top_cpuset.nr_subparts_cpus && + !cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask)) + goto out; + + if (top_cpuset.nr_subparts_cpus && + !cpumask_subset(top_cpuset.effective_cpus, cpu_active_mask)) goto out; /* Generate domain masks and attrs */ @@ -1367,11 +1383,15 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp) update_tasks_cpumask(cp); /* - * If the effective cpumask of any non-empty cpuset is changed, - * we need to rebuild sched domains. + * On legacy hierarchy, if the effective cpumask of any non- + * empty cpuset is changed, we need to rebuild sched domains. + * On default hierarchy, the cpuset needs to be a partition + * root as well. */ if (!cpumask_empty(cp->cpus_allowed) && - is_sched_load_balance(cp)) + is_sched_load_balance(cp) && + (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) || + is_partition_root(cp))) need_rebuild_sched_domains = true; rcu_read_lock(); -- cgit From 5776ceccd4de2a53dec740422a409e9e588c5a70 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Thu, 8 Nov 2018 10:08:43 -0500 Subject: cpuset: Expose cpus.effective and mems.effective on cgroup v2 root Because of the fact that setting the "cpuset.sched.partition" in a direct child of root can remove CPUs from the root's effective CPU list, it makes sense to know what CPUs are left in the root cgroup for scheduling purpose. So the "cpuset.cpus.effective" control file is now exposed in the v2 cgroup root. For consistency, the "cpuset.mems.effective" control file is exposed as well. Signed-off-by: Waiman Long Acked-by: Peter Zijlstra (Intel) Signed-off-by: Tejun Heo --- Documentation/admin-guide/cgroup-v2.rst | 4 ++-- kernel/cgroup/cpuset.c | 2 -- 2 files changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 01b70f69304e..595b0757ad2b 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1653,7 +1653,7 @@ Cpuset Interface Files and won't be affected by any CPU hotplug events. cpuset.cpus.effective - A read-only multiple values file which exists on non-root + A read-only multiple values file which exists on all cpuset-enabled cgroups. It lists the onlined CPUs that are actually granted to this @@ -1693,7 +1693,7 @@ Cpuset Interface Files and won't be affected by any memory nodes hotplug events. cpuset.mems.effective - A read-only multiple values file which exists on non-root + A read-only multiple values file which exists on all cpuset-enabled cgroups. It lists the onlined memory nodes that are actually granted to diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 3960de7a75cc..fc1a809cd5bb 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -2574,14 +2574,12 @@ static struct cftype dfl_files[] = { .name = "cpus.effective", .seq_show = cpuset_common_seq_show, .private = FILE_EFFECTIVE_CPULIST, - .flags = CFTYPE_NOT_ON_ROOT, }, { .name = "mems.effective", .seq_show = cpuset_common_seq_show, .private = FILE_EFFECTIVE_MEMLIST, - .flags = CFTYPE_NOT_ON_ROOT, }, { -- cgit From bb5b553c33cb3393f604483b103158bf7d01ca1c Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Thu, 8 Nov 2018 10:08:44 -0500 Subject: cpuset: Use descriptive text when reading/writing cpuset.sched.partition Currently, cpuset.sched.partition returns the values, 0, 1 or -1 on read. A person who is not familiar with the partition code may not understand what they mean. In order to make cpuset.sched.partition more user-friendly, it will now display the following descriptive text on read: "root" - A partition root (top cpuset of a partition) "member" - A non-root member of a partition "root invalid" - An invalid partition root Note that there is at least one partition in the whole cgroup hierarchy. The top cpuset is the root of that partition. The rests are either a root if it starts a new partition or a member of a partition. The cpuset.sched.partition file will now also accept "root" and "member" besides 1 and 0 as valid input values. The "root invalid" value is internal only and cannot be written to the file. Suggested-by: Tejun Heo Signed-off-by: Waiman Long Acked-by: Peter Zijlstra (Intel) Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 58 ++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 51 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index fc1a809cd5bb..c739fda805e0 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -2278,9 +2278,6 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, case FILE_SCHED_RELAX_DOMAIN_LEVEL: retval = update_relax_domain_level(cs, val); break; - case FILE_PARTITION_ROOT: - retval = update_prstate(cs, val); - break; default: retval = -EINVAL; break; @@ -2431,8 +2428,6 @@ static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft) switch (type) { case FILE_SCHED_RELAX_DOMAIN_LEVEL: return cs->relax_domain_level; - case FILE_PARTITION_ROOT: - return cs->partition_root_state; default: BUG(); } @@ -2441,6 +2436,55 @@ static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft) return 0; } +static int sched_partition_show(struct seq_file *seq, void *v) +{ + struct cpuset *cs = css_cs(seq_css(seq)); + + switch (cs->partition_root_state) { + case PRS_ENABLED: + seq_puts(seq, "root\n"); + break; + case PRS_DISABLED: + seq_puts(seq, "member\n"); + break; + case PRS_ERROR: + seq_puts(seq, "root invalid\n"); + break; + } + return 0; +} + +static ssize_t sched_partition_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) +{ + struct cpuset *cs = css_cs(of_css(of)); + int val; + int retval = -ENODEV; + + buf = strstrip(buf); + + /* + * Convert "root"/"1" to 1, and convert "member"/"0" to 0. + */ + if (!strcmp(buf, "root") || !strcmp(buf, "1")) + val = PRS_ENABLED; + else if (!strcmp(buf, "member") || !strcmp(buf, "0")) + val = PRS_DISABLED; + else + return -EINVAL; + + css_get(&cs->css); + mutex_lock(&cpuset_mutex); + if (!is_cpuset_online(cs)) + goto out_unlock; + + retval = update_prstate(cs, val); +out_unlock: + mutex_unlock(&cpuset_mutex); + css_put(&cs->css); + return retval ?: nbytes; +} + /* * for the common functions, 'private' gives the type of file */ @@ -2584,8 +2628,8 @@ static struct cftype dfl_files[] = { { .name = "sched.partition", - .read_s64 = cpuset_read_s64, - .write_s64 = cpuset_write_s64, + .seq_show = sched_partition_show, + .write = sched_partition_write, .private = FILE_PARTITION_ROOT, .flags = CFTYPE_NOT_ON_ROOT, }, -- cgit From 5cf8114d6e90b3822be5eb6a2faedf99d1c08f77 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Thu, 8 Nov 2018 10:08:46 -0500 Subject: cpuset: Expose cpuset.cpus.subpartitions with cgroup_debug For debugging purpose, it will be useful to expose the content of the subparts_cpus as a read-only file to see if the code work correctly. However, subparts_cpus will not be used at all in most use cases. So adding a new cpuset file that clutters the cgroup directory may not be desirable. This is now being done by using the hidden "cgroup_debug" kernel command line option to expose a new "cpuset.cpus.subpartitions" file. That option was originally used by the debug controller to expose itself when configured into the kernel. This is now extended to set an internal flag used by cgroup_addrm_files(). A new CFTYPE_DEBUG flag can now be used to specify that a cgroup file should only be created when the "cgroup_debug" option is specified. Signed-off-by: Waiman Long Acked-by: Peter Zijlstra (Intel) Signed-off-by: Tejun Heo --- include/linux/cgroup-defs.h | 1 + kernel/cgroup/cgroup-internal.h | 2 ++ kernel/cgroup/cgroup.c | 14 +++++++++++++- kernel/cgroup/cpuset.c | 11 +++++++++++ kernel/cgroup/debug.c | 4 +--- 5 files changed, 28 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 5e1694fe035b..8fcbae1b8db0 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -92,6 +92,7 @@ enum { CFTYPE_NO_PREFIX = (1 << 3), /* (DON'T USE FOR NEW FILES) no subsys prefix */ CFTYPE_WORLD_WRITABLE = (1 << 4), /* (DON'T USE FOR NEW FILES) S_IWUGO */ + CFTYPE_DEBUG = (1 << 5), /* create when cgroup_debug */ /* internal flags, do not use outside cgroup core proper */ __CFTYPE_ONLY_ON_DFL = (1 << 16), /* only on default hierarchy */ diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index 75568fcf2180..c950864016e2 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -11,6 +11,8 @@ #define TRACE_CGROUP_PATH_LEN 1024 extern spinlock_t trace_cgroup_path_lock; extern char trace_cgroup_path[TRACE_CGROUP_PATH_LEN]; +extern bool cgroup_debug; +extern void __init enable_debug_cgroup(void); /* * cgroup_path() takes a spin lock. It is good practice not to take diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 2e5d90dfcb49..ed7f0bfe6429 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -86,6 +86,7 @@ EXPORT_SYMBOL_GPL(css_set_lock); DEFINE_SPINLOCK(trace_cgroup_path_lock); char trace_cgroup_path[TRACE_CGROUP_PATH_LEN]; +bool cgroup_debug __read_mostly; /* * Protects cgroup_idr and css_idr so that IDs can be released without @@ -3639,7 +3640,8 @@ restart: continue; if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgroup_parent(cgrp)) continue; - + if ((cft->flags & CFTYPE_DEBUG) && !cgroup_debug) + continue; if (is_add) { ret = cgroup_add_file(css, cgrp, cft); if (ret) { @@ -5743,6 +5745,16 @@ static int __init cgroup_disable(char *str) } __setup("cgroup_disable=", cgroup_disable); +void __init __weak enable_debug_cgroup(void) { } + +static int __init enable_cgroup_debug(char *str) +{ + cgroup_debug = true; + enable_debug_cgroup(); + return 1; +} +__setup("cgroup_debug", enable_cgroup_debug); + /** * css_tryget_online_from_dir - get corresponding css from a cgroup dentry * @dentry: directory dentry of interest diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index c739fda805e0..b897314bab53 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -2204,6 +2204,7 @@ typedef enum { FILE_MEMLIST, FILE_EFFECTIVE_CPULIST, FILE_EFFECTIVE_MEMLIST, + FILE_SUBPARTS_CPULIST, FILE_CPU_EXCLUSIVE, FILE_MEM_EXCLUSIVE, FILE_MEM_HARDWALL, @@ -2382,6 +2383,9 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v) case FILE_EFFECTIVE_MEMLIST: seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->effective_mems)); break; + case FILE_SUBPARTS_CPULIST: + seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->subparts_cpus)); + break; default: ret = -EINVAL; } @@ -2634,6 +2638,13 @@ static struct cftype dfl_files[] = { .flags = CFTYPE_NOT_ON_ROOT, }, + { + .name = "cpus.subpartitions", + .seq_show = cpuset_common_seq_show, + .private = FILE_SUBPARTS_CPULIST, + .flags = CFTYPE_DEBUG, + }, + { } /* terminate */ }; diff --git a/kernel/cgroup/debug.c b/kernel/cgroup/debug.c index 9caeda610249..5f1b87330bee 100644 --- a/kernel/cgroup/debug.c +++ b/kernel/cgroup/debug.c @@ -373,11 +373,9 @@ struct cgroup_subsys debug_cgrp_subsys = { * On v2, debug is an implicit controller enabled by "cgroup_debug" boot * parameter. */ -static int __init enable_cgroup_debug(char *str) +void __init enable_debug_cgroup(void) { debug_cgrp_subsys.dfl_cftypes = debug_files; debug_cgrp_subsys.implicit_on_dfl = true; debug_cgrp_subsys.threaded = true; - return 1; } -__setup("cgroup_debug", enable_cgroup_debug); -- cgit From 042d4c70a203998697b34eaad1a99f6f09d09e4d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 22 Oct 2018 07:43:22 -0700 Subject: rcu: Eliminate BUG_ON() for sync.c The sync.c file has a number of calls to BUG_ON(), which panics the kernel, which is not a good strategy for devices (like embedded) that don't have a way to capture console output. This commit therefore changes these BUG_ON() calls to WARN_ON_ONCE(), but does so quite naively. Reported-by: Linus Torvalds Signed-off-by: Paul E. McKenney Acked-by: Oleg Nesterov Cc: Peter Zijlstra --- kernel/rcu/sync.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c index 3f943efcf61c..a6ba446a9693 100644 --- a/kernel/rcu/sync.c +++ b/kernel/rcu/sync.c @@ -125,8 +125,7 @@ void rcu_sync_enter(struct rcu_sync *rsp) rsp->gp_state = GP_PENDING; spin_unlock_irq(&rsp->rss_lock); - BUG_ON(need_wait && need_sync); - + WARN_ON_ONCE(need_wait && need_sync); if (need_sync) { gp_ops[rsp->gp_type].sync(); rsp->gp_state = GP_PASSED; @@ -139,7 +138,7 @@ void rcu_sync_enter(struct rcu_sync *rsp) * Nobody has yet been allowed the 'fast' path and thus we can * avoid doing any sync(). The callback will get 'dropped'. */ - BUG_ON(rsp->gp_state != GP_PASSED); + WARN_ON_ONCE(rsp->gp_state != GP_PASSED); } } @@ -166,8 +165,8 @@ static void rcu_sync_func(struct rcu_head *rhp) struct rcu_sync *rsp = container_of(rhp, struct rcu_sync, cb_head); unsigned long flags; - BUG_ON(rsp->gp_state != GP_PASSED); - BUG_ON(rsp->cb_state == CB_IDLE); + WARN_ON_ONCE(rsp->gp_state != GP_PASSED); + WARN_ON_ONCE(rsp->cb_state == CB_IDLE); spin_lock_irqsave(&rsp->rss_lock, flags); if (rsp->gp_count) { @@ -225,7 +224,7 @@ void rcu_sync_dtor(struct rcu_sync *rsp) { int cb_state; - BUG_ON(rsp->gp_count); + WARN_ON_ONCE(rsp->gp_count); spin_lock_irq(&rsp->rss_lock); if (rsp->cb_state == CB_REPLAY) @@ -235,6 +234,6 @@ void rcu_sync_dtor(struct rcu_sync *rsp) if (cb_state != CB_IDLE) { gp_ops[rsp->gp_type].wait(); - BUG_ON(rsp->cb_state != CB_IDLE); + WARN_ON_ONCE(rsp->cb_state != CB_IDLE); } } -- cgit From 08543bda42ef06c5ee4cd74501c894aa7cc13ea8 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 22 Oct 2018 08:04:03 -0700 Subject: rcu: Eliminate BUG_ON() for kernel/rcu/tree.c The tree.c file has a number of calls to BUG_ON(), which panics the kernel, which is not a good strategy for devices (like embedded) that don't have a way to capture console output. This commit therefore converts these BUG_ON() calls to WARN_ON_ONCE() and WARN_ONCE(). Reported-by: Linus Torvalds Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 121f833acd04..bdb6659a8dbc 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2826,7 +2826,7 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func, int cpu, bool lazy) * Very early boot, before rcu_init(). Initialize if needed * and then drop through to queue the callback. */ - BUG_ON(cpu != -1); + WARN_ON_ONCE(cpu != -1); WARN_ON_ONCE(!rcu_is_watching()); if (rcu_segcblist_empty(&rdp->cblist)) rcu_segcblist_init(&rdp->cblist); @@ -3485,7 +3485,8 @@ static int __init rcu_spawn_gp_kthread(void) rcu_scheduler_fully_active = 1; t = kthread_create(rcu_gp_kthread, NULL, "%s", rcu_state.name); - BUG_ON(IS_ERR(t)); + if (WARN_ONCE(IS_ERR(t), "%s: Could not start grace-period kthread, OOM is now expected behavior\n", __func__)) + return 0; rnp = rcu_get_root(); raw_spin_lock_irqsave_rcu_node(rnp, flags); rcu_state.gp_kthread = t; -- cgit From 75a8f72245223cce1571b0405b0881ca3c046df7 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Sat, 22 Sep 2018 19:41:25 -0400 Subject: rcu: Remove unused rcu_state externs The rcu_bh_state and rcu_sched_state variables were removed during the RCU flavor consolidations, but external declarations remain in tree.h. This commit therefore removes these obsolete declarations. Signed-off-by: Joel Fernandes (Google) Cc: Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.h | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 703e19ff532d..57a937ac51c2 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -398,17 +398,6 @@ static const char *tp_rcu_varname __used __tracepoint_string = rcu_name; #define RCU_NAME rcu_name #endif /* #else #ifdef CONFIG_TRACING */ -/* - * RCU implementation internal declarations: - */ -extern struct rcu_state rcu_sched_state; - -extern struct rcu_state rcu_bh_state; - -#ifdef CONFIG_PREEMPT_RCU -extern struct rcu_state rcu_preempt_state; -#endif /* #ifdef CONFIG_PREEMPT_RCU */ - int rcu_dynticks_snap(struct rcu_data *rdp); #ifdef CONFIG_RCU_BOOST -- cgit From adbccddb4a16b1dbf047d330ae1e78fd1ec80352 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Sat, 22 Sep 2018 19:41:26 -0400 Subject: rcu: Fix rcu_{node,data} comments about gp_seq_needed Recent changes have removed the old ->gp_seq_needed field from the rcu_state structure, which in turn obsoleted a couple of comments in the rcu_node and rcu_data structures. This commit therefore updates these comments accordingly. Signed-off-by: Joel Fernandes (Google) Cc: Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 57a937ac51c2..c3e2807a834a 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -57,7 +57,7 @@ struct rcu_node { /* some rcu_state fields as well as */ /* following. */ unsigned long gp_seq; /* Track rsp->rcu_gp_seq. */ - unsigned long gp_seq_needed; /* Track rsp->rcu_gp_seq_needed. */ + unsigned long gp_seq_needed; /* Track furthest future GP request. */ unsigned long completedqs; /* All QSes done for this node. */ unsigned long qsmask; /* CPUs or groups that need to switch in */ /* order for current grace period to proceed.*/ @@ -163,7 +163,7 @@ union rcu_noqs { struct rcu_data { /* 1) quiescent-state and grace-period handling : */ unsigned long gp_seq; /* Track rsp->rcu_gp_seq counter. */ - unsigned long gp_seq_needed; /* Track rsp->rcu_gp_seq_needed ctr. */ + unsigned long gp_seq_needed; /* Track furthest future GP request. */ union rcu_noqs cpu_no_qs; /* No QSes yet for this CPU. */ bool core_needs_qs; /* Core waits for quiesc state. */ bool beenonline; /* CPU online at least once. */ -- cgit From 309ba859b95085f61f4f2a154df6be9cb9713a12 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 11 Jul 2018 14:36:49 -0700 Subject: rcu: Eliminate synchronize_rcu_mult() Now that synchronize_rcu() waits for both RCU read-side critical sections and preempt-disabled regions of code, the sole caller of synchronize_rcu_mult() can be replaced by synchronize_rcu(). This patch makes this change and removes synchronize_rcu_mult(). Note that _wait_rcu_gp() still supports synchronize_rcu_mult(), and thus might be simplified in the future to take only take a single call_rcu() function rather than the current list of them. Signed-off-by: Paul E. McKenney --- include/linux/rcupdate_wait.h | 17 ----------------- kernel/rcu/update.c | 6 ++---- kernel/sched/core.c | 2 +- 3 files changed, 3 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/include/linux/rcupdate_wait.h b/include/linux/rcupdate_wait.h index 8a16c3eb3dd0..c0578ba23c1a 100644 --- a/include/linux/rcupdate_wait.h +++ b/include/linux/rcupdate_wait.h @@ -31,21 +31,4 @@ do { \ #define wait_rcu_gp(...) _wait_rcu_gp(false, __VA_ARGS__) -/** - * synchronize_rcu_mult - Wait concurrently for multiple grace periods - * @...: List of call_rcu() functions for different grace periods to wait on - * - * This macro waits concurrently for multiple types of RCU grace periods. - * For example, synchronize_rcu_mult(call_rcu, call_rcu_tasks) would wait - * on concurrent RCU and RCU-tasks grace periods. Waiting on a give SRCU - * domain requires you to write a wrapper function for that SRCU domain's - * call_srcu() function, supplying the corresponding srcu_struct. - * - * If Tiny RCU, tell _wait_rcu_gp() does not bother waiting for RCU, - * given that anywhere synchronize_rcu_mult() can be called is automatically - * a grace period. - */ -#define synchronize_rcu_mult(...) \ - _wait_rcu_gp(IS_ENABLED(CONFIG_TINY_RCU), __VA_ARGS__) - #endif /* _LINUX_SCHED_RCUPDATE_WAIT_H */ diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index f203b94f6b5b..c729ca5e6ee2 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -335,8 +335,7 @@ void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array, /* Initialize and register callbacks for each crcu_array element. */ for (i = 0; i < n; i++) { if (checktiny && - (crcu_array[i] == call_rcu || - crcu_array[i] == call_rcu_bh)) { + (crcu_array[i] == call_rcu)) { might_sleep(); continue; } @@ -352,8 +351,7 @@ void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array, /* Wait for all callbacks to be invoked. */ for (i = 0; i < n; i++) { if (checktiny && - (crcu_array[i] == call_rcu || - crcu_array[i] == call_rcu_bh)) + (crcu_array[i] == call_rcu)) continue; for (j = 0; j < i; j++) if (crcu_array[j] == crcu_array[i]) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f12225f26b70..ea12ebc57840 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5788,7 +5788,7 @@ int sched_cpu_deactivate(unsigned int cpu) * * Do sync before park smpboot threads to take care the rcu boost case. */ - synchronize_rcu_mult(call_rcu, call_rcu_sched); + synchronize_rcu(); if (!sched_smp_initialized) return 0; -- cgit From d3ff3891b2edba63a7dee9023306bb66878fc3d8 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 11 Jul 2018 14:42:53 -0700 Subject: rcu: Consolidate the RCU update functions invoked by sync.c This commit retains all the various gp_ops[] entries, but makes their update functions all be synchronize_rcu(), call_rcu() and rcu_barrier(). The read-side checks remain consistent with the various RCU flavors, which still exist on the read side. Signed-off-by: Paul E. McKenney Cc: Oleg Nesterov Cc: Peter Zijlstra --- kernel/rcu/sync.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c index 3f943efcf61c..9d570b1892b0 100644 --- a/kernel/rcu/sync.c +++ b/kernel/rcu/sync.c @@ -44,15 +44,15 @@ static const struct { __INIT_HELD(rcu_read_lock_held) }, [RCU_SCHED_SYNC] = { - .sync = synchronize_sched, - .call = call_rcu_sched, - .wait = rcu_barrier_sched, + .sync = synchronize_rcu, + .call = call_rcu, + .wait = rcu_barrier, __INIT_HELD(rcu_read_lock_sched_held) }, [RCU_BH_SYNC] = { - .sync = synchronize_rcu_bh, - .call = call_rcu_bh, - .wait = rcu_barrier_bh, + .sync = synchronize_rcu, + .call = call_rcu, + .wait = rcu_barrier, __INIT_HELD(rcu_read_lock_bh_held) }, }; -- cgit From 78d125d33858d00756baf0f40e75e77dcbfbea55 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 11 Jul 2018 15:36:43 -0700 Subject: sched/membarrier: Replace synchronize_sched() with synchronize_rcu() Now that synchronize_rcu() waits for preempt-disable regions of code as well as RCU read-side critical sections, the synchronize_sched() in sys_membarrier() can be replaced by synchronize_rcu(). This commit therefore makes this change. Signed-off-by: Paul E. McKenney Cc: Mathieu Desnoyers --- kernel/sched/membarrier.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index 76e0eaf4654e..388a7a6c1aa2 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -298,7 +298,7 @@ SYSCALL_DEFINE2(membarrier, int, cmd, int, flags) if (tick_nohz_full_enabled()) return -EINVAL; if (num_online_cpus() > 1) - synchronize_sched(); + synchronize_rcu(); return 0; case MEMBARRIER_CMD_GLOBAL_EXPEDITED: return membarrier_global_expedited(); -- cgit From 0607ba8403c4cdb253f8c5200ecf654dfb7790cc Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 25 Apr 2018 13:01:25 -0700 Subject: srcu: Prevent __call_srcu() counter wrap with read-side critical section Ever since cdf7abc4610a ("srcu: Allow use of Tiny/Tree SRCU from both process and interrupt context"), it has been permissible to use SRCU read-side critical sections in interrupt context. This allows __call_srcu() to use SRCU read-side critical sections to prevent a new SRCU grace period from ending before the call to either srcu_funnel_gp_start() or srcu_funnel_exp_start completes, thus preventing SRCU grace-period counter overflow during that time. Note that this does not permit removal of the counter-wrap checks in srcu_gp_end(). These check are necessary to handle the case where a given CPU does not interact at all with SRCU for an extended time period. This commit therefore adds an SRCU read-side critical section to __call_srcu() in order to prevent grace period counter wrap during the funnel-locking process. Signed-off-by: Paul E. McKenney --- kernel/rcu/srcutree.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index a8846ed7f352..60f3236beaf7 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -858,6 +858,7 @@ void __call_srcu(struct srcu_struct *sp, struct rcu_head *rhp, rcu_callback_t func, bool do_norm) { unsigned long flags; + int idx; bool needexp = false; bool needgp = false; unsigned long s; @@ -871,6 +872,7 @@ void __call_srcu(struct srcu_struct *sp, struct rcu_head *rhp, return; } rhp->func = func; + idx = srcu_read_lock(sp); local_irq_save(flags); sdp = this_cpu_ptr(sp->sda); spin_lock_rcu_node(sdp); @@ -892,6 +894,7 @@ void __call_srcu(struct srcu_struct *sp, struct rcu_head *rhp, srcu_funnel_gp_start(sp, sdp, s, do_norm); else if (needexp) srcu_funnel_exp_start(sp, sdp->mynode, s); + srcu_read_unlock(sp, idx); } /** -- cgit From e647815a4d3b3be9d85b5750ed0f2947fd78fac7 Mon Sep 17 00:00:00 2001 From: Jiong Wang Date: Thu, 8 Nov 2018 04:08:42 -0500 Subject: bpf: let verifier to calculate and record max_pkt_offset In check_packet_access, update max_pkt_offset after the offset has passed __check_packet_access. It should be safe to use u32 for max_pkt_offset as explained in code comment. Also, when there is tail call, the max_pkt_offset of the called program is unknown, so conservatively set max_pkt_offset to MAX_PACKET_OFF for such case. Reviewed-by: Jakub Kicinski Signed-off-by: Jiong Wang Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 1 + kernel/bpf/verifier.c | 12 ++++++++++++ 2 files changed, 13 insertions(+) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 33014ae73103..b6a296e01f6a 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -293,6 +293,7 @@ struct bpf_prog_aux { atomic_t refcnt; u32 used_map_cnt; u32 max_ctx_offset; + u32 max_pkt_offset; u32 stack_depth; u32 id; u32 func_cnt; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 1971ca325fb4..75dab40b19a3 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1455,6 +1455,17 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off, verbose(env, "R%d offset is outside of the packet\n", regno); return err; } + + /* __check_packet_access has made sure "off + size - 1" is within u16. + * reg->umax_value can't be bigger than MAX_PACKET_OFF which is 0xffff, + * otherwise find_good_pkt_pointers would have refused to set range info + * that __check_packet_access would have rejected this pkt access. + * Therefore, "off + reg->umax_value + size - 1" won't overflow u32. + */ + env->prog->aux->max_pkt_offset = + max_t(u32, env->prog->aux->max_pkt_offset, + off + reg->umax_value + size - 1); + return err; } @@ -6138,6 +6149,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) */ prog->cb_access = 1; env->prog->aux->stack_depth = MAX_BPF_STACK; + env->prog->aux->max_pkt_offset = MAX_PACKET_OFF; /* mark bpf_tail_call as different opcode to avoid * conditional branch in the interpeter for every normal -- cgit From 1385d755cfb42f596ef1cf9f5c761010ff3b34e7 Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Fri, 9 Nov 2018 13:03:25 +0000 Subject: bpf: pass a struct with offload callbacks to bpf_offload_dev_create() For passing device functions for offloaded eBPF programs, there used to be no place where to store the pointer without making the non-offloaded programs pay a memory price. As a consequence, three functions were called with ndo_bpf() through specific commands. Now that we have struct bpf_offload_dev, and since none of those operations rely on RTNL, we can turn these three commands into hooks inside the struct bpf_prog_offload_ops, and pass them as part of bpf_offload_dev_create(). This commit effectively passes a pointer to the struct to bpf_offload_dev_create(). We temporarily have two struct bpf_prog_offload_ops instances, one under offdev->ops and one under offload->dev_ops. The next patches will make the transition towards the former, so that offload->dev_ops can be removed, and callbacks relying on ndo_bpf() added to offdev->ops as well. While at it, rename "nfp_bpf_analyzer_ops" as "nfp_bpf_dev_ops" (and similarly for netdevsim). Suggested-by: Jakub Kicinski Signed-off-by: Quentin Monnet Reviewed-by: Jakub Kicinski Signed-off-by: Alexei Starovoitov --- drivers/net/ethernet/netronome/nfp/bpf/main.c | 2 +- drivers/net/ethernet/netronome/nfp/bpf/main.h | 2 +- drivers/net/ethernet/netronome/nfp/bpf/offload.c | 4 ++-- drivers/net/netdevsim/bpf.c | 6 +++--- include/linux/bpf.h | 3 ++- kernel/bpf/offload.c | 5 ++++- 6 files changed, 13 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.c b/drivers/net/ethernet/netronome/nfp/bpf/main.c index 6243af0ab025..dccae0319204 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/main.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/main.c @@ -465,7 +465,7 @@ static int nfp_bpf_init(struct nfp_app *app) app->ctrl_mtu = nfp_bpf_ctrl_cmsg_mtu(bpf); } - bpf->bpf_dev = bpf_offload_dev_create(); + bpf->bpf_dev = bpf_offload_dev_create(&nfp_bpf_dev_ops); err = PTR_ERR_OR_ZERO(bpf->bpf_dev); if (err) goto err_free_neutral_maps; diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.h b/drivers/net/ethernet/netronome/nfp/bpf/main.h index abdd93d14439..941277936475 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/main.h +++ b/drivers/net/ethernet/netronome/nfp/bpf/main.h @@ -513,7 +513,7 @@ int nfp_verify_insn(struct bpf_verifier_env *env, int insn_idx, int prev_insn_idx); int nfp_bpf_finalize(struct bpf_verifier_env *env); -extern const struct bpf_prog_offload_ops nfp_bpf_analyzer_ops; +extern const struct bpf_prog_offload_ops nfp_bpf_dev_ops; struct netdev_bpf; struct nfp_app; diff --git a/drivers/net/ethernet/netronome/nfp/bpf/offload.c b/drivers/net/ethernet/netronome/nfp/bpf/offload.c index dc548bb4089e..2fca996a7e77 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/offload.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/offload.c @@ -209,7 +209,7 @@ nfp_bpf_verifier_prep(struct nfp_app *app, struct nfp_net *nn, goto err_free; nfp_prog->verifier_meta = nfp_prog_first_meta(nfp_prog); - bpf->verifier.ops = &nfp_bpf_analyzer_ops; + bpf->verifier.ops = &nfp_bpf_dev_ops; return 0; @@ -602,7 +602,7 @@ int nfp_net_bpf_offload(struct nfp_net *nn, struct bpf_prog *prog, return 0; } -const struct bpf_prog_offload_ops nfp_bpf_analyzer_ops = { +const struct bpf_prog_offload_ops nfp_bpf_dev_ops = { .insn_hook = nfp_verify_insn, .finalize = nfp_bpf_finalize, }; diff --git a/drivers/net/netdevsim/bpf.c b/drivers/net/netdevsim/bpf.c index cb3518474f0e..135aee864162 100644 --- a/drivers/net/netdevsim/bpf.c +++ b/drivers/net/netdevsim/bpf.c @@ -91,7 +91,7 @@ static int nsim_bpf_finalize(struct bpf_verifier_env *env) return 0; } -static const struct bpf_prog_offload_ops nsim_bpf_analyzer_ops = { +static const struct bpf_prog_offload_ops nsim_bpf_dev_ops = { .insn_hook = nsim_bpf_verify_insn, .finalize = nsim_bpf_finalize, }; @@ -547,7 +547,7 @@ int nsim_bpf(struct net_device *dev, struct netdev_bpf *bpf) if (err) return err; - bpf->verifier.ops = &nsim_bpf_analyzer_ops; + bpf->verifier.ops = &nsim_bpf_dev_ops; return 0; case BPF_OFFLOAD_TRANSLATE: state = bpf->offload.prog->aux->offload->dev_priv; @@ -599,7 +599,7 @@ int nsim_bpf_init(struct netdevsim *ns) if (IS_ERR_OR_NULL(ns->sdev->ddir_bpf_bound_progs)) return -ENOMEM; - ns->sdev->bpf_dev = bpf_offload_dev_create(); + ns->sdev->bpf_dev = bpf_offload_dev_create(&nsim_bpf_dev_ops); err = PTR_ERR_OR_ZERO(ns->sdev->bpf_dev); if (err) return err; diff --git a/include/linux/bpf.h b/include/linux/bpf.h index b6a296e01f6a..c0197c37b2b2 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -692,7 +692,8 @@ int bpf_map_offload_get_next_key(struct bpf_map *map, bool bpf_offload_prog_map_match(struct bpf_prog *prog, struct bpf_map *map); -struct bpf_offload_dev *bpf_offload_dev_create(void); +struct bpf_offload_dev * +bpf_offload_dev_create(const struct bpf_prog_offload_ops *ops); void bpf_offload_dev_destroy(struct bpf_offload_dev *offdev); int bpf_offload_dev_netdev_register(struct bpf_offload_dev *offdev, struct net_device *netdev); diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index 8e93c47f0779..d513fbf9ca53 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -33,6 +33,7 @@ static DECLARE_RWSEM(bpf_devs_lock); struct bpf_offload_dev { + const struct bpf_prog_offload_ops *ops; struct list_head netdevs; }; @@ -655,7 +656,8 @@ unlock: } EXPORT_SYMBOL_GPL(bpf_offload_dev_netdev_unregister); -struct bpf_offload_dev *bpf_offload_dev_create(void) +struct bpf_offload_dev * +bpf_offload_dev_create(const struct bpf_prog_offload_ops *ops) { struct bpf_offload_dev *offdev; int err; @@ -673,6 +675,7 @@ struct bpf_offload_dev *bpf_offload_dev_create(void) if (!offdev) return ERR_PTR(-ENOMEM); + offdev->ops = ops; INIT_LIST_HEAD(&offdev->netdevs); return offdev; -- cgit From 341b3e7b7b89315c43d262da3199098bcf9bbe57 Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Fri, 9 Nov 2018 13:03:26 +0000 Subject: bpf: call verify_insn from its callback in struct bpf_offload_dev We intend to remove the dev_ops in struct bpf_prog_offload, and to only keep the ops in struct bpf_offload_dev instead, which is accessible from more locations for passing function pointers. But dev_ops is used for calling the verify_insn hook. Switch to the newly added ops in struct bpf_prog_offload instead. To avoid table lookups for each eBPF instruction to verify, we remember the offdev attached to a netdev and modify bpf_offload_find_netdev() to avoid performing more than once a lookup for a given offload object. Signed-off-by: Quentin Monnet Reviewed-by: Jakub Kicinski Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 + kernel/bpf/offload.c | 4 +++- 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index c0197c37b2b2..672714cd904f 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -273,6 +273,7 @@ struct bpf_prog_offload_ops { struct bpf_prog_offload { struct bpf_prog *prog; struct net_device *netdev; + struct bpf_offload_dev *offdev; void *dev_priv; struct list_head offloads; bool dev_state; diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index d513fbf9ca53..2cd3c0d0417b 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -107,6 +107,7 @@ int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr) err = -EINVAL; goto err_unlock; } + offload->offdev = ondev->offdev; prog->aux->offload = offload; list_add_tail(&offload->offloads, &ondev->progs); dev_put(offload->netdev); @@ -167,7 +168,8 @@ int bpf_prog_offload_verify_insn(struct bpf_verifier_env *env, down_read(&bpf_devs_lock); offload = env->prog->aux->offload; if (offload) - ret = offload->dev_ops->insn_hook(env, insn_idx, prev_insn_idx); + ret = offload->offdev->ops->insn_hook(env, insn_idx, + prev_insn_idx); up_read(&bpf_devs_lock); return ret; -- cgit From 6dc18fa6f4cad69c892d6fb9499f7e41c6a88a8e Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Fri, 9 Nov 2018 13:03:27 +0000 Subject: bpf: call finalize() from its callback in struct bpf_offload_dev In a way similar to the change previously brought to the verify_insn hook, switch to the newly added ops in struct bpf_prog_offload for calling the functions used to perform final verification steps for offloaded programs. Signed-off-by: Quentin Monnet Reviewed-by: Jakub Kicinski Signed-off-by: Alexei Starovoitov --- kernel/bpf/offload.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index 2cd3c0d0417b..2c88cb4ddfd8 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -183,8 +183,8 @@ int bpf_prog_offload_finalize(struct bpf_verifier_env *env) down_read(&bpf_devs_lock); offload = env->prog->aux->offload; if (offload) { - if (offload->dev_ops->finalize) - ret = offload->dev_ops->finalize(env); + if (offload->offdev->ops->finalize) + ret = offload->offdev->ops->finalize(env); else ret = 0; } -- cgit From 00db12c3d141356a4d1e6b6f688e0d5ed3b1f757 Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Fri, 9 Nov 2018 13:03:28 +0000 Subject: bpf: call verifier_prep from its callback in struct bpf_offload_dev In a way similar to the change previously brought to the verify_insn hook and to the finalize callback, switch to the newly added ops in struct bpf_prog_offload for calling the functions used to prepare driver verifiers. Since the dev_ops pointer in struct bpf_prog_offload is no longer used by any callback, we can now remove it from struct bpf_prog_offload. Signed-off-by: Quentin Monnet Reviewed-by: Jakub Kicinski Signed-off-by: Alexei Starovoitov --- drivers/net/ethernet/netronome/nfp/bpf/offload.c | 11 ++++---- drivers/net/netdevsim/bpf.c | 32 +++++++++++++----------- include/linux/bpf.h | 2 +- include/linux/netdevice.h | 6 ----- kernel/bpf/offload.c | 22 +++++++--------- 5 files changed, 32 insertions(+), 41 deletions(-) (limited to 'kernel') diff --git a/drivers/net/ethernet/netronome/nfp/bpf/offload.c b/drivers/net/ethernet/netronome/nfp/bpf/offload.c index 2fca996a7e77..16a3a9c55852 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/offload.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/offload.c @@ -188,10 +188,11 @@ static void nfp_prog_free(struct nfp_prog *nfp_prog) } static int -nfp_bpf_verifier_prep(struct nfp_app *app, struct nfp_net *nn, - struct netdev_bpf *bpf) +nfp_bpf_verifier_prep(struct net_device *netdev, struct bpf_verifier_env *env) { - struct bpf_prog *prog = bpf->verifier.prog; + struct nfp_net *nn = netdev_priv(netdev); + struct bpf_prog *prog = env->prog; + struct nfp_app *app = nn->app; struct nfp_prog *nfp_prog; int ret; @@ -209,7 +210,6 @@ nfp_bpf_verifier_prep(struct nfp_app *app, struct nfp_net *nn, goto err_free; nfp_prog->verifier_meta = nfp_prog_first_meta(nfp_prog); - bpf->verifier.ops = &nfp_bpf_dev_ops; return 0; @@ -422,8 +422,6 @@ nfp_bpf_map_free(struct nfp_app_bpf *bpf, struct bpf_offloaded_map *offmap) int nfp_ndo_bpf(struct nfp_app *app, struct nfp_net *nn, struct netdev_bpf *bpf) { switch (bpf->command) { - case BPF_OFFLOAD_VERIFIER_PREP: - return nfp_bpf_verifier_prep(app, nn, bpf); case BPF_OFFLOAD_TRANSLATE: return nfp_bpf_translate(nn, bpf->offload.prog); case BPF_OFFLOAD_DESTROY: @@ -605,4 +603,5 @@ int nfp_net_bpf_offload(struct nfp_net *nn, struct bpf_prog *prog, const struct bpf_prog_offload_ops nfp_bpf_dev_ops = { .insn_hook = nfp_verify_insn, .finalize = nfp_bpf_finalize, + .prepare = nfp_bpf_verifier_prep, }; diff --git a/drivers/net/netdevsim/bpf.c b/drivers/net/netdevsim/bpf.c index 135aee864162..d045b7d666d9 100644 --- a/drivers/net/netdevsim/bpf.c +++ b/drivers/net/netdevsim/bpf.c @@ -91,11 +91,6 @@ static int nsim_bpf_finalize(struct bpf_verifier_env *env) return 0; } -static const struct bpf_prog_offload_ops nsim_bpf_dev_ops = { - .insn_hook = nsim_bpf_verify_insn, - .finalize = nsim_bpf_finalize, -}; - static bool nsim_xdp_offload_active(struct netdevsim *ns) { return ns->xdp_hw.prog; @@ -263,6 +258,17 @@ static int nsim_bpf_create_prog(struct netdevsim *ns, struct bpf_prog *prog) return 0; } +static int +nsim_bpf_verifier_prep(struct net_device *dev, struct bpf_verifier_env *env) +{ + struct netdevsim *ns = netdev_priv(dev); + + if (!ns->bpf_bind_accept) + return -EOPNOTSUPP; + + return nsim_bpf_create_prog(ns, env->prog); +} + static void nsim_bpf_destroy_prog(struct bpf_prog *prog) { struct nsim_bpf_bound_prog *state; @@ -275,6 +281,12 @@ static void nsim_bpf_destroy_prog(struct bpf_prog *prog) kfree(state); } +static const struct bpf_prog_offload_ops nsim_bpf_dev_ops = { + .insn_hook = nsim_bpf_verify_insn, + .finalize = nsim_bpf_finalize, + .prepare = nsim_bpf_verifier_prep, +}; + static int nsim_setup_prog_checks(struct netdevsim *ns, struct netdev_bpf *bpf) { if (bpf->prog && bpf->prog->aux->offload) { @@ -539,16 +551,6 @@ int nsim_bpf(struct net_device *dev, struct netdev_bpf *bpf) ASSERT_RTNL(); switch (bpf->command) { - case BPF_OFFLOAD_VERIFIER_PREP: - if (!ns->bpf_bind_accept) - return -EOPNOTSUPP; - - err = nsim_bpf_create_prog(ns, bpf->verifier.prog); - if (err) - return err; - - bpf->verifier.ops = &nsim_bpf_dev_ops; - return 0; case BPF_OFFLOAD_TRANSLATE: state = bpf->offload.prog->aux->offload->dev_priv; diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 672714cd904f..f250494a4f56 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -268,6 +268,7 @@ struct bpf_prog_offload_ops { int (*insn_hook)(struct bpf_verifier_env *env, int insn_idx, int prev_insn_idx); int (*finalize)(struct bpf_verifier_env *env); + int (*prepare)(struct net_device *netdev, struct bpf_verifier_env *env); }; struct bpf_prog_offload { @@ -277,7 +278,6 @@ struct bpf_prog_offload { void *dev_priv; struct list_head offloads; bool dev_state; - const struct bpf_prog_offload_ops *dev_ops; void *jited_image; u32 jited_len; }; diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 857f8abf7b91..0fa2c2744928 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -863,7 +863,6 @@ enum bpf_netdev_command { XDP_QUERY_PROG, XDP_QUERY_PROG_HW, /* BPF program for offload callbacks, invoked at program load time. */ - BPF_OFFLOAD_VERIFIER_PREP, BPF_OFFLOAD_TRANSLATE, BPF_OFFLOAD_DESTROY, BPF_OFFLOAD_MAP_ALLOC, @@ -891,11 +890,6 @@ struct netdev_bpf { /* flags with which program was installed */ u32 prog_flags; }; - /* BPF_OFFLOAD_VERIFIER_PREP */ - struct { - struct bpf_prog *prog; - const struct bpf_prog_offload_ops *ops; /* callee set */ - } verifier; /* BPF_OFFLOAD_TRANSLATE, BPF_OFFLOAD_DESTROY */ struct { struct bpf_prog *prog; diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index 2c88cb4ddfd8..1f7ac00a494d 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -142,21 +142,17 @@ static int __bpf_offload_ndo(struct bpf_prog *prog, enum bpf_netdev_command cmd, int bpf_prog_offload_verifier_prep(struct bpf_verifier_env *env) { - struct netdev_bpf data = {}; - int err; - - data.verifier.prog = env->prog; + struct bpf_prog_offload *offload; + int ret = -ENODEV; - rtnl_lock(); - err = __bpf_offload_ndo(env->prog, BPF_OFFLOAD_VERIFIER_PREP, &data); - if (err) - goto exit_unlock; + down_read(&bpf_devs_lock); + offload = env->prog->aux->offload; + if (offload) + ret = offload->offdev->ops->prepare(offload->netdev, env); + offload->dev_state = !ret; + up_read(&bpf_devs_lock); - env->prog->aux->offload->dev_ops = data.verifier.ops; - env->prog->aux->offload->dev_state = true; -exit_unlock: - rtnl_unlock(); - return err; + return ret; } int bpf_prog_offload_verify_insn(struct bpf_verifier_env *env, -- cgit From b07ade27e93360197e453e5ca80eebdc9099dcb5 Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Fri, 9 Nov 2018 13:03:29 +0000 Subject: bpf: pass translate() as a callback and remove its ndo_bpf subcommand As part of the transition from ndo_bpf() to callbacks attached to struct bpf_offload_dev for some of the eBPF offload operations, move the functions related to code translation to the struct and remove the subcommand that was used to call them through the NDO. Signed-off-by: Quentin Monnet Reviewed-by: Jakub Kicinski Signed-off-by: Alexei Starovoitov --- drivers/net/ethernet/netronome/nfp/bpf/offload.c | 11 +++-------- drivers/net/netdevsim/bpf.c | 14 +++++++++----- include/linux/bpf.h | 1 + include/linux/netdevice.h | 3 +-- kernel/bpf/offload.c | 14 +++++++------- 5 files changed, 21 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/drivers/net/ethernet/netronome/nfp/bpf/offload.c b/drivers/net/ethernet/netronome/nfp/bpf/offload.c index 16a3a9c55852..8653a2189c19 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/offload.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/offload.c @@ -33,9 +33,6 @@ nfp_map_ptr_record(struct nfp_app_bpf *bpf, struct nfp_prog *nfp_prog, struct nfp_bpf_neutral_map *record; int err; - /* Map record paths are entered via ndo, update side is protected. */ - ASSERT_RTNL(); - /* Reuse path - other offloaded program is already tracking this map. */ record = rhashtable_lookup_fast(&bpf->maps_neutral, &map->id, nfp_bpf_maps_neutral_params); @@ -84,8 +81,6 @@ nfp_map_ptrs_forget(struct nfp_app_bpf *bpf, struct nfp_prog *nfp_prog) bool freed = false; int i; - ASSERT_RTNL(); - for (i = 0; i < nfp_prog->map_records_cnt; i++) { if (--nfp_prog->map_records[i]->count) { nfp_prog->map_records[i] = NULL; @@ -219,9 +214,10 @@ err_free: return ret; } -static int nfp_bpf_translate(struct nfp_net *nn, struct bpf_prog *prog) +static int nfp_bpf_translate(struct net_device *netdev, struct bpf_prog *prog) { struct nfp_prog *nfp_prog = prog->aux->offload->dev_priv; + struct nfp_net *nn = netdev_priv(netdev); unsigned int max_instr; int err; @@ -422,8 +418,6 @@ nfp_bpf_map_free(struct nfp_app_bpf *bpf, struct bpf_offloaded_map *offmap) int nfp_ndo_bpf(struct nfp_app *app, struct nfp_net *nn, struct netdev_bpf *bpf) { switch (bpf->command) { - case BPF_OFFLOAD_TRANSLATE: - return nfp_bpf_translate(nn, bpf->offload.prog); case BPF_OFFLOAD_DESTROY: return nfp_bpf_destroy(nn, bpf->offload.prog); case BPF_OFFLOAD_MAP_ALLOC: @@ -604,4 +598,5 @@ const struct bpf_prog_offload_ops nfp_bpf_dev_ops = { .insn_hook = nfp_verify_insn, .finalize = nfp_bpf_finalize, .prepare = nfp_bpf_verifier_prep, + .translate = nfp_bpf_translate, }; diff --git a/drivers/net/netdevsim/bpf.c b/drivers/net/netdevsim/bpf.c index d045b7d666d9..30c2cd516d1c 100644 --- a/drivers/net/netdevsim/bpf.c +++ b/drivers/net/netdevsim/bpf.c @@ -269,6 +269,14 @@ nsim_bpf_verifier_prep(struct net_device *dev, struct bpf_verifier_env *env) return nsim_bpf_create_prog(ns, env->prog); } +static int nsim_bpf_translate(struct net_device *dev, struct bpf_prog *prog) +{ + struct nsim_bpf_bound_prog *state = prog->aux->offload->dev_priv; + + state->state = "xlated"; + return 0; +} + static void nsim_bpf_destroy_prog(struct bpf_prog *prog) { struct nsim_bpf_bound_prog *state; @@ -285,6 +293,7 @@ static const struct bpf_prog_offload_ops nsim_bpf_dev_ops = { .insn_hook = nsim_bpf_verify_insn, .finalize = nsim_bpf_finalize, .prepare = nsim_bpf_verifier_prep, + .translate = nsim_bpf_translate, }; static int nsim_setup_prog_checks(struct netdevsim *ns, struct netdev_bpf *bpf) @@ -551,11 +560,6 @@ int nsim_bpf(struct net_device *dev, struct netdev_bpf *bpf) ASSERT_RTNL(); switch (bpf->command) { - case BPF_OFFLOAD_TRANSLATE: - state = bpf->offload.prog->aux->offload->dev_priv; - - state->state = "xlated"; - return 0; case BPF_OFFLOAD_DESTROY: nsim_bpf_destroy_prog(bpf->offload.prog); return 0; diff --git a/include/linux/bpf.h b/include/linux/bpf.h index f250494a4f56..d1eb3c8a3fa9 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -269,6 +269,7 @@ struct bpf_prog_offload_ops { int insn_idx, int prev_insn_idx); int (*finalize)(struct bpf_verifier_env *env); int (*prepare)(struct net_device *netdev, struct bpf_verifier_env *env); + int (*translate)(struct net_device *netdev, struct bpf_prog *prog); }; struct bpf_prog_offload { diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 0fa2c2744928..27499127e038 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -863,7 +863,6 @@ enum bpf_netdev_command { XDP_QUERY_PROG, XDP_QUERY_PROG_HW, /* BPF program for offload callbacks, invoked at program load time. */ - BPF_OFFLOAD_TRANSLATE, BPF_OFFLOAD_DESTROY, BPF_OFFLOAD_MAP_ALLOC, BPF_OFFLOAD_MAP_FREE, @@ -890,7 +889,7 @@ struct netdev_bpf { /* flags with which program was installed */ u32 prog_flags; }; - /* BPF_OFFLOAD_TRANSLATE, BPF_OFFLOAD_DESTROY */ + /* BPF_OFFLOAD_DESTROY */ struct { struct bpf_prog *prog; } offload; diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index 1f7ac00a494d..ae0167366c12 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -219,14 +219,14 @@ void bpf_prog_offload_destroy(struct bpf_prog *prog) static int bpf_prog_offload_translate(struct bpf_prog *prog) { - struct netdev_bpf data = {}; - int ret; - - data.offload.prog = prog; + struct bpf_prog_offload *offload; + int ret = -ENODEV; - rtnl_lock(); - ret = __bpf_offload_ndo(prog, BPF_OFFLOAD_TRANSLATE, &data); - rtnl_unlock(); + down_read(&bpf_devs_lock); + offload = prog->aux->offload; + if (offload) + ret = offload->offdev->ops->translate(offload->netdev, prog); + up_read(&bpf_devs_lock); return ret; } -- cgit From eb9119471efbf730c8f830f706026b486eb701dd Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Fri, 9 Nov 2018 13:03:30 +0000 Subject: bpf: pass destroy() as a callback and remove its ndo_bpf subcommand As part of the transition from ndo_bpf() to callbacks attached to struct bpf_offload_dev for some of the eBPF offload operations, move the functions related to program destruction to the struct and remove the subcommand that was used to call them through the NDO. Remove function __bpf_offload_ndo(), which is no longer used. Signed-off-by: Quentin Monnet Reviewed-by: Jakub Kicinski Signed-off-by: Alexei Starovoitov --- drivers/net/ethernet/netronome/nfp/bpf/offload.c | 7 ++----- drivers/net/netdevsim/bpf.c | 4 +--- include/linux/bpf.h | 1 + include/linux/netdevice.h | 5 ----- kernel/bpf/offload.c | 24 +----------------------- 5 files changed, 5 insertions(+), 36 deletions(-) (limited to 'kernel') diff --git a/drivers/net/ethernet/netronome/nfp/bpf/offload.c b/drivers/net/ethernet/netronome/nfp/bpf/offload.c index 8653a2189c19..91085cc3c843 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/offload.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/offload.c @@ -238,15 +238,13 @@ static int nfp_bpf_translate(struct net_device *netdev, struct bpf_prog *prog) return nfp_map_ptrs_record(nfp_prog->bpf, nfp_prog, prog); } -static int nfp_bpf_destroy(struct nfp_net *nn, struct bpf_prog *prog) +static void nfp_bpf_destroy(struct bpf_prog *prog) { struct nfp_prog *nfp_prog = prog->aux->offload->dev_priv; kvfree(nfp_prog->prog); nfp_map_ptrs_forget(nfp_prog->bpf, nfp_prog); nfp_prog_free(nfp_prog); - - return 0; } /* Atomic engine requires values to be in big endian, we need to byte swap @@ -418,8 +416,6 @@ nfp_bpf_map_free(struct nfp_app_bpf *bpf, struct bpf_offloaded_map *offmap) int nfp_ndo_bpf(struct nfp_app *app, struct nfp_net *nn, struct netdev_bpf *bpf) { switch (bpf->command) { - case BPF_OFFLOAD_DESTROY: - return nfp_bpf_destroy(nn, bpf->offload.prog); case BPF_OFFLOAD_MAP_ALLOC: return nfp_bpf_map_alloc(app->priv, bpf->offmap); case BPF_OFFLOAD_MAP_FREE: @@ -599,4 +595,5 @@ const struct bpf_prog_offload_ops nfp_bpf_dev_ops = { .finalize = nfp_bpf_finalize, .prepare = nfp_bpf_verifier_prep, .translate = nfp_bpf_translate, + .destroy = nfp_bpf_destroy, }; diff --git a/drivers/net/netdevsim/bpf.c b/drivers/net/netdevsim/bpf.c index 30c2cd516d1c..33e3d54c3a0a 100644 --- a/drivers/net/netdevsim/bpf.c +++ b/drivers/net/netdevsim/bpf.c @@ -294,6 +294,7 @@ static const struct bpf_prog_offload_ops nsim_bpf_dev_ops = { .finalize = nsim_bpf_finalize, .prepare = nsim_bpf_verifier_prep, .translate = nsim_bpf_translate, + .destroy = nsim_bpf_destroy_prog, }; static int nsim_setup_prog_checks(struct netdevsim *ns, struct netdev_bpf *bpf) @@ -560,9 +561,6 @@ int nsim_bpf(struct net_device *dev, struct netdev_bpf *bpf) ASSERT_RTNL(); switch (bpf->command) { - case BPF_OFFLOAD_DESTROY: - nsim_bpf_destroy_prog(bpf->offload.prog); - return 0; case XDP_QUERY_PROG: return xdp_attachment_query(&ns->xdp, bpf); case XDP_QUERY_PROG_HW: diff --git a/include/linux/bpf.h b/include/linux/bpf.h index d1eb3c8a3fa9..867d2801db64 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -270,6 +270,7 @@ struct bpf_prog_offload_ops { int (*finalize)(struct bpf_verifier_env *env); int (*prepare)(struct net_device *netdev, struct bpf_verifier_env *env); int (*translate)(struct net_device *netdev, struct bpf_prog *prog); + void (*destroy)(struct bpf_prog *prog); }; struct bpf_prog_offload { diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 27499127e038..17d52a647fe5 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -863,7 +863,6 @@ enum bpf_netdev_command { XDP_QUERY_PROG, XDP_QUERY_PROG_HW, /* BPF program for offload callbacks, invoked at program load time. */ - BPF_OFFLOAD_DESTROY, BPF_OFFLOAD_MAP_ALLOC, BPF_OFFLOAD_MAP_FREE, XDP_QUERY_XSK_UMEM, @@ -889,10 +888,6 @@ struct netdev_bpf { /* flags with which program was installed */ u32 prog_flags; }; - /* BPF_OFFLOAD_DESTROY */ - struct { - struct bpf_prog *prog; - } offload; /* BPF_OFFLOAD_MAP_ALLOC, BPF_OFFLOAD_MAP_FREE */ struct { struct bpf_offloaded_map *offmap; diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index ae0167366c12..d665e75a0ac3 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -123,23 +123,6 @@ err_maybe_put: return err; } -static int __bpf_offload_ndo(struct bpf_prog *prog, enum bpf_netdev_command cmd, - struct netdev_bpf *data) -{ - struct bpf_prog_offload *offload = prog->aux->offload; - struct net_device *netdev; - - ASSERT_RTNL(); - - if (!offload) - return -ENODEV; - netdev = offload->netdev; - - data->command = cmd; - - return netdev->netdev_ops->ndo_bpf(netdev, data); -} - int bpf_prog_offload_verifier_prep(struct bpf_verifier_env *env) { struct bpf_prog_offload *offload; @@ -192,12 +175,9 @@ int bpf_prog_offload_finalize(struct bpf_verifier_env *env) static void __bpf_prog_offload_destroy(struct bpf_prog *prog) { struct bpf_prog_offload *offload = prog->aux->offload; - struct netdev_bpf data = {}; - - data.offload.prog = prog; if (offload->dev_state) - WARN_ON(__bpf_offload_ndo(prog, BPF_OFFLOAD_DESTROY, &data)); + offload->offdev->ops->destroy(prog); /* Make sure BPF_PROG_GET_NEXT_ID can't find this dead program */ bpf_prog_free_id(prog, true); @@ -209,12 +189,10 @@ static void __bpf_prog_offload_destroy(struct bpf_prog *prog) void bpf_prog_offload_destroy(struct bpf_prog *prog) { - rtnl_lock(); down_write(&bpf_devs_lock); if (prog->aux->offload) __bpf_prog_offload_destroy(prog); up_write(&bpf_devs_lock); - rtnl_unlock(); } static int bpf_prog_offload_translate(struct bpf_prog *prog) -- cgit From a40a26322a83d4a26a99ad2616cbd77394c19587 Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Fri, 9 Nov 2018 13:03:31 +0000 Subject: bpf: pass prog instead of env to bpf_prog_offload_verifier_prep() Function bpf_prog_offload_verifier_prep(), called from the kernel BPF verifier to run a driver-specific callback for preparing for the verification step for offloaded programs, takes a pointer to a struct bpf_verifier_env object. However, no driver callback needs the whole structure at this time: the two drivers supporting this, nfp and netdevsim, only need a pointer to the struct bpf_prog instance held by env. Update the callback accordingly, on kernel side and in these two drivers. Signed-off-by: Quentin Monnet Reviewed-by: Jakub Kicinski Signed-off-by: Alexei Starovoitov --- drivers/net/ethernet/netronome/nfp/bpf/offload.c | 3 +-- drivers/net/netdevsim/bpf.c | 4 ++-- include/linux/bpf.h | 2 +- include/linux/bpf_verifier.h | 2 +- kernel/bpf/offload.c | 6 +++--- kernel/bpf/verifier.c | 2 +- 6 files changed, 9 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/drivers/net/ethernet/netronome/nfp/bpf/offload.c b/drivers/net/ethernet/netronome/nfp/bpf/offload.c index 91085cc3c843..e6b26d2f651d 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/offload.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/offload.c @@ -183,10 +183,9 @@ static void nfp_prog_free(struct nfp_prog *nfp_prog) } static int -nfp_bpf_verifier_prep(struct net_device *netdev, struct bpf_verifier_env *env) +nfp_bpf_verifier_prep(struct net_device *netdev, struct bpf_prog *prog) { struct nfp_net *nn = netdev_priv(netdev); - struct bpf_prog *prog = env->prog; struct nfp_app *app = nn->app; struct nfp_prog *nfp_prog; int ret; diff --git a/drivers/net/netdevsim/bpf.c b/drivers/net/netdevsim/bpf.c index 33e3d54c3a0a..560bdaf1c98b 100644 --- a/drivers/net/netdevsim/bpf.c +++ b/drivers/net/netdevsim/bpf.c @@ -259,14 +259,14 @@ static int nsim_bpf_create_prog(struct netdevsim *ns, struct bpf_prog *prog) } static int -nsim_bpf_verifier_prep(struct net_device *dev, struct bpf_verifier_env *env) +nsim_bpf_verifier_prep(struct net_device *dev, struct bpf_prog *prog) { struct netdevsim *ns = netdev_priv(dev); if (!ns->bpf_bind_accept) return -EOPNOTSUPP; - return nsim_bpf_create_prog(ns, env->prog); + return nsim_bpf_create_prog(ns, prog); } static int nsim_bpf_translate(struct net_device *dev, struct bpf_prog *prog) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 867d2801db64..888111350d0e 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -268,7 +268,7 @@ struct bpf_prog_offload_ops { int (*insn_hook)(struct bpf_verifier_env *env, int insn_idx, int prev_insn_idx); int (*finalize)(struct bpf_verifier_env *env); - int (*prepare)(struct net_device *netdev, struct bpf_verifier_env *env); + int (*prepare)(struct net_device *netdev, struct bpf_prog *prog); int (*translate)(struct net_device *netdev, struct bpf_prog *prog); void (*destroy)(struct bpf_prog *prog); }; diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index d93e89761a8b..11f5df1092d9 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -245,7 +245,7 @@ static inline struct bpf_reg_state *cur_regs(struct bpf_verifier_env *env) return cur_func(env)->regs; } -int bpf_prog_offload_verifier_prep(struct bpf_verifier_env *env); +int bpf_prog_offload_verifier_prep(struct bpf_prog *prog); int bpf_prog_offload_verify_insn(struct bpf_verifier_env *env, int insn_idx, int prev_insn_idx); int bpf_prog_offload_finalize(struct bpf_verifier_env *env); diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index d665e75a0ac3..397d206e184b 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -123,15 +123,15 @@ err_maybe_put: return err; } -int bpf_prog_offload_verifier_prep(struct bpf_verifier_env *env) +int bpf_prog_offload_verifier_prep(struct bpf_prog *prog) { struct bpf_prog_offload *offload; int ret = -ENODEV; down_read(&bpf_devs_lock); - offload = env->prog->aux->offload; + offload = prog->aux->offload; if (offload) - ret = offload->offdev->ops->prepare(offload->netdev, env); + ret = offload->offdev->ops->prepare(offload->netdev, prog); offload->dev_state = !ret; up_read(&bpf_devs_lock); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 75dab40b19a3..8d0977980cfa 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6368,7 +6368,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) goto skip_full_check; if (bpf_prog_is_dev_bound(env->prog->aux)) { - ret = bpf_prog_offload_verifier_prep(env); + ret = bpf_prog_offload_verifier_prep(env->prog); if (ret) goto skip_full_check; } -- cgit From 16a8cb5cffd0a2929ae97bc258d2d9c92a4e7f6d Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Fri, 9 Nov 2018 13:03:32 +0000 Subject: bpf: do not pass netdev to translate() and prepare() offload callbacks The kernel functions to prepare verifier and translate for offloaded program retrieve "offload" from "prog", and "netdev" from "offload". Then both "prog" and "netdev" are passed to the callbacks. Simplify this by letting the drivers retrieve the net device themselves from the offload object attached to prog - if they need it at all. There is currently no need to pass the netdev as an argument to those functions. Signed-off-by: Quentin Monnet Reviewed-by: Jakub Kicinski Signed-off-by: Alexei Starovoitov --- drivers/net/ethernet/netronome/nfp/bpf/offload.c | 9 ++++----- drivers/net/netdevsim/bpf.c | 7 +++---- include/linux/bpf.h | 4 ++-- kernel/bpf/offload.c | 4 ++-- 4 files changed, 11 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/drivers/net/ethernet/netronome/nfp/bpf/offload.c b/drivers/net/ethernet/netronome/nfp/bpf/offload.c index e6b26d2f651d..f0283854fade 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/offload.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/offload.c @@ -182,10 +182,9 @@ static void nfp_prog_free(struct nfp_prog *nfp_prog) kfree(nfp_prog); } -static int -nfp_bpf_verifier_prep(struct net_device *netdev, struct bpf_prog *prog) +static int nfp_bpf_verifier_prep(struct bpf_prog *prog) { - struct nfp_net *nn = netdev_priv(netdev); + struct nfp_net *nn = netdev_priv(prog->aux->offload->netdev); struct nfp_app *app = nn->app; struct nfp_prog *nfp_prog; int ret; @@ -213,10 +212,10 @@ err_free: return ret; } -static int nfp_bpf_translate(struct net_device *netdev, struct bpf_prog *prog) +static int nfp_bpf_translate(struct bpf_prog *prog) { + struct nfp_net *nn = netdev_priv(prog->aux->offload->netdev); struct nfp_prog *nfp_prog = prog->aux->offload->dev_priv; - struct nfp_net *nn = netdev_priv(netdev); unsigned int max_instr; int err; diff --git a/drivers/net/netdevsim/bpf.c b/drivers/net/netdevsim/bpf.c index 560bdaf1c98b..6a5b7bd9a1f9 100644 --- a/drivers/net/netdevsim/bpf.c +++ b/drivers/net/netdevsim/bpf.c @@ -258,10 +258,9 @@ static int nsim_bpf_create_prog(struct netdevsim *ns, struct bpf_prog *prog) return 0; } -static int -nsim_bpf_verifier_prep(struct net_device *dev, struct bpf_prog *prog) +static int nsim_bpf_verifier_prep(struct bpf_prog *prog) { - struct netdevsim *ns = netdev_priv(dev); + struct netdevsim *ns = netdev_priv(prog->aux->offload->netdev); if (!ns->bpf_bind_accept) return -EOPNOTSUPP; @@ -269,7 +268,7 @@ nsim_bpf_verifier_prep(struct net_device *dev, struct bpf_prog *prog) return nsim_bpf_create_prog(ns, prog); } -static int nsim_bpf_translate(struct net_device *dev, struct bpf_prog *prog) +static int nsim_bpf_translate(struct bpf_prog *prog) { struct nsim_bpf_bound_prog *state = prog->aux->offload->dev_priv; diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 888111350d0e..987815152629 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -268,8 +268,8 @@ struct bpf_prog_offload_ops { int (*insn_hook)(struct bpf_verifier_env *env, int insn_idx, int prev_insn_idx); int (*finalize)(struct bpf_verifier_env *env); - int (*prepare)(struct net_device *netdev, struct bpf_prog *prog); - int (*translate)(struct net_device *netdev, struct bpf_prog *prog); + int (*prepare)(struct bpf_prog *prog); + int (*translate)(struct bpf_prog *prog); void (*destroy)(struct bpf_prog *prog); }; diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index 397d206e184b..52c5617e3716 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -131,7 +131,7 @@ int bpf_prog_offload_verifier_prep(struct bpf_prog *prog) down_read(&bpf_devs_lock); offload = prog->aux->offload; if (offload) - ret = offload->offdev->ops->prepare(offload->netdev, prog); + ret = offload->offdev->ops->prepare(prog); offload->dev_state = !ret; up_read(&bpf_devs_lock); @@ -203,7 +203,7 @@ static int bpf_prog_offload_translate(struct bpf_prog *prog) down_read(&bpf_devs_lock); offload = prog->aux->offload; if (offload) - ret = offload->offdev->ops->translate(offload->netdev, prog); + ret = offload->offdev->ops->translate(prog); up_read(&bpf_devs_lock); return ret; -- cgit From 46f53a65d2de3e1591636c22b626b09d8684fd71 Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Sat, 10 Nov 2018 22:15:13 -0800 Subject: bpf: Allow narrow loads with offset > 0 Currently BPF verifier allows narrow loads for a context field only with offset zero. E.g. if there is a __u32 field then only the following loads are permitted: * off=0, size=1 (narrow); * off=0, size=2 (narrow); * off=0, size=4 (full). On the other hand LLVM can generate a load with offset different than zero that make sense from program logic point of view, but verifier doesn't accept it. E.g. tools/testing/selftests/bpf/sendmsg4_prog.c has code: #define DST_IP4 0xC0A801FEU /* 192.168.1.254 */ ... if ((ctx->user_ip4 >> 24) == (bpf_htonl(DST_IP4) >> 24) && where ctx is struct bpf_sock_addr. Some versions of LLVM can produce the following byte code for it: 8: 71 12 07 00 00 00 00 00 r2 = *(u8 *)(r1 + 7) 9: 67 02 00 00 18 00 00 00 r2 <<= 24 10: 18 03 00 00 00 00 00 fe 00 00 00 00 00 00 00 00 r3 = 4261412864 ll 12: 5d 32 07 00 00 00 00 00 if r2 != r3 goto +7 where `*(u8 *)(r1 + 7)` means narrow load for ctx->user_ip4 with size=1 and offset=3 (7 - sizeof(ctx->user_family) = 3). This load is currently rejected by verifier. Verifier code that rejects such loads is in bpf_ctx_narrow_access_ok() what means any is_valid_access implementation, that uses the function, works this way, e.g. bpf_skb_is_valid_access() for __sk_buff or sock_addr_is_valid_access() for bpf_sock_addr. The patch makes such loads supported. Offset can be in [0; size_default) but has to be multiple of load size. E.g. for __u32 field the following loads are supported now: * off=0, size=1 (narrow); * off=1, size=1 (narrow); * off=2, size=1 (narrow); * off=3, size=1 (narrow); * off=0, size=2 (narrow); * off=2, size=2 (narrow); * off=0, size=4 (full). Reported-by: Yonghong Song Signed-off-by: Andrey Ignatov Signed-off-by: Alexei Starovoitov --- include/linux/filter.h | 16 +--------------- kernel/bpf/verifier.c | 21 ++++++++++++++++----- 2 files changed, 17 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/include/linux/filter.h b/include/linux/filter.h index de629b706d1d..cc17f5f32fbb 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -668,24 +668,10 @@ static inline u32 bpf_ctx_off_adjust_machine(u32 size) return size; } -static inline bool bpf_ctx_narrow_align_ok(u32 off, u32 size_access, - u32 size_default) -{ - size_default = bpf_ctx_off_adjust_machine(size_default); - size_access = bpf_ctx_off_adjust_machine(size_access); - -#ifdef __LITTLE_ENDIAN - return (off & (size_default - 1)) == 0; -#else - return (off & (size_default - 1)) + size_access == size_default; -#endif -} - static inline bool bpf_ctx_narrow_access_ok(u32 off, u32 size, u32 size_default) { - return bpf_ctx_narrow_align_ok(off, size, size_default) && - size <= size_default && (size & (size - 1)) == 0; + return size <= size_default && (size & (size - 1)) == 0; } #define bpf_classic_proglen(fprog) (fprog->len * sizeof(fprog->filter[0])) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 8d0977980cfa..b5222aa61d54 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5718,10 +5718,10 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) int i, cnt, size, ctx_field_size, delta = 0; const int insn_cnt = env->prog->len; struct bpf_insn insn_buf[16], *insn; + u32 target_size, size_default, off; struct bpf_prog *new_prog; enum bpf_access_type type; bool is_narrower_load; - u32 target_size; if (ops->gen_prologue || env->seen_direct_write) { if (!ops->gen_prologue) { @@ -5814,9 +5814,9 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) * we will apply proper mask to the result. */ is_narrower_load = size < ctx_field_size; + size_default = bpf_ctx_off_adjust_machine(ctx_field_size); + off = insn->off; if (is_narrower_load) { - u32 size_default = bpf_ctx_off_adjust_machine(ctx_field_size); - u32 off = insn->off; u8 size_code; if (type == BPF_WRITE) { @@ -5844,12 +5844,23 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) } if (is_narrower_load && size < target_size) { - if (ctx_field_size <= 4) + u8 shift = (off & (size_default - 1)) * 8; + + if (ctx_field_size <= 4) { + if (shift) + insn_buf[cnt++] = BPF_ALU32_IMM(BPF_RSH, + insn->dst_reg, + shift); insn_buf[cnt++] = BPF_ALU32_IMM(BPF_AND, insn->dst_reg, (1 << size * 8) - 1); - else + } else { + if (shift) + insn_buf[cnt++] = BPF_ALU64_IMM(BPF_RSH, + insn->dst_reg, + shift); insn_buf[cnt++] = BPF_ALU64_IMM(BPF_AND, insn->dst_reg, (1 << size * 8) - 1); + } } new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); -- cgit From 9cac83a57e99e4692315b7a91a81bab787961d97 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 11 Sep 2018 08:57:48 -0700 Subject: rcu: Stop expedited grace periods from relying on stop-machine The CPU-selection code in sync_rcu_exp_select_cpus() disables preemption to prevent the cpu_online_mask from changing. However, this relies on the stop-machine mechanism in the CPU-hotplug offline code, which is not desirable (it would be good to someday remove the stop-machine mechanism). This commit therefore instead uses the relevant leaf rcu_node structure's ->ffmask, which has a bit set for all CPUs that are fully functional. A given CPU's bit is cleared very early during offline processing by rcutree_offline_cpu() and set very late during online processing by rcutree_online_cpu(). Therefore, if a CPU's bit is set in this mask, and preemption is disabled, we have to be before the synchronize_sched() in the CPU-hotplug offline code, which means that the CPU is guaranteed to be workqueue-ready throughout the duration of the enclosing preempt_disable() region of code. This also has the side-effect of using WORK_CPU_UNBOUND if all the CPUs for this leaf rcu_node structure are offline, which is an acceptable difference in behavior. Reported-by: Sebastian Andrzej Siewior Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_exp.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 8d18c1014e2b..e669ccf3751b 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -450,10 +450,12 @@ static void sync_rcu_exp_select_cpus(smp_call_func_t func) } INIT_WORK(&rnp->rew.rew_work, sync_rcu_exp_select_node_cpus); preempt_disable(); - cpu = cpumask_next(rnp->grplo - 1, cpu_online_mask); + cpu = find_next_bit(&rnp->ffmask, BITS_PER_LONG, -1); /* If all offline, queue the work on an unbound CPU. */ - if (unlikely(cpu > rnp->grphi)) + if (unlikely(cpu > rnp->grphi - rnp->grplo)) cpu = WORK_CPU_UNBOUND; + else + cpu += rnp->grplo; queue_work_on(cpu, rcu_par_gp_wq, &rnp->rew.rew_work); preempt_enable(); rnp->exp_need_flush = true; -- cgit From 92a801e5d5b7a893881c1676b15dd246727ccd16 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Mon, 5 Nov 2018 14:53:59 +0000 Subject: sched/fair: Mask UTIL_AVG_UNCHANGED usages The _task_util_est() is mainly used to add/remove the task contribution to/from the rq's estimated utilization at task enqueue/dequeue time. In both cases we ensure the UTIL_AVG_UNCHANGED flag is set to keep consistency between enqueue and dequeue time while still being transparent to update_load_avg calls which will eventually reset the flag. Let's move the flag forcing within _task_util_est() itself so that we can simplify calling code by hiding that estimated utilization implementation detail into one of its internal functions. This will affect also the "public" API task_util_est() but we know that the flag will (eventually) impact just on the LSB of the estimated utilization, thus it's certainly acceptable. Signed-off-by: Patrick Bellasi Signed-off-by: Peter Zijlstra (Intel) Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Morten Rasmussen Cc: Peter Zijlstra Cc: Quentin Perret Cc: Steve Muckle Cc: Suren Baghdasaryan Cc: Thomas Gleixner Cc: Todd Kjos Cc: Vincent Guittot Link: http://lkml.kernel.org/r/20181105145400.935-3-patrick.bellasi@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a1ccf1ddd37a..28ee60cabba1 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3604,7 +3604,7 @@ static inline unsigned long _task_util_est(struct task_struct *p) { struct util_est ue = READ_ONCE(p->se.avg.util_est); - return max(ue.ewma, ue.enqueued); + return (max(ue.ewma, ue.enqueued) | UTIL_AVG_UNCHANGED); } static inline unsigned long task_util_est(struct task_struct *p) @@ -3622,7 +3622,7 @@ static inline void util_est_enqueue(struct cfs_rq *cfs_rq, /* Update root cfs_rq's estimated utilization */ enqueued = cfs_rq->avg.util_est.enqueued; - enqueued += (_task_util_est(p) | UTIL_AVG_UNCHANGED); + enqueued += _task_util_est(p); WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued); } @@ -3650,8 +3650,7 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep) /* Update root cfs_rq's estimated utilization */ ue.enqueued = cfs_rq->avg.util_est.enqueued; - ue.enqueued -= min_t(unsigned int, ue.enqueued, - (_task_util_est(p) | UTIL_AVG_UNCHANGED)); + ue.enqueued -= min_t(unsigned int, ue.enqueued, _task_util_est(p)); WRITE_ONCE(cfs_rq->avg.util_est.enqueued, ue.enqueued); /* @@ -6292,7 +6291,7 @@ static unsigned long cpu_util_without(int cpu, struct task_struct *p) */ if (unlikely(task_on_rq_queued(p) || current == p)) { estimated -= min_t(unsigned int, estimated, - (_task_util_est(p) | UTIL_AVG_UNCHANGED)); + _task_util_est(p)); } util = max(util, estimated); } -- cgit From b5c0ce7bd1848892e2930f481828b6d7750231ed Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Mon, 5 Nov 2018 14:54:00 +0000 Subject: sched/fair: Add lsub_positive() and use it consistently The following pattern: var -= min_t(typeof(var), var, val); is used multiple times in fair.c. The existing sub_positive() already captures that pattern, but it also adds an explicit load-store to properly support lockless observations. In other cases the pattern above is used to update local, and/or not concurrently accessed, variables. Let's add a simpler version of sub_positive(), targeted at local variables updates, which gives the same readability benefits at calling sites, without enforcing {READ,WRITE}_ONCE() barriers. Signed-off-by: Patrick Bellasi Signed-off-by: Peter Zijlstra (Intel) Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Morten Rasmussen Cc: Peter Zijlstra Cc: Quentin Perret Cc: Steve Muckle Cc: Suren Baghdasaryan Cc: Thomas Gleixner Cc: Todd Kjos Cc: Vincent Guittot Link: https://lore.kernel.org/lkml/20181031184527.GA3178@hirez.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 28ee60cabba1..2cac9a469df4 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2734,6 +2734,17 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) WRITE_ONCE(*ptr, res); \ } while (0) +/* + * Remove and clamp on negative, from a local variable. + * + * A variant of sub_positive(), which does not use explicit load-store + * and is thus optimized for local variable updates. + */ +#define lsub_positive(_ptr, _val) do { \ + typeof(_ptr) ptr = (_ptr); \ + *ptr -= min_t(typeof(*ptr), *ptr, _val); \ +} while (0) + #ifdef CONFIG_SMP static inline void enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) @@ -4639,7 +4650,7 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) cfs_b->distribute_running = 0; throttled = !list_empty(&cfs_b->throttled_cfs_rq); - cfs_b->runtime -= min(runtime, cfs_b->runtime); + lsub_positive(&cfs_b->runtime, runtime); } /* @@ -4773,7 +4784,7 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) raw_spin_lock(&cfs_b->lock); if (expires == cfs_b->runtime_expires) - cfs_b->runtime -= min(runtime, cfs_b->runtime); + lsub_positive(&cfs_b->runtime, runtime); cfs_b->distribute_running = 0; raw_spin_unlock(&cfs_b->lock); } @@ -6240,7 +6251,7 @@ static unsigned long cpu_util_without(int cpu, struct task_struct *p) util = READ_ONCE(cfs_rq->avg.util_avg); /* Discount task's util from CPU's util */ - util -= min_t(unsigned int, util, task_util(p)); + lsub_positive(&util, task_util(p)); /* * Covered cases: @@ -6289,10 +6300,9 @@ static unsigned long cpu_util_without(int cpu, struct task_struct *p) * properly fix the execl regression and it helps in further * reducing the chances for the above race. */ - if (unlikely(task_on_rq_queued(p) || current == p)) { - estimated -= min_t(unsigned int, estimated, - _task_util_est(p)); - } + if (unlikely(task_on_rq_queued(p) || current == p)) + lsub_positive(&estimated, _task_util_est(p)); + util = max(util, estimated); } -- cgit From 1da1843f9f0334e2428308945d396ffecc2acfe1 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Mon, 5 Nov 2018 16:51:55 +0530 Subject: sched/core: Create task_has_idle_policy() helper We already have task_has_rt_policy() and task_has_dl_policy() helpers, create task_has_idle_policy() as well and update sched core to start using it. While at it, use task_has_dl_policy() at one more place. Signed-off-by: Viresh Kumar Signed-off-by: Peter Zijlstra (Intel) Acked-by: Daniel Lezcano Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vincent Guittot Link: http://lkml.kernel.org/r/ce3915d5b490fc81af926a3b6bfb775e7188e005.1541416894.git.viresh.kumar@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 4 ++-- kernel/sched/debug.c | 2 +- kernel/sched/fair.c | 10 +++++----- kernel/sched/sched.h | 5 +++++ 4 files changed, 13 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 02a20ef196a6..5afb868f7339 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -697,7 +697,7 @@ static void set_load_weight(struct task_struct *p, bool update_load) /* * SCHED_IDLE tasks get minimal weight: */ - if (idle_policy(p->policy)) { + if (task_has_idle_policy(p)) { load->weight = scale_load(WEIGHT_IDLEPRIO); load->inv_weight = WMULT_IDLEPRIO; p->se.runnable_weight = load->weight; @@ -4199,7 +4199,7 @@ recheck: * Treat SCHED_IDLE as nice 20. Only allow a switch to * SCHED_NORMAL if the RLIMIT_NICE would normally permit it. */ - if (idle_policy(p->policy) && !idle_policy(policy)) { + if (task_has_idle_policy(p) && !idle_policy(policy)) { if (!can_nice(p, task_nice(p))) return -EPERM; } diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 6383aa6a60ca..02bd5f969b21 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -974,7 +974,7 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, #endif P(policy); P(prio); - if (p->policy == SCHED_DEADLINE) { + if (task_has_dl_policy(p)) { P(dl.runtime); P(dl.deadline); } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 2cac9a469df4..d1f91e6efe51 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6529,7 +6529,7 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se) static void set_last_buddy(struct sched_entity *se) { - if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE)) + if (entity_is_task(se) && unlikely(task_has_idle_policy(task_of(se)))) return; for_each_sched_entity(se) { @@ -6541,7 +6541,7 @@ static void set_last_buddy(struct sched_entity *se) static void set_next_buddy(struct sched_entity *se) { - if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE)) + if (entity_is_task(se) && unlikely(task_has_idle_policy(task_of(se)))) return; for_each_sched_entity(se) { @@ -6599,8 +6599,8 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ return; /* Idle tasks are by definition preempted by non-idle tasks. */ - if (unlikely(curr->policy == SCHED_IDLE) && - likely(p->policy != SCHED_IDLE)) + if (unlikely(task_has_idle_policy(curr)) && + likely(!task_has_idle_policy(p))) goto preempt; /* @@ -7021,7 +7021,7 @@ static int task_hot(struct task_struct *p, struct lb_env *env) if (p->sched_class != &fair_sched_class) return 0; - if (unlikely(p->policy == SCHED_IDLE)) + if (unlikely(task_has_idle_policy(p))) return 0; /* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 618577fc9aa8..b7a3147874e3 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -176,6 +176,11 @@ static inline bool valid_policy(int policy) rt_policy(policy) || dl_policy(policy); } +static inline int task_has_idle_policy(struct task_struct *p) +{ + return idle_policy(p->policy); +} + static inline int task_has_rt_policy(struct task_struct *p) { return rt_policy(p->policy); -- cgit From ed8885a14433aec04067463493051eaaeef3255f Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Sat, 10 Nov 2018 15:52:02 +0800 Subject: sched/fair: Make some variables static The variables are local to the source and do not need to be in global scope, so make them static. Signed-off-by: Muchun Song Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20181110075202.61172-1-smuchun@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d1f91e6efe51..e30dea59d215 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -38,7 +38,7 @@ * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds) */ unsigned int sysctl_sched_latency = 6000000ULL; -unsigned int normalized_sysctl_sched_latency = 6000000ULL; +static unsigned int normalized_sysctl_sched_latency = 6000000ULL; /* * The initial- and re-scaling of tunables is configurable @@ -58,8 +58,8 @@ enum sched_tunable_scaling sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_L * * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds) */ -unsigned int sysctl_sched_min_granularity = 750000ULL; -unsigned int normalized_sysctl_sched_min_granularity = 750000ULL; +unsigned int sysctl_sched_min_granularity = 750000ULL; +static unsigned int normalized_sysctl_sched_min_granularity = 750000ULL; /* * This value is kept at sysctl_sched_latency/sysctl_sched_min_granularity @@ -81,8 +81,8 @@ unsigned int sysctl_sched_child_runs_first __read_mostly; * * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) */ -unsigned int sysctl_sched_wakeup_granularity = 1000000UL; -unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL; +unsigned int sysctl_sched_wakeup_granularity = 1000000UL; +static unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL; const_debug unsigned int sysctl_sched_migration_cost = 500000UL; @@ -116,7 +116,7 @@ unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; * * (default: ~20%) */ -unsigned int capacity_margin = 1280; +static unsigned int capacity_margin = 1280; static inline void update_load_add(struct load_weight *lw, unsigned long inc) { -- cgit From 3e184501083c38fa091f640acb13af17a21fd228 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Tue, 6 Nov 2018 11:12:57 +0530 Subject: sched/core: Clean up the #ifdef block in add_nr_running() There is no point in keeping the conditional statement of the #if block outside of the #ifdef block, while all of its body is contained within the #ifdef block. Move the conditional statement under the #ifdef block as well. Signed-off-by: Viresh Kumar Cc: Daniel Lezcano Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vincent Guittot Link: http://lkml.kernel.org/r/78cbd78a615d6f9fdcd3327f1ead68470f92593e.1541482935.git.viresh.kumar@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/sched.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index b7a3147874e3..e0e052a50fcd 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1801,12 +1801,12 @@ static inline void add_nr_running(struct rq *rq, unsigned count) rq->nr_running = prev_nr + count; - if (prev_nr < 2 && rq->nr_running >= 2) { #ifdef CONFIG_SMP + if (prev_nr < 2 && rq->nr_running >= 2) { if (!READ_ONCE(rq->rd->overload)) WRITE_ONCE(rq->rd->overload, 1); -#endif } +#endif sched_update_tick_dependency(rq); } -- cgit From 9f16d2e6241b2fc664523f17d74adda7489f496b Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 17 Oct 2018 12:14:52 +0200 Subject: audit_tree: Remove mark->lock locking Currently, audit_tree code uses mark->lock to protect against detaching of mark from an inode. In most places it however also uses mark->group->mark_mutex (as we need to atomically replace attached marks) and this provides protection against mark detaching as well. So just remove protection with mark->lock from audit tree code and replace it with mark->group->mark_mutex protection in all the places. It simplifies the code and gets rid of some ugly catches like calling fsnotify_add_mark_locked() with mark->lock held (which cannot sleep only because we hold a reference to another mark attached to the same inode). Reviewed-by: Richard Guy Briggs Signed-off-by: Jan Kara Signed-off-by: Paul Moore --- kernel/audit_tree.c | 24 ++++-------------------- 1 file changed, 4 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index ea43181cde4a..1b55b1026a36 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -193,7 +193,7 @@ static inline struct list_head *chunk_hash(unsigned long key) return chunk_hash_heads + n % HASH_SIZE; } -/* hash_lock & entry->lock is held by caller */ +/* hash_lock & entry->group->mark_mutex is held by caller */ static void insert_hash(struct audit_chunk *chunk) { unsigned long key = chunk_to_key(chunk); @@ -256,13 +256,11 @@ static void untag_chunk(struct node *p) new = alloc_chunk(size); mutex_lock(&entry->group->mark_mutex); - spin_lock(&entry->lock); /* * mark_mutex protects mark from getting detached and thus also from * mark->connector->obj getting NULL. */ if (chunk->dead || !(entry->flags & FSNOTIFY_MARK_FLAG_ATTACHED)) { - spin_unlock(&entry->lock); mutex_unlock(&entry->group->mark_mutex); if (new) fsnotify_put_mark(&new->mark); @@ -280,7 +278,6 @@ static void untag_chunk(struct node *p) list_del_init(&p->list); list_del_rcu(&chunk->hash); spin_unlock(&hash_lock); - spin_unlock(&entry->lock); mutex_unlock(&entry->group->mark_mutex); fsnotify_destroy_mark(entry, audit_tree_group); goto out; @@ -323,7 +320,6 @@ static void untag_chunk(struct node *p) list_for_each_entry(owner, &new->trees, same_root) owner->root = new; spin_unlock(&hash_lock); - spin_unlock(&entry->lock); mutex_unlock(&entry->group->mark_mutex); fsnotify_destroy_mark(entry, audit_tree_group); fsnotify_put_mark(&new->mark); /* drop initial reference */ @@ -340,7 +336,6 @@ Fallback: p->owner = NULL; put_tree(owner); spin_unlock(&hash_lock); - spin_unlock(&entry->lock); mutex_unlock(&entry->group->mark_mutex); out: fsnotify_put_mark(entry); @@ -360,12 +355,12 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree) return -ENOSPC; } - spin_lock(&entry->lock); + mutex_lock(&entry->group->mark_mutex); spin_lock(&hash_lock); if (tree->goner) { spin_unlock(&hash_lock); chunk->dead = 1; - spin_unlock(&entry->lock); + mutex_unlock(&entry->group->mark_mutex); fsnotify_destroy_mark(entry, audit_tree_group); fsnotify_put_mark(entry); return 0; @@ -380,7 +375,7 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree) } insert_hash(chunk); spin_unlock(&hash_lock); - spin_unlock(&entry->lock); + mutex_unlock(&entry->group->mark_mutex); fsnotify_put_mark(entry); /* drop initial reference */ return 0; } @@ -421,14 +416,12 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) chunk_entry = &chunk->mark; mutex_lock(&old_entry->group->mark_mutex); - spin_lock(&old_entry->lock); /* * mark_mutex protects mark from getting detached and thus also from * mark->connector->obj getting NULL. */ if (!(old_entry->flags & FSNOTIFY_MARK_FLAG_ATTACHED)) { /* old_entry is being shot, lets just lie */ - spin_unlock(&old_entry->lock); mutex_unlock(&old_entry->group->mark_mutex); fsnotify_put_mark(old_entry); fsnotify_put_mark(&chunk->mark); @@ -437,23 +430,16 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) if (fsnotify_add_mark_locked(chunk_entry, old_entry->connector->obj, FSNOTIFY_OBJ_TYPE_INODE, 1)) { - spin_unlock(&old_entry->lock); mutex_unlock(&old_entry->group->mark_mutex); fsnotify_put_mark(chunk_entry); fsnotify_put_mark(old_entry); return -ENOSPC; } - /* even though we hold old_entry->lock, this is safe since chunk_entry->lock could NEVER have been grabbed before */ - spin_lock(&chunk_entry->lock); spin_lock(&hash_lock); - - /* we now hold old_entry->lock, chunk_entry->lock, and hash_lock */ if (tree->goner) { spin_unlock(&hash_lock); chunk->dead = 1; - spin_unlock(&chunk_entry->lock); - spin_unlock(&old_entry->lock); mutex_unlock(&old_entry->group->mark_mutex); fsnotify_destroy_mark(chunk_entry, audit_tree_group); @@ -485,8 +471,6 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) list_add(&tree->same_root, &chunk->trees); } spin_unlock(&hash_lock); - spin_unlock(&chunk_entry->lock); - spin_unlock(&old_entry->lock); mutex_unlock(&old_entry->group->mark_mutex); fsnotify_destroy_mark(old_entry, audit_tree_group); fsnotify_put_mark(chunk_entry); /* drop initial reference */ -- cgit From a5789b07b35aa56569dff762bfc063303a9ccb95 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 12 Nov 2018 09:54:48 -0500 Subject: audit: Fix possible spurious -ENOSPC error When an inode is tagged with a tree, tag_chunk() checks whether there is audit_tree_group mark attached to the inode and adds one if not. However nothing protects another tag_chunk() to add the mark between we've checked and try to add the fsnotify mark thus resulting in an error from fsnotify_add_mark() and consequently an ENOSPC error from tag_chunk(). Fix the problem by holding mark_mutex over the whole check-insert code sequence. Reviewed-by: Richard Guy Briggs Signed-off-by: Jan Kara Signed-off-by: Paul Moore --- kernel/audit_tree.c | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 1b55b1026a36..8a74b468b666 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -342,25 +342,29 @@ out: spin_lock(&hash_lock); } +/* Call with group->mark_mutex held, releases it */ static int create_chunk(struct inode *inode, struct audit_tree *tree) { struct fsnotify_mark *entry; struct audit_chunk *chunk = alloc_chunk(1); - if (!chunk) + + if (!chunk) { + mutex_unlock(&audit_tree_group->mark_mutex); return -ENOMEM; + } entry = &chunk->mark; - if (fsnotify_add_inode_mark(entry, inode, 0)) { + if (fsnotify_add_inode_mark_locked(entry, inode, 0)) { + mutex_unlock(&audit_tree_group->mark_mutex); fsnotify_put_mark(entry); return -ENOSPC; } - mutex_lock(&entry->group->mark_mutex); spin_lock(&hash_lock); if (tree->goner) { spin_unlock(&hash_lock); chunk->dead = 1; - mutex_unlock(&entry->group->mark_mutex); + mutex_unlock(&audit_tree_group->mark_mutex); fsnotify_destroy_mark(entry, audit_tree_group); fsnotify_put_mark(entry); return 0; @@ -375,7 +379,7 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree) } insert_hash(chunk); spin_unlock(&hash_lock); - mutex_unlock(&entry->group->mark_mutex); + mutex_unlock(&audit_tree_group->mark_mutex); fsnotify_put_mark(entry); /* drop initial reference */ return 0; } @@ -389,6 +393,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) struct node *p; int n; + mutex_lock(&audit_tree_group->mark_mutex); old_entry = fsnotify_find_mark(&inode->i_fsnotify_marks, audit_tree_group); if (!old_entry) @@ -401,6 +406,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) for (n = 0; n < old->count; n++) { if (old->owners[n].owner == tree) { spin_unlock(&hash_lock); + mutex_unlock(&audit_tree_group->mark_mutex); fsnotify_put_mark(old_entry); return 0; } @@ -409,20 +415,20 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) chunk = alloc_chunk(old->count + 1); if (!chunk) { + mutex_unlock(&audit_tree_group->mark_mutex); fsnotify_put_mark(old_entry); return -ENOMEM; } chunk_entry = &chunk->mark; - mutex_lock(&old_entry->group->mark_mutex); /* * mark_mutex protects mark from getting detached and thus also from * mark->connector->obj getting NULL. */ if (!(old_entry->flags & FSNOTIFY_MARK_FLAG_ATTACHED)) { /* old_entry is being shot, lets just lie */ - mutex_unlock(&old_entry->group->mark_mutex); + mutex_unlock(&audit_tree_group->mark_mutex); fsnotify_put_mark(old_entry); fsnotify_put_mark(&chunk->mark); return -ENOENT; @@ -430,7 +436,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) if (fsnotify_add_mark_locked(chunk_entry, old_entry->connector->obj, FSNOTIFY_OBJ_TYPE_INODE, 1)) { - mutex_unlock(&old_entry->group->mark_mutex); + mutex_unlock(&audit_tree_group->mark_mutex); fsnotify_put_mark(chunk_entry); fsnotify_put_mark(old_entry); return -ENOSPC; @@ -440,7 +446,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) if (tree->goner) { spin_unlock(&hash_lock); chunk->dead = 1; - mutex_unlock(&old_entry->group->mark_mutex); + mutex_unlock(&audit_tree_group->mark_mutex); fsnotify_destroy_mark(chunk_entry, audit_tree_group); @@ -471,7 +477,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) list_add(&tree->same_root, &chunk->trees); } spin_unlock(&hash_lock); - mutex_unlock(&old_entry->group->mark_mutex); + mutex_unlock(&audit_tree_group->mark_mutex); fsnotify_destroy_mark(old_entry, audit_tree_group); fsnotify_put_mark(chunk_entry); /* drop initial reference */ fsnotify_put_mark(old_entry); /* pair to fsnotify_find mark_entry */ -- cgit From b1e4603b92d8aef8776e5673dc13fedb68d32ea4 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 12 Nov 2018 09:54:48 -0500 Subject: audit: Fix possible tagging failures Audit tree code is replacing marks attached to inodes in non-atomic way. Thus fsnotify_find_mark() in tag_chunk() may find a mark that belongs to a chunk that is no longer valid one and will soon be destroyed. Tags added to such chunk will be simply lost. Fix the problem by making sure old mark is marked as going away (through fsnotify_detach_mark()) before dropping mark_mutex and thus in an atomic way wrt tag_chunk(). Note that this does not fix the problem completely as if tag_chunk() finds a mark that is going away, it fails with -ENOENT. But at least the failure is not silent and currently there's no way to search for another fsnotify mark attached to the inode. We'll fix this problem in later patch. Reviewed-by: Richard Guy Briggs Signed-off-by: Jan Kara Signed-off-by: Paul Moore --- kernel/audit_tree.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 8a74b468b666..c194dbd53753 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -278,8 +278,9 @@ static void untag_chunk(struct node *p) list_del_init(&p->list); list_del_rcu(&chunk->hash); spin_unlock(&hash_lock); + fsnotify_detach_mark(entry); mutex_unlock(&entry->group->mark_mutex); - fsnotify_destroy_mark(entry, audit_tree_group); + fsnotify_free_mark(entry); goto out; } @@ -320,8 +321,9 @@ static void untag_chunk(struct node *p) list_for_each_entry(owner, &new->trees, same_root) owner->root = new; spin_unlock(&hash_lock); + fsnotify_detach_mark(entry); mutex_unlock(&entry->group->mark_mutex); - fsnotify_destroy_mark(entry, audit_tree_group); + fsnotify_free_mark(entry); fsnotify_put_mark(&new->mark); /* drop initial reference */ goto out; @@ -364,8 +366,9 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree) if (tree->goner) { spin_unlock(&hash_lock); chunk->dead = 1; + fsnotify_detach_mark(entry); mutex_unlock(&audit_tree_group->mark_mutex); - fsnotify_destroy_mark(entry, audit_tree_group); + fsnotify_free_mark(entry); fsnotify_put_mark(entry); return 0; } @@ -446,10 +449,9 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) if (tree->goner) { spin_unlock(&hash_lock); chunk->dead = 1; + fsnotify_detach_mark(chunk_entry); mutex_unlock(&audit_tree_group->mark_mutex); - - fsnotify_destroy_mark(chunk_entry, audit_tree_group); - + fsnotify_free_mark(chunk_entry); fsnotify_put_mark(chunk_entry); fsnotify_put_mark(old_entry); return 0; @@ -477,8 +479,9 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) list_add(&tree->same_root, &chunk->trees); } spin_unlock(&hash_lock); + fsnotify_detach_mark(old_entry); mutex_unlock(&audit_tree_group->mark_mutex); - fsnotify_destroy_mark(old_entry, audit_tree_group); + fsnotify_free_mark(old_entry); fsnotify_put_mark(chunk_entry); /* drop initial reference */ fsnotify_put_mark(old_entry); /* pair to fsnotify_find mark_entry */ return 0; -- cgit From 8d20d6e9301d7b3777d66d47dd5b89acd645cd39 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 12 Nov 2018 09:54:48 -0500 Subject: audit: Embed key into chunk Currently chunk hash key (which is in fact pointer to the inode) is derived as chunk->mark.conn->obj. It is tricky to make this dereference reliable for hash table lookups only under RCU as mark can get detached from the connector and connector gets freed independently of the running lookup. Thus there is a possible use after free / NULL ptr dereference issue: CPU1 CPU2 untag_chunk() ... audit_tree_lookup() list_for_each_entry_rcu(p, list, hash) { list_del_rcu(&chunk->hash); fsnotify_destroy_mark(entry); fsnotify_put_mark(entry) chunk_to_key(p) if (!chunk->mark.connector) ... hlist_del_init_rcu(&mark->obj_list); if (hlist_empty(&conn->list)) { inode = fsnotify_detach_connector_from_object(conn); mark->connector = NULL; ... frees connector from workqueue chunk->mark.connector->obj This race is probably impossible to hit in practice as the race window on CPU1 is very narrow and CPU2 has a lot of code to execute. Still it's better to have this fixed. Since the inode the chunk is attached to is constant during chunk's lifetime it is easy to cache the key in the chunk itself and thus avoid these issues. Reviewed-by: Richard Guy Briggs Signed-off-by: Jan Kara Signed-off-by: Paul Moore --- kernel/audit_tree.c | 27 ++++++++------------------- 1 file changed, 8 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index c194dbd53753..bac5dd90c629 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -24,6 +24,7 @@ struct audit_tree { struct audit_chunk { struct list_head hash; + unsigned long key; struct fsnotify_mark mark; struct list_head trees; /* with root here */ int dead; @@ -172,21 +173,6 @@ static unsigned long inode_to_key(const struct inode *inode) return (unsigned long)&inode->i_fsnotify_marks; } -/* - * Function to return search key in our hash from chunk. Key 0 is special and - * should never be present in the hash. - */ -static unsigned long chunk_to_key(struct audit_chunk *chunk) -{ - /* - * We have a reference to the mark so it should be attached to a - * connector. - */ - if (WARN_ON_ONCE(!chunk->mark.connector)) - return 0; - return (unsigned long)chunk->mark.connector->obj; -} - static inline struct list_head *chunk_hash(unsigned long key) { unsigned long n = key / L1_CACHE_BYTES; @@ -196,12 +182,12 @@ static inline struct list_head *chunk_hash(unsigned long key) /* hash_lock & entry->group->mark_mutex is held by caller */ static void insert_hash(struct audit_chunk *chunk) { - unsigned long key = chunk_to_key(chunk); struct list_head *list; if (!(chunk->mark.flags & FSNOTIFY_MARK_FLAG_ATTACHED)) return; - list = chunk_hash(key); + WARN_ON_ONCE(!chunk->key); + list = chunk_hash(chunk->key); list_add_rcu(&chunk->hash, list); } @@ -213,7 +199,7 @@ struct audit_chunk *audit_tree_lookup(const struct inode *inode) struct audit_chunk *p; list_for_each_entry_rcu(p, list, hash) { - if (chunk_to_key(p) == key) { + if (p->key == key) { atomic_long_inc(&p->refs); return p; } @@ -295,6 +281,7 @@ static void untag_chunk(struct node *p) chunk->dead = 1; spin_lock(&hash_lock); + new->key = chunk->key; list_replace_init(&chunk->trees, &new->trees); if (owner->root == chunk) { list_del_init(&owner->same_root); @@ -380,6 +367,7 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree) tree->root = chunk; list_add(&tree->same_root, &chunk->trees); } + chunk->key = inode_to_key(inode); insert_hash(chunk); spin_unlock(&hash_lock); mutex_unlock(&audit_tree_group->mark_mutex); @@ -456,6 +444,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) fsnotify_put_mark(old_entry); return 0; } + chunk->key = old->key; list_replace_init(&old->trees, &chunk->trees); for (n = 0, p = chunk->owners; n < old->count; n++, p++) { struct audit_tree *s = old->owners[n].owner; @@ -654,7 +643,7 @@ void audit_trim_trees(void) /* this could be NULL if the watch is dying else where... */ node->index |= 1U<<31; if (iterate_mounts(compare_root, - (void *)chunk_to_key(chunk), + (void *)(chunk->key), root_mnt)) node->index &= ~(1U<<31); } -- cgit From 1635e5722350597b6a149bdb131358fcd7e34906 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 12 Nov 2018 09:54:48 -0500 Subject: audit: Make hash table insertion safe against concurrent lookups Currently, the audit tree code does not make sure that when a chunk is inserted into the hash table, it is fully initialized. So in theory a user of RCU lookup could see uninitialized structure in the hash table and crash. Add appropriate barriers between initialization of the structure and its insertion into hash table. Reviewed-by: Richard Guy Briggs Signed-off-by: Jan Kara Signed-off-by: Paul Moore --- kernel/audit_tree.c | 32 +++++++++++++++++++++++++++++--- 1 file changed, 29 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index bac5dd90c629..307749d6773c 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -186,6 +186,12 @@ static void insert_hash(struct audit_chunk *chunk) if (!(chunk->mark.flags & FSNOTIFY_MARK_FLAG_ATTACHED)) return; + /* + * Make sure chunk is fully initialized before making it visible in the + * hash. Pairs with a data dependency barrier in READ_ONCE() in + * audit_tree_lookup(). + */ + smp_wmb(); WARN_ON_ONCE(!chunk->key); list = chunk_hash(chunk->key); list_add_rcu(&chunk->hash, list); @@ -199,7 +205,11 @@ struct audit_chunk *audit_tree_lookup(const struct inode *inode) struct audit_chunk *p; list_for_each_entry_rcu(p, list, hash) { - if (p->key == key) { + /* + * We use a data dependency barrier in READ_ONCE() to make sure + * the chunk we see is fully initialized. + */ + if (READ_ONCE(p->key) == key) { atomic_long_inc(&p->refs); return p; } @@ -304,9 +314,15 @@ static void untag_chunk(struct node *p) list_replace_init(&chunk->owners[j].list, &new->owners[i].list); } - list_replace_rcu(&chunk->hash, &new->hash); list_for_each_entry(owner, &new->trees, same_root) owner->root = new; + /* + * Make sure chunk is fully initialized before making it visible in the + * hash. Pairs with a data dependency barrier in READ_ONCE() in + * audit_tree_lookup(). + */ + smp_wmb(); + list_replace_rcu(&chunk->hash, &new->hash); spin_unlock(&hash_lock); fsnotify_detach_mark(entry); mutex_unlock(&entry->group->mark_mutex); @@ -368,6 +384,10 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree) list_add(&tree->same_root, &chunk->trees); } chunk->key = inode_to_key(inode); + /* + * Inserting into the hash table has to go last as once we do that RCU + * readers can see the chunk. + */ insert_hash(chunk); spin_unlock(&hash_lock); mutex_unlock(&audit_tree_group->mark_mutex); @@ -459,7 +479,6 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) p->owner = tree; get_tree(tree); list_add(&p->list, &tree->chunks); - list_replace_rcu(&old->hash, &chunk->hash); list_for_each_entry(owner, &chunk->trees, same_root) owner->root = chunk; old->dead = 1; @@ -467,6 +486,13 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) tree->root = chunk; list_add(&tree->same_root, &chunk->trees); } + /* + * Make sure chunk is fully initialized before making it visible in the + * hash. Pairs with a data dependency barrier in READ_ONCE() in + * audit_tree_lookup(). + */ + smp_wmb(); + list_replace_rcu(&old->hash, &chunk->hash); spin_unlock(&hash_lock); fsnotify_detach_mark(old_entry); mutex_unlock(&audit_tree_group->mark_mutex); -- cgit From d31b326d3ce7b1ff2ec36470dfcccb14a6c3e04e Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 12 Nov 2018 09:54:48 -0500 Subject: audit: Factor out chunk replacement code Chunk replacement code is very similar for the cases where we grow or shrink chunk. Factor the code out into a common helper function. Signed-off-by: Jan Kara Reviewed-by: Richard Guy Briggs Signed-off-by: Paul Moore --- kernel/audit_tree.c | 86 +++++++++++++++++++++++++---------------------------- 1 file changed, 40 insertions(+), 46 deletions(-) (limited to 'kernel') diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 307749d6773c..d8f6cfa0005b 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -235,6 +235,38 @@ static struct audit_chunk *find_chunk(struct node *p) return container_of(p, struct audit_chunk, owners[0]); } +static void replace_chunk(struct audit_chunk *new, struct audit_chunk *old, + struct node *skip) +{ + struct audit_tree *owner; + int i, j; + + new->key = old->key; + list_splice_init(&old->trees, &new->trees); + list_for_each_entry(owner, &new->trees, same_root) + owner->root = new; + for (i = j = 0; j < old->count; i++, j++) { + if (&old->owners[j] == skip) { + i--; + continue; + } + owner = old->owners[j].owner; + new->owners[i].owner = owner; + new->owners[i].index = old->owners[j].index - j + i; + if (!owner) /* result of earlier fallback */ + continue; + get_tree(owner); + list_replace_init(&old->owners[j].list, &new->owners[i].list); + } + /* + * Make sure chunk is fully initialized before making it visible in the + * hash. Pairs with a data dependency barrier in READ_ONCE() in + * audit_tree_lookup(). + */ + smp_wmb(); + list_replace_rcu(&old->hash, &new->hash); +} + static void untag_chunk(struct node *p) { struct audit_chunk *chunk = find_chunk(p); @@ -242,7 +274,6 @@ static void untag_chunk(struct node *p) struct audit_chunk *new = NULL; struct audit_tree *owner; int size = chunk->count - 1; - int i, j; fsnotify_get_mark(entry); @@ -291,38 +322,16 @@ static void untag_chunk(struct node *p) chunk->dead = 1; spin_lock(&hash_lock); - new->key = chunk->key; - list_replace_init(&chunk->trees, &new->trees); if (owner->root == chunk) { list_del_init(&owner->same_root); owner->root = NULL; } - - for (i = j = 0; j <= size; i++, j++) { - struct audit_tree *s; - if (&chunk->owners[j] == p) { - list_del_init(&p->list); - i--; - continue; - } - s = chunk->owners[j].owner; - new->owners[i].owner = s; - new->owners[i].index = chunk->owners[j].index - j + i; - if (!s) /* result of earlier fallback */ - continue; - get_tree(s); - list_replace_init(&chunk->owners[j].list, &new->owners[i].list); - } - - list_for_each_entry(owner, &new->trees, same_root) - owner->root = new; + list_del_init(&p->list); /* - * Make sure chunk is fully initialized before making it visible in the - * hash. Pairs with a data dependency barrier in READ_ONCE() in - * audit_tree_lookup(). + * This has to go last when updating chunk as once replace_chunk() is + * called, new RCU readers can see the new chunk. */ - smp_wmb(); - list_replace_rcu(&chunk->hash, &new->hash); + replace_chunk(new, chunk, p); spin_unlock(&hash_lock); fsnotify_detach_mark(entry); mutex_unlock(&entry->group->mark_mutex); @@ -399,7 +408,6 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree) static int tag_chunk(struct inode *inode, struct audit_tree *tree) { struct fsnotify_mark *old_entry, *chunk_entry; - struct audit_tree *owner; struct audit_chunk *chunk, *old; struct node *p; int n; @@ -464,35 +472,21 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) fsnotify_put_mark(old_entry); return 0; } - chunk->key = old->key; - list_replace_init(&old->trees, &chunk->trees); - for (n = 0, p = chunk->owners; n < old->count; n++, p++) { - struct audit_tree *s = old->owners[n].owner; - p->owner = s; - p->index = old->owners[n].index; - if (!s) /* result of fallback in untag */ - continue; - get_tree(s); - list_replace_init(&old->owners[n].list, &p->list); - } + p = &chunk->owners[chunk->count - 1]; p->index = (chunk->count - 1) | (1U<<31); p->owner = tree; get_tree(tree); list_add(&p->list, &tree->chunks); - list_for_each_entry(owner, &chunk->trees, same_root) - owner->root = chunk; old->dead = 1; if (!tree->root) { tree->root = chunk; list_add(&tree->same_root, &chunk->trees); } /* - * Make sure chunk is fully initialized before making it visible in the - * hash. Pairs with a data dependency barrier in READ_ONCE() in - * audit_tree_lookup(). + * This has to go last when updating chunk as once replace_chunk() is + * called, new RCU readers can see the new chunk. */ - smp_wmb(); - list_replace_rcu(&old->hash, &chunk->hash); + replace_chunk(chunk, old, NULL); spin_unlock(&hash_lock); fsnotify_detach_mark(old_entry); mutex_unlock(&audit_tree_group->mark_mutex); -- cgit From 8cd0feb5234ccda3c15de35b40c8010a406dfc03 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 12 Nov 2018 09:54:48 -0500 Subject: audit: Remove pointless check in insert_hash() The audit_tree_group->mark_mutex is held all the time while we create the fsnotify mark, add it to the inode, and insert chunk into the hash. Hence mark cannot get detached during this time and so the check whether the mark is attached in insert_hash() is pointless. Reviewed-by: Richard Guy Briggs Signed-off-by: Jan Kara Signed-off-by: Paul Moore --- kernel/audit_tree.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index d8f6cfa0005b..d150514ff15e 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -184,8 +184,6 @@ static void insert_hash(struct audit_chunk *chunk) { struct list_head *list; - if (!(chunk->mark.flags & FSNOTIFY_MARK_FLAG_ATTACHED)) - return; /* * Make sure chunk is fully initialized before making it visible in the * hash. Pairs with a data dependency barrier in READ_ONCE() in -- cgit From a8375713fb1ff28ec718b601895958f1db775774 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 12 Nov 2018 09:54:49 -0500 Subject: audit: Provide helper for dropping mark's chunk reference Provide a helper function audit_mark_put_chunk() for dropping mark's reference (which has to happen only after RCU grace period expires). Currently that happens only from a single place but in later patches we introduce more callers. Reviewed-by: Richard Guy Briggs Signed-off-by: Jan Kara Signed-off-by: Paul Moore --- kernel/audit_tree.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index d150514ff15e..35c031ebcc12 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -132,10 +132,20 @@ static void __put_chunk(struct rcu_head *rcu) audit_put_chunk(chunk); } +/* + * Drop reference to the chunk that was held by the mark. This is the reference + * that gets dropped after we've removed the chunk from the hash table and we + * use it to make sure chunk cannot be freed before RCU grace period expires. + */ +static void audit_mark_put_chunk(struct audit_chunk *chunk) +{ + call_rcu(&chunk->head, __put_chunk); +} + static void audit_tree_destroy_watch(struct fsnotify_mark *entry) { struct audit_chunk *chunk = container_of(entry, struct audit_chunk, mark); - call_rcu(&chunk->head, __put_chunk); + audit_mark_put_chunk(chunk); } static struct audit_chunk *alloc_chunk(int count) -- cgit From 5f5161300d7bd530e062428ac694824832960cf5 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 12 Nov 2018 09:54:49 -0500 Subject: audit: Allocate fsnotify mark independently of chunk Allocate fsnotify mark independently instead of embedding it inside chunk. This will allow us to just replace chunk attached to mark when growing / shrinking chunk instead of replacing mark attached to inode which is a more complex operation. Reviewed-by: Richard Guy Briggs Signed-off-by: Jan Kara Signed-off-by: Paul Moore --- kernel/audit_tree.c | 64 +++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 50 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 35c031ebcc12..c98ab2d68a1c 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -25,7 +25,7 @@ struct audit_tree { struct audit_chunk { struct list_head hash; unsigned long key; - struct fsnotify_mark mark; + struct fsnotify_mark *mark; struct list_head trees; /* with root here */ int dead; int count; @@ -38,6 +38,11 @@ struct audit_chunk { } owners[]; }; +struct audit_tree_mark { + struct fsnotify_mark mark; + struct audit_chunk *chunk; +}; + static LIST_HEAD(tree_list); static LIST_HEAD(prune_list); static struct task_struct *prune_thread; @@ -73,6 +78,7 @@ static struct task_struct *prune_thread; */ static struct fsnotify_group *audit_tree_group; +static struct kmem_cache *audit_tree_mark_cachep __read_mostly; static struct audit_tree *alloc_tree(const char *s) { @@ -142,10 +148,33 @@ static void audit_mark_put_chunk(struct audit_chunk *chunk) call_rcu(&chunk->head, __put_chunk); } +static inline struct audit_tree_mark *audit_mark(struct fsnotify_mark *entry) +{ + return container_of(entry, struct audit_tree_mark, mark); +} + +static struct audit_chunk *mark_chunk(struct fsnotify_mark *mark) +{ + return audit_mark(mark)->chunk; +} + static void audit_tree_destroy_watch(struct fsnotify_mark *entry) { - struct audit_chunk *chunk = container_of(entry, struct audit_chunk, mark); + struct audit_chunk *chunk = mark_chunk(entry); audit_mark_put_chunk(chunk); + kmem_cache_free(audit_tree_mark_cachep, audit_mark(entry)); +} + +static struct fsnotify_mark *alloc_mark(void) +{ + struct audit_tree_mark *amark; + + amark = kmem_cache_zalloc(audit_tree_mark_cachep, GFP_KERNEL); + if (!amark) + return NULL; + fsnotify_init_mark(&amark->mark, audit_tree_group); + amark->mark.mask = FS_IN_IGNORED; + return &amark->mark; } static struct audit_chunk *alloc_chunk(int count) @@ -159,6 +188,13 @@ static struct audit_chunk *alloc_chunk(int count) if (!chunk) return NULL; + chunk->mark = alloc_mark(); + if (!chunk->mark) { + kfree(chunk); + return NULL; + } + audit_mark(chunk->mark)->chunk = chunk; + INIT_LIST_HEAD(&chunk->hash); INIT_LIST_HEAD(&chunk->trees); chunk->count = count; @@ -167,8 +203,6 @@ static struct audit_chunk *alloc_chunk(int count) INIT_LIST_HEAD(&chunk->owners[i].list); chunk->owners[i].index = i; } - fsnotify_init_mark(&chunk->mark, audit_tree_group); - chunk->mark.mask = FS_IN_IGNORED; return chunk; } @@ -278,7 +312,7 @@ static void replace_chunk(struct audit_chunk *new, struct audit_chunk *old, static void untag_chunk(struct node *p) { struct audit_chunk *chunk = find_chunk(p); - struct fsnotify_mark *entry = &chunk->mark; + struct fsnotify_mark *entry = chunk->mark; struct audit_chunk *new = NULL; struct audit_tree *owner; int size = chunk->count - 1; @@ -298,7 +332,7 @@ static void untag_chunk(struct node *p) if (chunk->dead || !(entry->flags & FSNOTIFY_MARK_FLAG_ATTACHED)) { mutex_unlock(&entry->group->mark_mutex); if (new) - fsnotify_put_mark(&new->mark); + fsnotify_put_mark(new->mark); goto out; } @@ -322,9 +356,9 @@ static void untag_chunk(struct node *p) if (!new) goto Fallback; - if (fsnotify_add_mark_locked(&new->mark, entry->connector->obj, + if (fsnotify_add_mark_locked(new->mark, entry->connector->obj, FSNOTIFY_OBJ_TYPE_INODE, 1)) { - fsnotify_put_mark(&new->mark); + fsnotify_put_mark(new->mark); goto Fallback; } @@ -344,7 +378,7 @@ static void untag_chunk(struct node *p) fsnotify_detach_mark(entry); mutex_unlock(&entry->group->mark_mutex); fsnotify_free_mark(entry); - fsnotify_put_mark(&new->mark); /* drop initial reference */ + fsnotify_put_mark(new->mark); /* drop initial reference */ goto out; Fallback: @@ -375,7 +409,7 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree) return -ENOMEM; } - entry = &chunk->mark; + entry = chunk->mark; if (fsnotify_add_inode_mark_locked(entry, inode, 0)) { mutex_unlock(&audit_tree_group->mark_mutex); fsnotify_put_mark(entry); @@ -426,7 +460,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) if (!old_entry) return create_chunk(inode, tree); - old = container_of(old_entry, struct audit_chunk, mark); + old = mark_chunk(old_entry); /* are we already there? */ spin_lock(&hash_lock); @@ -447,7 +481,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) return -ENOMEM; } - chunk_entry = &chunk->mark; + chunk_entry = chunk->mark; /* * mark_mutex protects mark from getting detached and thus also from @@ -457,7 +491,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) /* old_entry is being shot, lets just lie */ mutex_unlock(&audit_tree_group->mark_mutex); fsnotify_put_mark(old_entry); - fsnotify_put_mark(&chunk->mark); + fsnotify_put_mark(chunk->mark); return -ENOENT; } @@ -1011,7 +1045,7 @@ static int audit_tree_handle_event(struct fsnotify_group *group, static void audit_tree_freeing_mark(struct fsnotify_mark *entry, struct fsnotify_group *group) { - struct audit_chunk *chunk = container_of(entry, struct audit_chunk, mark); + struct audit_chunk *chunk = mark_chunk(entry); evict_chunk(chunk); @@ -1032,6 +1066,8 @@ static int __init audit_tree_init(void) { int i; + audit_tree_mark_cachep = KMEM_CACHE(audit_tree_mark, SLAB_PANIC); + audit_tree_group = fsnotify_alloc_group(&audit_tree_ops); if (IS_ERR(audit_tree_group)) audit_panic("cannot initialize fsnotify group for rectree watches"); -- cgit From 49a4ee7d98dbe34cfed90b930664c8a9fa73b24c Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 12 Nov 2018 09:54:49 -0500 Subject: audit: Guarantee forward progress of chunk untagging When removing chunk from a tree, we do shrink the chunk. This can fail for various reasons (due to races, ENOMEM, etc.) and in some cases we just bail from untag_chunk() relying on someone else to cleanup. Although this currently works, later we will need to add new failure situation which would break. Also this simplifies the code and will allow us to make locking around untag_chunk() less awkward. Signed-off-by: Jan Kara Reviewed-by: Richard Guy Briggs Signed-off-by: Paul Moore --- kernel/audit_tree.c | 42 +++++++++++++++++------------------------- 1 file changed, 17 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index c98ab2d68a1c..ca2b6baff7aa 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -309,16 +309,28 @@ static void replace_chunk(struct audit_chunk *new, struct audit_chunk *old, list_replace_rcu(&old->hash, &new->hash); } +static void remove_chunk_node(struct audit_chunk *chunk, struct node *p) +{ + struct audit_tree *owner = p->owner; + + if (owner->root == chunk) { + list_del_init(&owner->same_root); + owner->root = NULL; + } + list_del_init(&p->list); + p->owner = NULL; + put_tree(owner); +} + static void untag_chunk(struct node *p) { struct audit_chunk *chunk = find_chunk(p); struct fsnotify_mark *entry = chunk->mark; struct audit_chunk *new = NULL; - struct audit_tree *owner; int size = chunk->count - 1; + remove_chunk_node(chunk, p); fsnotify_get_mark(entry); - spin_unlock(&hash_lock); if (size) @@ -336,15 +348,10 @@ static void untag_chunk(struct node *p) goto out; } - owner = p->owner; - if (!size) { chunk->dead = 1; spin_lock(&hash_lock); list_del_init(&chunk->trees); - if (owner->root == chunk) - owner->root = NULL; - list_del_init(&p->list); list_del_rcu(&chunk->hash); spin_unlock(&hash_lock); fsnotify_detach_mark(entry); @@ -354,21 +361,16 @@ static void untag_chunk(struct node *p) } if (!new) - goto Fallback; + goto out_mutex; if (fsnotify_add_mark_locked(new->mark, entry->connector->obj, FSNOTIFY_OBJ_TYPE_INODE, 1)) { fsnotify_put_mark(new->mark); - goto Fallback; + goto out_mutex; } chunk->dead = 1; spin_lock(&hash_lock); - if (owner->root == chunk) { - list_del_init(&owner->same_root); - owner->root = NULL; - } - list_del_init(&p->list); /* * This has to go last when updating chunk as once replace_chunk() is * called, new RCU readers can see the new chunk. @@ -381,17 +383,7 @@ static void untag_chunk(struct node *p) fsnotify_put_mark(new->mark); /* drop initial reference */ goto out; -Fallback: - // do the best we can - spin_lock(&hash_lock); - if (owner->root == chunk) { - list_del_init(&owner->same_root); - owner->root = NULL; - } - list_del_init(&p->list); - p->owner = NULL; - put_tree(owner); - spin_unlock(&hash_lock); +out_mutex: mutex_unlock(&entry->group->mark_mutex); out: fsnotify_put_mark(entry); -- cgit From c22fcde775dcc9f46d73d694061441efdc7bdaad Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 12 Nov 2018 09:54:49 -0500 Subject: audit: Drop all unused chunk nodes during deletion When deleting chunk from a tree, drop all unused nodes in a chunk instead of just the one used by the tree. This gets rid of possibly lingering unused nodes (created due to fallback path in untag_chunk()) and also removes some special cases and will allow us to simplify locking in untag_chunk(). Signed-off-by: Jan Kara Reviewed-by: Richard Guy Briggs Signed-off-by: Paul Moore --- kernel/audit_tree.c | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index ca2b6baff7aa..145e8c92dd31 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -277,8 +277,7 @@ static struct audit_chunk *find_chunk(struct node *p) return container_of(p, struct audit_chunk, owners[0]); } -static void replace_chunk(struct audit_chunk *new, struct audit_chunk *old, - struct node *skip) +static void replace_chunk(struct audit_chunk *new, struct audit_chunk *old) { struct audit_tree *owner; int i, j; @@ -288,7 +287,7 @@ static void replace_chunk(struct audit_chunk *new, struct audit_chunk *old, list_for_each_entry(owner, &new->trees, same_root) owner->root = new; for (i = j = 0; j < old->count; i++, j++) { - if (&old->owners[j] == skip) { + if (!old->owners[j].owner) { i--; continue; } @@ -322,20 +321,28 @@ static void remove_chunk_node(struct audit_chunk *chunk, struct node *p) put_tree(owner); } +static int chunk_count_trees(struct audit_chunk *chunk) +{ + int i; + int ret = 0; + + for (i = 0; i < chunk->count; i++) + if (chunk->owners[i].owner) + ret++; + return ret; +} + static void untag_chunk(struct node *p) { struct audit_chunk *chunk = find_chunk(p); struct fsnotify_mark *entry = chunk->mark; struct audit_chunk *new = NULL; - int size = chunk->count - 1; + int size; remove_chunk_node(chunk, p); fsnotify_get_mark(entry); spin_unlock(&hash_lock); - if (size) - new = alloc_chunk(size); - mutex_lock(&entry->group->mark_mutex); /* * mark_mutex protects mark from getting detached and thus also from @@ -348,6 +355,7 @@ static void untag_chunk(struct node *p) goto out; } + size = chunk_count_trees(chunk); if (!size) { chunk->dead = 1; spin_lock(&hash_lock); @@ -360,6 +368,7 @@ static void untag_chunk(struct node *p) goto out; } + new = alloc_chunk(size); if (!new) goto out_mutex; @@ -375,7 +384,7 @@ static void untag_chunk(struct node *p) * This has to go last when updating chunk as once replace_chunk() is * called, new RCU readers can see the new chunk. */ - replace_chunk(new, chunk, p); + replace_chunk(new, chunk); spin_unlock(&hash_lock); fsnotify_detach_mark(entry); mutex_unlock(&entry->group->mark_mutex); @@ -520,7 +529,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) * This has to go last when updating chunk as once replace_chunk() is * called, new RCU readers can see the new chunk. */ - replace_chunk(chunk, old, NULL); + replace_chunk(chunk, old); spin_unlock(&hash_lock); fsnotify_detach_mark(old_entry); mutex_unlock(&audit_tree_group->mark_mutex); -- cgit From 8432c70062978d9a57bde6715496d585ec520c3e Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 12 Nov 2018 09:54:56 -0500 Subject: audit: Simplify locking around untag_chunk() untag_chunk() has to be called with hash_lock, it drops it and reacquires it when returning. The unlocking of hash_lock is thus hidden from the callers of untag_chunk() with is rather error prone. Reorganize the code so that untag_chunk() is called without hash_lock, only with mark reference preventing the chunk from going away. Since this requires some more code in the caller of untag_chunk() to assure forward progress, factor out loop pruning tree from all chunks into a common helper function. Signed-off-by: Jan Kara Reviewed-by: Richard Guy Briggs Signed-off-by: Paul Moore --- kernel/audit_tree.c | 77 ++++++++++++++++++++++++++++------------------------- 1 file changed, 40 insertions(+), 37 deletions(-) (limited to 'kernel') diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 145e8c92dd31..82b27da7031c 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -332,28 +332,18 @@ static int chunk_count_trees(struct audit_chunk *chunk) return ret; } -static void untag_chunk(struct node *p) +static void untag_chunk(struct audit_chunk *chunk, struct fsnotify_mark *entry) { - struct audit_chunk *chunk = find_chunk(p); - struct fsnotify_mark *entry = chunk->mark; - struct audit_chunk *new = NULL; + struct audit_chunk *new; int size; - remove_chunk_node(chunk, p); - fsnotify_get_mark(entry); - spin_unlock(&hash_lock); - - mutex_lock(&entry->group->mark_mutex); + mutex_lock(&audit_tree_group->mark_mutex); /* * mark_mutex protects mark from getting detached and thus also from * mark->connector->obj getting NULL. */ - if (chunk->dead || !(entry->flags & FSNOTIFY_MARK_FLAG_ATTACHED)) { - mutex_unlock(&entry->group->mark_mutex); - if (new) - fsnotify_put_mark(new->mark); - goto out; - } + if (chunk->dead || !(entry->flags & FSNOTIFY_MARK_FLAG_ATTACHED)) + goto out_mutex; size = chunk_count_trees(chunk); if (!size) { @@ -363,9 +353,9 @@ static void untag_chunk(struct node *p) list_del_rcu(&chunk->hash); spin_unlock(&hash_lock); fsnotify_detach_mark(entry); - mutex_unlock(&entry->group->mark_mutex); + mutex_unlock(&audit_tree_group->mark_mutex); fsnotify_free_mark(entry); - goto out; + return; } new = alloc_chunk(size); @@ -387,16 +377,13 @@ static void untag_chunk(struct node *p) replace_chunk(new, chunk); spin_unlock(&hash_lock); fsnotify_detach_mark(entry); - mutex_unlock(&entry->group->mark_mutex); + mutex_unlock(&audit_tree_group->mark_mutex); fsnotify_free_mark(entry); fsnotify_put_mark(new->mark); /* drop initial reference */ - goto out; + return; out_mutex: - mutex_unlock(&entry->group->mark_mutex); -out: - fsnotify_put_mark(entry); - spin_lock(&hash_lock); + mutex_unlock(&audit_tree_group->mark_mutex); } /* Call with group->mark_mutex held, releases it */ @@ -579,22 +566,45 @@ static void kill_rules(struct audit_tree *tree) } /* - * finish killing struct audit_tree + * Remove tree from chunks. If 'tagged' is set, remove tree only from tagged + * chunks. The function expects tagged chunks are all at the beginning of the + * chunks list. */ -static void prune_one(struct audit_tree *victim) +static void prune_tree_chunks(struct audit_tree *victim, bool tagged) { spin_lock(&hash_lock); while (!list_empty(&victim->chunks)) { struct node *p; + struct audit_chunk *chunk; + struct fsnotify_mark *mark; + + p = list_first_entry(&victim->chunks, struct node, list); + /* have we run out of marked? */ + if (tagged && !(p->index & (1U<<31))) + break; + chunk = find_chunk(p); + mark = chunk->mark; + remove_chunk_node(chunk, p); + fsnotify_get_mark(mark); + spin_unlock(&hash_lock); - p = list_entry(victim->chunks.next, struct node, list); + untag_chunk(chunk, mark); + fsnotify_put_mark(mark); - untag_chunk(p); + spin_lock(&hash_lock); } spin_unlock(&hash_lock); put_tree(victim); } +/* + * finish killing struct audit_tree + */ +static void prune_one(struct audit_tree *victim) +{ + prune_tree_chunks(victim, false); +} + /* trim the uncommitted chunks from tree */ static void trim_marked(struct audit_tree *tree) @@ -614,18 +624,11 @@ static void trim_marked(struct audit_tree *tree) list_add(p, &tree->chunks); } } + spin_unlock(&hash_lock); - while (!list_empty(&tree->chunks)) { - struct node *node; - - node = list_entry(tree->chunks.next, struct node, list); - - /* have we run out of marked? */ - if (!(node->index & (1U<<31))) - break; + prune_tree_chunks(tree, true); - untag_chunk(node); - } + spin_lock(&hash_lock); if (!tree->root && !tree->goner) { tree->goner = 1; spin_unlock(&hash_lock); -- cgit From 83d23bc8aedc51fc40078026e9fae6e349d83b2a Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 12 Nov 2018 09:55:16 -0500 Subject: audit: Replace chunk attached to mark instead of replacing mark Audit tree code currently associates new fsnotify mark with each new chunk. As chunk attached to an inode is replaced when new tag is added / removed, we also need to remove old fsnotify mark and add a new one on such occasion. This is cumbersome and makes locking rules somewhat difficult to follow. Fix these problems by allocating fsnotify mark independently of chunk and keeping it all the time while there is some chunk attached to an inode. Also add documentation about the locking rules so that things are easier to follow. Signed-off-by: Jan Kara Reviewed-by: Richard Guy Briggs [PM: minor merge fuzz due to updated patches previously in the series] Signed-off-by: Paul Moore --- kernel/audit_tree.c | 160 +++++++++++++++++++++++++++------------------------- 1 file changed, 83 insertions(+), 77 deletions(-) (limited to 'kernel') diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 82b27da7031c..1d8dc20296fb 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -27,7 +27,6 @@ struct audit_chunk { unsigned long key; struct fsnotify_mark *mark; struct list_head trees; /* with root here */ - int dead; int count; atomic_long_t refs; struct rcu_head head; @@ -48,8 +47,15 @@ static LIST_HEAD(prune_list); static struct task_struct *prune_thread; /* - * One struct chunk is attached to each inode of interest. - * We replace struct chunk on tagging/untagging. + * One struct chunk is attached to each inode of interest through + * audit_tree_mark (fsnotify mark). We replace struct chunk on tagging / + * untagging, the mark is stable as long as there is chunk attached. The + * association between mark and chunk is protected by hash_lock and + * audit_tree_group->mark_mutex. Thus as long as we hold + * audit_tree_group->mark_mutex and check that the mark is alive by + * FSNOTIFY_MARK_FLAG_ATTACHED flag check, we are sure the mark points to + * the current chunk. + * * Rules have pointer to struct audit_tree. * Rules have struct list_head rlist forming a list of rules over * the same tree. @@ -68,8 +74,12 @@ static struct task_struct *prune_thread; * tree is refcounted; one reference for "some rules on rules_list refer to * it", one for each chunk with pointer to it. * - * chunk is refcounted by embedded fsnotify_mark + .refs (non-zero refcount - * of watch contributes 1 to .refs). + * chunk is refcounted by embedded .refs. Mark associated with the chunk holds + * one chunk reference. This reference is dropped either when a mark is going + * to be freed (corresponding inode goes away) or when chunk attached to the + * mark gets replaced. This reference must be dropped using + * audit_mark_put_chunk() to make sure the reference is dropped only after RCU + * grace period as it protects RCU readers of the hash table. * * node.index allows to get from node.list to containing chunk. * MSB of that sucker is stolen to mark taggings that we might have to @@ -160,8 +170,6 @@ static struct audit_chunk *mark_chunk(struct fsnotify_mark *mark) static void audit_tree_destroy_watch(struct fsnotify_mark *entry) { - struct audit_chunk *chunk = mark_chunk(entry); - audit_mark_put_chunk(chunk); kmem_cache_free(audit_tree_mark_cachep, audit_mark(entry)); } @@ -188,13 +196,6 @@ static struct audit_chunk *alloc_chunk(int count) if (!chunk) return NULL; - chunk->mark = alloc_mark(); - if (!chunk->mark) { - kfree(chunk); - return NULL; - } - audit_mark(chunk->mark)->chunk = chunk; - INIT_LIST_HEAD(&chunk->hash); INIT_LIST_HEAD(&chunk->trees); chunk->count = count; @@ -277,6 +278,20 @@ static struct audit_chunk *find_chunk(struct node *p) return container_of(p, struct audit_chunk, owners[0]); } +static void replace_mark_chunk(struct fsnotify_mark *entry, + struct audit_chunk *chunk) +{ + struct audit_chunk *old; + + assert_spin_locked(&hash_lock); + old = mark_chunk(entry); + audit_mark(entry)->chunk = chunk; + if (chunk) + chunk->mark = entry; + if (old) + old->mark = NULL; +} + static void replace_chunk(struct audit_chunk *new, struct audit_chunk *old) { struct audit_tree *owner; @@ -299,6 +314,7 @@ static void replace_chunk(struct audit_chunk *new, struct audit_chunk *old) get_tree(owner); list_replace_init(&old->owners[j].list, &new->owners[i].list); } + replace_mark_chunk(old->mark, new); /* * Make sure chunk is fully initialized before making it visible in the * hash. Pairs with a data dependency barrier in READ_ONCE() in @@ -339,21 +355,23 @@ static void untag_chunk(struct audit_chunk *chunk, struct fsnotify_mark *entry) mutex_lock(&audit_tree_group->mark_mutex); /* - * mark_mutex protects mark from getting detached and thus also from - * mark->connector->obj getting NULL. + * mark_mutex stabilizes chunk attached to the mark so we can check + * whether it didn't change while we've dropped hash_lock. */ - if (chunk->dead || !(entry->flags & FSNOTIFY_MARK_FLAG_ATTACHED)) + if (!(entry->flags & FSNOTIFY_MARK_FLAG_ATTACHED) || + mark_chunk(entry) != chunk) goto out_mutex; size = chunk_count_trees(chunk); if (!size) { - chunk->dead = 1; spin_lock(&hash_lock); list_del_init(&chunk->trees); list_del_rcu(&chunk->hash); + replace_mark_chunk(entry, NULL); spin_unlock(&hash_lock); fsnotify_detach_mark(entry); mutex_unlock(&audit_tree_group->mark_mutex); + audit_mark_put_chunk(chunk); fsnotify_free_mark(entry); return; } @@ -362,13 +380,6 @@ static void untag_chunk(struct audit_chunk *chunk, struct fsnotify_mark *entry) if (!new) goto out_mutex; - if (fsnotify_add_mark_locked(new->mark, entry->connector->obj, - FSNOTIFY_OBJ_TYPE_INODE, 1)) { - fsnotify_put_mark(new->mark); - goto out_mutex; - } - - chunk->dead = 1; spin_lock(&hash_lock); /* * This has to go last when updating chunk as once replace_chunk() is @@ -376,10 +387,8 @@ static void untag_chunk(struct audit_chunk *chunk, struct fsnotify_mark *entry) */ replace_chunk(new, chunk); spin_unlock(&hash_lock); - fsnotify_detach_mark(entry); mutex_unlock(&audit_tree_group->mark_mutex); - fsnotify_free_mark(entry); - fsnotify_put_mark(new->mark); /* drop initial reference */ + audit_mark_put_chunk(chunk); return; out_mutex: @@ -397,23 +406,31 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree) return -ENOMEM; } - entry = chunk->mark; + entry = alloc_mark(); + if (!entry) { + mutex_unlock(&audit_tree_group->mark_mutex); + kfree(chunk); + return -ENOMEM; + } + if (fsnotify_add_inode_mark_locked(entry, inode, 0)) { mutex_unlock(&audit_tree_group->mark_mutex); fsnotify_put_mark(entry); + kfree(chunk); return -ENOSPC; } spin_lock(&hash_lock); if (tree->goner) { spin_unlock(&hash_lock); - chunk->dead = 1; fsnotify_detach_mark(entry); mutex_unlock(&audit_tree_group->mark_mutex); fsnotify_free_mark(entry); fsnotify_put_mark(entry); + kfree(chunk); return 0; } + replace_mark_chunk(entry, chunk); chunk->owners[0].index = (1U << 31); chunk->owners[0].owner = tree; get_tree(tree); @@ -430,33 +447,41 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree) insert_hash(chunk); spin_unlock(&hash_lock); mutex_unlock(&audit_tree_group->mark_mutex); - fsnotify_put_mark(entry); /* drop initial reference */ + /* + * Drop our initial reference. When mark we point to is getting freed, + * we get notification through ->freeing_mark callback and cleanup + * chunk pointing to this mark. + */ + fsnotify_put_mark(entry); return 0; } /* the first tagged inode becomes root of tree */ static int tag_chunk(struct inode *inode, struct audit_tree *tree) { - struct fsnotify_mark *old_entry, *chunk_entry; + struct fsnotify_mark *entry; struct audit_chunk *chunk, *old; struct node *p; int n; mutex_lock(&audit_tree_group->mark_mutex); - old_entry = fsnotify_find_mark(&inode->i_fsnotify_marks, - audit_tree_group); - if (!old_entry) + entry = fsnotify_find_mark(&inode->i_fsnotify_marks, audit_tree_group); + if (!entry) return create_chunk(inode, tree); - old = mark_chunk(old_entry); - + /* + * Found mark is guaranteed to be attached and mark_mutex protects mark + * from getting detached and thus it makes sure there is chunk attached + * to the mark. + */ /* are we already there? */ spin_lock(&hash_lock); + old = mark_chunk(entry); for (n = 0; n < old->count; n++) { if (old->owners[n].owner == tree) { spin_unlock(&hash_lock); mutex_unlock(&audit_tree_group->mark_mutex); - fsnotify_put_mark(old_entry); + fsnotify_put_mark(entry); return 0; } } @@ -465,41 +490,16 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) chunk = alloc_chunk(old->count + 1); if (!chunk) { mutex_unlock(&audit_tree_group->mark_mutex); - fsnotify_put_mark(old_entry); + fsnotify_put_mark(entry); return -ENOMEM; } - chunk_entry = chunk->mark; - - /* - * mark_mutex protects mark from getting detached and thus also from - * mark->connector->obj getting NULL. - */ - if (!(old_entry->flags & FSNOTIFY_MARK_FLAG_ATTACHED)) { - /* old_entry is being shot, lets just lie */ - mutex_unlock(&audit_tree_group->mark_mutex); - fsnotify_put_mark(old_entry); - fsnotify_put_mark(chunk->mark); - return -ENOENT; - } - - if (fsnotify_add_mark_locked(chunk_entry, old_entry->connector->obj, - FSNOTIFY_OBJ_TYPE_INODE, 1)) { - mutex_unlock(&audit_tree_group->mark_mutex); - fsnotify_put_mark(chunk_entry); - fsnotify_put_mark(old_entry); - return -ENOSPC; - } - spin_lock(&hash_lock); if (tree->goner) { spin_unlock(&hash_lock); - chunk->dead = 1; - fsnotify_detach_mark(chunk_entry); mutex_unlock(&audit_tree_group->mark_mutex); - fsnotify_free_mark(chunk_entry); - fsnotify_put_mark(chunk_entry); - fsnotify_put_mark(old_entry); + fsnotify_put_mark(entry); + kfree(chunk); return 0; } p = &chunk->owners[chunk->count - 1]; @@ -507,7 +507,6 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) p->owner = tree; get_tree(tree); list_add(&p->list, &tree->chunks); - old->dead = 1; if (!tree->root) { tree->root = chunk; list_add(&tree->same_root, &chunk->trees); @@ -518,11 +517,10 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) */ replace_chunk(chunk, old); spin_unlock(&hash_lock); - fsnotify_detach_mark(old_entry); mutex_unlock(&audit_tree_group->mark_mutex); - fsnotify_free_mark(old_entry); - fsnotify_put_mark(chunk_entry); /* drop initial reference */ - fsnotify_put_mark(old_entry); /* pair to fsnotify_find mark_entry */ + fsnotify_put_mark(entry); /* pair to fsnotify_find mark_entry */ + audit_mark_put_chunk(old); + return 0; } @@ -585,6 +583,9 @@ static void prune_tree_chunks(struct audit_tree *victim, bool tagged) chunk = find_chunk(p); mark = chunk->mark; remove_chunk_node(chunk, p); + /* Racing with audit_tree_freeing_mark()? */ + if (!mark) + continue; fsnotify_get_mark(mark); spin_unlock(&hash_lock); @@ -1007,10 +1008,6 @@ static void evict_chunk(struct audit_chunk *chunk) int need_prune = 0; int n; - if (chunk->dead) - return; - - chunk->dead = 1; mutex_lock(&audit_filter_mutex); spin_lock(&hash_lock); while (!list_empty(&chunk->trees)) { @@ -1049,9 +1046,18 @@ static int audit_tree_handle_event(struct fsnotify_group *group, static void audit_tree_freeing_mark(struct fsnotify_mark *entry, struct fsnotify_group *group) { - struct audit_chunk *chunk = mark_chunk(entry); + struct audit_chunk *chunk; - evict_chunk(chunk); + mutex_lock(&entry->group->mark_mutex); + spin_lock(&hash_lock); + chunk = mark_chunk(entry); + replace_mark_chunk(entry, NULL); + spin_unlock(&hash_lock); + mutex_unlock(&entry->group->mark_mutex); + if (chunk) { + evict_chunk(chunk); + audit_mark_put_chunk(chunk); + } /* * We are guaranteed to have at least one reference to the mark from -- cgit From f905c2fc3980a41aeccb8673ab10ed5e616391fd Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 12 Nov 2018 09:55:16 -0500 Subject: audit: Use 'mark' name for fsnotify_mark variables Variables pointing to fsnotify_mark are sometimes called 'entry' and sometimes 'mark'. Use 'mark' in all places. Reviewed-by: Richard Guy Briggs Signed-off-by: Jan Kara [PM: minor merge fuzz due to updated patches previously in the series] Signed-off-by: Paul Moore --- kernel/audit_tree.c | 79 +++++++++++++++++++++++++++-------------------------- 1 file changed, 40 insertions(+), 39 deletions(-) (limited to 'kernel') diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 1d8dc20296fb..58e84eb5d826 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -158,9 +158,9 @@ static void audit_mark_put_chunk(struct audit_chunk *chunk) call_rcu(&chunk->head, __put_chunk); } -static inline struct audit_tree_mark *audit_mark(struct fsnotify_mark *entry) +static inline struct audit_tree_mark *audit_mark(struct fsnotify_mark *mark) { - return container_of(entry, struct audit_tree_mark, mark); + return container_of(mark, struct audit_tree_mark, mark); } static struct audit_chunk *mark_chunk(struct fsnotify_mark *mark) @@ -168,9 +168,9 @@ static struct audit_chunk *mark_chunk(struct fsnotify_mark *mark) return audit_mark(mark)->chunk; } -static void audit_tree_destroy_watch(struct fsnotify_mark *entry) +static void audit_tree_destroy_watch(struct fsnotify_mark *mark) { - kmem_cache_free(audit_tree_mark_cachep, audit_mark(entry)); + kmem_cache_free(audit_tree_mark_cachep, audit_mark(mark)); } static struct fsnotify_mark *alloc_mark(void) @@ -224,7 +224,7 @@ static inline struct list_head *chunk_hash(unsigned long key) return chunk_hash_heads + n % HASH_SIZE; } -/* hash_lock & entry->group->mark_mutex is held by caller */ +/* hash_lock & mark->group->mark_mutex is held by caller */ static void insert_hash(struct audit_chunk *chunk) { struct list_head *list; @@ -278,16 +278,16 @@ static struct audit_chunk *find_chunk(struct node *p) return container_of(p, struct audit_chunk, owners[0]); } -static void replace_mark_chunk(struct fsnotify_mark *entry, +static void replace_mark_chunk(struct fsnotify_mark *mark, struct audit_chunk *chunk) { struct audit_chunk *old; assert_spin_locked(&hash_lock); - old = mark_chunk(entry); - audit_mark(entry)->chunk = chunk; + old = mark_chunk(mark); + audit_mark(mark)->chunk = chunk; if (chunk) - chunk->mark = entry; + chunk->mark = mark; if (old) old->mark = NULL; } @@ -348,7 +348,7 @@ static int chunk_count_trees(struct audit_chunk *chunk) return ret; } -static void untag_chunk(struct audit_chunk *chunk, struct fsnotify_mark *entry) +static void untag_chunk(struct audit_chunk *chunk, struct fsnotify_mark *mark) { struct audit_chunk *new; int size; @@ -358,8 +358,8 @@ static void untag_chunk(struct audit_chunk *chunk, struct fsnotify_mark *entry) * mark_mutex stabilizes chunk attached to the mark so we can check * whether it didn't change while we've dropped hash_lock. */ - if (!(entry->flags & FSNOTIFY_MARK_FLAG_ATTACHED) || - mark_chunk(entry) != chunk) + if (!(mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED) || + mark_chunk(mark) != chunk) goto out_mutex; size = chunk_count_trees(chunk); @@ -367,12 +367,12 @@ static void untag_chunk(struct audit_chunk *chunk, struct fsnotify_mark *entry) spin_lock(&hash_lock); list_del_init(&chunk->trees); list_del_rcu(&chunk->hash); - replace_mark_chunk(entry, NULL); + replace_mark_chunk(mark, NULL); spin_unlock(&hash_lock); - fsnotify_detach_mark(entry); + fsnotify_detach_mark(mark); mutex_unlock(&audit_tree_group->mark_mutex); audit_mark_put_chunk(chunk); - fsnotify_free_mark(entry); + fsnotify_free_mark(mark); return; } @@ -398,7 +398,7 @@ out_mutex: /* Call with group->mark_mutex held, releases it */ static int create_chunk(struct inode *inode, struct audit_tree *tree) { - struct fsnotify_mark *entry; + struct fsnotify_mark *mark; struct audit_chunk *chunk = alloc_chunk(1); if (!chunk) { @@ -406,16 +406,16 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree) return -ENOMEM; } - entry = alloc_mark(); - if (!entry) { + mark = alloc_mark(); + if (!mark) { mutex_unlock(&audit_tree_group->mark_mutex); kfree(chunk); return -ENOMEM; } - if (fsnotify_add_inode_mark_locked(entry, inode, 0)) { + if (fsnotify_add_inode_mark_locked(mark, inode, 0)) { mutex_unlock(&audit_tree_group->mark_mutex); - fsnotify_put_mark(entry); + fsnotify_put_mark(mark); kfree(chunk); return -ENOSPC; } @@ -423,14 +423,14 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree) spin_lock(&hash_lock); if (tree->goner) { spin_unlock(&hash_lock); - fsnotify_detach_mark(entry); + fsnotify_detach_mark(mark); mutex_unlock(&audit_tree_group->mark_mutex); - fsnotify_free_mark(entry); - fsnotify_put_mark(entry); + fsnotify_free_mark(mark); + fsnotify_put_mark(mark); kfree(chunk); return 0; } - replace_mark_chunk(entry, chunk); + replace_mark_chunk(mark, chunk); chunk->owners[0].index = (1U << 31); chunk->owners[0].owner = tree; get_tree(tree); @@ -452,21 +452,21 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree) * we get notification through ->freeing_mark callback and cleanup * chunk pointing to this mark. */ - fsnotify_put_mark(entry); + fsnotify_put_mark(mark); return 0; } /* the first tagged inode becomes root of tree */ static int tag_chunk(struct inode *inode, struct audit_tree *tree) { - struct fsnotify_mark *entry; + struct fsnotify_mark *mark; struct audit_chunk *chunk, *old; struct node *p; int n; mutex_lock(&audit_tree_group->mark_mutex); - entry = fsnotify_find_mark(&inode->i_fsnotify_marks, audit_tree_group); - if (!entry) + mark = fsnotify_find_mark(&inode->i_fsnotify_marks, audit_tree_group); + if (!mark) return create_chunk(inode, tree); /* @@ -476,12 +476,12 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) */ /* are we already there? */ spin_lock(&hash_lock); - old = mark_chunk(entry); + old = mark_chunk(mark); for (n = 0; n < old->count; n++) { if (old->owners[n].owner == tree) { spin_unlock(&hash_lock); mutex_unlock(&audit_tree_group->mark_mutex); - fsnotify_put_mark(entry); + fsnotify_put_mark(mark); return 0; } } @@ -490,7 +490,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) chunk = alloc_chunk(old->count + 1); if (!chunk) { mutex_unlock(&audit_tree_group->mark_mutex); - fsnotify_put_mark(entry); + fsnotify_put_mark(mark); return -ENOMEM; } @@ -498,7 +498,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) if (tree->goner) { spin_unlock(&hash_lock); mutex_unlock(&audit_tree_group->mark_mutex); - fsnotify_put_mark(entry); + fsnotify_put_mark(mark); kfree(chunk); return 0; } @@ -518,7 +518,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) replace_chunk(chunk, old); spin_unlock(&hash_lock); mutex_unlock(&audit_tree_group->mark_mutex); - fsnotify_put_mark(entry); /* pair to fsnotify_find mark_entry */ + fsnotify_put_mark(mark); /* pair to fsnotify_find_mark */ audit_mark_put_chunk(old); return 0; @@ -1044,16 +1044,17 @@ static int audit_tree_handle_event(struct fsnotify_group *group, return 0; } -static void audit_tree_freeing_mark(struct fsnotify_mark *entry, struct fsnotify_group *group) +static void audit_tree_freeing_mark(struct fsnotify_mark *mark, + struct fsnotify_group *group) { struct audit_chunk *chunk; - mutex_lock(&entry->group->mark_mutex); + mutex_lock(&mark->group->mark_mutex); spin_lock(&hash_lock); - chunk = mark_chunk(entry); - replace_mark_chunk(entry, NULL); + chunk = mark_chunk(mark); + replace_mark_chunk(mark, NULL); spin_unlock(&hash_lock); - mutex_unlock(&entry->group->mark_mutex); + mutex_unlock(&mark->group->mark_mutex); if (chunk) { evict_chunk(chunk); audit_mark_put_chunk(chunk); @@ -1063,7 +1064,7 @@ static void audit_tree_freeing_mark(struct fsnotify_mark *entry, struct fsnotify * We are guaranteed to have at least one reference to the mark from * either the inode or the caller of fsnotify_destroy_mark(). */ - BUG_ON(refcount_read(&entry->refcnt) < 1); + BUG_ON(refcount_read(&mark->refcnt) < 1); } static const struct fsnotify_ops audit_tree_ops = { -- cgit From 9213784b48f8ba666b4695ca3f5d34f583daab83 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 22 Oct 2018 08:26:00 -0700 Subject: rcu: Eliminate BUG_ON() for kernel/rcu/tree_plugin.h The tree_plugin.h file has a number of calls to BUG_ON(), which panics the kernel, which is not a good strategy for devices (like embedded) that don't have a way to capture console output. This commit therefore converts these BUG_ON() calls to WARN_ON_ONCE() and WARN_ONCE(). Reported-by: Linus Torvalds Signed-off-by: Paul E. McKenney [ paulmck: Fix typo: s/rcuo/rcub/. ] --- kernel/rcu/tree_plugin.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 05915e536336..adda608874a8 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1464,7 +1464,8 @@ static void __init rcu_spawn_boost_kthreads(void) for_each_possible_cpu(cpu) per_cpu(rcu_cpu_has_work, cpu) = 0; - BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec)); + if (WARN_ONCE(smpboot_register_percpu_thread(&rcu_cpu_thread_spec), "%s: Could not start rcub kthread, OOM is now expected behavior\n", __func__)) + return; rcu_for_each_leaf_node(rnp) (void)rcu_spawn_one_boost_kthread(rnp); } @@ -2322,7 +2323,8 @@ static int rcu_nocb_kthread(void *arg) tail = rdp->nocb_follower_tail; rdp->nocb_follower_tail = &rdp->nocb_follower_head; raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); - BUG_ON(!list); + if (WARN_ON_ONCE(!list)) + continue; trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeNonEmpty")); /* Each pass through the following loop invokes a callback. */ @@ -2495,7 +2497,8 @@ static void rcu_spawn_one_nocb_kthread(int cpu) /* Spawn the kthread for this CPU. */ t = kthread_run(rcu_nocb_kthread, rdp_spawn, "rcuo%c/%d", rcu_state.abbr, cpu); - BUG_ON(IS_ERR(t)); + if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo kthread, OOM is now expected behavior\n", __func__)) + return; WRITE_ONCE(rdp_spawn->nocb_kthread, t); } -- cgit From f0ad56e876cdd67730065274625edbcfe0cca278 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 22 Oct 2018 08:33:06 -0700 Subject: rcu: Eliminate BUG_ON() for kernel/rcu/update.c The update.c file has a number of calls to BUG_ON(), which panics the kernel, which is not a good strategy for devices (like embedded) that don't have a way to capture console output. This commit therefore converts these BUG_ON() calls to WARN_ON_ONCE() and WARN_ONCE(). Reported-by: Linus Torvalds Signed-off-by: Paul E. McKenney --- kernel/rcu/update.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index f203b94f6b5b..cb26de25658a 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -822,7 +822,8 @@ static int __init rcu_spawn_tasks_kthread(void) struct task_struct *t; t = kthread_run(rcu_tasks_kthread, NULL, "rcu_tasks_kthread"); - BUG_ON(IS_ERR(t)); + if (WARN_ONCE(IS_ERR(t), "%s: Could not start Tasks-RCU grace-period kthread, OOM is now expected behavior\n", __func__)) + return 0; smp_mb(); /* Ensure others see full kthread. */ WRITE_ONCE(rcu_tasks_kthread_ptr, t); return 0; -- cgit From b3c1d9ec7c59feadd22693f43145d8285bd35b04 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 1 Oct 2018 13:25:32 -0700 Subject: rcu: Avoid double multiply by HZ The rcu_check_gp_start_stall() function multiplies the return value from rcu_jiffies_till_stall_check() by HZ, but the units are already in jiffies. This commit therefore avoids the need for introduction of a jiffies-squared unit by removing the extraneous multiplication. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 121f833acd04..4933f5d9d992 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2603,7 +2603,7 @@ static void force_quiescent_state(void) static void rcu_check_gp_start_stall(struct rcu_node *rnp, struct rcu_data *rdp) { - const unsigned long gpssdelay = rcu_jiffies_till_stall_check() * HZ; + const unsigned long gpssdelay = rcu_jiffies_till_stall_check(); unsigned long flags; unsigned long j; struct rcu_node *rnp_root = rcu_get_root(); -- cgit From 791416c47153b45f640d52baaf30995d9d396a08 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 1 Oct 2018 15:42:44 -0700 Subject: rcu: Parameterize rcu_check_gp_start_stall() In order to debug forward-progress stalls, it is necessary to check for excessively delayed grace-period starts. This is currently done for RCU CPU stall warnings by rcu_check_gp_start_stall(), which checks to see if the start of a requested grace period has been delayed by an RCU CPU stall warning period. Because rcutorture will need to check for the time consumed by an RCU forward-progress delay, this commit promotes gpssdelay from a local variable to a formal parameter. It is not necessary to export rcu_check_gp_start_stall() because rcutorture will access it via a wrapper function. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 4933f5d9d992..36e30150e1e9 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2600,10 +2600,10 @@ static void force_quiescent_state(void) * This function checks for grace-period requests that fail to motivate * RCU to come out of its idle mode. */ -static void -rcu_check_gp_start_stall(struct rcu_node *rnp, struct rcu_data *rdp) +void +rcu_check_gp_start_stall(struct rcu_node *rnp, struct rcu_data *rdp, + const unsigned long gpssdelay) { - const unsigned long gpssdelay = rcu_jiffies_till_stall_check(); unsigned long flags; unsigned long j; struct rcu_node *rnp_root = rcu_get_root(); @@ -2690,7 +2690,7 @@ static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused local_irq_restore(flags); } - rcu_check_gp_start_stall(rnp, rdp); + rcu_check_gp_start_stall(rnp, rdp, rcu_jiffies_till_stall_check()); /* If there are callbacks ready, invoke them. */ if (rcu_segcblist_ready_cbs(&rdp->cblist)) -- cgit From 691960197e8daa39bf89886ba2e39de1e33f1ce4 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 2 Oct 2018 11:24:08 -0700 Subject: rcu: Add state name to show_rcu_gp_kthreads() output This commit adds the name of the RCU grace-period state to the show_rcu_gp_kthreads() output in order to ease debugging. This commit also moves gp_state_getname() up in the code so that show_rcu_gp_kthreads() can use it. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 36e30150e1e9..ea78532183ac 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -499,6 +499,16 @@ void rcu_force_quiescent_state(void) } EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); +/* + * Convert a ->gp_state value to a character string. + */ +static const char *gp_state_getname(short gs) +{ + if (gs < 0 || gs >= ARRAY_SIZE(gp_state_names)) + return "???"; + return gp_state_names[gs]; +} + /* * Show the state of the grace-period kthreads. */ @@ -508,8 +518,9 @@ void show_rcu_gp_kthreads(void) struct rcu_data *rdp; struct rcu_node *rnp; - pr_info("%s: wait state: %d ->state: %#lx\n", rcu_state.name, - rcu_state.gp_state, rcu_state.gp_kthread->state); + pr_info("%s: wait state: %s(%d) ->state: %#lx\n", rcu_state.name, + gp_state_getname(rcu_state.gp_state), rcu_state.gp_state, + rcu_state.gp_kthread->state); rcu_for_each_node_breadth_first(rnp) { if (ULONG_CMP_GE(rcu_state.gp_seq, rnp->gp_seq_needed)) continue; @@ -1142,16 +1153,6 @@ static void record_gp_stall_check_time(void) rcu_state.n_force_qs_gpstart = READ_ONCE(rcu_state.n_force_qs); } -/* - * Convert a ->gp_state value to a character string. - */ -static const char *gp_state_getname(short gs) -{ - if (gs < 0 || gs >= ARRAY_SIZE(gp_state_names)) - return "???"; - return gp_state_names[gs]; -} - /* * Complain about starvation of grace-period kthread. */ -- cgit From c669c014d1dae9c7cdbfff049c798722f8650829 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 2 Oct 2018 12:42:21 -0700 Subject: rcu: Add jiffies-since-GP-activity to show_rcu_gp_kthreads() This commit adds a printout of the number of jiffies since the last time that the RCU grace-period kthread did any processing. This can be useful when tracking down forward-progress issues. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index ea78532183ac..e7c9848d1e1b 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -515,12 +515,14 @@ static const char *gp_state_getname(short gs) void show_rcu_gp_kthreads(void) { int cpu; + unsigned long j; struct rcu_data *rdp; struct rcu_node *rnp; - pr_info("%s: wait state: %s(%d) ->state: %#lx\n", rcu_state.name, - gp_state_getname(rcu_state.gp_state), rcu_state.gp_state, - rcu_state.gp_kthread->state); + j = jiffies - READ_ONCE(rcu_state.gp_activity); + pr_info("%s: wait state: %s(%d) ->state: %#lx delta ->gp_activity %ld\n", + rcu_state.name, gp_state_getname(rcu_state.gp_state), + rcu_state.gp_state, rcu_state.gp_kthread->state, j); rcu_for_each_node_breadth_first(rnp) { if (ULONG_CMP_GE(rcu_state.gp_seq, rnp->gp_seq_needed)) continue; -- cgit From 2320bda26df75c02f86c9fa42c3355ebcd16d8ed Mon Sep 17 00:00:00 2001 From: Zhouyi Zhou Date: Mon, 8 Oct 2018 06:50:41 +0000 Subject: rcu: Adjust the comment of function rcu_is_watching Because RCU avoids interrupting idle CPUs, rcu_is_watching() is used to test whether or not it is currently legal to run RCU read-side critical sections on this CPU. However, the first sentence and last sentences of current comment for rcu_is_watching have opposite meaning of what is expected. This commit therefore fixes this header comment. Signed-off-by: Zhouyi Zhou Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index e7c9848d1e1b..429a46fb087a 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -904,12 +904,12 @@ void rcu_irq_enter_irqson(void) } /** - * rcu_is_watching - see if RCU thinks that the current CPU is idle + * rcu_is_watching - see if RCU thinks that the current CPU is not idle * * Return true if RCU is watching the running CPU, which means that this * CPU can safely enter RCU read-side critical sections. In other words, - * if the current CPU is in its idle loop and is neither in an interrupt - * or NMI handler, return true. + * if the current CPU is not in its idle loop or is in an interrupt or + * NMI handler, return true. */ bool notrace rcu_is_watching(void) { -- cgit From 0a89e5a402e957a97129423599a6ef6342da2764 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 15 Oct 2018 10:00:58 -0700 Subject: rcu: Trace end of grace period before end of grace period Currently, rcu_gp_cleanup() traces the end of the old grace period after the old grace period has officially ended. This might make intuitive sense, but it also makes for confusing event-trace output because the "end" trace displays not the old but instead the new grace-period number. This commit therefore traces the end of an old grace period just before that grace period officially ends. Reported-by: Aravinda Prasad Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 429a46fb087a..61bae41b1afe 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2035,9 +2035,9 @@ static void rcu_gp_cleanup(void) rnp = rcu_get_root(); raw_spin_lock_irq_rcu_node(rnp); /* GP before ->gp_seq update. */ - /* Declare grace period done. */ - rcu_seq_end(&rcu_state.gp_seq); + /* Declare grace period done, trace first to use old GP number. */ trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("end")); + rcu_seq_end(&rcu_state.gp_seq); rcu_state.gp_state = RCU_GP_IDLE; /* Check for GP requests since above loop. */ rdp = this_cpu_ptr(&rcu_data); -- cgit From 05f415715ce45da07a0b1a5eac842765b733157f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 16 Oct 2018 04:12:58 -0700 Subject: rcu: Speed up expedited GPs when interrupting RCU reader In PREEMPT kernels, an expedited grace period might send an IPI to a CPU that is executing an RCU read-side critical section. In that case, it would be nice if the rcu_read_unlock() directly interacted with the RCU core code to immediately report the quiescent state. And this does happen in the case where the reader has been preempted. But it would also be a nice performance optimization if immediate reporting also happened in the preemption-free case. This commit therefore adds an ->exp_hint field to the task_struct structure's ->rcu_read_unlock_special field. The IPI handler sets this hint when it has interrupted an RCU read-side critical section, and this causes the outermost rcu_read_unlock() call to invoke rcu_read_unlock_special(), which, if preemption is enabled, reports the quiescent state immediately. If preemption is disabled, then the report is required to be deferred until preemption (or bottom halves or interrupts or whatever) is re-enabled. Because this is a hint, it does nothing for more complicated cases. For example, if the IPI interrupts an RCU reader, but interrupts are disabled across the rcu_read_unlock(), but another rcu_read_lock() is executed before interrupts are re-enabled, the hint will already have been cleared. If you do crazy things like this, reporting will be deferred until some later RCU_SOFTIRQ handler, context switch, cond_resched(), or similar. Reported-by: Joel Fernandes Signed-off-by: Paul E. McKenney Acked-by: Joel Fernandes (Google) --- include/linux/sched.h | 4 +++- kernel/rcu/tree_exp.h | 4 +++- kernel/rcu/tree_plugin.h | 14 +++++++++++--- 3 files changed, 17 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index a51c13c2b1a0..e4c7b6241088 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -572,8 +572,10 @@ union rcu_special { struct { u8 blocked; u8 need_qs; + u8 exp_hint; /* Hint for performance. */ + u8 pad; /* No garbage from compiler! */ } b; /* Bits. */ - u16 s; /* Set of bits. */ + u32 s; /* Set of bits. */ }; enum perf_event_task_context { diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index e669ccf3751b..928fe5893a57 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -692,8 +692,10 @@ static void sync_rcu_exp_handler(void *unused) */ if (t->rcu_read_lock_nesting > 0) { raw_spin_lock_irqsave_rcu_node(rnp, flags); - if (rnp->expmask & rdp->grpmask) + if (rnp->expmask & rdp->grpmask) { rdp->deferred_qs = true; + WRITE_ONCE(t->rcu_read_unlock_special.b.exp_hint, true); + } raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 05915e536336..618956cc7a55 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -642,13 +642,21 @@ static void rcu_read_unlock_special(struct task_struct *t) local_irq_save(flags); irqs_were_disabled = irqs_disabled_flags(flags); - if ((preempt_bh_were_disabled || irqs_were_disabled) && - t->rcu_read_unlock_special.b.blocked) { + if (preempt_bh_were_disabled || irqs_were_disabled) { + WRITE_ONCE(t->rcu_read_unlock_special.b.exp_hint, false); /* Need to defer quiescent state until everything is enabled. */ - raise_softirq_irqoff(RCU_SOFTIRQ); + if (irqs_were_disabled) { + /* Enabling irqs does not reschedule, so... */ + raise_softirq_irqoff(RCU_SOFTIRQ); + } else { + /* Enabling BH or preempt does reschedule, so... */ + set_tsk_need_resched(current); + set_preempt_need_resched(); + } local_irq_restore(flags); return; } + WRITE_ONCE(t->rcu_read_unlock_special.b.exp_hint, false); rcu_preempt_deferred_qs_irqrestore(t, flags); } -- cgit From 117f683c6e0104e1d6dfe8f143ea9c24ab069044 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 5 Nov 2018 14:20:57 -0800 Subject: rcu: Replace this_cpu_ptr() with __this_cpu_read() Because __this_cpu_read() can be lighter weight than equivalent uses of this_cpu_ptr(), this commit replaces the latter with the former. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 618956cc7a55..0bb1c1593ca4 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -597,7 +597,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) */ static bool rcu_preempt_need_deferred_qs(struct task_struct *t) { - return (this_cpu_ptr(&rcu_data)->deferred_qs || + return (__this_cpu_read(rcu_data.deferred_qs) || READ_ONCE(t->rcu_read_unlock_special.s)) && t->rcu_read_lock_nesting <= 0; } -- cgit From 5f1a6ef3746f536157922197d98676fa21154549 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 29 Oct 2018 07:36:50 -0700 Subject: rcu: Avoid signed integer overflow in rcu_preempt_deferred_qs() Subtracting INT_MIN can be interpreted as unconditional signed integer overflow, which according to the C standard is undefined behavior. Therefore, kernel build arguments notwithstanding, it would be good to future-proof the code. This commit therefore substitutes INT_MAX for INT_MIN in order to avoid undefined behavior. While in the neighborhood, this commit also creates some meaningful names for INT_MAX and friends in order to improve readability, as suggested by Joel Fernandes. Reported-by: Ran Rozenstein Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 0bb1c1593ca4..3ed43f8cb029 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -397,6 +397,11 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp) return rnp->gp_tasks != NULL; } +/* Bias and limit values for ->rcu_read_lock_nesting. */ +#define RCU_NEST_BIAS INT_MAX +#define RCU_NEST_NMAX (-INT_MAX / 2) +#define RCU_NEST_PMAX (INT_MAX / 2) + /* * Preemptible RCU implementation for rcu_read_lock(). * Just increment ->rcu_read_lock_nesting, shared state will be updated @@ -405,6 +410,8 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp) void __rcu_read_lock(void) { current->rcu_read_lock_nesting++; + if (IS_ENABLED(CONFIG_PROVE_LOCKING)) + WARN_ON_ONCE(current->rcu_read_lock_nesting > RCU_NEST_PMAX); barrier(); /* critical section after entry code. */ } EXPORT_SYMBOL_GPL(__rcu_read_lock); @@ -424,20 +431,18 @@ void __rcu_read_unlock(void) --t->rcu_read_lock_nesting; } else { barrier(); /* critical section before exit code. */ - t->rcu_read_lock_nesting = INT_MIN; + t->rcu_read_lock_nesting = -RCU_NEST_BIAS; barrier(); /* assign before ->rcu_read_unlock_special load */ if (unlikely(READ_ONCE(t->rcu_read_unlock_special.s))) rcu_read_unlock_special(t); barrier(); /* ->rcu_read_unlock_special load before assign */ t->rcu_read_lock_nesting = 0; } -#ifdef CONFIG_PROVE_LOCKING - { - int rrln = READ_ONCE(t->rcu_read_lock_nesting); + if (IS_ENABLED(CONFIG_PROVE_LOCKING)) { + int rrln = t->rcu_read_lock_nesting; - WARN_ON_ONCE(rrln < 0 && rrln > INT_MIN / 2); + WARN_ON_ONCE(rrln < 0 && rrln > RCU_NEST_NMAX); } -#endif /* #ifdef CONFIG_PROVE_LOCKING */ } EXPORT_SYMBOL_GPL(__rcu_read_unlock); @@ -617,11 +622,11 @@ static void rcu_preempt_deferred_qs(struct task_struct *t) if (!rcu_preempt_need_deferred_qs(t)) return; if (couldrecurse) - t->rcu_read_lock_nesting -= INT_MIN; + t->rcu_read_lock_nesting -= RCU_NEST_BIAS; local_irq_save(flags); rcu_preempt_deferred_qs_irqrestore(t, flags); if (couldrecurse) - t->rcu_read_lock_nesting += INT_MIN; + t->rcu_read_lock_nesting += RCU_NEST_BIAS; } /* -- cgit From 04547728b7b775333a4f6fbb3c55102a79dc4596 Mon Sep 17 00:00:00 2001 From: Lance Roy Date: Thu, 4 Oct 2018 23:45:46 -0700 Subject: locking/mutex: Replace spin_is_locked() with lockdep lockdep_assert_held() is better suited to checking locking requirements, since it only checks if the current thread holds the lock regardless of whether someone else does. This is also a step towards possibly removing spin_is_locked(). Signed-off-by: Lance Roy Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Will Deacon Signed-off-by: Paul E. McKenney --- kernel/locking/mutex-debug.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c index 9aa713629387..771d4ca96dda 100644 --- a/kernel/locking/mutex-debug.c +++ b/kernel/locking/mutex-debug.c @@ -36,7 +36,7 @@ void debug_mutex_lock_common(struct mutex *lock, struct mutex_waiter *waiter) void debug_mutex_wake_waiter(struct mutex *lock, struct mutex_waiter *waiter) { - SMP_DEBUG_LOCKS_WARN_ON(!spin_is_locked(&lock->wait_lock)); + lockdep_assert_held(&lock->wait_lock); DEBUG_LOCKS_WARN_ON(list_empty(&lock->wait_list)); DEBUG_LOCKS_WARN_ON(waiter->magic != waiter); DEBUG_LOCKS_WARN_ON(list_empty(&waiter->list)); @@ -51,7 +51,7 @@ void debug_mutex_free_waiter(struct mutex_waiter *waiter) void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter, struct task_struct *task) { - SMP_DEBUG_LOCKS_WARN_ON(!spin_is_locked(&lock->wait_lock)); + lockdep_assert_held(&lock->wait_lock); /* Mark the current thread as blocked on the lock: */ task->blocked_on = waiter; -- cgit From b1e3aeb11c5e86ee0988a038c4e7682d6beaa977 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 Nov 2018 12:03:33 -0800 Subject: cpuset: Minor cgroup2 interface updates * Rename the partition file from "cpuset.sched.partition" to "cpuset.cpus.partition". * When writing to the partition file, drop "0" and "1" and only accept "member" and "root". Signed-off-by: Tejun Heo Cc: Peter Zijlstra (Intel) Cc: Waiman Long --- Documentation/admin-guide/cgroup-v2.rst | 6 +++--- kernel/cgroup/cpuset.c | 8 ++++---- 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index f83a5231bbe3..07e06136a550 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1708,15 +1708,15 @@ Cpuset Interface Files Its value will be affected by memory nodes hotplug events. - cpuset.sched.partition + cpuset.cpus.partition A read-write single value file which exists on non-root cpuset-enabled cgroups. This flag is owned by the parent cgroup and is not delegatable. It accepts only the following input values when written to. - "root" or "1" - a paritition root - "member" or "0" - a non-root member of a partition + "root" - a paritition root + "member" - a non-root member of a partition When set to be a partition root, the current cgroup is the root of a new partition or scheduling domain that comprises diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index b897314bab53..1151e93d71b6 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -2468,11 +2468,11 @@ static ssize_t sched_partition_write(struct kernfs_open_file *of, char *buf, buf = strstrip(buf); /* - * Convert "root"/"1" to 1, and convert "member"/"0" to 0. + * Convert "root" to ENABLED, and convert "member" to DISABLED. */ - if (!strcmp(buf, "root") || !strcmp(buf, "1")) + if (!strcmp(buf, "root")) val = PRS_ENABLED; - else if (!strcmp(buf, "member") || !strcmp(buf, "0")) + else if (!strcmp(buf, "member")) val = PRS_DISABLED; else return -EINVAL; @@ -2631,7 +2631,7 @@ static struct cftype dfl_files[] = { }, { - .name = "sched.partition", + .name = "cpus.partition", .seq_show = sched_partition_show, .write = sched_partition_write, .private = FILE_PARTITION_ROOT, -- cgit From c1bbd933e5fae83f96acd3f748bb01a0300b315d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 Nov 2018 12:06:41 -0800 Subject: cgroup: Add .__DEBUG__. prefix to debug file names Clearly mark the debug files and hide them by default by prefixing ".__DEBUG__.". Signed-off-by: Tejun Heo Cc: Peter Zijlstra (Intel) Cc: Waiman Long --- kernel/cgroup/cgroup.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index ed7f0bfe6429..e06994fd4e34 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1400,12 +1400,15 @@ static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft, struct cgroup_subsys *ss = cft->ss; if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) && - !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) - snprintf(buf, CGROUP_FILE_NAME_MAX, "%s.%s", - cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name, + !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) { + const char *dbg = (cft->flags & CFTYPE_DEBUG) ? ".__DEBUG__." : ""; + + snprintf(buf, CGROUP_FILE_NAME_MAX, "%s%s.%s", + dbg, cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name, cft->name); - else + } else { strscpy(buf, cft->name, CGROUP_FILE_NAME_MAX); + } return buf; } -- cgit From 8ddab428730d66e86ebe90aef15d5f7c5c45fe0d Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 9 Nov 2018 13:16:39 +0000 Subject: padata: clean an indentation issue, remove extraneous space Trivial fix to clean up an indentation issue Signed-off-by: Colin Ian King Signed-off-by: Herbert Xu --- kernel/padata.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/padata.c b/kernel/padata.c index d568cc56405f..3e2633ae3bca 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -720,7 +720,7 @@ int padata_start(struct padata_instance *pinst) if (pinst->flags & PADATA_INVALID) err = -EINVAL; - __padata_start(pinst); + __padata_start(pinst); mutex_unlock(&pinst->lock); -- cgit From 592ee43faf860c1f2c0a4c11838db6fdb974bb78 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 13 Nov 2018 09:29:26 +0000 Subject: bpf: fix null pointer dereference on pointer offload Pointer offload is being null checked however the following statement dereferences the potentially null pointer offload when assigning offload->dev_state. Fix this by only assigning it if offload is not null. Detected by CoverityScan, CID#1475437 ("Dereference after null check") Fixes: 00db12c3d141 ("bpf: call verifier_prep from its callback in struct bpf_offload_dev") Signed-off-by: Colin Ian King Acked-by: Jakub Kicinski Signed-off-by: Alexei Starovoitov --- kernel/bpf/offload.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index 52c5617e3716..54cf2b9c44a4 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -130,9 +130,10 @@ int bpf_prog_offload_verifier_prep(struct bpf_prog *prog) down_read(&bpf_devs_lock); offload = prog->aux->offload; - if (offload) + if (offload) { ret = offload->offdev->ops->prepare(prog); - offload->dev_state = !ret; + offload->dev_state = !ret; + } up_read(&bpf_devs_lock); return ret; -- cgit From 0fe3c7fceb500de2d0adfb9dcf292580cd43ea38 Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Fri, 16 Nov 2018 12:16:35 -0500 Subject: audit: localize audit_log_session_info prototype The audit_log_session_info() function is only used in kernel/audit*, so move its prototype to kernel/audit.h Signed-off-by: Richard Guy Briggs Signed-off-by: Paul Moore --- include/linux/audit.h | 2 -- kernel/audit.h | 2 ++ 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/audit.h b/include/linux/audit.h index 9334fbef7bae..58cf665f597e 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -115,8 +115,6 @@ extern int audit_classify_compat_syscall(int abi, unsigned syscall); struct filename; -extern void audit_log_session_info(struct audit_buffer *ab); - #define AUDIT_OFF 0 #define AUDIT_ON 1 #define AUDIT_LOCKED 2 diff --git a/kernel/audit.h b/kernel/audit.h index 214e14948370..9a3828bd387b 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -210,6 +210,8 @@ struct audit_context { extern bool audit_ever_enabled; +extern void audit_log_session_info(struct audit_buffer *ab); + extern void audit_copy_inode(struct audit_names *name, const struct dentry *dentry, struct inode *inode); -- cgit From a2c97da11cdb973b752dd434aee9636ce10ee737 Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Fri, 16 Nov 2018 16:30:10 -0500 Subject: audit: use session_info helper There are still a couple of places (mark and watch config changes) that open code auid and ses fields in sequence in records instead of using the audit_log_session_info() helper. Use the helper. Adjust the helper to accommodate being the first fields. Passes audit-testsuite. Signed-off-by: Richard Guy Briggs [PM: fixed misspellings in the description] Signed-off-by: Paul Moore --- kernel/audit.c | 6 +++--- kernel/audit_fsnotify.c | 5 ++--- kernel/audit_watch.c | 5 ++--- 3 files changed, 7 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 2a8058764aa6..6c53e373b828 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -400,7 +400,7 @@ static int audit_log_config_change(char *function_name, u32 new, u32 old, ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); if (unlikely(!ab)) return rc; - audit_log_format(ab, "%s=%u old=%u", function_name, new, old); + audit_log_format(ab, "%s=%u old=%u ", function_name, new, old); audit_log_session_info(ab); rc = audit_log_task_context(ab); if (rc) @@ -1067,7 +1067,7 @@ static void audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type) *ab = audit_log_start(NULL, GFP_KERNEL, msg_type); if (unlikely(!*ab)) return; - audit_log_format(*ab, "pid=%d uid=%u", pid, uid); + audit_log_format(*ab, "pid=%d uid=%u ", pid, uid); audit_log_session_info(*ab); audit_log_task_context(*ab); } @@ -2042,7 +2042,7 @@ void audit_log_session_info(struct audit_buffer *ab) unsigned int sessionid = audit_get_sessionid(current); uid_t auid = from_kuid(&init_user_ns, audit_get_loginuid(current)); - audit_log_format(ab, " auid=%u ses=%u", auid, sessionid); + audit_log_format(ab, "auid=%u ses=%u", auid, sessionid); } void audit_log_key(struct audit_buffer *ab, char *key) diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c index fba78047fb37..f90ffa699e5b 100644 --- a/kernel/audit_fsnotify.c +++ b/kernel/audit_fsnotify.c @@ -130,9 +130,8 @@ static void audit_mark_log_rule_change(struct audit_fsnotify_mark *audit_mark, c ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE); if (unlikely(!ab)) return; - audit_log_format(ab, "auid=%u ses=%u op=%s", - from_kuid(&init_user_ns, audit_get_loginuid(current)), - audit_get_sessionid(current), op); + audit_log_session_info(ab); + audit_log_format(ab, " op=%s", op); audit_log_format(ab, " path="); audit_log_untrustedstring(ab, audit_mark->path); audit_log_key(ab, rule->filterkey); diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 787c7afdf829..568e48d1d0ab 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -245,9 +245,8 @@ static void audit_watch_log_rule_change(struct audit_krule *r, struct audit_watc ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE); if (!ab) return; - audit_log_format(ab, "auid=%u ses=%u op=%s", - from_kuid(&init_user_ns, audit_get_loginuid(current)), - audit_get_sessionid(current), op); + audit_log_session_info(ab); + audit_log_format(ab, "op=%s", op); audit_log_format(ab, " path="); audit_log_untrustedstring(ab, w->path); audit_log_key(ab, r->filterkey); -- cgit From c8fc5d49c341805fee7fc295f2ea8a709f78aec4 Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Fri, 16 Nov 2018 12:17:35 -0500 Subject: audit: remove WATCH and TREE config options Remove the CONFIG_AUDIT_WATCH and CONFIG_AUDIT_TREE config options since they are both dependent on CONFIG_AUDITSYSCALL and force CONFIG_FSNOTIFY. Signed-off-by: Richard Guy Briggs Signed-off-by: Paul Moore --- init/Kconfig | 9 --------- kernel/Makefile | 4 +--- kernel/audit.h | 6 +++--- kernel/auditsc.c | 10 ---------- 4 files changed, 4 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/init/Kconfig b/init/Kconfig index a4112e95724a..7eb2538e6ca0 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -335,15 +335,6 @@ config HAVE_ARCH_AUDITSYSCALL config AUDITSYSCALL def_bool y depends on AUDIT && HAVE_ARCH_AUDITSYSCALL - -config AUDIT_WATCH - def_bool y - depends on AUDITSYSCALL - select FSNOTIFY - -config AUDIT_TREE - def_bool y - depends on AUDITSYSCALL select FSNOTIFY source "kernel/irq/Kconfig" diff --git a/kernel/Makefile b/kernel/Makefile index 7343b3a9bff0..9dc7f519129d 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -76,9 +76,7 @@ obj-$(CONFIG_IKCONFIG) += configs.o obj-$(CONFIG_SMP) += stop_machine.o obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o obj-$(CONFIG_AUDIT) += audit.o auditfilter.o -obj-$(CONFIG_AUDITSYSCALL) += auditsc.o -obj-$(CONFIG_AUDIT_WATCH) += audit_watch.o audit_fsnotify.o -obj-$(CONFIG_AUDIT_TREE) += audit_tree.o +obj-$(CONFIG_AUDITSYSCALL) += auditsc.o audit_watch.o audit_fsnotify.o audit_tree.o obj-$(CONFIG_GCOV_KERNEL) += gcov/ obj-$(CONFIG_KCOV) += kcov.o obj-$(CONFIG_KPROBES) += kprobes.o diff --git a/kernel/audit.h b/kernel/audit.h index 9a3828bd387b..0b5295aeaebb 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -268,7 +268,7 @@ extern struct tty_struct *audit_get_tty(struct task_struct *tsk); extern void audit_put_tty(struct tty_struct *tty); /* audit watch functions */ -#ifdef CONFIG_AUDIT_WATCH +#ifdef CONFIG_AUDITSYSCALL extern void audit_put_watch(struct audit_watch *watch); extern void audit_get_watch(struct audit_watch *watch); extern int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op); @@ -301,9 +301,9 @@ extern int audit_exe_compare(struct task_struct *tsk, struct audit_fsnotify_mark #define audit_mark_compare(m, i, d) 0 #define audit_exe_compare(t, m) (-EINVAL) #define audit_dupe_exe(n, o) (-EINVAL) -#endif /* CONFIG_AUDIT_WATCH */ +#endif /* CONFIG_AUDITSYSCALL */ -#ifdef CONFIG_AUDIT_TREE +#ifdef CONFIG_AUDITSYSCALL extern struct audit_chunk *audit_tree_lookup(const struct inode *inode); extern void audit_put_chunk(struct audit_chunk *chunk); extern bool audit_tree_match(struct audit_chunk *chunk, struct audit_tree *tree); diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 1513873e23bd..605f2d825204 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -200,7 +200,6 @@ static int audit_match_filetype(struct audit_context *ctx, int val) * References in it _are_ dropped - at the same time we free/drop aux stuff. */ -#ifdef CONFIG_AUDIT_TREE static void audit_set_auditable(struct audit_context *ctx) { if (!ctx->prio) { @@ -245,12 +244,10 @@ static int grow_tree_refs(struct audit_context *ctx) ctx->tree_count = 31; return 1; } -#endif static void unroll_tree_refs(struct audit_context *ctx, struct audit_tree_refs *p, int count) { -#ifdef CONFIG_AUDIT_TREE struct audit_tree_refs *q; int n; if (!p) { @@ -274,7 +271,6 @@ static void unroll_tree_refs(struct audit_context *ctx, } ctx->trees = p; ctx->tree_count = count; -#endif } static void free_tree_refs(struct audit_context *ctx) @@ -288,7 +284,6 @@ static void free_tree_refs(struct audit_context *ctx) static int match_tree_refs(struct audit_context *ctx, struct audit_tree *tree) { -#ifdef CONFIG_AUDIT_TREE struct audit_tree_refs *p; int n; if (!tree) @@ -305,7 +300,6 @@ static int match_tree_refs(struct audit_context *ctx, struct audit_tree *tree) if (audit_tree_match(p->c[n], tree)) return 1; } -#endif return 0; } @@ -1602,7 +1596,6 @@ void __audit_syscall_exit(int success, long return_code) static inline void handle_one(const struct inode *inode) { -#ifdef CONFIG_AUDIT_TREE struct audit_context *context; struct audit_tree_refs *p; struct audit_chunk *chunk; @@ -1627,12 +1620,10 @@ static inline void handle_one(const struct inode *inode) return; } put_tree_ref(context, chunk); -#endif } static void handle_path(const struct dentry *dentry) { -#ifdef CONFIG_AUDIT_TREE struct audit_context *context; struct audit_tree_refs *p; const struct dentry *d, *parent; @@ -1685,7 +1676,6 @@ retry: return; } rcu_read_unlock(); -#endif } static struct audit_names *audit_alloc_name(struct audit_context *context, -- cgit From 96b3b6c9091d23289721350e32c63cc8749686be Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Fri, 16 Nov 2018 11:41:08 +0000 Subject: bpf: allow zero-initializing hash map seed Add a new flag BPF_F_ZERO_SEED, which forces a hash map to initialize the seed to zero. This is useful when doing performance analysis both on individual BPF programs, as well as the kernel's hash table implementation. Signed-off-by: Lorenz Bauer Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 3 +++ kernel/bpf/hashtab.c | 13 +++++++++++-- 2 files changed, 14 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 47d606d744cc..8c01b89a4cb4 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -269,6 +269,9 @@ enum bpf_attach_type { /* Flag for stack_map, store build_id+offset instead of pointer */ #define BPF_F_STACK_BUILD_ID (1U << 5) +/* Zero-initialize hash function seed. This should only be used for testing. */ +#define BPF_F_ZERO_SEED (1U << 6) + enum bpf_stack_build_id_status { /* user space need an empty entry to identify end of a trace */ BPF_STACK_BUILD_ID_EMPTY = 0, diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 2c1790288138..4b7c76765d9d 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -23,7 +23,7 @@ #define HTAB_CREATE_FLAG_MASK \ (BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE | \ - BPF_F_RDONLY | BPF_F_WRONLY) + BPF_F_RDONLY | BPF_F_WRONLY | BPF_F_ZERO_SEED) struct bucket { struct hlist_nulls_head head; @@ -244,6 +244,7 @@ static int htab_map_alloc_check(union bpf_attr *attr) */ bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU); bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC); + bool zero_seed = (attr->map_flags & BPF_F_ZERO_SEED); int numa_node = bpf_map_attr_numa_node(attr); BUILD_BUG_ON(offsetof(struct htab_elem, htab) != @@ -257,6 +258,10 @@ static int htab_map_alloc_check(union bpf_attr *attr) */ return -EPERM; + if (zero_seed && !capable(CAP_SYS_ADMIN)) + /* Guard against local DoS, and discourage production use. */ + return -EPERM; + if (attr->map_flags & ~HTAB_CREATE_FLAG_MASK) /* reserved bits should not be used */ return -EINVAL; @@ -373,7 +378,11 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) if (!htab->buckets) goto free_htab; - htab->hashrnd = get_random_int(); + if (htab->map.map_flags & BPF_F_ZERO_SEED) + htab->hashrnd = 0; + else + htab->hashrnd = get_random_int(); + for (i = 0; i < htab->n_buckets; i++) { INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i); raw_spin_lock_init(&htab->buckets[i].lock); -- cgit From e9d81a1bc2c48ea9782e3e8b53875f419766ef47 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 8 Nov 2018 12:15:15 -0800 Subject: cgroup: fix CSS_TASK_ITER_PROCS CSS_TASK_ITER_PROCS implements process-only iteration by making css_task_iter_advance() skip tasks which aren't threadgroup leaders; however, when an iteration is started css_task_iter_start() calls the inner helper function css_task_iter_advance_css_set() instead of css_task_iter_advance(). As the helper doesn't have the skip logic, when the first task to visit is a non-leader thread, it doesn't get skipped correctly as shown in the following example. # ps -L 2030 PID LWP TTY STAT TIME COMMAND 2030 2030 pts/0 Sl+ 0:00 ./test-thread 2030 2031 pts/0 Sl+ 0:00 ./test-thread # mkdir -p /sys/fs/cgroup/x/a/b # echo threaded > /sys/fs/cgroup/x/a/cgroup.type # echo threaded > /sys/fs/cgroup/x/a/b/cgroup.type # echo 2030 > /sys/fs/cgroup/x/a/cgroup.procs # cat /sys/fs/cgroup/x/a/cgroup.threads 2030 2031 # cat /sys/fs/cgroup/x/cgroup.procs 2030 # echo 2030 > /sys/fs/cgroup/x/a/b/cgroup.threads # cat /sys/fs/cgroup/x/cgroup.procs 2031 2030 The last read of cgroup.procs is incorrectly showing non-leader 2031 in cgroup.procs output. This can be fixed by updating css_task_iter_advance() to handle the first advance and css_task_iters_tart() to call css_task_iter_advance() instead of the inner helper. After the fix, the same commands result in the following (correct) result: # ps -L 2062 PID LWP TTY STAT TIME COMMAND 2062 2062 pts/0 Sl+ 0:00 ./test-thread 2062 2063 pts/0 Sl+ 0:00 ./test-thread # mkdir -p /sys/fs/cgroup/x/a/b # echo threaded > /sys/fs/cgroup/x/a/cgroup.type # echo threaded > /sys/fs/cgroup/x/a/b/cgroup.type # echo 2062 > /sys/fs/cgroup/x/a/cgroup.procs # cat /sys/fs/cgroup/x/a/cgroup.threads 2062 2063 # cat /sys/fs/cgroup/x/cgroup.procs 2062 # echo 2062 > /sys/fs/cgroup/x/a/b/cgroup.threads # cat /sys/fs/cgroup/x/cgroup.procs 2062 Signed-off-by: Tejun Heo Reported-by: "Michael Kerrisk (man-pages)" Fixes: 8cfd8147df67 ("cgroup: implement cgroup v2 thread support") Cc: stable@vger.kernel.org # v4.14+ --- kernel/cgroup/cgroup.c | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 6aaf5dd5383b..1f84977fab47 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -4202,20 +4202,25 @@ static void css_task_iter_advance(struct css_task_iter *it) lockdep_assert_held(&css_set_lock); repeat: - /* - * Advance iterator to find next entry. cset->tasks is consumed - * first and then ->mg_tasks. After ->mg_tasks, we move onto the - * next cset. - */ - next = it->task_pos->next; + if (it->task_pos) { + /* + * Advance iterator to find next entry. cset->tasks is + * consumed first and then ->mg_tasks. After ->mg_tasks, + * we move onto the next cset. + */ + next = it->task_pos->next; - if (next == it->tasks_head) - next = it->mg_tasks_head->next; + if (next == it->tasks_head) + next = it->mg_tasks_head->next; - if (next == it->mg_tasks_head) + if (next == it->mg_tasks_head) + css_task_iter_advance_css_set(it); + else + it->task_pos = next; + } else { + /* called from start, proceed to the first cset */ css_task_iter_advance_css_set(it); - else - it->task_pos = next; + } /* if PROCS, skip over tasks which aren't group leaders */ if ((it->flags & CSS_TASK_ITER_PROCS) && it->task_pos && @@ -4255,7 +4260,7 @@ void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags, it->cset_head = it->cset_pos; - css_task_iter_advance_css_set(it); + css_task_iter_advance(it); spin_unlock_irq(&css_set_lock); } -- cgit From b47a0bd23e34022aa0d4b812fcebe85cb0c54d49 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Mon, 19 Nov 2018 15:29:06 -0800 Subject: bpf: btf: Break up btf_type_is_void() This patch breaks up btf_type_is_void() into btf_type_is_void() and btf_type_is_fwd(). It also adds btf_type_nosize() to better describe it is testing a type has nosize info. Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov --- kernel/bpf/btf.c | 37 ++++++++++++++++++++++--------------- 1 file changed, 22 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index ee4c82667d65..2a50d87de485 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -306,15 +306,22 @@ static bool btf_type_is_modifier(const struct btf_type *t) static bool btf_type_is_void(const struct btf_type *t) { - /* void => no type and size info. - * Hence, FWD is also treated as void. - */ - return t == &btf_void || BTF_INFO_KIND(t->info) == BTF_KIND_FWD; + return t == &btf_void; +} + +static bool btf_type_is_fwd(const struct btf_type *t) +{ + return BTF_INFO_KIND(t->info) == BTF_KIND_FWD; +} + +static bool btf_type_nosize(const struct btf_type *t) +{ + return btf_type_is_void(t) || btf_type_is_fwd(t); } -static bool btf_type_is_void_or_null(const struct btf_type *t) +static bool btf_type_nosize_or_null(const struct btf_type *t) { - return !t || btf_type_is_void(t); + return !t || btf_type_nosize(t); } /* union is only a special case of struct: @@ -826,7 +833,7 @@ const struct btf_type *btf_type_id_size(const struct btf *btf, u32 size = 0; size_type = btf_type_by_id(btf, size_type_id); - if (btf_type_is_void_or_null(size_type)) + if (btf_type_nosize_or_null(size_type)) return NULL; if (btf_type_has_size(size_type)) { @@ -842,7 +849,7 @@ const struct btf_type *btf_type_id_size(const struct btf *btf, size = btf->resolved_sizes[size_type_id]; size_type_id = btf->resolved_ids[size_type_id]; size_type = btf_type_by_id(btf, size_type_id); - if (btf_type_is_void(size_type)) + if (btf_type_nosize_or_null(size_type)) return NULL; } @@ -1164,7 +1171,7 @@ static int btf_modifier_resolve(struct btf_verifier_env *env, } /* "typedef void new_void", "const void"...etc */ - if (btf_type_is_void(next_type)) + if (btf_type_is_void(next_type) || btf_type_is_fwd(next_type)) goto resolved; if (!env_type_is_resolve_sink(env, next_type) && @@ -1178,7 +1185,7 @@ static int btf_modifier_resolve(struct btf_verifier_env *env, * pretty print). */ if (!btf_type_id_size(btf, &next_type_id, &next_type_size) && - !btf_type_is_void(btf_type_id_resolve(btf, &next_type_id))) { + !btf_type_nosize(btf_type_id_resolve(btf, &next_type_id))) { btf_verifier_log_type(env, v->t, "Invalid type_id"); return -EINVAL; } @@ -1205,7 +1212,7 @@ static int btf_ptr_resolve(struct btf_verifier_env *env, } /* "void *" */ - if (btf_type_is_void(next_type)) + if (btf_type_is_void(next_type) || btf_type_is_fwd(next_type)) goto resolved; if (!env_type_is_resolve_sink(env, next_type) && @@ -1235,7 +1242,7 @@ static int btf_ptr_resolve(struct btf_verifier_env *env, } if (!btf_type_id_size(btf, &next_type_id, &next_type_size) && - !btf_type_is_void(btf_type_id_resolve(btf, &next_type_id))) { + !btf_type_nosize(btf_type_id_resolve(btf, &next_type_id))) { btf_verifier_log_type(env, v->t, "Invalid type_id"); return -EINVAL; } @@ -1396,7 +1403,7 @@ static int btf_array_resolve(struct btf_verifier_env *env, /* Check array->index_type */ index_type_id = array->index_type; index_type = btf_type_by_id(btf, index_type_id); - if (btf_type_is_void_or_null(index_type)) { + if (btf_type_nosize_or_null(index_type)) { btf_verifier_log_type(env, v->t, "Invalid index"); return -EINVAL; } @@ -1415,7 +1422,7 @@ static int btf_array_resolve(struct btf_verifier_env *env, /* Check array->type */ elem_type_id = array->type; elem_type = btf_type_by_id(btf, elem_type_id); - if (btf_type_is_void_or_null(elem_type)) { + if (btf_type_nosize_or_null(elem_type)) { btf_verifier_log_type(env, v->t, "Invalid elem"); return -EINVAL; @@ -1615,7 +1622,7 @@ static int btf_struct_resolve(struct btf_verifier_env *env, const struct btf_type *member_type = btf_type_by_id(env->btf, member_type_id); - if (btf_type_is_void_or_null(member_type)) { + if (btf_type_nosize_or_null(member_type)) { btf_verifier_log_member(env, v->t, member, "Invalid member"); return -EINVAL; -- cgit From 2667a2626f4da370409c2830552f6e8c8b8c41e2 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Mon, 19 Nov 2018 15:29:08 -0800 Subject: bpf: btf: Add BTF_KIND_FUNC and BTF_KIND_FUNC_PROTO This patch adds BTF_KIND_FUNC and BTF_KIND_FUNC_PROTO to support the function debug info. BTF_KIND_FUNC_PROTO must not have a name (i.e. !t->name_off) and it is followed by >= 0 'struct bpf_param' objects to describe the function arguments. The BTF_KIND_FUNC must have a valid name and it must refer back to a BTF_KIND_FUNC_PROTO. The above is the conclusion after the discussion between Edward Cree, Alexei, Daniel, Yonghong and Martin. By combining BTF_KIND_FUNC and BTF_LIND_FUNC_PROTO, a complete function signature can be obtained. It will be used in the later patches to learn the function signature of a running bpf program. Signed-off-by: Martin KaFai Lau Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov --- include/uapi/linux/btf.h | 18 ++- kernel/bpf/btf.c | 389 +++++++++++++++++++++++++++++++++++++++++------ 2 files changed, 354 insertions(+), 53 deletions(-) (limited to 'kernel') diff --git a/include/uapi/linux/btf.h b/include/uapi/linux/btf.h index 972265f32871..14f66948fc95 100644 --- a/include/uapi/linux/btf.h +++ b/include/uapi/linux/btf.h @@ -40,7 +40,8 @@ struct btf_type { /* "size" is used by INT, ENUM, STRUCT and UNION. * "size" tells the size of the type it is describing. * - * "type" is used by PTR, TYPEDEF, VOLATILE, CONST and RESTRICT. + * "type" is used by PTR, TYPEDEF, VOLATILE, CONST, RESTRICT, + * FUNC and FUNC_PROTO. * "type" is a type_id referring to another type. */ union { @@ -64,8 +65,10 @@ struct btf_type { #define BTF_KIND_VOLATILE 9 /* Volatile */ #define BTF_KIND_CONST 10 /* Const */ #define BTF_KIND_RESTRICT 11 /* Restrict */ -#define BTF_KIND_MAX 11 -#define NR_BTF_KINDS 12 +#define BTF_KIND_FUNC 12 /* Function */ +#define BTF_KIND_FUNC_PROTO 13 /* Function Proto */ +#define BTF_KIND_MAX 13 +#define NR_BTF_KINDS 14 /* For some specific BTF_KIND, "struct btf_type" is immediately * followed by extra data. @@ -110,4 +113,13 @@ struct btf_member { __u32 offset; /* offset in bits */ }; +/* BTF_KIND_FUNC_PROTO is followed by multiple "struct btf_param". + * The exact number of btf_param is stored in the vlen (of the + * info in "struct btf_type"). + */ +struct btf_param { + __u32 name_off; + __u32 type; +}; + #endif /* _UAPI__LINUX_BTF_H__ */ diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 2a50d87de485..6a2be79b73fc 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -259,6 +260,8 @@ static const char * const btf_kind_str[NR_BTF_KINDS] = { [BTF_KIND_VOLATILE] = "VOLATILE", [BTF_KIND_CONST] = "CONST", [BTF_KIND_RESTRICT] = "RESTRICT", + [BTF_KIND_FUNC] = "FUNC", + [BTF_KIND_FUNC_PROTO] = "FUNC_PROTO", }; struct btf_kind_operations { @@ -281,6 +284,9 @@ struct btf_kind_operations { static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS]; static struct btf_type btf_void; +static int btf_resolve(struct btf_verifier_env *env, + const struct btf_type *t, u32 type_id); + static bool btf_type_is_modifier(const struct btf_type *t) { /* Some of them is not strictly a C modifier @@ -314,9 +320,20 @@ static bool btf_type_is_fwd(const struct btf_type *t) return BTF_INFO_KIND(t->info) == BTF_KIND_FWD; } +static bool btf_type_is_func(const struct btf_type *t) +{ + return BTF_INFO_KIND(t->info) == BTF_KIND_FUNC; +} + +static bool btf_type_is_func_proto(const struct btf_type *t) +{ + return BTF_INFO_KIND(t->info) == BTF_KIND_FUNC_PROTO; +} + static bool btf_type_nosize(const struct btf_type *t) { - return btf_type_is_void(t) || btf_type_is_fwd(t); + return btf_type_is_void(t) || btf_type_is_fwd(t) || + btf_type_is_func(t) || btf_type_is_func_proto(t); } static bool btf_type_nosize_or_null(const struct btf_type *t) @@ -433,6 +450,30 @@ static bool btf_name_offset_valid(const struct btf *btf, u32 offset) offset < btf->hdr.str_len; } +/* Only C-style identifier is permitted. This can be relaxed if + * necessary. + */ +static bool btf_name_valid_identifier(const struct btf *btf, u32 offset) +{ + /* offset must be valid */ + const char *src = &btf->strings[offset]; + const char *src_limit; + + if (!isalpha(*src) && *src != '_') + return false; + + /* set a limit on identifier length */ + src_limit = src + KSYM_NAME_LEN; + src++; + while (*src && src < src_limit) { + if (!isalnum(*src) && *src != '_') + return false; + src++; + } + + return !*src; +} + static const char *btf_name_by_offset(const struct btf *btf, u32 offset) { if (!offset) @@ -747,11 +788,15 @@ static bool env_type_is_resolve_sink(const struct btf_verifier_env *env, /* int, enum or void is a sink */ return !btf_type_needs_resolve(next_type); case RESOLVE_PTR: - /* int, enum, void, struct or array is a sink for ptr */ + /* int, enum, void, struct, array, func or func_proto is a sink + * for ptr + */ return !btf_type_is_modifier(next_type) && !btf_type_is_ptr(next_type); case RESOLVE_STRUCT_OR_ARRAY: - /* int, enum, void or ptr is a sink for struct and array */ + /* int, enum, void, ptr, func or func_proto is a sink + * for struct and array + */ return !btf_type_is_modifier(next_type) && !btf_type_is_array(next_type) && !btf_type_is_struct(next_type); @@ -1170,10 +1215,6 @@ static int btf_modifier_resolve(struct btf_verifier_env *env, return -EINVAL; } - /* "typedef void new_void", "const void"...etc */ - if (btf_type_is_void(next_type) || btf_type_is_fwd(next_type)) - goto resolved; - if (!env_type_is_resolve_sink(env, next_type) && !env_type_is_resolved(env, next_type_id)) return env_stack_push(env, next_type, next_type_id); @@ -1184,13 +1225,18 @@ static int btf_modifier_resolve(struct btf_verifier_env *env, * save us a few type-following when we use it later (e.g. in * pretty print). */ - if (!btf_type_id_size(btf, &next_type_id, &next_type_size) && - !btf_type_nosize(btf_type_id_resolve(btf, &next_type_id))) { - btf_verifier_log_type(env, v->t, "Invalid type_id"); - return -EINVAL; + if (!btf_type_id_size(btf, &next_type_id, &next_type_size)) { + if (env_type_is_resolved(env, next_type_id)) + next_type = btf_type_id_resolve(btf, &next_type_id); + + /* "typedef void new_void", "const void"...etc */ + if (!btf_type_is_void(next_type) && + !btf_type_is_fwd(next_type)) { + btf_verifier_log_type(env, v->t, "Invalid type_id"); + return -EINVAL; + } } -resolved: env_stack_pop_resolved(env, next_type_id, next_type_size); return 0; @@ -1203,7 +1249,6 @@ static int btf_ptr_resolve(struct btf_verifier_env *env, const struct btf_type *t = v->t; u32 next_type_id = t->type; struct btf *btf = env->btf; - u32 next_type_size = 0; next_type = btf_type_by_id(btf, next_type_id); if (!next_type) { @@ -1211,10 +1256,6 @@ static int btf_ptr_resolve(struct btf_verifier_env *env, return -EINVAL; } - /* "void *" */ - if (btf_type_is_void(next_type) || btf_type_is_fwd(next_type)) - goto resolved; - if (!env_type_is_resolve_sink(env, next_type) && !env_type_is_resolved(env, next_type_id)) return env_stack_push(env, next_type, next_type_id); @@ -1241,13 +1282,18 @@ static int btf_ptr_resolve(struct btf_verifier_env *env, resolved_type_id); } - if (!btf_type_id_size(btf, &next_type_id, &next_type_size) && - !btf_type_nosize(btf_type_id_resolve(btf, &next_type_id))) { - btf_verifier_log_type(env, v->t, "Invalid type_id"); - return -EINVAL; + if (!btf_type_id_size(btf, &next_type_id, NULL)) { + if (env_type_is_resolved(env, next_type_id)) + next_type = btf_type_id_resolve(btf, &next_type_id); + + if (!btf_type_is_void(next_type) && + !btf_type_is_fwd(next_type) && + !btf_type_is_func_proto(next_type)) { + btf_verifier_log_type(env, v->t, "Invalid type_id"); + return -EINVAL; + } } -resolved: env_stack_pop_resolved(env, next_type_id, 0); return 0; @@ -1787,6 +1833,232 @@ static struct btf_kind_operations enum_ops = { .seq_show = btf_enum_seq_show, }; +static s32 btf_func_proto_check_meta(struct btf_verifier_env *env, + const struct btf_type *t, + u32 meta_left) +{ + u32 meta_needed = btf_type_vlen(t) * sizeof(struct btf_param); + + if (meta_left < meta_needed) { + btf_verifier_log_basic(env, t, + "meta_left:%u meta_needed:%u", + meta_left, meta_needed); + return -EINVAL; + } + + if (t->name_off) { + btf_verifier_log_type(env, t, "Invalid name"); + return -EINVAL; + } + + btf_verifier_log_type(env, t, NULL); + + return meta_needed; +} + +static void btf_func_proto_log(struct btf_verifier_env *env, + const struct btf_type *t) +{ + const struct btf_param *args = (const struct btf_param *)(t + 1); + u16 nr_args = btf_type_vlen(t), i; + + btf_verifier_log(env, "return=%u args=(", t->type); + if (!nr_args) { + btf_verifier_log(env, "void"); + goto done; + } + + if (nr_args == 1 && !args[0].type) { + /* Only one vararg */ + btf_verifier_log(env, "vararg"); + goto done; + } + + btf_verifier_log(env, "%u %s", args[0].type, + btf_name_by_offset(env->btf, + args[0].name_off)); + for (i = 1; i < nr_args - 1; i++) + btf_verifier_log(env, ", %u %s", args[i].type, + btf_name_by_offset(env->btf, + args[i].name_off)); + + if (nr_args > 1) { + const struct btf_param *last_arg = &args[nr_args - 1]; + + if (last_arg->type) + btf_verifier_log(env, ", %u %s", last_arg->type, + btf_name_by_offset(env->btf, + last_arg->name_off)); + else + btf_verifier_log(env, ", vararg"); + } + +done: + btf_verifier_log(env, ")"); +} + +static struct btf_kind_operations func_proto_ops = { + .check_meta = btf_func_proto_check_meta, + .resolve = btf_df_resolve, + /* + * BTF_KIND_FUNC_PROTO cannot be directly referred by + * a struct's member. + * + * It should be a funciton pointer instead. + * (i.e. struct's member -> BTF_KIND_PTR -> BTF_KIND_FUNC_PROTO) + * + * Hence, there is no btf_func_check_member(). + */ + .check_member = btf_df_check_member, + .log_details = btf_func_proto_log, + .seq_show = btf_df_seq_show, +}; + +static s32 btf_func_check_meta(struct btf_verifier_env *env, + const struct btf_type *t, + u32 meta_left) +{ + if (!t->name_off || + !btf_name_valid_identifier(env->btf, t->name_off)) { + btf_verifier_log_type(env, t, "Invalid name"); + return -EINVAL; + } + + if (btf_type_vlen(t)) { + btf_verifier_log_type(env, t, "vlen != 0"); + return -EINVAL; + } + + btf_verifier_log_type(env, t, NULL); + + return 0; +} + +static struct btf_kind_operations func_ops = { + .check_meta = btf_func_check_meta, + .resolve = btf_df_resolve, + .check_member = btf_df_check_member, + .log_details = btf_ref_type_log, + .seq_show = btf_df_seq_show, +}; + +static int btf_func_proto_check(struct btf_verifier_env *env, + const struct btf_type *t) +{ + const struct btf_type *ret_type; + const struct btf_param *args; + const struct btf *btf; + u16 nr_args, i; + int err; + + btf = env->btf; + args = (const struct btf_param *)(t + 1); + nr_args = btf_type_vlen(t); + + /* Check func return type which could be "void" (t->type == 0) */ + if (t->type) { + u32 ret_type_id = t->type; + + ret_type = btf_type_by_id(btf, ret_type_id); + if (!ret_type) { + btf_verifier_log_type(env, t, "Invalid return type"); + return -EINVAL; + } + + if (btf_type_needs_resolve(ret_type) && + !env_type_is_resolved(env, ret_type_id)) { + err = btf_resolve(env, ret_type, ret_type_id); + if (err) + return err; + } + + /* Ensure the return type is a type that has a size */ + if (!btf_type_id_size(btf, &ret_type_id, NULL)) { + btf_verifier_log_type(env, t, "Invalid return type"); + return -EINVAL; + } + } + + if (!nr_args) + return 0; + + /* Last func arg type_id could be 0 if it is a vararg */ + if (!args[nr_args - 1].type) { + if (args[nr_args - 1].name_off) { + btf_verifier_log_type(env, t, "Invalid arg#%u", + nr_args); + return -EINVAL; + } + nr_args--; + } + + err = 0; + for (i = 0; i < nr_args; i++) { + const struct btf_type *arg_type; + u32 arg_type_id; + + arg_type_id = args[i].type; + arg_type = btf_type_by_id(btf, arg_type_id); + if (!arg_type) { + btf_verifier_log_type(env, t, "Invalid arg#%u", i + 1); + err = -EINVAL; + break; + } + + if (args[i].name_off && + (!btf_name_offset_valid(btf, args[i].name_off) || + !btf_name_valid_identifier(btf, args[i].name_off))) { + btf_verifier_log_type(env, t, + "Invalid arg#%u", i + 1); + err = -EINVAL; + break; + } + + if (btf_type_needs_resolve(arg_type) && + !env_type_is_resolved(env, arg_type_id)) { + err = btf_resolve(env, arg_type, arg_type_id); + if (err) + break; + } + + if (!btf_type_id_size(btf, &arg_type_id, NULL)) { + btf_verifier_log_type(env, t, "Invalid arg#%u", i + 1); + err = -EINVAL; + break; + } + } + + return err; +} + +static int btf_func_check(struct btf_verifier_env *env, + const struct btf_type *t) +{ + const struct btf_type *proto_type; + const struct btf_param *args; + const struct btf *btf; + u16 nr_args, i; + + btf = env->btf; + proto_type = btf_type_by_id(btf, t->type); + + if (!proto_type || !btf_type_is_func_proto(proto_type)) { + btf_verifier_log_type(env, t, "Invalid type_id"); + return -EINVAL; + } + + args = (const struct btf_param *)(proto_type + 1); + nr_args = btf_type_vlen(proto_type); + for (i = 0; i < nr_args; i++) { + if (!args[i].name_off && args[i].type) { + btf_verifier_log_type(env, t, "Invalid arg#%u", i + 1); + return -EINVAL; + } + } + + return 0; +} + static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS] = { [BTF_KIND_INT] = &int_ops, [BTF_KIND_PTR] = &ptr_ops, @@ -1799,6 +2071,8 @@ static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS] = { [BTF_KIND_VOLATILE] = &modifier_ops, [BTF_KIND_CONST] = &modifier_ops, [BTF_KIND_RESTRICT] = &modifier_ops, + [BTF_KIND_FUNC] = &func_ops, + [BTF_KIND_FUNC_PROTO] = &func_proto_ops, }; static s32 btf_check_meta(struct btf_verifier_env *env, @@ -1870,30 +2144,6 @@ static int btf_check_all_metas(struct btf_verifier_env *env) return 0; } -static int btf_resolve(struct btf_verifier_env *env, - const struct btf_type *t, u32 type_id) -{ - const struct resolve_vertex *v; - int err = 0; - - env->resolve_mode = RESOLVE_TBD; - env_stack_push(env, t, type_id); - while (!err && (v = env_stack_peak(env))) { - env->log_type_id = v->type_id; - err = btf_type_ops(v->t)->resolve(env, v); - } - - env->log_type_id = type_id; - if (err == -E2BIG) - btf_verifier_log_type(env, t, - "Exceeded max resolving depth:%u", - MAX_RESOLVE_DEPTH); - else if (err == -EEXIST) - btf_verifier_log_type(env, t, "Loop detected"); - - return err; -} - static bool btf_resolve_valid(struct btf_verifier_env *env, const struct btf_type *t, u32 type_id) @@ -1927,6 +2177,39 @@ static bool btf_resolve_valid(struct btf_verifier_env *env, return false; } +static int btf_resolve(struct btf_verifier_env *env, + const struct btf_type *t, u32 type_id) +{ + u32 save_log_type_id = env->log_type_id; + const struct resolve_vertex *v; + int err = 0; + + env->resolve_mode = RESOLVE_TBD; + env_stack_push(env, t, type_id); + while (!err && (v = env_stack_peak(env))) { + env->log_type_id = v->type_id; + err = btf_type_ops(v->t)->resolve(env, v); + } + + env->log_type_id = type_id; + if (err == -E2BIG) { + btf_verifier_log_type(env, t, + "Exceeded max resolving depth:%u", + MAX_RESOLVE_DEPTH); + } else if (err == -EEXIST) { + btf_verifier_log_type(env, t, "Loop detected"); + } + + /* Final sanity check */ + if (!err && !btf_resolve_valid(env, t, type_id)) { + btf_verifier_log_type(env, t, "Invalid resolve state"); + err = -EINVAL; + } + + env->log_type_id = save_log_type_id; + return err; +} + static int btf_check_all_types(struct btf_verifier_env *env) { struct btf *btf = env->btf; @@ -1949,10 +2232,16 @@ static int btf_check_all_types(struct btf_verifier_env *env) return err; } - if (btf_type_needs_resolve(t) && - !btf_resolve_valid(env, t, type_id)) { - btf_verifier_log_type(env, t, "Invalid resolve state"); - return -EINVAL; + if (btf_type_is_func_proto(t)) { + err = btf_func_proto_check(env, t); + if (err) + return err; + } + + if (btf_type_is_func(t)) { + err = btf_func_check(env, t); + if (err) + return err; } } -- cgit From 838e96904ff3fc6c30e5ebbc611474669856e3c0 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 19 Nov 2018 15:29:11 -0800 Subject: bpf: Introduce bpf_func_info This patch added interface to load a program with the following additional information: . prog_btf_fd . func_info, func_info_rec_size and func_info_cnt where func_info will provide function range and type_id corresponding to each function. The func_info_rec_size is introduced in the UAPI to specify struct bpf_func_info size passed from user space. This intends to make bpf_func_info structure growable in the future. If the kernel gets a different bpf_func_info size from userspace, it will try to handle user request with part of bpf_func_info it can understand. In this patch, kernel can understand struct bpf_func_info { __u32 insn_offset; __u32 type_id; }; If user passed a bpf func_info record size of 16 bytes, the kernel can still handle part of records with the above definition. If verifier agrees with function range provided by the user, the bpf_prog ksym for each function will use the func name provided in the type_id, which is supposed to provide better encoding as it is not limited by 16 bytes program name limitation and this is better for bpf program which contains multiple subprograms. The bpf_prog_info interface is also extended to return btf_id, func_info, func_info_rec_size and func_info_cnt to userspace, so userspace can print out the function prototype for each xlated function. The insn_offset in the returned func_info corresponds to the insn offset for xlated functions. With other jit related fields in bpf_prog_info, userspace can also print out function prototypes for each jited function. Signed-off-by: Yonghong Song Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 5 +- include/linux/bpf_verifier.h | 1 + include/linux/btf.h | 2 + include/uapi/linux/bpf.h | 13 +++++ kernel/bpf/btf.c | 4 +- kernel/bpf/core.c | 13 +++++ kernel/bpf/syscall.c | 59 +++++++++++++++++++-- kernel/bpf/verifier.c | 120 ++++++++++++++++++++++++++++++++++++++++++- 8 files changed, 209 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 987815152629..7f0e225bf630 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -316,6 +316,8 @@ struct bpf_prog_aux { void *security; #endif struct bpf_prog_offload *offload; + struct btf *btf; + u32 type_id; /* type id for this prog/func */ union { struct work_struct work; struct rcu_head rcu; @@ -527,7 +529,8 @@ static inline void bpf_long_memcpy(void *dst, const void *src, u32 size) } /* verify correctness of eBPF program */ -int bpf_check(struct bpf_prog **fp, union bpf_attr *attr); +int bpf_check(struct bpf_prog **fp, union bpf_attr *attr, + union bpf_attr __user *uattr); void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth); /* Map specifics */ diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 11f5df1092d9..204382f46fd8 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -204,6 +204,7 @@ static inline bool bpf_verifier_log_needed(const struct bpf_verifier_log *log) struct bpf_subprog_info { u32 start; /* insn idx of function entry point */ u16 stack_depth; /* max. stack depth used by this function */ + u32 type_id; /* btf type_id for this subprog */ }; /* single container for all structs diff --git a/include/linux/btf.h b/include/linux/btf.h index e076c4697049..7f2c0a4a45ea 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -46,5 +46,7 @@ void btf_type_seq_show(const struct btf *btf, u32 type_id, void *obj, struct seq_file *m); int btf_get_fd_by_id(u32 id); u32 btf_id(const struct btf *btf); +const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id); +const char *btf_name_by_offset(const struct btf *btf, u32 offset); #endif diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 05d95290b848..c1554aa07465 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -338,6 +338,10 @@ union bpf_attr { * (context accesses, allowed helpers, etc). */ __u32 expected_attach_type; + __u32 prog_btf_fd; /* fd pointing to BTF type data */ + __u32 func_info_rec_size; /* userspace bpf_func_info size */ + __aligned_u64 func_info; /* func info */ + __u32 func_info_cnt; /* number of bpf_func_info records */ }; struct { /* anonymous struct used by BPF_OBJ_* commands */ @@ -2638,6 +2642,10 @@ struct bpf_prog_info { __u32 nr_jited_func_lens; __aligned_u64 jited_ksyms; __aligned_u64 jited_func_lens; + __u32 btf_id; + __u32 func_info_rec_size; + __aligned_u64 func_info; + __u32 func_info_cnt; } __attribute__((aligned(8))); struct bpf_map_info { @@ -2949,4 +2957,9 @@ struct bpf_flow_keys { }; }; +struct bpf_func_info { + __u32 insn_offset; + __u32 type_id; +}; + #endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 6a2be79b73fc..69da9169819a 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -474,7 +474,7 @@ static bool btf_name_valid_identifier(const struct btf *btf, u32 offset) return !*src; } -static const char *btf_name_by_offset(const struct btf *btf, u32 offset) +const char *btf_name_by_offset(const struct btf *btf, u32 offset) { if (!offset) return "(anon)"; @@ -484,7 +484,7 @@ static const char *btf_name_by_offset(const struct btf *btf, u32 offset) return "(invalid-name-offset)"; } -static const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id) +const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id) { if (type_id > btf->nr_types) return NULL; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 1a796e0799ec..16d77012ad3e 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -21,12 +21,14 @@ * Kris Katterjohn - Added many additional checks in bpf_check_classic() */ +#include #include #include #include #include #include #include +#include #include #include #include @@ -390,6 +392,8 @@ bpf_get_prog_addr_region(const struct bpf_prog *prog, static void bpf_get_prog_name(const struct bpf_prog *prog, char *sym) { const char *end = sym + KSYM_NAME_LEN; + const struct btf_type *type; + const char *func_name; BUILD_BUG_ON(sizeof("bpf_prog_") + sizeof(prog->tag) * 2 + @@ -404,6 +408,15 @@ static void bpf_get_prog_name(const struct bpf_prog *prog, char *sym) sym += snprintf(sym, KSYM_NAME_LEN, "bpf_prog_"); sym = bin2hex(sym, prog->tag, sizeof(prog->tag)); + + /* prog->aux->name will be ignored if full btf name is available */ + if (prog->aux->btf) { + type = btf_type_by_id(prog->aux->btf, prog->aux->type_id); + func_name = btf_name_by_offset(prog->aux->btf, type->name_off); + snprintf(sym, (size_t)(end - sym), "_%s", func_name); + return; + } + if (prog->aux->name[0]) snprintf(sym, (size_t)(end - sym), "_%s", prog->aux->name); else diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index cf5040fd5434..998377808102 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1213,6 +1213,7 @@ static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock) /* bpf_prog_free_id() must be called first */ bpf_prog_free_id(prog, do_idr_lock); bpf_prog_kallsyms_del_all(prog); + btf_put(prog->aux->btf); call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu); } @@ -1437,9 +1438,9 @@ bpf_prog_load_check_attach_type(enum bpf_prog_type prog_type, } /* last field in 'union bpf_attr' used by this command */ -#define BPF_PROG_LOAD_LAST_FIELD expected_attach_type +#define BPF_PROG_LOAD_LAST_FIELD func_info_cnt -static int bpf_prog_load(union bpf_attr *attr) +static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) { enum bpf_prog_type type = attr->prog_type; struct bpf_prog *prog; @@ -1525,7 +1526,7 @@ static int bpf_prog_load(union bpf_attr *attr) goto free_prog; /* run eBPF verifier */ - err = bpf_check(&prog, attr); + err = bpf_check(&prog, attr, uattr); if (err < 0) goto free_used_maps; @@ -2079,6 +2080,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, info.xlated_prog_len = 0; info.nr_jited_ksyms = 0; info.nr_jited_func_lens = 0; + info.func_info_cnt = 0; goto done; } @@ -2216,6 +2218,55 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, } } + if (prog->aux->btf) { + u32 ucnt, urec_size; + + info.btf_id = btf_id(prog->aux->btf); + + ucnt = info.func_info_cnt; + info.func_info_cnt = prog->aux->func_cnt ? : 1; + urec_size = info.func_info_rec_size; + info.func_info_rec_size = sizeof(struct bpf_func_info); + if (ucnt) { + /* expect passed-in urec_size is what the kernel expects */ + if (urec_size != info.func_info_rec_size) + return -EINVAL; + + if (bpf_dump_raw_ok()) { + struct bpf_func_info kern_finfo; + char __user *user_finfo; + u32 i, insn_offset; + + user_finfo = u64_to_user_ptr(info.func_info); + if (prog->aux->func_cnt) { + ucnt = min_t(u32, info.func_info_cnt, ucnt); + insn_offset = 0; + for (i = 0; i < ucnt; i++) { + kern_finfo.insn_offset = insn_offset; + kern_finfo.type_id = prog->aux->func[i]->aux->type_id; + if (copy_to_user(user_finfo, &kern_finfo, + sizeof(kern_finfo))) + return -EFAULT; + + /* func[i]->len holds the prog len */ + insn_offset += prog->aux->func[i]->len; + user_finfo += urec_size; + } + } else { + kern_finfo.insn_offset = 0; + kern_finfo.type_id = prog->aux->type_id; + if (copy_to_user(user_finfo, &kern_finfo, + sizeof(kern_finfo))) + return -EFAULT; + } + } else { + info.func_info_cnt = 0; + } + } + } else { + info.func_info_cnt = 0; + } + done: if (copy_to_user(uinfo, &info, info_len) || put_user(info_len, &uattr->info.info_len)) @@ -2501,7 +2552,7 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz err = map_get_next_key(&attr); break; case BPF_PROG_LOAD: - err = bpf_prog_load(&attr); + err = bpf_prog_load(&attr, uattr); break; case BPF_OBJ_PIN: err = bpf_obj_pin(&attr); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index b5222aa61d54..f102c4fd0c5a 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -11,10 +11,12 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. */ +#include #include #include #include #include +#include #include #include #include @@ -4639,6 +4641,114 @@ err_free: return ret; } +/* The minimum supported BTF func info size */ +#define MIN_BPF_FUNCINFO_SIZE 8 +#define MAX_FUNCINFO_REC_SIZE 252 + +static int check_btf_func(struct bpf_prog *prog, struct bpf_verifier_env *env, + union bpf_attr *attr, union bpf_attr __user *uattr) +{ + u32 i, nfuncs, urec_size, min_size, prev_offset; + u32 krec_size = sizeof(struct bpf_func_info); + struct bpf_func_info krecord = {}; + const struct btf_type *type; + void __user *urecord; + struct btf *btf; + int ret = 0; + + nfuncs = attr->func_info_cnt; + if (!nfuncs) + return 0; + + if (nfuncs != env->subprog_cnt) { + verbose(env, "number of funcs in func_info doesn't match number of subprogs\n"); + return -EINVAL; + } + + urec_size = attr->func_info_rec_size; + if (urec_size < MIN_BPF_FUNCINFO_SIZE || + urec_size > MAX_FUNCINFO_REC_SIZE || + urec_size % sizeof(u32)) { + verbose(env, "invalid func info rec size %u\n", urec_size); + return -EINVAL; + } + + btf = btf_get_by_fd(attr->prog_btf_fd); + if (IS_ERR(btf)) { + verbose(env, "unable to get btf from fd\n"); + return PTR_ERR(btf); + } + + urecord = u64_to_user_ptr(attr->func_info); + min_size = min_t(u32, krec_size, urec_size); + + for (i = 0; i < nfuncs; i++) { + ret = bpf_check_uarg_tail_zero(urecord, krec_size, urec_size); + if (ret) { + if (ret == -E2BIG) { + verbose(env, "nonzero tailing record in func info"); + /* set the size kernel expects so loader can zero + * out the rest of the record. + */ + if (put_user(min_size, &uattr->func_info_rec_size)) + ret = -EFAULT; + } + goto free_btf; + } + + if (copy_from_user(&krecord, urecord, min_size)) { + ret = -EFAULT; + goto free_btf; + } + + /* check insn_offset */ + if (i == 0) { + if (krecord.insn_offset) { + verbose(env, + "nonzero insn_offset %u for the first func info record", + krecord.insn_offset); + ret = -EINVAL; + goto free_btf; + } + } else if (krecord.insn_offset <= prev_offset) { + verbose(env, + "same or smaller insn offset (%u) than previous func info record (%u)", + krecord.insn_offset, prev_offset); + ret = -EINVAL; + goto free_btf; + } + + if (env->subprog_info[i].start != krecord.insn_offset) { + verbose(env, "func_info BTF section doesn't match subprog layout in BPF program\n"); + ret = -EINVAL; + goto free_btf; + } + + /* check type_id */ + type = btf_type_by_id(btf, krecord.type_id); + if (!type || BTF_INFO_KIND(type->info) != BTF_KIND_FUNC) { + verbose(env, "invalid type id %d in func info", + krecord.type_id); + ret = -EINVAL; + goto free_btf; + } + + if (i == 0) + prog->aux->type_id = krecord.type_id; + env->subprog_info[i].type_id = krecord.type_id; + + prev_offset = krecord.insn_offset; + urecord += urec_size; + } + + prog->aux->btf = btf; + return 0; + +free_btf: + btf_put(btf); + return ret; +} + /* check %cur's range satisfies %old's */ static bool range_within(struct bpf_reg_state *old, struct bpf_reg_state *cur) @@ -5939,6 +6049,9 @@ static int jit_subprogs(struct bpf_verifier_env *env) func[i]->aux->name[0] = 'F'; func[i]->aux->stack_depth = env->subprog_info[i].stack_depth; func[i]->jit_requested = 1; + /* the btf will be freed only at prog->aux */ + func[i]->aux->btf = prog->aux->btf; + func[i]->aux->type_id = env->subprog_info[i].type_id; func[i] = bpf_int_jit_compile(func[i]); if (!func[i]->jited) { err = -ENOTSUPP; @@ -6325,7 +6438,8 @@ static void free_states(struct bpf_verifier_env *env) kfree(env->explored_states); } -int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) +int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, + union bpf_attr __user *uattr) { struct bpf_verifier_env *env; struct bpf_verifier_log *log; @@ -6397,6 +6511,10 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) if (ret < 0) goto skip_full_check; + ret = check_btf_func(env->prog, env, attr, uattr); + if (ret < 0) + goto skip_full_check; + ret = do_check(env); if (env->cur_state) { free_verifier_state(env->cur_state, true); -- cgit From 8d75839b843ae0ef8d9db97ed05b493e687e6b75 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 21 Nov 2018 21:39:52 -0800 Subject: bpf, lpm: make longest_prefix_match() faster At LPC 2018 in Vancouver, Vlad Dumitrescu mentioned that longest_prefix_match() has a high cost [1]. One reason for that cost is a loop handling one byte at a time. We can handle more bytes at a time, if enough attention is paid to endianness. I was able to remove ~55 % of longest_prefix_match() cpu costs. [1] https://linuxplumbersconf.org/event/2/contributions/88/attachments/76/87/lpc-bpf-2018-shaping.pdf Signed-off-by: Eric Dumazet Cc: Vlad Dumitrescu Cc: Alexei Starovoitov Cc: Daniel Borkmann Signed-off-by: Daniel Borkmann --- kernel/bpf/lpm_trie.c | 59 ++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 49 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 9058317ba9de..bfd4882e1106 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -168,20 +168,59 @@ static size_t longest_prefix_match(const struct lpm_trie *trie, const struct lpm_trie_node *node, const struct bpf_lpm_trie_key *key) { - size_t prefixlen = 0; - size_t i; + u32 limit = min(node->prefixlen, key->prefixlen); + u32 prefixlen = 0, i = 0; - for (i = 0; i < trie->data_size; i++) { - size_t b; + BUILD_BUG_ON(offsetof(struct lpm_trie_node, data) % sizeof(u32)); + BUILD_BUG_ON(offsetof(struct bpf_lpm_trie_key, data) % sizeof(u32)); - b = 8 - fls(node->data[i] ^ key->data[i]); - prefixlen += b; +#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && defined(CONFIG_64BIT) - if (prefixlen >= node->prefixlen || prefixlen >= key->prefixlen) - return min(node->prefixlen, key->prefixlen); + /* data_size >= 16 has very small probability. + * We do not use a loop for optimal code generation. + */ + if (trie->data_size >= 8) { + u64 diff = be64_to_cpu(*(__be64 *)node->data ^ + *(__be64 *)key->data); + + prefixlen = 64 - fls64(diff); + if (prefixlen >= limit) + return limit; + if (diff) + return prefixlen; + i = 8; + } +#endif + + while (trie->data_size >= i + 4) { + u32 diff = be32_to_cpu(*(__be32 *)&node->data[i] ^ + *(__be32 *)&key->data[i]); + + prefixlen += 32 - fls(diff); + if (prefixlen >= limit) + return limit; + if (diff) + return prefixlen; + i += 4; + } - if (b < 8) - break; + if (trie->data_size >= i + 2) { + u16 diff = be16_to_cpu(*(__be16 *)&node->data[i] ^ + *(__be16 *)&key->data[i]); + + prefixlen += 16 - fls(diff); + if (prefixlen >= limit) + return limit; + if (diff) + return prefixlen; + i += 2; + } + + if (trie->data_size >= i + 1) { + prefixlen += 8 - fls(node->data[i] ^ key->data[i]); + + if (prefixlen >= limit) + return limit; } return prefixlen; -- cgit From c7c3f05e341a9a2bd1a92993d4f996cfd6e7348e Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Thu, 25 Oct 2018 19:10:36 +0900 Subject: panic: avoid deadlocks in re-entrant console drivers From printk()/serial console point of view panic() is special, because it may force CPU to re-enter printk() or/and serial console driver. Therefore, some of serial consoles drivers are re-entrant. E.g. 8250: serial8250_console_write() { if (port->sysrq) locked = 0; else if (oops_in_progress) locked = spin_trylock_irqsave(&port->lock, flags); else spin_lock_irqsave(&port->lock, flags); ... } panic() does set oops_in_progress via bust_spinlocks(1), so in theory we should be able to re-enter serial console driver from panic(): CPU0 uart_console_write() serial8250_console_write() // if (oops_in_progress) // spin_trylock_irqsave() call_console_drivers() console_unlock() console_flush_on_panic() bust_spinlocks(1) // oops_in_progress++ panic() spin_lock_irqsave(&port->lock, flags) // spin_lock_irqsave() serial8250_console_write() call_console_drivers() console_unlock() printk() ... However, this does not happen and we deadlock in serial console on port->lock spinlock. And the problem is that console_flush_on_panic() called after bust_spinlocks(0): void panic(const char *fmt, ...) { bust_spinlocks(1); ... bust_spinlocks(0); console_flush_on_panic(); ... } bust_spinlocks(0) decrements oops_in_progress, so oops_in_progress can go back to zero. Thus even re-entrant console drivers will simply spin on port->lock spinlock. Given that port->lock may already be locked either by a stopped CPU, or by the very same CPU we execute panic() on (for instance, NMI panic() on printing CPU) the system deadlocks and does not reboot. Fix this by removing bust_spinlocks(0), so oops_in_progress is always set in panic() now and, thus, re-entrant console drivers will trylock the port->lock instead of spinning on it forever, when we call them from console_flush_on_panic(). Link: http://lkml.kernel.org/r/20181025101036.6823-1-sergey.senozhatsky@gmail.com Cc: Steven Rostedt Cc: Daniel Wang Cc: Peter Zijlstra Cc: Andrew Morton Cc: Linus Torvalds Cc: Greg Kroah-Hartman Cc: Alan Cox Cc: Jiri Slaby Cc: Peter Feiner Cc: linux-serial@vger.kernel.org Cc: Sergey Senozhatsky Cc: stable@vger.kernel.org Signed-off-by: Sergey Senozhatsky Signed-off-by: Petr Mladek --- kernel/panic.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index 8b2e002d52eb..6a6df23acd1a 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -233,7 +234,10 @@ void panic(const char *fmt, ...) if (_crash_kexec_post_notifiers) __crash_kexec(NULL); - bust_spinlocks(0); +#ifdef CONFIG_VT + unblank_screen(); +#endif + console_unblank(); /* * We may have ended up stopping the CPU holding the lock (in -- cgit From 58c5fc2b96e4ae65068d815a1c3ca81da92fa1c9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 31 Oct 2018 19:21:08 +0100 Subject: time: Remove useless filenames in top level comments Remove the pointless filenames in the top level comments. They have no value at all and just occupy space. While at it tidy up some of the comments and remove a stale one. Signed-off-by: Thomas Gleixner Acked-by: Nicolas Pitre Acked-by: Kees Cook Acked-by: Ingo Molnar Acked-by: John Stultz Acked-by: Corey Minyard Cc: Peter Zijlstra Cc: Kate Stewart Cc: Philippe Ombredanne Cc: Peter Anvin Cc: Russell King Cc: Richard Cochran Cc: "Paul E. McKenney" Cc: David Riley Cc: Colin Cross Cc: Mark Brown Link: https://lkml.kernel.org/r/20181031182252.794898238@linutronix.de --- include/linux/hrtimer.h | 2 -- kernel/time/clockevents.c | 2 -- kernel/time/clocksource.c | 5 ----- kernel/time/hrtimer.c | 16 ++++------------ kernel/time/itimer.c | 2 -- kernel/time/jiffies.c | 2 -- kernel/time/posix-clock.c | 2 +- kernel/time/posix-timers.c | 4 ---- kernel/time/sched_clock.c | 4 ++-- kernel/time/tick-broadcast-hrtimer.c | 4 +--- kernel/time/tick-broadcast.c | 2 -- kernel/time/tick-common.c | 2 -- kernel/time/tick-oneshot.c | 2 -- kernel/time/tick-sched.c | 2 -- kernel/time/time.c | 12 ++++-------- kernel/time/timecounter.c | 6 +----- kernel/time/timekeeping.c | 10 ++-------- kernel/time/timer.c | 2 -- kernel/time/timer_list.c | 2 -- 19 files changed, 15 insertions(+), 68 deletions(-) (limited to 'kernel') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 3892e9c8b2de..50ebe2ad43e0 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -1,6 +1,4 @@ /* - * include/linux/hrtimer.h - * * hrtimers - High-resolution kernel timers * * Copyright(C) 2005, Thomas Gleixner diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index af58898d9ebf..9b8c7c0fd113 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -1,6 +1,4 @@ /* - * linux/kernel/time/clockevents.c - * * This file contains functions which manage clock event devices. * * Copyright(C) 2005-2006, Thomas Gleixner diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index ffe081623aec..1c5273fbd500 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -1,6 +1,4 @@ /* - * linux/kernel/time/clocksource.c - * * This file contains the functions which manage clocksource drivers. * * Copyright (C) 2004, 2005 IBM, John Stultz (johnstul@us.ibm.com) @@ -18,9 +16,6 @@ * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - * - * TODO WishList: - * o Allow clocksource drivers to be unregistered */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 9cdd74bd2d27..223548bb81c6 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1,26 +1,18 @@ /* - * linux/kernel/hrtimer.c - * * Copyright(C) 2005-2006, Thomas Gleixner * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar * Copyright(C) 2006-2007 Timesys Corp., Thomas Gleixner * * High-resolution kernel timers * - * In contrast to the low-resolution timeout API implemented in - * kernel/timer.c, hrtimers provide finer resolution and accuracy - * depending on system configuration and capabilities. - * - * These timers are currently used for: - * - itimers - * - POSIX timers - * - nanosleep - * - precise in-kernel timing + * In contrast to the low-resolution timeout API, aka timer wheel, + * hrtimers provide finer resolution and accuracy depending on system + * configuration and capabilities. * * Started by: Thomas Gleixner and Ingo Molnar * * Credits: - * based on kernel/timer.c + * Based on the original timer wheel code * * Help, testing, suggestions, bugfixes, improvements were * provided by: diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c index 9a65713c8309..02068b2d5862 100644 --- a/kernel/time/itimer.c +++ b/kernel/time/itimer.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 /* - * linux/kernel/itimer.c - * * Copyright (C) 1992 Darren Senn */ diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index 497719127bf9..9c3957fe9317 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c @@ -1,6 +1,4 @@ /*********************************************************************** -* linux/kernel/time/jiffies.c -* * This file contains the jiffies based clocksource. * * Copyright (C) 2004, 2005 IBM, John Stultz (johnstul@us.ibm.com) diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c index fe56c4e06c51..4959815f4fd7 100644 --- a/kernel/time/posix-clock.c +++ b/kernel/time/posix-clock.c @@ -1,5 +1,5 @@ /* - * posix-clock.c - support for dynamic clock devices + * Support for dynamic clock devices * * Copyright (C) 2010 OMICRON electronics GmbH * diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index bd62b5eeb5a0..c72307c119d9 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -1,10 +1,6 @@ /* - * linux/kernel/posix-timers.c - * - * * 2002-10-15 Posix Clocks & timers * by George Anzinger george@mvista.com - * * Copyright (C) 2002 2003 by MontaVista Software. * * 2004-06-01 Fix CLOCK_REALTIME clock/timer TIMER_ABSTIME bug. diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index cbc72c2c1fca..b38b6628f89b 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -1,6 +1,6 @@ /* - * sched_clock.c: Generic sched_clock() support, to extend low level - * hardware time counters to full 64-bit ns values. + * Generic sched_clock() support, to extend low level hardware time + * counters to full 64-bit ns values. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c index a59641fb88b6..5be6154e2fd2 100644 --- a/kernel/time/tick-broadcast-hrtimer.c +++ b/kernel/time/tick-broadcast-hrtimer.c @@ -1,8 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * linux/kernel/time/tick-broadcast-hrtimer.c - * This file emulates a local clock event device - * via a pseudo clock device. + * Emulate a local clock event device via a pseudo clock device. */ #include #include diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index be0aac2b4300..4f5abde2dfa7 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -1,6 +1,4 @@ /* - * linux/kernel/time/tick-broadcast.c - * * This file contains functions which emulate a local clock-event * device via a broadcast event source. * diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 14de3727b18e..7b5008039c2d 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -1,6 +1,4 @@ /* - * linux/kernel/time/tick-common.c - * * This file contains the base functions to manage periodic tick * related events. * diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c index 6fe615d57ebb..77989efe13d2 100644 --- a/kernel/time/tick-oneshot.c +++ b/kernel/time/tick-oneshot.c @@ -1,6 +1,4 @@ /* - * linux/kernel/time/tick-oneshot.c - * * This file contains functions which manage high resolution tick * related events. * diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 69e673b88474..cb557e56a19f 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -1,6 +1,4 @@ /* - * linux/kernel/time/tick-sched.c - * * Copyright(C) 2005-2006, Thomas Gleixner * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar * Copyright(C) 2006-2007 Timesys Corp., Thomas Gleixner diff --git a/kernel/time/time.c b/kernel/time/time.c index ad204cf6d001..13ffa9950ffc 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -1,14 +1,10 @@ /* - * linux/kernel/time.c - * * Copyright (C) 1991, 1992 Linus Torvalds * - * This file contains the interface functions for the various - * time related system calls: time, stime, gettimeofday, settimeofday, - * adjtime - */ -/* - * Modification history kernel/time.c + * This file contains the interface functions for the various time related + * system calls: time, stime, gettimeofday, settimeofday, adjtime + * + * Modification history: * * 1993-09-02 Philip Gladstone * Created file with time related functions from sched/core.c and adjtimex() diff --git a/kernel/time/timecounter.c b/kernel/time/timecounter.c index 8afd78932bdf..400f3456d564 100644 --- a/kernel/time/timecounter.c +++ b/kernel/time/timecounter.c @@ -1,8 +1,5 @@ /* - * linux/kernel/time/timecounter.c - * - * based on code that migrated away from - * linux/kernel/time/clocksource.c + * Based on clocksource code. See commit 74d23cc704d1 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -14,7 +11,6 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. */ - #include #include diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 2d110c948805..30fdf48f50c2 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1,13 +1,7 @@ /* - * linux/kernel/time/timekeeping.c - * - * Kernel timekeeping code and accessor functions - * - * This code was moved from linux/kernel/timer.c. - * Please see that file for copyright and history logs. - * + * Kernel timekeeping code and accessor functions. Based on code from + * timer.c, moved in commit 8524070b7982. */ - #include #include #include diff --git a/kernel/time/timer.c b/kernel/time/timer.c index fa49cd753dea..2f248bbedb4a 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1,6 +1,4 @@ /* - * linux/kernel/timer.c - * * Kernel internal timers * * Copyright (C) 1991, 1992 Linus Torvalds diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index d647dabdac97..5d64fff384c8 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -1,6 +1,4 @@ /* - * kernel/time/timer_list.c - * * List pending timers * * Copyright(C) 2006, Red Hat, Inc., Ingo Molnar -- cgit From 35728b8209ee7d25b6241a56304ee926469bd154 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 31 Oct 2018 19:21:09 +0100 Subject: time: Add SPDX license identifiers Update the time(r) core files files with the correct SPDX license identifier based on the license text in the file itself. The SPDX identifier is a legally binding shorthand, which can be used instead of the full boiler plate text. This work is based on a script and data from Philippe Ombredanne, Kate Stewart and myself. The data has been created with two independent license scanners and manual inspection. The following files do not contain any direct license information and have been omitted from the big initial SPDX changes: timeconst.bc: The .bc files were not touched time.c, timer.c, timekeeping.c: Licence was deduced from EXPORT_SYMBOL_GPL As those files do not contain direct license references they fall under the project license, i.e. GPL V2 only. Signed-off-by: Thomas Gleixner Acked-by: Kees Cook Acked-by: Ingo Molnar Acked-by: John Stultz Acked-by: Corey Minyard Cc: Peter Zijlstra Cc: Kate Stewart Cc: Philippe Ombredanne Cc: Russell King Cc: Richard Cochran Cc: Nicolas Pitre Cc: David Riley Cc: Colin Cross Cc: Mark Brown Cc: H. Peter Anvin Cc: Paul E. McKenney Link: https://lkml.kernel.org/r/20181031182252.879109557@linutronix.de --- include/linux/hrtimer.h | 1 + kernel/time/alarmtimer.c | 1 + kernel/time/clockevents.c | 1 + kernel/time/clocksource.c | 1 + kernel/time/hrtimer.c | 1 + kernel/time/jiffies.c | 1 + kernel/time/posix-clock.c | 1 + kernel/time/posix-stubs.c | 1 + kernel/time/posix-timers.c | 1 + kernel/time/sched_clock.c | 1 + kernel/time/test_udelay.c | 1 + kernel/time/tick-broadcast.c | 1 + kernel/time/tick-common.c | 1 + kernel/time/tick-oneshot.c | 1 + kernel/time/tick-sched.c | 1 + kernel/time/time.c | 1 + kernel/time/timeconst.bc | 2 ++ kernel/time/timeconv.c | 1 + kernel/time/timecounter.c | 1 + kernel/time/timekeeping.c | 1 + kernel/time/timekeeping_debug.c | 1 + kernel/time/timer.c | 1 + kernel/time/timer_list.c | 1 + 23 files changed, 24 insertions(+) (limited to 'kernel') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 50ebe2ad43e0..851e4231d3ab 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * hrtimers - High-resolution kernel timers * diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index fa5de5e8de61..69070d399d70 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Alarmtimer interface * diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 9b8c7c0fd113..0fdbdf17f8a2 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * This file contains functions which manage clock event devices. * diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 1c5273fbd500..b1abeac5f3f7 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * This file contains the functions which manage clocksource drivers. * diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 223548bb81c6..16dacc8d3ca2 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright(C) 2005-2006, Thomas Gleixner * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index 9c3957fe9317..0deb0be2c445 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0+ /*********************************************************************** * This file contains the jiffies based clocksource. * diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c index 4959815f4fd7..339e35e4605f 100644 --- a/kernel/time/posix-clock.c +++ b/kernel/time/posix-clock.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * Support for dynamic clock devices * diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c index 989ccf028bde..b9f9f6f02e11 100644 --- a/kernel/time/posix-stubs.c +++ b/kernel/time/posix-stubs.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Dummy stubs used when CONFIG_POSIX_TIMERS=n * diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index c72307c119d9..e8cd9aa6c9cf 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * 2002-10-15 Posix Clocks & timers * by George Anzinger george@mvista.com diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index b38b6628f89b..11570ba451cc 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Generic sched_clock() support, to extend low level hardware time * counters to full 64-bit ns values. diff --git a/kernel/time/test_udelay.c b/kernel/time/test_udelay.c index b0928ab3270f..d6a87bb2040f 100644 --- a/kernel/time/test_udelay.c +++ b/kernel/time/test_udelay.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * udelay() test kernel module * diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 4f5abde2dfa7..f4725f53d852 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * This file contains functions which emulate a local clock-event * device via a broadcast event source. diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 7b5008039c2d..455b8d65a2b7 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * This file contains the base functions to manage periodic tick * related events. diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c index 77989efe13d2..1c8ad0fb33c0 100644 --- a/kernel/time/tick-oneshot.c +++ b/kernel/time/tick-oneshot.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * This file contains functions which manage high resolution tick * related events. diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index cb557e56a19f..62ecb2a802ca 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright(C) 2005-2006, Thomas Gleixner * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar diff --git a/kernel/time/time.c b/kernel/time/time.c index 13ffa9950ffc..5aa0a156e331 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 1991, 1992 Linus Torvalds * diff --git a/kernel/time/timeconst.bc b/kernel/time/timeconst.bc index f83bbb81600b..7ed0e0fb5831 100644 --- a/kernel/time/timeconst.bc +++ b/kernel/time/timeconst.bc @@ -1,3 +1,5 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + scale=0 define gcd(a,b) { diff --git a/kernel/time/timeconv.c b/kernel/time/timeconv.c index 7142580ad94f..589e0a552129 100644 --- a/kernel/time/timeconv.c +++ b/kernel/time/timeconv.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: LGPL-2.0+ /* * Copyright (C) 1993, 1994, 1995, 1996, 1997 Free Software Foundation, Inc. * This file is part of the GNU C Library. diff --git a/kernel/time/timecounter.c b/kernel/time/timecounter.c index 400f3456d564..933462326489 100644 --- a/kernel/time/timecounter.c +++ b/kernel/time/timecounter.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * Based on clocksource code. See commit 74d23cc704d1 * diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 30fdf48f50c2..cd02bd38cf2d 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Kernel timekeeping code and accessor functions. Based on code from * timer.c, moved in commit 8524070b7982. diff --git a/kernel/time/timekeeping_debug.c b/kernel/time/timekeeping_debug.c index 238e4be60229..d06f09209fb7 100644 --- a/kernel/time/timekeeping_debug.c +++ b/kernel/time/timekeeping_debug.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * debugfs file to track time spent in suspend * diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 2f248bbedb4a..444156debfa0 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Kernel internal timers * diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index 5d64fff384c8..f81693cdf981 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * List pending timers * -- cgit From f49c174b5f431db9fa17315269e288d4548b651c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 31 Oct 2018 19:21:10 +0100 Subject: hrtimers/tick/clockevents: Remove sloppy license references "For licencing details see kernel-base/COPYING" and similar license references have no value over the SPDX identifier. Remove them. Signed-off-by: Thomas Gleixner Acked-by: Kees Cook Acked-by: Ingo Molnar Acked-by: John Stultz Acked-by: Corey Minyard Cc: Peter Zijlstra Cc: Kate Stewart Cc: Philippe Ombredanne Cc: Peter Anvin Cc: Russell King Cc: Richard Cochran Cc: "Paul E. McKenney" Cc: Nicolas Pitre Cc: David Riley Cc: Colin Cross Cc: Mark Brown Link: https://lkml.kernel.org/r/20181031182252.963632760@linutronix.de --- include/linux/hrtimer.h | 2 -- kernel/time/clockevents.c | 3 --- kernel/time/hrtimer.c | 2 -- kernel/time/tick-broadcast.c | 3 --- kernel/time/tick-common.c | 3 --- kernel/time/tick-oneshot.c | 3 --- kernel/time/tick-sched.c | 2 -- kernel/time/timer_list.c | 4 ---- 8 files changed, 22 deletions(-) (limited to 'kernel') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 851e4231d3ab..2e8957eac4d4 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -8,8 +8,6 @@ * data type definitions, declarations, prototypes * * Started by: Thomas Gleixner and Ingo Molnar - * - * For licencing details see kernel-base/COPYING */ #ifndef _LINUX_HRTIMER_H #define _LINUX_HRTIMER_H diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 0fdbdf17f8a2..5e77662dd2d9 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -5,9 +5,6 @@ * Copyright(C) 2005-2006, Thomas Gleixner * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner - * - * This code is licenced under the GPL version 2. For details see - * kernel-base/COPYING. */ #include diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 16dacc8d3ca2..f5cfa1b73d6f 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -20,8 +20,6 @@ * * George Anzinger, Andrew Morton, Steven Rostedt, Roman Zippel * et. al. - * - * For licencing details see kernel-base/COPYING */ #include diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index f4725f53d852..803fa67aace9 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -6,9 +6,6 @@ * Copyright(C) 2005-2006, Thomas Gleixner * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner - * - * This code is licenced under the GPL version 2. For details see - * kernel-base/COPYING. */ #include #include diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 455b8d65a2b7..529143b4c8d2 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -6,9 +6,6 @@ * Copyright(C) 2005-2006, Thomas Gleixner * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner - * - * This code is licenced under the GPL version 2. For details see - * kernel-base/COPYING. */ #include #include diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c index 1c8ad0fb33c0..f9745d47425a 100644 --- a/kernel/time/tick-oneshot.c +++ b/kernel/time/tick-oneshot.c @@ -6,9 +6,6 @@ * Copyright(C) 2005-2006, Thomas Gleixner * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner - * - * This code is licenced under the GPL version 2. For details see - * kernel-base/COPYING. */ #include #include diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 62ecb2a802ca..6fa52cd6df0b 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -7,8 +7,6 @@ * No idle tick implementation for low and high resolution timers * * Started by: Thomas Gleixner and Ingo Molnar - * - * Distribute under GPLv2. */ #include #include diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index f81693cdf981..98ba50dcb1b2 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -3,10 +3,6 @@ * List pending timers * * Copyright(C) 2006, Red Hat, Inc., Ingo Molnar - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #include -- cgit From 9281a7857b91cf4d283be7c86d80e5d091bfb3d9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 31 Oct 2018 19:21:11 +0100 Subject: time/debug: Remove license boilerplate The SPDX identifier is enough. Remove the license boilerplate. Signed-off-by: Thomas Gleixner Acked-by: Kees Cook Acked-by: Ingo Molnar Acked-by: John Stultz Acked-by: Corey Minyard Cc: Peter Zijlstra Cc: Kate Stewart Cc: Philippe Ombredanne Cc: Peter Anvin Cc: Russell King Cc: Richard Cochran Cc: "Paul E. McKenney" Cc: Nicolas Pitre Cc: David Riley Cc: Colin Cross Cc: Mark Brown Link: https://lkml.kernel.org/r/20181031182253.047449481@linutronix.de --- kernel/time/test_udelay.c | 9 --------- kernel/time/timekeeping_debug.c | 10 ---------- 2 files changed, 19 deletions(-) (limited to 'kernel') diff --git a/kernel/time/test_udelay.c b/kernel/time/test_udelay.c index d6a87bb2040f..77c63005dc4e 100644 --- a/kernel/time/test_udelay.c +++ b/kernel/time/test_udelay.c @@ -8,15 +8,6 @@ * Specifying usecs of 0 or negative values will run multiples tests. * * Copyright (C) 2014 Google, Inc. - * - * This software is licensed under the terms of the GNU General Public - * License version 2, as published by the Free Software Foundation, and - * may be copied, distributed, and modified under those terms. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. */ #include diff --git a/kernel/time/timekeeping_debug.c b/kernel/time/timekeeping_debug.c index d06f09209fb7..f811882cfd13 100644 --- a/kernel/time/timekeeping_debug.c +++ b/kernel/time/timekeeping_debug.c @@ -3,16 +3,6 @@ * debugfs file to track time spent in suspend * * Copyright (c) 2011, Google, Inc. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. */ #include -- cgit From 6c7811c628a96fe00965622f3aa1cf272cb898f7 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 31 Oct 2018 19:21:12 +0100 Subject: time: Remove license boilerplate The SPDX identifier defines the license of the files already. No need for the boilerplates. Signed-off-by: Thomas Gleixner Acked-by: Kees Cook Acked-by: Ingo Molnar Acked-by: John Stultz Acked-by: Corey Minyard Acked-by: Paul E. McKenney Cc: Peter Zijlstra Cc: Kate Stewart Cc: Philippe Ombredanne Cc: Peter Anvin Cc: Russell King Cc: Richard Cochran Cc: Nicolas Pitre Cc: David Riley Cc: Colin Cross Cc: Mark Brown Cc: Paul E. McKenney Link: https://lkml.kernel.org/r/20181031182253.132458951@linutronix.de --- kernel/time/alarmtimer.c | 4 ---- kernel/time/clocksource.c | 14 -------------- kernel/time/jiffies.c | 25 +++++-------------------- kernel/time/timecounter.c | 10 ---------- 4 files changed, 5 insertions(+), 48 deletions(-) (limited to 'kernel') diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 69070d399d70..2c97e8c2d29f 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -11,10 +11,6 @@ * Copyright (C) 2010 IBM Corperation * * Author: John Stultz - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #include #include diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index b1abeac5f3f7..3bcc19ceb073 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -3,20 +3,6 @@ * This file contains the functions which manage clocksource drivers. * * Copyright (C) 2004, 2005 IBM, John Stultz (johnstul@us.ibm.com) - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index 0deb0be2c445..dc1b6f1929f9 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c @@ -1,24 +1,9 @@ // SPDX-License-Identifier: GPL-2.0+ -/*********************************************************************** -* This file contains the jiffies based clocksource. -* -* Copyright (C) 2004, 2005 IBM, John Stultz (johnstul@us.ibm.com) -* -* This program is free software; you can redistribute it and/or modify -* it under the terms of the GNU General Public License as published by -* the Free Software Foundation; either version 2 of the License, or -* (at your option) any later version. -* -* This program is distributed in the hope that it will be useful, -* but WITHOUT ANY WARRANTY; without even the implied warranty of -* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -* GNU General Public License for more details. -* -* You should have received a copy of the GNU General Public License -* along with this program; if not, write to the Free Software -* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. -* -************************************************************************/ +/* + * This file contains the jiffies based clocksource. + * + * Copyright (C) 2004, 2005 IBM, John Stultz (johnstul@us.ibm.com) + */ #include #include #include diff --git a/kernel/time/timecounter.c b/kernel/time/timecounter.c index 933462326489..85b98e727306 100644 --- a/kernel/time/timecounter.c +++ b/kernel/time/timecounter.c @@ -1,16 +1,6 @@ // SPDX-License-Identifier: GPL-2.0+ /* * Based on clocksource code. See commit 74d23cc704d1 - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. */ #include #include -- cgit From 3c8f2515ac0a986c8af6ba17c376f874306e71f4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 31 Oct 2018 19:21:13 +0100 Subject: posix-timers/stubs: Remove license boilerplate The SPDX identifier defines the license of the file already. No need for the boilerplate. Signed-off-by: Thomas Gleixner Acked-by: Nicolas Pitre Acked-by: Kees Cook Acked-by: Ingo Molnar Acked-by: John Stultz Acked-by: Corey Minyard Cc: Peter Zijlstra Cc: Kate Stewart Cc: Philippe Ombredanne Cc: Peter Anvin Cc: Russell King Cc: Richard Cochran Cc: "Paul E. McKenney" Cc: David Riley Cc: Colin Cross Cc: Mark Brown Cc: Arnd Bergmann Link: https://lkml.kernel.org/r/20181031182253.215825217@linutronix.de --- kernel/time/posix-stubs.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c index b9f9f6f02e11..a51895486e5e 100644 --- a/kernel/time/posix-stubs.c +++ b/kernel/time/posix-stubs.c @@ -4,10 +4,6 @@ * * Created by: Nicolas Pitre, July 2016 * Copyright: (C) 2016 Linaro Limited - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #include -- cgit From 2fa6d420c22268b30dc46c8cbfe8bec0e361e03e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 31 Oct 2018 19:21:14 +0100 Subject: sched/clock: Remove license boilerplate The SPDX identifier defines the license of the file already. No need for the boilerplate. Signed-off-by: Thomas Gleixner Acked-by: Kees Cook Acked-by: Ingo Molnar Acked-by: John Stultz Acked-by: Corey Minyard Cc: Peter Zijlstra Cc: Kate Stewart Cc: Philippe Ombredanne Cc: Peter Anvin Cc: Russell King Cc: Richard Cochran Cc: "Paul E. McKenney" Cc: Nicolas Pitre Cc: David Riley Cc: Colin Cross Cc: Mark Brown Link: https://lkml.kernel.org/r/20181031182253.300140921@linutronix.de --- kernel/time/sched_clock.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'kernel') diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index 11570ba451cc..094b82ca95e5 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -2,10 +2,6 @@ /* * Generic sched_clock() support, to extend low level hardware time * counters to full 64-bit ns values. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #include #include -- cgit From c804efeb58229e6040b9a200cbab1fc8c150f99d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 31 Oct 2018 19:21:15 +0100 Subject: posix-clocks: Remove license boiler plate The SPDX identifier defines the license of the file already. No need for the boilerplate. Signed-off-by: Thomas Gleixner Acked-by: Richard Cochran Acked-by: Kees Cook Acked-by: Ingo Molnar Acked-by: Manfred Rudigier Acked-by: John Stultz Acked-by: Corey Minyard Cc: Peter Zijlstra Cc: Kate Stewart Cc: Philippe Ombredanne Cc: Peter Anvin Cc: Russell King Cc: "Paul E. McKenney" Cc: Nicolas Pitre Cc: David Riley Cc: Colin Cross Cc: Mark Brown Link: https://lkml.kernel.org/r/20181031182253.385909804@linutronix.de --- kernel/time/posix-clock.c | 14 -------------- 1 file changed, 14 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c index 339e35e4605f..425bbfce6819 100644 --- a/kernel/time/posix-clock.c +++ b/kernel/time/posix-clock.c @@ -3,20 +3,6 @@ * Support for dynamic clock devices * * Copyright (C) 2010 OMICRON electronics GmbH - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #include #include -- cgit From 0141de741e0710d5e2e68087577329606f59ed71 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 31 Oct 2018 19:21:16 +0100 Subject: posix-timers: Remove license boilerplate The SPDX identifier defines the license of the file already. No need for the boilerplate. Remove also the completely outdated Montavista snail mail address. Signed-off-by: Thomas Gleixner Acked-by: Kees Cook Acked-by: Ingo Molnar Acked-by: John Stultz Acked-by: Corey Minyard Cc: Peter Zijlstra Cc: Kate Stewart Cc: Philippe Ombredanne Cc: Peter Anvin Cc: Russell King Cc: Richard Cochran Cc: "Paul E. McKenney" Cc: Nicolas Pitre Cc: David Riley Cc: Colin Cross Cc: Mark Brown Link: https://lkml.kernel.org/r/20181031182253.479792883@linutronix.de --- kernel/time/posix-timers.c | 20 +------------------- 1 file changed, 1 insertion(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index e8cd9aa6c9cf..dd70ced15a36 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -7,25 +7,7 @@ * 2004-06-01 Fix CLOCK_REALTIME clock/timer TIMER_ABSTIME bug. * Copyright (C) 2004 Boris Hu * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or (at - * your option) any later version. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - * - * MontaVista Software | 1237 East Arques Avenue | Sunnyvale | CA 94085 | USA - */ - -/* These are all the functions necessary to implement - * POSIX clocks & timers + * These are all the functions necessary to implement POSIX clocks & timers */ #include #include -- cgit From cf0dd411e80f7066cabf69899724e48dd3192b99 Mon Sep 17 00:00:00 2001 From: Rustam Kovhaev Date: Fri, 23 Nov 2018 15:48:16 -0800 Subject: bpf, tags: Fix DEFINE_PER_CPU expansion Building tags produces warning: ctags: Warning: kernel/bpf/local_storage.c:10: null expansion of name pattern "\1" Let's use the same fix as in commit 25528213fe9f ("tags: Fix DEFINE_PER_CPU expansions"), even though it violates the usual code style. Signed-off-by: Rustam Kovhaev Signed-off-by: Daniel Borkmann --- kernel/bpf/local_storage.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index c97a8f968638..9e94b1cc6cf2 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -7,8 +7,7 @@ #include #include -DEFINE_PER_CPU(struct bpf_cgroup_storage*, - bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]); +DEFINE_PER_CPU(struct bpf_cgroup_storage*, bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]); #ifdef CONFIG_CGROUP_BPF -- cgit From 311fe1a813324ea6d8172a3e9eefb1b274c72fea Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Sun, 25 Nov 2018 23:32:51 +0000 Subject: bpf: btf: fix spelling mistake "Memmber" -> "Member" There is a spelling mistake in a btf_verifier_log_member message, fix it. Signed-off-by: Colin Ian King Signed-off-by: Daniel Borkmann --- kernel/bpf/btf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 69da9169819a..a09b2f94ab25 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -1621,7 +1621,7 @@ static s32 btf_struct_check_meta(struct btf_verifier_env *env, if (BITS_ROUNDUP_BYTES(member->offset) > struct_size) { btf_verifier_log_member(env, t, member, - "Memmber bits_offset exceeds its struct size"); + "Member bits_offset exceeds its struct size"); return -EINVAL; } -- cgit From d0a3f18a70f2d9700bf9f5e9c4a505472388a7c1 Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Thu, 2 Aug 2018 17:56:50 -0400 Subject: audit: minimize our use of audit_log_format() There are some cases where we are making multiple audit_log_format() calls in a row, for no apparent reason. Squash these down to a single audit_log_format() call whenever possible. Acked-by: Richard Guy Briggs Signed-off-by: Paul Moore --- kernel/audit.c | 11 +++++------ kernel/audit_fsnotify.c | 3 +-- kernel/audit_tree.c | 3 +-- kernel/audit_watch.c | 3 +-- kernel/auditsc.c | 7 +++---- 5 files changed, 11 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 6c53e373b828..d09298d3c2d2 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -2177,22 +2177,21 @@ void audit_log_name(struct audit_context *context, struct audit_names *n, } /* log the audit_names record type */ - audit_log_format(ab, " nametype="); switch(n->type) { case AUDIT_TYPE_NORMAL: - audit_log_format(ab, "NORMAL"); + audit_log_format(ab, " nametype=NORMAL"); break; case AUDIT_TYPE_PARENT: - audit_log_format(ab, "PARENT"); + audit_log_format(ab, " nametype=PARENT"); break; case AUDIT_TYPE_CHILD_DELETE: - audit_log_format(ab, "DELETE"); + audit_log_format(ab, " nametype=DELETE"); break; case AUDIT_TYPE_CHILD_CREATE: - audit_log_format(ab, "CREATE"); + audit_log_format(ab, " nametype=CREATE"); break; default: - audit_log_format(ab, "UNKNOWN"); + audit_log_format(ab, " nametype=UNKNOWN"); break; } diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c index f90ffa699e5b..cf4512a33675 100644 --- a/kernel/audit_fsnotify.c +++ b/kernel/audit_fsnotify.c @@ -131,8 +131,7 @@ static void audit_mark_log_rule_change(struct audit_fsnotify_mark *audit_mark, c if (unlikely(!ab)) return; audit_log_session_info(ab); - audit_log_format(ab, " op=%s", op); - audit_log_format(ab, " path="); + audit_log_format(ab, " op=%s path=", op); audit_log_untrustedstring(ab, audit_mark->path); audit_log_key(ab, rule->filterkey); audit_log_format(ab, " list=%d res=1", rule->listnr); diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 58e84eb5d826..d4af4d97f847 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -533,8 +533,7 @@ static void audit_tree_log_remove_rule(struct audit_krule *rule) ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); if (unlikely(!ab)) return; - audit_log_format(ab, "op=remove_rule"); - audit_log_format(ab, " dir="); + audit_log_format(ab, "op=remove_rule dir="); audit_log_untrustedstring(ab, rule->tree->pathname); audit_log_key(ab, rule->filterkey); audit_log_format(ab, " list=%d res=1", rule->listnr); diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 568e48d1d0ab..20ef9ba134b0 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -246,8 +246,7 @@ static void audit_watch_log_rule_change(struct audit_krule *r, struct audit_watc if (!ab) return; audit_log_session_info(ab); - audit_log_format(ab, "op=%s", op); - audit_log_format(ab, " path="); + audit_log_format(ab, "op=%s path=", op); audit_log_untrustedstring(ab, w->path); audit_log_key(ab, r->filterkey); audit_log_format(ab, " list=%d res=1", r->listnr); diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 605f2d825204..51e735aedf58 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -2503,10 +2503,9 @@ void audit_seccomp_actions_logged(const char *names, const char *old_names, if (unlikely(!ab)) return; - audit_log_format(ab, "op=seccomp-logging"); - audit_log_format(ab, " actions=%s", names); - audit_log_format(ab, " old-actions=%s", old_names); - audit_log_format(ab, " res=%d", res); + audit_log_format(ab, + "op=seccomp-logging actions=%s old-actions=%s res=%d", + names, old_names, res); audit_log_end(ab); } -- cgit From 2a1fe215e7300c7ebd6a7a24afcab71db5107bb0 Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Mon, 26 Nov 2018 18:40:07 -0500 Subject: audit: use current whenever possible There are many places, notably audit_log_task_info() and audit_log_exit(), that take task_struct pointers but in reality they are always working on the current task. This patch eliminates the task_struct arguments and uses current directly which allows a number of cleanups as well. Acked-by: Richard Guy Briggs Signed-off-by: Paul Moore --- drivers/tty/tty_audit.c | 13 ++-- include/linux/audit.h | 6 +- kernel/audit.c | 34 +++++----- kernel/audit.h | 2 +- kernel/auditsc.c | 131 +++++++++++++++++++-------------------- security/integrity/ima/ima_api.c | 2 +- 6 files changed, 90 insertions(+), 98 deletions(-) (limited to 'kernel') diff --git a/drivers/tty/tty_audit.c b/drivers/tty/tty_audit.c index 50f567b6a66e..28f87fd6a28e 100644 --- a/drivers/tty/tty_audit.c +++ b/drivers/tty/tty_audit.c @@ -61,20 +61,19 @@ static void tty_audit_log(const char *description, dev_t dev, unsigned char *data, size_t size) { struct audit_buffer *ab; - struct task_struct *tsk = current; - pid_t pid = task_pid_nr(tsk); - uid_t uid = from_kuid(&init_user_ns, task_uid(tsk)); - uid_t loginuid = from_kuid(&init_user_ns, audit_get_loginuid(tsk)); - unsigned int sessionid = audit_get_sessionid(tsk); + pid_t pid = task_pid_nr(current); + uid_t uid = from_kuid(&init_user_ns, task_uid(current)); + uid_t loginuid = from_kuid(&init_user_ns, audit_get_loginuid(current)); + unsigned int sessionid = audit_get_sessionid(current); ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_TTY); if (ab) { - char name[sizeof(tsk->comm)]; + char name[sizeof(current->comm)]; audit_log_format(ab, "%s pid=%u uid=%u auid=%u ses=%u major=%d" " minor=%d comm=", description, pid, uid, loginuid, sessionid, MAJOR(dev), MINOR(dev)); - get_task_comm(name, tsk); + get_task_comm(name, current); audit_log_untrustedstring(ab, name); audit_log_format(ab, " data="); audit_log_n_hex(ab, data, size); diff --git a/include/linux/audit.h b/include/linux/audit.h index 58cf665f597e..a625c29a2ea2 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -151,8 +151,7 @@ extern void audit_log_link_denied(const char *operation); extern void audit_log_lost(const char *message); extern int audit_log_task_context(struct audit_buffer *ab); -extern void audit_log_task_info(struct audit_buffer *ab, - struct task_struct *tsk); +extern void audit_log_task_info(struct audit_buffer *ab); extern int audit_update_lsm_rules(void); @@ -200,8 +199,7 @@ static inline int audit_log_task_context(struct audit_buffer *ab) { return 0; } -static inline void audit_log_task_info(struct audit_buffer *ab, - struct task_struct *tsk) +static inline void audit_log_task_info(struct audit_buffer *ab) { } #define audit_enabled AUDIT_OFF #endif /* CONFIG_AUDIT */ diff --git a/kernel/audit.c b/kernel/audit.c index d09298d3c2d2..779671883349 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1096,10 +1096,11 @@ static void audit_log_feature_change(int which, u32 old_feature, u32 new_feature if (audit_enabled == AUDIT_OFF) return; + ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_FEATURE_CHANGE); if (!ab) return; - audit_log_task_info(ab, current); + audit_log_task_info(ab); audit_log_format(ab, " feature=%s old=%u new=%u old_lock=%u new_lock=%u res=%d", audit_feature_names[which], !!old_feature, !!new_feature, !!old_lock, !!new_lock, res); @@ -2246,15 +2247,15 @@ out_null: audit_log_format(ab, " exe=(null)"); } -struct tty_struct *audit_get_tty(struct task_struct *tsk) +struct tty_struct *audit_get_tty(void) { struct tty_struct *tty = NULL; unsigned long flags; - spin_lock_irqsave(&tsk->sighand->siglock, flags); - if (tsk->signal) - tty = tty_kref_get(tsk->signal->tty); - spin_unlock_irqrestore(&tsk->sighand->siglock, flags); + spin_lock_irqsave(¤t->sighand->siglock, flags); + if (current->signal) + tty = tty_kref_get(current->signal->tty); + spin_unlock_irqrestore(¤t->sighand->siglock, flags); return tty; } @@ -2263,25 +2264,24 @@ void audit_put_tty(struct tty_struct *tty) tty_kref_put(tty); } -void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk) +void audit_log_task_info(struct audit_buffer *ab) { const struct cred *cred; - char comm[sizeof(tsk->comm)]; + char comm[sizeof(current->comm)]; struct tty_struct *tty; if (!ab) return; - /* tsk == current */ cred = current_cred(); - tty = audit_get_tty(tsk); + tty = audit_get_tty(); audit_log_format(ab, " ppid=%d pid=%d auid=%u uid=%u gid=%u" " euid=%u suid=%u fsuid=%u" " egid=%u sgid=%u fsgid=%u tty=%s ses=%u", - task_ppid_nr(tsk), - task_tgid_nr(tsk), - from_kuid(&init_user_ns, audit_get_loginuid(tsk)), + task_ppid_nr(current), + task_tgid_nr(current), + from_kuid(&init_user_ns, audit_get_loginuid(current)), from_kuid(&init_user_ns, cred->uid), from_kgid(&init_user_ns, cred->gid), from_kuid(&init_user_ns, cred->euid), @@ -2291,11 +2291,11 @@ void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk) from_kgid(&init_user_ns, cred->sgid), from_kgid(&init_user_ns, cred->fsgid), tty ? tty_name(tty) : "(none)", - audit_get_sessionid(tsk)); + audit_get_sessionid(current)); audit_put_tty(tty); audit_log_format(ab, " comm="); - audit_log_untrustedstring(ab, get_task_comm(comm, tsk)); - audit_log_d_path_exe(ab, tsk->mm); + audit_log_untrustedstring(ab, get_task_comm(comm, current)); + audit_log_d_path_exe(ab, current->mm); audit_log_task_context(ab); } EXPORT_SYMBOL(audit_log_task_info); @@ -2316,7 +2316,7 @@ void audit_log_link_denied(const char *operation) if (!ab) return; audit_log_format(ab, "op=%s", operation); - audit_log_task_info(ab, current); + audit_log_task_info(ab); audit_log_format(ab, " res=0"); audit_log_end(ab); } diff --git a/kernel/audit.h b/kernel/audit.h index 0b5295aeaebb..91421679a168 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -264,7 +264,7 @@ extern struct audit_entry *audit_dupe_rule(struct audit_krule *old); extern void audit_log_d_path_exe(struct audit_buffer *ab, struct mm_struct *mm); -extern struct tty_struct *audit_get_tty(struct task_struct *tsk); +extern struct tty_struct *audit_get_tty(void); extern void audit_put_tty(struct tty_struct *tty); /* audit watch functions */ diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 51e735aedf58..6593a5207fb0 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -830,44 +830,6 @@ void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx) rcu_read_unlock(); } -/* Transfer the audit context pointer to the caller, clearing it in the tsk's struct */ -static inline struct audit_context *audit_take_context(struct task_struct *tsk, - int return_valid, - long return_code) -{ - struct audit_context *context = tsk->audit_context; - - if (!context) - return NULL; - context->return_valid = return_valid; - - /* - * we need to fix up the return code in the audit logs if the actual - * return codes are later going to be fixed up by the arch specific - * signal handlers - * - * This is actually a test for: - * (rc == ERESTARTSYS ) || (rc == ERESTARTNOINTR) || - * (rc == ERESTARTNOHAND) || (rc == ERESTART_RESTARTBLOCK) - * - * but is faster than a bunch of || - */ - if (unlikely(return_code <= -ERESTARTSYS) && - (return_code >= -ERESTART_RESTARTBLOCK) && - (return_code != -ENOIOCTLCMD)) - context->return_code = -EINTR; - else - context->return_code = return_code; - - if (context->in_syscall && !context->dummy) { - audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_EXIT]); - audit_filter_inodes(tsk, context); - } - - audit_set_context(tsk, NULL); - return context; -} - static inline void audit_proctitle_free(struct audit_context *context) { kfree(context->proctitle.value); @@ -1296,15 +1258,18 @@ static inline int audit_proctitle_rtrim(char *proctitle, int len) return len; } -static void audit_log_proctitle(struct task_struct *tsk, - struct audit_context *context) +static void audit_log_proctitle(void) { int res; char *buf; char *msg = "(null)"; int len = strlen(msg); + struct audit_context *context = audit_context(); struct audit_buffer *ab; + if (!context || context->dummy) + return; + ab = audit_log_start(context, GFP_KERNEL, AUDIT_PROCTITLE); if (!ab) return; /* audit_panic or being filtered */ @@ -1317,7 +1282,7 @@ static void audit_log_proctitle(struct task_struct *tsk, if (!buf) goto out; /* Historically called this from procfs naming */ - res = get_cmdline(tsk, buf, MAX_PROCTITLE_AUDIT_LEN); + res = get_cmdline(current, buf, MAX_PROCTITLE_AUDIT_LEN); if (res == 0) { kfree(buf); goto out; @@ -1337,15 +1302,15 @@ out: audit_log_end(ab); } -static void audit_log_exit(struct audit_context *context, struct task_struct *tsk) +static void audit_log_exit(void) { int i, call_panic = 0; + struct audit_context *context = audit_context(); struct audit_buffer *ab; struct audit_aux_data *aux; struct audit_names *n; - /* tsk == current */ - context->personality = tsk->personality; + context->personality = current->personality; ab = audit_log_start(context, GFP_KERNEL, AUDIT_SYSCALL); if (!ab) @@ -1367,7 +1332,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts context->argv[3], context->name_count); - audit_log_task_info(ab, tsk); + audit_log_task_info(ab); audit_log_key(ab, context->filterkey); audit_log_end(ab); @@ -1456,7 +1421,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts audit_log_name(context, n, NULL, i++, &call_panic); } - audit_log_proctitle(tsk, context); + audit_log_proctitle(); /* Send end of event record to help user space know we are finished */ ab = audit_log_start(context, GFP_KERNEL, AUDIT_EOE); @@ -1474,22 +1439,31 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts */ void __audit_free(struct task_struct *tsk) { - struct audit_context *context; + struct audit_context *context = tsk->audit_context; - context = audit_take_context(tsk, 0, 0); if (!context) return; - /* Check for system calls that do not go through the exit - * function (e.g., exit_group), then free context block. - * We use GFP_ATOMIC here because we might be doing this - * in the context of the idle thread */ - /* that can happen only if we are called from do_exit() */ - if (context->in_syscall && context->current_state == AUDIT_RECORD_CONTEXT) - audit_log_exit(context, tsk); + /* We are called either by do_exit() or the fork() error handling code; + * in the former case tsk == current and in the latter tsk is a + * random task_struct that doesn't doesn't have any meaningful data we + * need to log via audit_log_exit(). + */ + if (tsk == current && !context->dummy && context->in_syscall) { + context->return_valid = 0; + context->return_code = 0; + + audit_filter_syscall(tsk, context, + &audit_filter_list[AUDIT_FILTER_EXIT]); + audit_filter_inodes(tsk, context); + if (context->current_state == AUDIT_RECORD_CONTEXT) + audit_log_exit(); + } + if (!list_empty(&context->killed_trees)) audit_kill_trees(&context->killed_trees); + audit_set_context(tsk, NULL); audit_free_context(context); } @@ -1559,17 +1533,40 @@ void __audit_syscall_exit(int success, long return_code) { struct audit_context *context; - if (success) - success = AUDITSC_SUCCESS; - else - success = AUDITSC_FAILURE; - - context = audit_take_context(current, success, return_code); + context = audit_context(); if (!context) return; - if (context->in_syscall && context->current_state == AUDIT_RECORD_CONTEXT) - audit_log_exit(context, current); + if (!context->dummy && context->in_syscall) { + if (success) + context->return_valid = AUDITSC_SUCCESS; + else + context->return_valid = AUDITSC_FAILURE; + + /* + * we need to fix up the return code in the audit logs if the + * actual return codes are later going to be fixed up by the + * arch specific signal handlers + * + * This is actually a test for: + * (rc == ERESTARTSYS ) || (rc == ERESTARTNOINTR) || + * (rc == ERESTARTNOHAND) || (rc == ERESTART_RESTARTBLOCK) + * + * but is faster than a bunch of || + */ + if (unlikely(return_code <= -ERESTARTSYS) && + (return_code >= -ERESTART_RESTARTBLOCK) && + (return_code != -ENOIOCTLCMD)) + context->return_code = -EINTR; + else + context->return_code = return_code; + + audit_filter_syscall(current, context, + &audit_filter_list[AUDIT_FILTER_EXIT]); + audit_filter_inodes(current, context); + if (context->current_state == AUDIT_RECORD_CONTEXT) + audit_log_exit(); + } context->in_syscall = 0; context->prio = context->state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0; @@ -1591,7 +1588,6 @@ void __audit_syscall_exit(int success, long return_code) kfree(context->filterkey); context->filterkey = NULL; } - audit_set_context(current, context); } static inline void handle_one(const struct inode *inode) @@ -2025,7 +2021,7 @@ static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid, uid = from_kuid(&init_user_ns, task_uid(current)); oldloginuid = from_kuid(&init_user_ns, koldloginuid); loginuid = from_kuid(&init_user_ns, kloginuid), - tty = audit_get_tty(current); + tty = audit_get_tty(); audit_log_format(ab, "pid=%d uid=%u", task_tgid_nr(current), uid); audit_log_task_context(ab); @@ -2046,7 +2042,6 @@ static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid, */ int audit_set_loginuid(kuid_t loginuid) { - struct task_struct *task = current; unsigned int oldsessionid, sessionid = AUDIT_SID_UNSET; kuid_t oldloginuid; int rc; @@ -2065,8 +2060,8 @@ int audit_set_loginuid(kuid_t loginuid) sessionid = (unsigned int)atomic_inc_return(&session_id); } - task->sessionid = sessionid; - task->loginuid = loginuid; + current->sessionid = sessionid; + current->loginuid = loginuid; out: audit_log_set_loginuid(oldloginuid, loginuid, oldsessionid, sessionid, rc); return rc; diff --git a/security/integrity/ima/ima_api.c b/security/integrity/ima/ima_api.c index 99dd1d53fc35..af134588ab4e 100644 --- a/security/integrity/ima/ima_api.c +++ b/security/integrity/ima/ima_api.c @@ -336,7 +336,7 @@ void ima_audit_measurement(struct integrity_iint_cache *iint, audit_log_untrustedstring(ab, filename); audit_log_format(ab, " hash=\"%s:%s\"", algo_name, hash); - audit_log_task_info(ab, current); + audit_log_task_info(ab); audit_log_end(ab); iint->flags |= IMA_AUDITED; -- cgit From ba64e7d8525236aa56ab58ba3a3a71615c4ee289 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 24 Nov 2018 23:20:44 -0800 Subject: bpf: btf: support proper non-jit func info Commit 838e96904ff3 ("bpf: Introduce bpf_func_info") added bpf func info support. The userspace is able to get better ksym's for bpf programs with jit, and is able to print out func prototypes. For a program containing func-to-func calls, the existing implementation returns user specified number of function calls and BTF types if jit is enabled. If the jit is not enabled, it only returns the type for the main function. This is undesirable. Interpreter may still be used and we should keep feature identical regardless of whether jit is enabled or not. This patch fixed this discrepancy. Fixes: 838e96904ff3 ("bpf: Introduce bpf_func_info") Signed-off-by: Yonghong Song Acked-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 6 +++-- include/linux/bpf_verifier.h | 1 - kernel/bpf/core.c | 3 ++- kernel/bpf/syscall.c | 33 +++++++------------------- kernel/bpf/verifier.c | 55 ++++++++++++++++++++++++++++++-------------- 5 files changed, 52 insertions(+), 46 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 7f0e225bf630..e82b7039fc66 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -299,7 +299,8 @@ struct bpf_prog_aux { u32 max_pkt_offset; u32 stack_depth; u32 id; - u32 func_cnt; + u32 func_cnt; /* used by non-func prog as the number of func progs */ + u32 func_idx; /* 0 for non-func prog, the index in func array for func prog */ bool offload_requested; struct bpf_prog **func; void *jit_data; /* JIT specific data. arch dependent */ @@ -317,7 +318,8 @@ struct bpf_prog_aux { #endif struct bpf_prog_offload *offload; struct btf *btf; - u32 type_id; /* type id for this prog/func */ + struct bpf_func_info *func_info; + u32 func_info_cnt; union { struct work_struct work; struct rcu_head rcu; diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 204382f46fd8..11f5df1092d9 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -204,7 +204,6 @@ static inline bool bpf_verifier_log_needed(const struct bpf_verifier_log *log) struct bpf_subprog_info { u32 start; /* insn idx of function entry point */ u16 stack_depth; /* max. stack depth used by this function */ - u32 type_id; /* btf type_id for this subprog */ }; /* single container for all structs diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 16d77012ad3e..002d67c62c8b 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -411,7 +411,8 @@ static void bpf_get_prog_name(const struct bpf_prog *prog, char *sym) /* prog->aux->name will be ignored if full btf name is available */ if (prog->aux->btf) { - type = btf_type_by_id(prog->aux->btf, prog->aux->type_id); + type = btf_type_by_id(prog->aux->btf, + prog->aux->func_info[prog->aux->func_idx].type_id); func_name = btf_name_by_offset(prog->aux->btf, type->name_off); snprintf(sym, (size_t)(end - sym), "_%s", func_name); return; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 998377808102..85cbeec06e50 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1214,6 +1214,7 @@ static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock) bpf_prog_free_id(prog, do_idr_lock); bpf_prog_kallsyms_del_all(prog); btf_put(prog->aux->btf); + kvfree(prog->aux->func_info); call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu); } @@ -2219,46 +2220,28 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, } if (prog->aux->btf) { + u32 krec_size = sizeof(struct bpf_func_info); u32 ucnt, urec_size; info.btf_id = btf_id(prog->aux->btf); ucnt = info.func_info_cnt; - info.func_info_cnt = prog->aux->func_cnt ? : 1; + info.func_info_cnt = prog->aux->func_info_cnt; urec_size = info.func_info_rec_size; - info.func_info_rec_size = sizeof(struct bpf_func_info); + info.func_info_rec_size = krec_size; if (ucnt) { /* expect passed-in urec_size is what the kernel expects */ if (urec_size != info.func_info_rec_size) return -EINVAL; if (bpf_dump_raw_ok()) { - struct bpf_func_info kern_finfo; char __user *user_finfo; - u32 i, insn_offset; user_finfo = u64_to_user_ptr(info.func_info); - if (prog->aux->func_cnt) { - ucnt = min_t(u32, info.func_info_cnt, ucnt); - insn_offset = 0; - for (i = 0; i < ucnt; i++) { - kern_finfo.insn_offset = insn_offset; - kern_finfo.type_id = prog->aux->func[i]->aux->type_id; - if (copy_to_user(user_finfo, &kern_finfo, - sizeof(kern_finfo))) - return -EFAULT; - - /* func[i]->len holds the prog len */ - insn_offset += prog->aux->func[i]->len; - user_finfo += urec_size; - } - } else { - kern_finfo.insn_offset = 0; - kern_finfo.type_id = prog->aux->type_id; - if (copy_to_user(user_finfo, &kern_finfo, - sizeof(kern_finfo))) - return -EFAULT; - } + ucnt = min_t(u32, info.func_info_cnt, ucnt); + if (copy_to_user(user_finfo, prog->aux->func_info, + krec_size * ucnt)) + return -EFAULT; } else { info.func_info_cnt = 0; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f102c4fd0c5a..05d95c0e4a26 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4650,7 +4650,7 @@ static int check_btf_func(struct bpf_prog *prog, struct bpf_verifier_env *env, { u32 i, nfuncs, urec_size, min_size, prev_offset; u32 krec_size = sizeof(struct bpf_func_info); - struct bpf_func_info krecord = {}; + struct bpf_func_info *krecord = NULL; const struct btf_type *type; void __user *urecord; struct btf *btf; @@ -4682,6 +4682,12 @@ static int check_btf_func(struct bpf_prog *prog, struct bpf_verifier_env *env, urecord = u64_to_user_ptr(attr->func_info); min_size = min_t(u32, krec_size, urec_size); + krecord = kvcalloc(nfuncs, krec_size, GFP_KERNEL | __GFP_NOWARN); + if (!krecord) { + ret = -ENOMEM; + goto free_btf; + } + for (i = 0; i < nfuncs; i++) { ret = bpf_check_uarg_tail_zero(urecord, krec_size, urec_size); if (ret) { @@ -4696,59 +4702,69 @@ static int check_btf_func(struct bpf_prog *prog, struct bpf_verifier_env *env, goto free_btf; } - if (copy_from_user(&krecord, urecord, min_size)) { + if (copy_from_user(&krecord[i], urecord, min_size)) { ret = -EFAULT; goto free_btf; } /* check insn_offset */ if (i == 0) { - if (krecord.insn_offset) { + if (krecord[i].insn_offset) { verbose(env, "nonzero insn_offset %u for the first func info record", - krecord.insn_offset); + krecord[i].insn_offset); ret = -EINVAL; goto free_btf; } - } else if (krecord.insn_offset <= prev_offset) { + } else if (krecord[i].insn_offset <= prev_offset) { verbose(env, "same or smaller insn offset (%u) than previous func info record (%u)", - krecord.insn_offset, prev_offset); + krecord[i].insn_offset, prev_offset); ret = -EINVAL; goto free_btf; } - if (env->subprog_info[i].start != krecord.insn_offset) { + if (env->subprog_info[i].start != krecord[i].insn_offset) { verbose(env, "func_info BTF section doesn't match subprog layout in BPF program\n"); ret = -EINVAL; goto free_btf; } /* check type_id */ - type = btf_type_by_id(btf, krecord.type_id); + type = btf_type_by_id(btf, krecord[i].type_id); if (!type || BTF_INFO_KIND(type->info) != BTF_KIND_FUNC) { verbose(env, "invalid type id %d in func info", - krecord.type_id); + krecord[i].type_id); ret = -EINVAL; goto free_btf; } - if (i == 0) - prog->aux->type_id = krecord.type_id; - env->subprog_info[i].type_id = krecord.type_id; - - prev_offset = krecord.insn_offset; + prev_offset = krecord[i].insn_offset; urecord += urec_size; } prog->aux->btf = btf; + prog->aux->func_info = krecord; + prog->aux->func_info_cnt = nfuncs; return 0; free_btf: btf_put(btf); + kvfree(krecord); return ret; } +static void adjust_btf_func(struct bpf_verifier_env *env) +{ + int i; + + if (!env->prog->aux->func_info) + return; + + for (i = 0; i < env->subprog_cnt; i++) + env->prog->aux->func_info[i].insn_offset = env->subprog_info[i].start; +} + /* check %cur's range satisfies %old's */ static bool range_within(struct bpf_reg_state *old, struct bpf_reg_state *cur) @@ -6043,15 +6059,17 @@ static int jit_subprogs(struct bpf_verifier_env *env) if (bpf_prog_calc_tag(func[i])) goto out_free; func[i]->is_func = 1; + func[i]->aux->func_idx = i; + /* the btf and func_info will be freed only at prog->aux */ + func[i]->aux->btf = prog->aux->btf; + func[i]->aux->func_info = prog->aux->func_info; + /* Use bpf_prog_F_tag to indicate functions in stack traces. * Long term would need debug info to populate names */ func[i]->aux->name[0] = 'F'; func[i]->aux->stack_depth = env->subprog_info[i].stack_depth; func[i]->jit_requested = 1; - /* the btf will be freed only at prog->aux */ - func[i]->aux->btf = prog->aux->btf; - func[i]->aux->type_id = env->subprog_info[i].type_id; func[i] = bpf_int_jit_compile(func[i]); if (!func[i]->jited) { err = -ENOTSUPP; @@ -6572,6 +6590,9 @@ skip_full_check: convert_pseudo_ld_imm64(env); } + if (ret == 0) + adjust_btf_func(env); + err_release_maps: if (!env->prog->aux->used_maps) /* if we didn't copy map pointers into bpf_prog_info, release -- cgit From 7440172974e85b1828bdd84ac6b23b5bcad9c5eb Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 6 Nov 2018 18:44:52 -0800 Subject: tracing: Replace synchronize_sched() and call_rcu_sched() Now that synchronize_rcu() waits for preempt-disable regions of code as well as RCU read-side critical sections, synchronize_sched() can be replaced by synchronize_rcu(). Similarly, call_rcu_sched() can be replaced by call_rcu(). This commit therefore makes these changes. Signed-off-by: Paul E. McKenney Cc: Ingo Molnar Cc: Acked-by: Steven Rostedt (VMware) --- include/linux/tracepoint.h | 2 +- kernel/trace/ftrace.c | 24 ++++++++++++------------ kernel/trace/ring_buffer.c | 12 ++++++------ kernel/trace/trace.c | 10 +++++----- kernel/trace/trace_events_filter.c | 4 ++-- kernel/trace/trace_kprobe.c | 2 +- kernel/tracepoint.c | 4 ++-- 7 files changed, 29 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index 538ba1a58f5b..432080b59c26 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -82,7 +82,7 @@ int unregister_tracepoint_module_notifier(struct notifier_block *nb) static inline void tracepoint_synchronize_unregister(void) { synchronize_srcu(&tracepoint_srcu); - synchronize_sched(); + synchronize_rcu(); } #else static inline void tracepoint_synchronize_unregister(void) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index f536f601bd46..5b4f73e4fd56 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -173,7 +173,7 @@ static void ftrace_sync(struct work_struct *work) { /* * This function is just a stub to implement a hard force - * of synchronize_sched(). This requires synchronizing + * of synchronize_rcu(). This requires synchronizing * tasks even in userspace and idle. * * Yes, function tracing is rude. @@ -934,7 +934,7 @@ ftrace_profile_write(struct file *filp, const char __user *ubuf, ftrace_profile_enabled = 0; /* * unregister_ftrace_profiler calls stop_machine - * so this acts like an synchronize_sched. + * so this acts like an synchronize_rcu. */ unregister_ftrace_profiler(); } @@ -1086,7 +1086,7 @@ struct ftrace_ops *ftrace_ops_trampoline(unsigned long addr) /* * Some of the ops may be dynamically allocated, - * they are freed after a synchronize_sched(). + * they are freed after a synchronize_rcu(). */ preempt_disable_notrace(); @@ -1286,7 +1286,7 @@ static void free_ftrace_hash_rcu(struct ftrace_hash *hash) { if (!hash || hash == EMPTY_HASH) return; - call_rcu_sched(&hash->rcu, __free_ftrace_hash_rcu); + call_rcu(&hash->rcu, __free_ftrace_hash_rcu); } void ftrace_free_filter(struct ftrace_ops *ops) @@ -1501,7 +1501,7 @@ static bool hash_contains_ip(unsigned long ip, * the ip is not in the ops->notrace_hash. * * This needs to be called with preemption disabled as - * the hashes are freed with call_rcu_sched(). + * the hashes are freed with call_rcu(). */ static int ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs) @@ -4496,7 +4496,7 @@ unregister_ftrace_function_probe_func(char *glob, struct trace_array *tr, if (ftrace_enabled && !ftrace_hash_empty(hash)) ftrace_run_modify_code(&probe->ops, FTRACE_UPDATE_CALLS, &old_hash_ops); - synchronize_sched(); + synchronize_rcu(); hlist_for_each_entry_safe(entry, tmp, &hhd, hlist) { hlist_del(&entry->hlist); @@ -5314,7 +5314,7 @@ ftrace_graph_release(struct inode *inode, struct file *file) mutex_unlock(&graph_lock); /* Wait till all users are no longer using the old hash */ - synchronize_sched(); + synchronize_rcu(); free_ftrace_hash(old_hash); } @@ -5707,7 +5707,7 @@ void ftrace_release_mod(struct module *mod) list_for_each_entry_safe(mod_map, n, &ftrace_mod_maps, list) { if (mod_map->mod == mod) { list_del_rcu(&mod_map->list); - call_rcu_sched(&mod_map->rcu, ftrace_free_mod_map); + call_rcu(&mod_map->rcu, ftrace_free_mod_map); break; } } @@ -5927,7 +5927,7 @@ ftrace_mod_address_lookup(unsigned long addr, unsigned long *size, struct ftrace_mod_map *mod_map; const char *ret = NULL; - /* mod_map is freed via call_rcu_sched() */ + /* mod_map is freed via call_rcu() */ preempt_disable(); list_for_each_entry_rcu(mod_map, &ftrace_mod_maps, list) { ret = ftrace_func_address_lookup(mod_map, addr, size, off, sym); @@ -6262,7 +6262,7 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, /* * Some of the ops may be dynamically allocated, - * they must be freed after a synchronize_sched(). + * they must be freed after a synchronize_rcu(). */ preempt_disable_notrace(); @@ -6433,7 +6433,7 @@ static void clear_ftrace_pids(struct trace_array *tr) rcu_assign_pointer(tr->function_pids, NULL); /* Wait till all users are no longer using pid filtering */ - synchronize_sched(); + synchronize_rcu(); trace_free_pid_list(pid_list); } @@ -6580,7 +6580,7 @@ ftrace_pid_write(struct file *filp, const char __user *ubuf, rcu_assign_pointer(tr->function_pids, pid_list); if (filtered_pids) { - synchronize_sched(); + synchronize_rcu(); trace_free_pid_list(filtered_pids); } else if (pid_list) { /* Register a probe to set whether to ignore the tracing of a task */ diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 65bd4616220d..4f3247a53259 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1834,7 +1834,7 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size, * There could have been a race between checking * record_disable and incrementing it. */ - synchronize_sched(); + synchronize_rcu(); for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; rb_check_pages(cpu_buffer); @@ -3151,7 +3151,7 @@ static bool rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer) * This prevents all writes to the buffer. Any attempt to write * to the buffer after this will fail and return NULL. * - * The caller should call synchronize_sched() after this. + * The caller should call synchronize_rcu() after this. */ void ring_buffer_record_disable(struct ring_buffer *buffer) { @@ -3253,7 +3253,7 @@ bool ring_buffer_record_is_set_on(struct ring_buffer *buffer) * This prevents all writes to the buffer. Any attempt to write * to the buffer after this will fail and return NULL. * - * The caller should call synchronize_sched() after this. + * The caller should call synchronize_rcu() after this. */ void ring_buffer_record_disable_cpu(struct ring_buffer *buffer, int cpu) { @@ -4191,7 +4191,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_read_prepare); void ring_buffer_read_prepare_sync(void) { - synchronize_sched(); + synchronize_rcu(); } EXPORT_SYMBOL_GPL(ring_buffer_read_prepare_sync); @@ -4363,7 +4363,7 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu) atomic_inc(&cpu_buffer->record_disabled); /* Make sure all commits have finished */ - synchronize_sched(); + synchronize_rcu(); raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); @@ -4496,7 +4496,7 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a, goto out; /* - * We can't do a synchronize_sched here because this + * We can't do a synchronize_rcu here because this * function can be called in atomic context. * Normally this will be called from the same CPU as cpu. * If not it's up to the caller to protect this. diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index ff1c4b20cd0a..51612b4a603f 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1681,7 +1681,7 @@ void tracing_reset(struct trace_buffer *buf, int cpu) ring_buffer_record_disable(buffer); /* Make sure all commits have finished */ - synchronize_sched(); + synchronize_rcu(); ring_buffer_reset_cpu(buffer, cpu); ring_buffer_record_enable(buffer); @@ -1698,7 +1698,7 @@ void tracing_reset_online_cpus(struct trace_buffer *buf) ring_buffer_record_disable(buffer); /* Make sure all commits have finished */ - synchronize_sched(); + synchronize_rcu(); buf->time_start = buffer_ftrace_now(buf, buf->cpu); @@ -2250,7 +2250,7 @@ void trace_buffered_event_disable(void) preempt_enable(); /* Wait for all current users to finish */ - synchronize_sched(); + synchronize_rcu(); for_each_tracing_cpu(cpu) { free_page((unsigned long)per_cpu(trace_buffered_event, cpu)); @@ -5398,7 +5398,7 @@ static int tracing_set_tracer(struct trace_array *tr, const char *buf) if (tr->current_trace->reset) tr->current_trace->reset(tr); - /* Current trace needs to be nop_trace before synchronize_sched */ + /* Current trace needs to be nop_trace before synchronize_rcu */ tr->current_trace = &nop_trace; #ifdef CONFIG_TRACER_MAX_TRACE @@ -5412,7 +5412,7 @@ static int tracing_set_tracer(struct trace_array *tr, const char *buf) * The update_max_tr is called from interrupts disabled * so a synchronized_sched() is sufficient. */ - synchronize_sched(); + synchronize_rcu(); free_snapshot(tr); } #endif diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 84a65173b1e9..35f3aa55be85 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -1614,7 +1614,7 @@ static int process_system_preds(struct trace_subsystem_dir *dir, /* * The calls can still be using the old filters. - * Do a synchronize_sched() and to ensure all calls are + * Do a synchronize_rcu() and to ensure all calls are * done with them before we free them. */ tracepoint_synchronize_unregister(); @@ -1845,7 +1845,7 @@ int apply_subsystem_event_filter(struct trace_subsystem_dir *dir, if (filter) { /* * No event actually uses the system filter - * we can free it without synchronize_sched(). + * we can free it without synchronize_rcu(). */ __free_filter(system->filter); system->filter = filter; diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index fec67188c4d2..adc153ab51c0 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -333,7 +333,7 @@ disable_trace_kprobe(struct trace_kprobe *tk, struct trace_event_file *file) * event_call related objects, which will be accessed in * the kprobe_trace_func/kretprobe_trace_func. */ - synchronize_sched(); + synchronize_rcu(); kfree(link); /* Ignored if link == NULL */ } diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index a3be42304485..46f2ab1e08a9 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -92,7 +92,7 @@ static __init int release_early_probes(void) while (early_probes) { tmp = early_probes; early_probes = tmp->next; - call_rcu_sched(tmp, rcu_free_old_probes); + call_rcu(tmp, rcu_free_old_probes); } return 0; @@ -123,7 +123,7 @@ static inline void release_probes(struct tracepoint_func *old) * cover both cases. So let us chain the SRCU and sched RCU * callbacks to wait for both grace periods. */ - call_rcu_sched(&tp_probes->rcu, rcu_free_old_probes); + call_rcu(&tp_probes->rcu, rcu_free_old_probes); } } -- cgit From ae8b7ce7647bc5c56a9a9fc32b03e1cd0ae49629 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 6 Nov 2018 19:04:39 -0800 Subject: kprobes: Replace synchronize_sched() with synchronize_rcu() Now that synchronize_rcu() waits for preempt-disable regions of code as well as RCU read-side critical sections, synchronize_sched() can be replaced by synchronize_rcu(). This commit therefore makes this change. Signed-off-by: Paul E. McKenney Cc: "Naveen N. Rao" Cc: Anil S Keshavamurthy Cc: "David S. Miller" Acked-by: Masami Hiramatsu --- kernel/kprobes.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 90e98e233647..08e31d863191 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -229,7 +229,7 @@ static int collect_garbage_slots(struct kprobe_insn_cache *c) struct kprobe_insn_page *kip, *next; /* Ensure no-one is interrupted on the garbages */ - synchronize_sched(); + synchronize_rcu(); list_for_each_entry_safe(kip, next, &c->pages, list) { int i; @@ -1382,7 +1382,7 @@ out: if (ret) { ap->flags |= KPROBE_FLAG_DISABLED; list_del_rcu(&p->list); - synchronize_sched(); + synchronize_rcu(); } } } @@ -1597,7 +1597,7 @@ int register_kprobe(struct kprobe *p) ret = arm_kprobe(p); if (ret) { hlist_del_rcu(&p->hlist); - synchronize_sched(); + synchronize_rcu(); goto out; } } @@ -1776,7 +1776,7 @@ void unregister_kprobes(struct kprobe **kps, int num) kps[i]->addr = NULL; mutex_unlock(&kprobe_mutex); - synchronize_sched(); + synchronize_rcu(); for (i = 0; i < num; i++) if (kps[i]->addr) __unregister_kprobe_bottom(kps[i]); @@ -1966,7 +1966,7 @@ void unregister_kretprobes(struct kretprobe **rps, int num) rps[i]->kp.addr = NULL; mutex_unlock(&kprobe_mutex); - synchronize_sched(); + synchronize_rcu(); for (i = 0; i < num; i++) { if (rps[i]->kp.addr) { __unregister_kprobe_bottom(&rps[i]->kp); -- cgit From 51959d85f32dde9041655936eef206cc3323dc12 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 6 Nov 2018 19:06:51 -0800 Subject: lockdep: Replace synchronize_sched() with synchronize_rcu() Now that synchronize_rcu() waits for preempt-disable regions of code as well as RCU read-side critical sections, synchronize_sched() can be replaced by synchronize_rcu(). This commit therefore makes this change. Signed-off-by: Paul E. McKenney Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Will Deacon --- kernel/locking/lockdep.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 1efada2dd9dd..ef27f98714c0 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -4195,7 +4195,7 @@ void lockdep_free_key_range(void *start, unsigned long size) * * sync_sched() is sufficient because the read-side is IRQ disable. */ - synchronize_sched(); + synchronize_rcu(); /* * XXX at this point we could return the resources to the pool; -- cgit From c9a863bbb1620dca3af57934cdbb002e852449fc Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 6 Nov 2018 19:09:14 -0800 Subject: sched/membarrier: synchronize_sched() with synchronize_rcu() Now that synchronize_rcu() waits for preempt-disable regions of code as well as RCU read-side critical sections, synchronize_sched() can be replaced by synchronize_rcu(). This commit therefore makes this change. Signed-off-by: Paul E. McKenney Cc: Mathieu Desnoyers Cc: Ingo Molnar Cc: Peter Zijlstra --- kernel/sched/membarrier.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index 388a7a6c1aa2..3cd8a3a795d2 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -210,7 +210,7 @@ static int membarrier_register_global_expedited(void) * future scheduler executions will observe the new * thread flag state for this mm. */ - synchronize_sched(); + synchronize_rcu(); } atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY, &mm->membarrier_state); @@ -246,7 +246,7 @@ static int membarrier_register_private_expedited(int flags) * Ensure all future scheduler executions will observe the * new thread flag state for this process. */ - synchronize_sched(); + synchronize_rcu(); } atomic_or(state, &mm->membarrier_state); -- cgit From cb2f55369d3a9e6cf5c34d2da39eb242279a582d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 6 Nov 2018 19:17:01 -0800 Subject: modules: Replace synchronize_sched() and call_rcu_sched() Now that synchronize_rcu() waits for preempt-disable regions of code as well as RCU read-side critical sections, synchronize_sched() can be replaced by synchronize_rcu(). Similarly, call_rcu_sched() can be replaced by call_rcu(). This commit therefore makes these changes. Signed-off-by: Paul E. McKenney Acked-by: Jessica Yu --- kernel/module.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 49a405891587..99b46c32d579 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2159,7 +2159,7 @@ static void free_module(struct module *mod) /* Remove this module from bug list, this uses list_del_rcu */ module_bug_cleanup(mod); /* Wait for RCU-sched synchronizing before releasing mod->list and buglist. */ - synchronize_sched(); + synchronize_rcu(); mutex_unlock(&module_mutex); /* This may be empty, but that's OK */ @@ -3507,15 +3507,15 @@ static noinline int do_init_module(struct module *mod) /* * We want to free module_init, but be aware that kallsyms may be * walking this with preempt disabled. In all the failure paths, we - * call synchronize_sched(), but we don't want to slow down the success + * call synchronize_rcu(), but we don't want to slow down the success * path, so use actual RCU here. * Note that module_alloc() on most architectures creates W+X page * mappings which won't be cleaned up until do_free_init() runs. Any * code such as mark_rodata_ro() which depends on those mappings to * be cleaned up needs to sync with the queued work - ie - * rcu_barrier_sched() + * rcu_barrier() */ - call_rcu_sched(&freeinit->rcu, do_free_init); + call_rcu(&freeinit->rcu, do_free_init); mutex_unlock(&module_mutex); wake_up_all(&module_wq); @@ -3526,7 +3526,7 @@ fail_free_freeinit: fail: /* Try to protect us from buggy refcounters. */ mod->state = MODULE_STATE_GOING; - synchronize_sched(); + synchronize_rcu(); module_put(mod); blocking_notifier_call_chain(&module_notify_list, MODULE_STATE_GOING, mod); @@ -3819,7 +3819,7 @@ static int load_module(struct load_info *info, const char __user *uargs, ddebug_cleanup: ftrace_release_mod(mod); dynamic_debug_remove(mod, info->debug); - synchronize_sched(); + synchronize_rcu(); kfree(mod->args); free_arch_cleanup: module_arch_cleanup(mod); @@ -3834,7 +3834,7 @@ static int load_module(struct load_info *info, const char __user *uargs, mod_tree_remove(mod); wake_up_all(&module_wq); /* Wait for RCU-sched synchronizing before releasing mod->list. */ - synchronize_sched(); + synchronize_rcu(); mutex_unlock(&module_mutex); free_module: /* Free lock-classes; relies on the preceding sync_rcu() */ -- cgit From 25b0077511fe7cf1b876174f8481fb1742f4fb4d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 6 Nov 2018 19:18:45 -0800 Subject: workqueue: Replace call_rcu_sched() with call_rcu() Now that call_rcu()'s callback is not invoked until after all preempt-disable regions of code have completed (in addition to explicitly marked RCU read-side critical sections), call_rcu() can be used in place of call_rcu_sched(). This commit therefore makes that change. Signed-off-by: Paul E. McKenney Cc: Lai Jiangshan Acked-by: Tejun Heo --- kernel/workqueue.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 0280deac392e..392be4b252f6 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3396,7 +3396,7 @@ static void put_unbound_pool(struct worker_pool *pool) del_timer_sync(&pool->mayday_timer); /* sched-RCU protected to allow dereferences from get_work_pool() */ - call_rcu_sched(&pool->rcu, rcu_free_pool); + call_rcu(&pool->rcu, rcu_free_pool); } /** @@ -3503,14 +3503,14 @@ static void pwq_unbound_release_workfn(struct work_struct *work) put_unbound_pool(pool); mutex_unlock(&wq_pool_mutex); - call_rcu_sched(&pwq->rcu, rcu_free_pwq); + call_rcu(&pwq->rcu, rcu_free_pwq); /* * If we're the last pwq going away, @wq is already dead and no one * is gonna access it anymore. Schedule RCU free. */ if (is_last) - call_rcu_sched(&wq->rcu, rcu_free_wq); + call_rcu(&wq->rcu, rcu_free_wq); } /** @@ -4195,7 +4195,7 @@ void destroy_workqueue(struct workqueue_struct *wq) * The base ref is never dropped on per-cpu pwqs. Directly * schedule RCU free. */ - call_rcu_sched(&wq->rcu, rcu_free_wq); + call_rcu(&wq->rcu, rcu_free_wq); } else { /* * We're the sole accessor of @wq at this point. Directly -- cgit From 0809d95451f7d867d37cf2b526b8da923fd72891 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 6 Nov 2018 19:20:05 -0800 Subject: events: Replace synchronize_sched() with synchronize_rcu() Now that synchronize_rcu() waits for preempt-disable regions of code as well as RCU read-side critical sections, synchronize_sched() can be replaced by synchronize_rcu(). This commit therefore makes this change. Signed-off-by: Paul E. McKenney Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Namhyung Kim --- kernel/events/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 84530ab358c3..c4b90cf7734a 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -9918,7 +9918,7 @@ static void account_event(struct perf_event *event) * call the perf scheduling hooks before proceeding to * install events that need them. */ - synchronize_sched(); + synchronize_rcu(); } /* * Now that we have waited for the sync_sched(), allow further -- cgit From eb4c2382272ae7ae5d81fdfa5b7a6c86146eaaa4 Mon Sep 17 00:00:00 2001 From: Dennis Krein Date: Fri, 26 Oct 2018 07:38:24 -0700 Subject: srcu: Lock srcu_data structure in srcu_gp_start() The srcu_gp_start() function is called with the srcu_struct structure's ->lock held, but not with the srcu_data structure's ->lock. This is problematic because this function accesses and updates the srcu_data structure's ->srcu_cblist, which is protected by that lock. Failing to hold this lock can result in corruption of the SRCU callback lists, which in turn can result in arbitrarily bad results. This commit therefore makes srcu_gp_start() acquire the srcu_data structure's ->lock across the calls to rcu_segcblist_advance() and rcu_segcblist_accelerate(), thus preventing this corruption. Reported-by: Bart Van Assche Reported-by: Christoph Hellwig Reported-by: Sebastian Kuzminsky Signed-off-by: Dennis Krein Signed-off-by: Paul E. McKenney Tested-by: Dennis Krein Cc: # 4.16.x --- kernel/rcu/srcutree.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 60f3236beaf7..697a2d7e8e8a 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -451,10 +451,12 @@ static void srcu_gp_start(struct srcu_struct *sp) lockdep_assert_held(&ACCESS_PRIVATE(sp, lock)); WARN_ON_ONCE(ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)); + spin_lock_rcu_node(sdp); /* Interrupts already disabled. */ rcu_segcblist_advance(&sdp->srcu_cblist, rcu_seq_current(&sp->srcu_gp_seq)); (void)rcu_segcblist_accelerate(&sdp->srcu_cblist, rcu_seq_snap(&sp->srcu_gp_seq)); + spin_unlock_rcu_node(sdp); /* Interrupts remain disabled. */ smp_mb(); /* Order prior store to ->srcu_gp_seq_needed vs. GP start. */ rcu_seq_start(&sp->srcu_gp_seq); state = rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)); -- cgit From aacb5d91ab1bfbb0e8123da59a2e333d52ba7f60 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 28 Oct 2018 10:32:51 -0700 Subject: srcu: Use "ssp" instead of "sp" for srcu_struct pointer In RCU, the distinction between "rsp", "rnp", and "rdp" has served well for a great many years, but in SRCU, "sp" vs. "sdp" has proven confusing. This commit therefore renames SRCU's "sp" pointers to "ssp", so that there is "ssp" for srcu_struct pointer, "snp" for srcu_node pointer, and "sdp" for srcu_data pointer. Signed-off-by: Paul E. McKenney --- include/linux/srcu.h | 78 ++++---- include/linux/srcutiny.h | 24 +-- include/linux/srcutree.h | 8 +- kernel/rcu/srcutiny.c | 120 ++++++------ kernel/rcu/srcutree.c | 488 +++++++++++++++++++++++------------------------ 5 files changed, 359 insertions(+), 359 deletions(-) (limited to 'kernel') diff --git a/include/linux/srcu.h b/include/linux/srcu.h index ebd5f1511690..c614375cd264 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -38,20 +38,20 @@ struct srcu_struct; #ifdef CONFIG_DEBUG_LOCK_ALLOC -int __init_srcu_struct(struct srcu_struct *sp, const char *name, +int __init_srcu_struct(struct srcu_struct *ssp, const char *name, struct lock_class_key *key); -#define init_srcu_struct(sp) \ +#define init_srcu_struct(ssp) \ ({ \ static struct lock_class_key __srcu_key; \ \ - __init_srcu_struct((sp), #sp, &__srcu_key); \ + __init_srcu_struct((ssp), #ssp, &__srcu_key); \ }) #define __SRCU_DEP_MAP_INIT(srcu_name) .dep_map = { .name = #srcu_name }, #else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ -int init_srcu_struct(struct srcu_struct *sp); +int init_srcu_struct(struct srcu_struct *ssp); #define __SRCU_DEP_MAP_INIT(srcu_name) #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ @@ -67,28 +67,28 @@ int init_srcu_struct(struct srcu_struct *sp); struct srcu_struct { }; #endif -void call_srcu(struct srcu_struct *sp, struct rcu_head *head, +void call_srcu(struct srcu_struct *ssp, struct rcu_head *head, void (*func)(struct rcu_head *head)); -void _cleanup_srcu_struct(struct srcu_struct *sp, bool quiesced); -int __srcu_read_lock(struct srcu_struct *sp) __acquires(sp); -void __srcu_read_unlock(struct srcu_struct *sp, int idx) __releases(sp); -void synchronize_srcu(struct srcu_struct *sp); +void _cleanup_srcu_struct(struct srcu_struct *ssp, bool quiesced); +int __srcu_read_lock(struct srcu_struct *ssp) __acquires(ssp); +void __srcu_read_unlock(struct srcu_struct *ssp, int idx) __releases(ssp); +void synchronize_srcu(struct srcu_struct *ssp); /** * cleanup_srcu_struct - deconstruct a sleep-RCU structure - * @sp: structure to clean up. + * @ssp: structure to clean up. * * Must invoke this after you are finished using a given srcu_struct that * was initialized via init_srcu_struct(), else you leak memory. */ -static inline void cleanup_srcu_struct(struct srcu_struct *sp) +static inline void cleanup_srcu_struct(struct srcu_struct *ssp) { - _cleanup_srcu_struct(sp, false); + _cleanup_srcu_struct(ssp, false); } /** * cleanup_srcu_struct_quiesced - deconstruct a quiesced sleep-RCU structure - * @sp: structure to clean up. + * @ssp: structure to clean up. * * Must invoke this after you are finished using a given srcu_struct that * was initialized via init_srcu_struct(), else you leak memory. Also, @@ -103,16 +103,16 @@ static inline void cleanup_srcu_struct(struct srcu_struct *sp) * (with high probability, anyway), and will also cause the srcu_struct * to be leaked. */ -static inline void cleanup_srcu_struct_quiesced(struct srcu_struct *sp) +static inline void cleanup_srcu_struct_quiesced(struct srcu_struct *ssp) { - _cleanup_srcu_struct(sp, true); + _cleanup_srcu_struct(ssp, true); } #ifdef CONFIG_DEBUG_LOCK_ALLOC /** * srcu_read_lock_held - might we be in SRCU read-side critical section? - * @sp: The srcu_struct structure to check + * @ssp: The srcu_struct structure to check * * If CONFIG_DEBUG_LOCK_ALLOC is selected, returns nonzero iff in an SRCU * read-side critical section. In absence of CONFIG_DEBUG_LOCK_ALLOC, @@ -126,16 +126,16 @@ static inline void cleanup_srcu_struct_quiesced(struct srcu_struct *sp) * relies on normal RCU, it can be called from the CPU which * is in the idle loop from an RCU point of view or offline. */ -static inline int srcu_read_lock_held(const struct srcu_struct *sp) +static inline int srcu_read_lock_held(const struct srcu_struct *ssp) { if (!debug_lockdep_rcu_enabled()) return 1; - return lock_is_held(&sp->dep_map); + return lock_is_held(&ssp->dep_map); } #else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ -static inline int srcu_read_lock_held(const struct srcu_struct *sp) +static inline int srcu_read_lock_held(const struct srcu_struct *ssp) { return 1; } @@ -145,7 +145,7 @@ static inline int srcu_read_lock_held(const struct srcu_struct *sp) /** * srcu_dereference_check - fetch SRCU-protected pointer for later dereferencing * @p: the pointer to fetch and protect for later dereferencing - * @sp: pointer to the srcu_struct, which is used to check that we + * @ssp: pointer to the srcu_struct, which is used to check that we * really are in an SRCU read-side critical section. * @c: condition to check for update-side use * @@ -154,32 +154,32 @@ static inline int srcu_read_lock_held(const struct srcu_struct *sp) * to 1. The @c argument will normally be a logical expression containing * lockdep_is_held() calls. */ -#define srcu_dereference_check(p, sp, c) \ - __rcu_dereference_check((p), (c) || srcu_read_lock_held(sp), __rcu) +#define srcu_dereference_check(p, ssp, c) \ + __rcu_dereference_check((p), (c) || srcu_read_lock_held(ssp), __rcu) /** * srcu_dereference - fetch SRCU-protected pointer for later dereferencing * @p: the pointer to fetch and protect for later dereferencing - * @sp: pointer to the srcu_struct, which is used to check that we + * @ssp: pointer to the srcu_struct, which is used to check that we * really are in an SRCU read-side critical section. * * Makes rcu_dereference_check() do the dirty work. If PROVE_RCU * is enabled, invoking this outside of an RCU read-side critical * section will result in an RCU-lockdep splat. */ -#define srcu_dereference(p, sp) srcu_dereference_check((p), (sp), 0) +#define srcu_dereference(p, ssp) srcu_dereference_check((p), (ssp), 0) /** * srcu_dereference_notrace - no tracing and no lockdep calls from here * @p: the pointer to fetch and protect for later dereferencing - * @sp: pointer to the srcu_struct, which is used to check that we + * @ssp: pointer to the srcu_struct, which is used to check that we * really are in an SRCU read-side critical section. */ -#define srcu_dereference_notrace(p, sp) srcu_dereference_check((p), (sp), 1) +#define srcu_dereference_notrace(p, ssp) srcu_dereference_check((p), (ssp), 1) /** * srcu_read_lock - register a new reader for an SRCU-protected structure. - * @sp: srcu_struct in which to register the new reader. + * @ssp: srcu_struct in which to register the new reader. * * Enter an SRCU read-side critical section. Note that SRCU read-side * critical sections may be nested. However, it is illegal to @@ -194,44 +194,44 @@ static inline int srcu_read_lock_held(const struct srcu_struct *sp) * srcu_read_unlock() in an irq handler if the matching srcu_read_lock() * was invoked in process context. */ -static inline int srcu_read_lock(struct srcu_struct *sp) __acquires(sp) +static inline int srcu_read_lock(struct srcu_struct *ssp) __acquires(ssp) { int retval; - retval = __srcu_read_lock(sp); - rcu_lock_acquire(&(sp)->dep_map); + retval = __srcu_read_lock(ssp); + rcu_lock_acquire(&(ssp)->dep_map); return retval; } /* Used by tracing, cannot be traced and cannot invoke lockdep. */ static inline notrace int -srcu_read_lock_notrace(struct srcu_struct *sp) __acquires(sp) +srcu_read_lock_notrace(struct srcu_struct *ssp) __acquires(ssp) { int retval; - retval = __srcu_read_lock(sp); + retval = __srcu_read_lock(ssp); return retval; } /** * srcu_read_unlock - unregister a old reader from an SRCU-protected structure. - * @sp: srcu_struct in which to unregister the old reader. + * @ssp: srcu_struct in which to unregister the old reader. * @idx: return value from corresponding srcu_read_lock(). * * Exit an SRCU read-side critical section. */ -static inline void srcu_read_unlock(struct srcu_struct *sp, int idx) - __releases(sp) +static inline void srcu_read_unlock(struct srcu_struct *ssp, int idx) + __releases(ssp) { - rcu_lock_release(&(sp)->dep_map); - __srcu_read_unlock(sp, idx); + rcu_lock_release(&(ssp)->dep_map); + __srcu_read_unlock(ssp, idx); } /* Used by tracing, cannot be traced and cannot call lockdep. */ static inline notrace void -srcu_read_unlock_notrace(struct srcu_struct *sp, int idx) __releases(sp) +srcu_read_unlock_notrace(struct srcu_struct *ssp, int idx) __releases(ssp) { - __srcu_read_unlock(sp, idx); + __srcu_read_unlock(ssp, idx); } /** diff --git a/include/linux/srcutiny.h b/include/linux/srcutiny.h index f41d2fb09f87..b19216aaaef2 100644 --- a/include/linux/srcutiny.h +++ b/include/linux/srcutiny.h @@ -60,7 +60,7 @@ void srcu_drive_gp(struct work_struct *wp); #define DEFINE_STATIC_SRCU(name) \ static struct srcu_struct name = __SRCU_STRUCT_INIT(name, name) -void synchronize_srcu(struct srcu_struct *sp); +void synchronize_srcu(struct srcu_struct *ssp); /* * Counts the new reader in the appropriate per-CPU element of the @@ -68,36 +68,36 @@ void synchronize_srcu(struct srcu_struct *sp); * __srcu_read_unlock() must be in the same handler instance. Returns an * index that must be passed to the matching srcu_read_unlock(). */ -static inline int __srcu_read_lock(struct srcu_struct *sp) +static inline int __srcu_read_lock(struct srcu_struct *ssp) { int idx; - idx = READ_ONCE(sp->srcu_idx); - WRITE_ONCE(sp->srcu_lock_nesting[idx], sp->srcu_lock_nesting[idx] + 1); + idx = READ_ONCE(ssp->srcu_idx); + WRITE_ONCE(ssp->srcu_lock_nesting[idx], ssp->srcu_lock_nesting[idx] + 1); return idx; } -static inline void synchronize_srcu_expedited(struct srcu_struct *sp) +static inline void synchronize_srcu_expedited(struct srcu_struct *ssp) { - synchronize_srcu(sp); + synchronize_srcu(ssp); } -static inline void srcu_barrier(struct srcu_struct *sp) +static inline void srcu_barrier(struct srcu_struct *ssp) { - synchronize_srcu(sp); + synchronize_srcu(ssp); } /* Defined here to avoid size increase for non-torture kernels. */ -static inline void srcu_torture_stats_print(struct srcu_struct *sp, +static inline void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf) { int idx; - idx = READ_ONCE(sp->srcu_idx) & 0x1; + idx = READ_ONCE(ssp->srcu_idx) & 0x1; pr_alert("%s%s Tiny SRCU per-CPU(idx=%d): (%hd,%hd)\n", tt, tf, idx, - READ_ONCE(sp->srcu_lock_nesting[!idx]), - READ_ONCE(sp->srcu_lock_nesting[idx])); + READ_ONCE(ssp->srcu_lock_nesting[!idx]), + READ_ONCE(ssp->srcu_lock_nesting[idx])); } #endif diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index 0ae91b3a7406..6f292bd3e7db 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -51,7 +51,7 @@ struct srcu_data { unsigned long grpmask; /* Mask for leaf srcu_node */ /* ->srcu_data_have_cbs[]. */ int cpu; - struct srcu_struct *sp; + struct srcu_struct *ssp; }; /* @@ -138,8 +138,8 @@ struct srcu_struct { #define DEFINE_SRCU(name) __DEFINE_SRCU(name, /* not static */) #define DEFINE_STATIC_SRCU(name) __DEFINE_SRCU(name, static) -void synchronize_srcu_expedited(struct srcu_struct *sp); -void srcu_barrier(struct srcu_struct *sp); -void srcu_torture_stats_print(struct srcu_struct *sp, char *tt, char *tf); +void synchronize_srcu_expedited(struct srcu_struct *ssp); +void srcu_barrier(struct srcu_struct *ssp); +void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf); #endif diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c index b46e6683f8c9..32dfd6522548 100644 --- a/kernel/rcu/srcutiny.c +++ b/kernel/rcu/srcutiny.c @@ -37,30 +37,30 @@ int rcu_scheduler_active __read_mostly; static LIST_HEAD(srcu_boot_list); static bool srcu_init_done; -static int init_srcu_struct_fields(struct srcu_struct *sp) +static int init_srcu_struct_fields(struct srcu_struct *ssp) { - sp->srcu_lock_nesting[0] = 0; - sp->srcu_lock_nesting[1] = 0; - init_swait_queue_head(&sp->srcu_wq); - sp->srcu_cb_head = NULL; - sp->srcu_cb_tail = &sp->srcu_cb_head; - sp->srcu_gp_running = false; - sp->srcu_gp_waiting = false; - sp->srcu_idx = 0; - INIT_WORK(&sp->srcu_work, srcu_drive_gp); - INIT_LIST_HEAD(&sp->srcu_work.entry); + ssp->srcu_lock_nesting[0] = 0; + ssp->srcu_lock_nesting[1] = 0; + init_swait_queue_head(&ssp->srcu_wq); + ssp->srcu_cb_head = NULL; + ssp->srcu_cb_tail = &ssp->srcu_cb_head; + ssp->srcu_gp_running = false; + ssp->srcu_gp_waiting = false; + ssp->srcu_idx = 0; + INIT_WORK(&ssp->srcu_work, srcu_drive_gp); + INIT_LIST_HEAD(&ssp->srcu_work.entry); return 0; } #ifdef CONFIG_DEBUG_LOCK_ALLOC -int __init_srcu_struct(struct srcu_struct *sp, const char *name, +int __init_srcu_struct(struct srcu_struct *ssp, const char *name, struct lock_class_key *key) { /* Don't re-initialize a lock while it is held. */ - debug_check_no_locks_freed((void *)sp, sizeof(*sp)); - lockdep_init_map(&sp->dep_map, name, key, 0); - return init_srcu_struct_fields(sp); + debug_check_no_locks_freed((void *)ssp, sizeof(*ssp)); + lockdep_init_map(&ssp->dep_map, name, key, 0); + return init_srcu_struct_fields(ssp); } EXPORT_SYMBOL_GPL(__init_srcu_struct); @@ -68,15 +68,15 @@ EXPORT_SYMBOL_GPL(__init_srcu_struct); /* * init_srcu_struct - initialize a sleep-RCU structure - * @sp: structure to initialize. + * @ssp: structure to initialize. * * Must invoke this on a given srcu_struct before passing that srcu_struct * to any other function. Each srcu_struct represents a separate domain * of SRCU protection. */ -int init_srcu_struct(struct srcu_struct *sp) +int init_srcu_struct(struct srcu_struct *ssp) { - return init_srcu_struct_fields(sp); + return init_srcu_struct_fields(ssp); } EXPORT_SYMBOL_GPL(init_srcu_struct); @@ -84,22 +84,22 @@ EXPORT_SYMBOL_GPL(init_srcu_struct); /* * cleanup_srcu_struct - deconstruct a sleep-RCU structure - * @sp: structure to clean up. + * @ssp: structure to clean up. * * Must invoke this after you are finished using a given srcu_struct that * was initialized via init_srcu_struct(), else you leak memory. */ -void _cleanup_srcu_struct(struct srcu_struct *sp, bool quiesced) +void _cleanup_srcu_struct(struct srcu_struct *ssp, bool quiesced) { - WARN_ON(sp->srcu_lock_nesting[0] || sp->srcu_lock_nesting[1]); + WARN_ON(ssp->srcu_lock_nesting[0] || ssp->srcu_lock_nesting[1]); if (quiesced) - WARN_ON(work_pending(&sp->srcu_work)); + WARN_ON(work_pending(&ssp->srcu_work)); else - flush_work(&sp->srcu_work); - WARN_ON(sp->srcu_gp_running); - WARN_ON(sp->srcu_gp_waiting); - WARN_ON(sp->srcu_cb_head); - WARN_ON(&sp->srcu_cb_head != sp->srcu_cb_tail); + flush_work(&ssp->srcu_work); + WARN_ON(ssp->srcu_gp_running); + WARN_ON(ssp->srcu_gp_waiting); + WARN_ON(ssp->srcu_cb_head); + WARN_ON(&ssp->srcu_cb_head != ssp->srcu_cb_tail); } EXPORT_SYMBOL_GPL(_cleanup_srcu_struct); @@ -107,13 +107,13 @@ EXPORT_SYMBOL_GPL(_cleanup_srcu_struct); * Removes the count for the old reader from the appropriate element of * the srcu_struct. */ -void __srcu_read_unlock(struct srcu_struct *sp, int idx) +void __srcu_read_unlock(struct srcu_struct *ssp, int idx) { - int newval = sp->srcu_lock_nesting[idx] - 1; + int newval = ssp->srcu_lock_nesting[idx] - 1; - WRITE_ONCE(sp->srcu_lock_nesting[idx], newval); - if (!newval && READ_ONCE(sp->srcu_gp_waiting)) - swake_up_one(&sp->srcu_wq); + WRITE_ONCE(ssp->srcu_lock_nesting[idx], newval); + if (!newval && READ_ONCE(ssp->srcu_gp_waiting)) + swake_up_one(&ssp->srcu_wq); } EXPORT_SYMBOL_GPL(__srcu_read_unlock); @@ -127,24 +127,24 @@ void srcu_drive_gp(struct work_struct *wp) int idx; struct rcu_head *lh; struct rcu_head *rhp; - struct srcu_struct *sp; + struct srcu_struct *ssp; - sp = container_of(wp, struct srcu_struct, srcu_work); - if (sp->srcu_gp_running || !READ_ONCE(sp->srcu_cb_head)) + ssp = container_of(wp, struct srcu_struct, srcu_work); + if (ssp->srcu_gp_running || !READ_ONCE(ssp->srcu_cb_head)) return; /* Already running or nothing to do. */ /* Remove recently arrived callbacks and wait for readers. */ - WRITE_ONCE(sp->srcu_gp_running, true); + WRITE_ONCE(ssp->srcu_gp_running, true); local_irq_disable(); - lh = sp->srcu_cb_head; - sp->srcu_cb_head = NULL; - sp->srcu_cb_tail = &sp->srcu_cb_head; + lh = ssp->srcu_cb_head; + ssp->srcu_cb_head = NULL; + ssp->srcu_cb_tail = &ssp->srcu_cb_head; local_irq_enable(); - idx = sp->srcu_idx; - WRITE_ONCE(sp->srcu_idx, !sp->srcu_idx); - WRITE_ONCE(sp->srcu_gp_waiting, true); /* srcu_read_unlock() wakes! */ - swait_event_exclusive(sp->srcu_wq, !READ_ONCE(sp->srcu_lock_nesting[idx])); - WRITE_ONCE(sp->srcu_gp_waiting, false); /* srcu_read_unlock() cheap. */ + idx = ssp->srcu_idx; + WRITE_ONCE(ssp->srcu_idx, !ssp->srcu_idx); + WRITE_ONCE(ssp->srcu_gp_waiting, true); /* srcu_read_unlock() wakes! */ + swait_event_exclusive(ssp->srcu_wq, !READ_ONCE(ssp->srcu_lock_nesting[idx])); + WRITE_ONCE(ssp->srcu_gp_waiting, false); /* srcu_read_unlock() cheap. */ /* Invoke the callbacks we removed above. */ while (lh) { @@ -161,9 +161,9 @@ void srcu_drive_gp(struct work_struct *wp) * at interrupt level, but the ->srcu_gp_running checks will * straighten that out. */ - WRITE_ONCE(sp->srcu_gp_running, false); - if (READ_ONCE(sp->srcu_cb_head)) - schedule_work(&sp->srcu_work); + WRITE_ONCE(ssp->srcu_gp_running, false); + if (READ_ONCE(ssp->srcu_cb_head)) + schedule_work(&ssp->srcu_work); } EXPORT_SYMBOL_GPL(srcu_drive_gp); @@ -171,7 +171,7 @@ EXPORT_SYMBOL_GPL(srcu_drive_gp); * Enqueue an SRCU callback on the specified srcu_struct structure, * initiating grace-period processing if it is not already running. */ -void call_srcu(struct srcu_struct *sp, struct rcu_head *rhp, +void call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp, rcu_callback_t func) { unsigned long flags; @@ -179,14 +179,14 @@ void call_srcu(struct srcu_struct *sp, struct rcu_head *rhp, rhp->func = func; rhp->next = NULL; local_irq_save(flags); - *sp->srcu_cb_tail = rhp; - sp->srcu_cb_tail = &rhp->next; + *ssp->srcu_cb_tail = rhp; + ssp->srcu_cb_tail = &rhp->next; local_irq_restore(flags); - if (!READ_ONCE(sp->srcu_gp_running)) { + if (!READ_ONCE(ssp->srcu_gp_running)) { if (likely(srcu_init_done)) - schedule_work(&sp->srcu_work); - else if (list_empty(&sp->srcu_work.entry)) - list_add(&sp->srcu_work.entry, &srcu_boot_list); + schedule_work(&ssp->srcu_work); + else if (list_empty(&ssp->srcu_work.entry)) + list_add(&ssp->srcu_work.entry, &srcu_boot_list); } } EXPORT_SYMBOL_GPL(call_srcu); @@ -194,13 +194,13 @@ EXPORT_SYMBOL_GPL(call_srcu); /* * synchronize_srcu - wait for prior SRCU read-side critical-section completion */ -void synchronize_srcu(struct srcu_struct *sp) +void synchronize_srcu(struct srcu_struct *ssp) { struct rcu_synchronize rs; init_rcu_head_on_stack(&rs.head); init_completion(&rs.completion); - call_srcu(sp, &rs.head, wakeme_after_rcu); + call_srcu(ssp, &rs.head, wakeme_after_rcu); wait_for_completion(&rs.completion); destroy_rcu_head_on_stack(&rs.head); } @@ -219,13 +219,13 @@ void __init rcu_scheduler_starting(void) */ void __init srcu_init(void) { - struct srcu_struct *sp; + struct srcu_struct *ssp; srcu_init_done = true; while (!list_empty(&srcu_boot_list)) { - sp = list_first_entry(&srcu_boot_list, + ssp = list_first_entry(&srcu_boot_list, struct srcu_struct, srcu_work.entry); - list_del_init(&sp->srcu_work.entry); - schedule_work(&sp->srcu_work); + list_del_init(&ssp->srcu_work.entry); + schedule_work(&ssp->srcu_work); } } diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 697a2d7e8e8a..3600d88d8956 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -56,7 +56,7 @@ static LIST_HEAD(srcu_boot_list); static bool __read_mostly srcu_init_done; static void srcu_invoke_callbacks(struct work_struct *work); -static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay); +static void srcu_reschedule(struct srcu_struct *ssp, unsigned long delay); static void process_srcu(struct work_struct *work); /* Wrappers for lock acquisition and release, see raw_spin_lock_rcu_node(). */ @@ -92,7 +92,7 @@ do { \ * srcu_read_unlock() running against them. So if the is_static parameter * is set, don't initialize ->srcu_lock_count[] and ->srcu_unlock_count[]. */ -static void init_srcu_struct_nodes(struct srcu_struct *sp, bool is_static) +static void init_srcu_struct_nodes(struct srcu_struct *ssp, bool is_static) { int cpu; int i; @@ -103,13 +103,13 @@ static void init_srcu_struct_nodes(struct srcu_struct *sp, bool is_static) struct srcu_node *snp_first; /* Work out the overall tree geometry. */ - sp->level[0] = &sp->node[0]; + ssp->level[0] = &ssp->node[0]; for (i = 1; i < rcu_num_lvls; i++) - sp->level[i] = sp->level[i - 1] + num_rcu_lvl[i - 1]; + ssp->level[i] = ssp->level[i - 1] + num_rcu_lvl[i - 1]; rcu_init_levelspread(levelspread, num_rcu_lvl); /* Each pass through this loop initializes one srcu_node structure. */ - srcu_for_each_node_breadth_first(sp, snp) { + srcu_for_each_node_breadth_first(ssp, snp) { spin_lock_init(&ACCESS_PRIVATE(snp, lock)); WARN_ON_ONCE(ARRAY_SIZE(snp->srcu_have_cbs) != ARRAY_SIZE(snp->srcu_data_have_cbs)); @@ -120,17 +120,17 @@ static void init_srcu_struct_nodes(struct srcu_struct *sp, bool is_static) snp->srcu_gp_seq_needed_exp = 0; snp->grplo = -1; snp->grphi = -1; - if (snp == &sp->node[0]) { + if (snp == &ssp->node[0]) { /* Root node, special case. */ snp->srcu_parent = NULL; continue; } /* Non-root node. */ - if (snp == sp->level[level + 1]) + if (snp == ssp->level[level + 1]) level++; - snp->srcu_parent = sp->level[level - 1] + - (snp - sp->level[level]) / + snp->srcu_parent = ssp->level[level - 1] + + (snp - ssp->level[level]) / levelspread[level - 1]; } @@ -141,14 +141,14 @@ static void init_srcu_struct_nodes(struct srcu_struct *sp, bool is_static) WARN_ON_ONCE(ARRAY_SIZE(sdp->srcu_lock_count) != ARRAY_SIZE(sdp->srcu_unlock_count)); level = rcu_num_lvls - 1; - snp_first = sp->level[level]; + snp_first = ssp->level[level]; for_each_possible_cpu(cpu) { - sdp = per_cpu_ptr(sp->sda, cpu); + sdp = per_cpu_ptr(ssp->sda, cpu); spin_lock_init(&ACCESS_PRIVATE(sdp, lock)); rcu_segcblist_init(&sdp->srcu_cblist); sdp->srcu_cblist_invoking = false; - sdp->srcu_gp_seq_needed = sp->srcu_gp_seq; - sdp->srcu_gp_seq_needed_exp = sp->srcu_gp_seq; + sdp->srcu_gp_seq_needed = ssp->srcu_gp_seq; + sdp->srcu_gp_seq_needed_exp = ssp->srcu_gp_seq; sdp->mynode = &snp_first[cpu / levelspread[level]]; for (snp = sdp->mynode; snp != NULL; snp = snp->srcu_parent) { if (snp->grplo < 0) @@ -157,7 +157,7 @@ static void init_srcu_struct_nodes(struct srcu_struct *sp, bool is_static) } sdp->cpu = cpu; INIT_DELAYED_WORK(&sdp->work, srcu_invoke_callbacks); - sdp->sp = sp; + sdp->ssp = ssp; sdp->grpmask = 1 << (cpu - sdp->mynode->grplo); if (is_static) continue; @@ -176,35 +176,35 @@ static void init_srcu_struct_nodes(struct srcu_struct *sp, bool is_static) * parameter is passed through to init_srcu_struct_nodes(), and * also tells us that ->sda has already been wired up to srcu_data. */ -static int init_srcu_struct_fields(struct srcu_struct *sp, bool is_static) +static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static) { - mutex_init(&sp->srcu_cb_mutex); - mutex_init(&sp->srcu_gp_mutex); - sp->srcu_idx = 0; - sp->srcu_gp_seq = 0; - sp->srcu_barrier_seq = 0; - mutex_init(&sp->srcu_barrier_mutex); - atomic_set(&sp->srcu_barrier_cpu_cnt, 0); - INIT_DELAYED_WORK(&sp->work, process_srcu); + mutex_init(&ssp->srcu_cb_mutex); + mutex_init(&ssp->srcu_gp_mutex); + ssp->srcu_idx = 0; + ssp->srcu_gp_seq = 0; + ssp->srcu_barrier_seq = 0; + mutex_init(&ssp->srcu_barrier_mutex); + atomic_set(&ssp->srcu_barrier_cpu_cnt, 0); + INIT_DELAYED_WORK(&ssp->work, process_srcu); if (!is_static) - sp->sda = alloc_percpu(struct srcu_data); - init_srcu_struct_nodes(sp, is_static); - sp->srcu_gp_seq_needed_exp = 0; - sp->srcu_last_gp_end = ktime_get_mono_fast_ns(); - smp_store_release(&sp->srcu_gp_seq_needed, 0); /* Init done. */ - return sp->sda ? 0 : -ENOMEM; + ssp->sda = alloc_percpu(struct srcu_data); + init_srcu_struct_nodes(ssp, is_static); + ssp->srcu_gp_seq_needed_exp = 0; + ssp->srcu_last_gp_end = ktime_get_mono_fast_ns(); + smp_store_release(&ssp->srcu_gp_seq_needed, 0); /* Init done. */ + return ssp->sda ? 0 : -ENOMEM; } #ifdef CONFIG_DEBUG_LOCK_ALLOC -int __init_srcu_struct(struct srcu_struct *sp, const char *name, +int __init_srcu_struct(struct srcu_struct *ssp, const char *name, struct lock_class_key *key) { /* Don't re-initialize a lock while it is held. */ - debug_check_no_locks_freed((void *)sp, sizeof(*sp)); - lockdep_init_map(&sp->dep_map, name, key, 0); - spin_lock_init(&ACCESS_PRIVATE(sp, lock)); - return init_srcu_struct_fields(sp, false); + debug_check_no_locks_freed((void *)ssp, sizeof(*ssp)); + lockdep_init_map(&ssp->dep_map, name, key, 0); + spin_lock_init(&ACCESS_PRIVATE(ssp, lock)); + return init_srcu_struct_fields(ssp, false); } EXPORT_SYMBOL_GPL(__init_srcu_struct); @@ -212,16 +212,16 @@ EXPORT_SYMBOL_GPL(__init_srcu_struct); /** * init_srcu_struct - initialize a sleep-RCU structure - * @sp: structure to initialize. + * @ssp: structure to initialize. * * Must invoke this on a given srcu_struct before passing that srcu_struct * to any other function. Each srcu_struct represents a separate domain * of SRCU protection. */ -int init_srcu_struct(struct srcu_struct *sp) +int init_srcu_struct(struct srcu_struct *ssp) { - spin_lock_init(&ACCESS_PRIVATE(sp, lock)); - return init_srcu_struct_fields(sp, false); + spin_lock_init(&ACCESS_PRIVATE(ssp, lock)); + return init_srcu_struct_fields(ssp, false); } EXPORT_SYMBOL_GPL(init_srcu_struct); @@ -231,37 +231,37 @@ EXPORT_SYMBOL_GPL(init_srcu_struct); * First-use initialization of statically allocated srcu_struct * structure. Wiring up the combining tree is more than can be * done with compile-time initialization, so this check is added - * to each update-side SRCU primitive. Use sp->lock, which -is- + * to each update-side SRCU primitive. Use ssp->lock, which -is- * compile-time initialized, to resolve races involving multiple * CPUs trying to garner first-use privileges. */ -static void check_init_srcu_struct(struct srcu_struct *sp) +static void check_init_srcu_struct(struct srcu_struct *ssp) { unsigned long flags; /* The smp_load_acquire() pairs with the smp_store_release(). */ - if (!rcu_seq_state(smp_load_acquire(&sp->srcu_gp_seq_needed))) /*^^^*/ + if (!rcu_seq_state(smp_load_acquire(&ssp->srcu_gp_seq_needed))) /*^^^*/ return; /* Already initialized. */ - spin_lock_irqsave_rcu_node(sp, flags); - if (!rcu_seq_state(sp->srcu_gp_seq_needed)) { - spin_unlock_irqrestore_rcu_node(sp, flags); + spin_lock_irqsave_rcu_node(ssp, flags); + if (!rcu_seq_state(ssp->srcu_gp_seq_needed)) { + spin_unlock_irqrestore_rcu_node(ssp, flags); return; } - init_srcu_struct_fields(sp, true); - spin_unlock_irqrestore_rcu_node(sp, flags); + init_srcu_struct_fields(ssp, true); + spin_unlock_irqrestore_rcu_node(ssp, flags); } /* * Returns approximate total of the readers' ->srcu_lock_count[] values * for the rank of per-CPU counters specified by idx. */ -static unsigned long srcu_readers_lock_idx(struct srcu_struct *sp, int idx) +static unsigned long srcu_readers_lock_idx(struct srcu_struct *ssp, int idx) { int cpu; unsigned long sum = 0; for_each_possible_cpu(cpu) { - struct srcu_data *cpuc = per_cpu_ptr(sp->sda, cpu); + struct srcu_data *cpuc = per_cpu_ptr(ssp->sda, cpu); sum += READ_ONCE(cpuc->srcu_lock_count[idx]); } @@ -272,13 +272,13 @@ static unsigned long srcu_readers_lock_idx(struct srcu_struct *sp, int idx) * Returns approximate total of the readers' ->srcu_unlock_count[] values * for the rank of per-CPU counters specified by idx. */ -static unsigned long srcu_readers_unlock_idx(struct srcu_struct *sp, int idx) +static unsigned long srcu_readers_unlock_idx(struct srcu_struct *ssp, int idx) { int cpu; unsigned long sum = 0; for_each_possible_cpu(cpu) { - struct srcu_data *cpuc = per_cpu_ptr(sp->sda, cpu); + struct srcu_data *cpuc = per_cpu_ptr(ssp->sda, cpu); sum += READ_ONCE(cpuc->srcu_unlock_count[idx]); } @@ -289,11 +289,11 @@ static unsigned long srcu_readers_unlock_idx(struct srcu_struct *sp, int idx) * Return true if the number of pre-existing readers is determined to * be zero. */ -static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx) +static bool srcu_readers_active_idx_check(struct srcu_struct *ssp, int idx) { unsigned long unlocks; - unlocks = srcu_readers_unlock_idx(sp, idx); + unlocks = srcu_readers_unlock_idx(ssp, idx); /* * Make sure that a lock is always counted if the corresponding @@ -329,25 +329,25 @@ static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx) * of floor(ULONG_MAX/NR_CPUS/2), which should be sufficient, * especially on 64-bit systems. */ - return srcu_readers_lock_idx(sp, idx) == unlocks; + return srcu_readers_lock_idx(ssp, idx) == unlocks; } /** * srcu_readers_active - returns true if there are readers. and false * otherwise - * @sp: which srcu_struct to count active readers (holding srcu_read_lock). + * @ssp: which srcu_struct to count active readers (holding srcu_read_lock). * * Note that this is not an atomic primitive, and can therefore suffer * severe errors when invoked on an active srcu_struct. That said, it * can be useful as an error check at cleanup time. */ -static bool srcu_readers_active(struct srcu_struct *sp) +static bool srcu_readers_active(struct srcu_struct *ssp) { int cpu; unsigned long sum = 0; for_each_possible_cpu(cpu) { - struct srcu_data *cpuc = per_cpu_ptr(sp->sda, cpu); + struct srcu_data *cpuc = per_cpu_ptr(ssp->sda, cpu); sum += READ_ONCE(cpuc->srcu_lock_count[0]); sum += READ_ONCE(cpuc->srcu_lock_count[1]); @@ -363,44 +363,44 @@ static bool srcu_readers_active(struct srcu_struct *sp) * Return grace-period delay, zero if there are expedited grace * periods pending, SRCU_INTERVAL otherwise. */ -static unsigned long srcu_get_delay(struct srcu_struct *sp) +static unsigned long srcu_get_delay(struct srcu_struct *ssp) { - if (ULONG_CMP_LT(READ_ONCE(sp->srcu_gp_seq), - READ_ONCE(sp->srcu_gp_seq_needed_exp))) + if (ULONG_CMP_LT(READ_ONCE(ssp->srcu_gp_seq), + READ_ONCE(ssp->srcu_gp_seq_needed_exp))) return 0; return SRCU_INTERVAL; } /* Helper for cleanup_srcu_struct() and cleanup_srcu_struct_quiesced(). */ -void _cleanup_srcu_struct(struct srcu_struct *sp, bool quiesced) +void _cleanup_srcu_struct(struct srcu_struct *ssp, bool quiesced) { int cpu; - if (WARN_ON(!srcu_get_delay(sp))) + if (WARN_ON(!srcu_get_delay(ssp))) return; /* Just leak it! */ - if (WARN_ON(srcu_readers_active(sp))) + if (WARN_ON(srcu_readers_active(ssp))) return; /* Just leak it! */ if (quiesced) { - if (WARN_ON(delayed_work_pending(&sp->work))) + if (WARN_ON(delayed_work_pending(&ssp->work))) return; /* Just leak it! */ } else { - flush_delayed_work(&sp->work); + flush_delayed_work(&ssp->work); } for_each_possible_cpu(cpu) if (quiesced) { - if (WARN_ON(delayed_work_pending(&per_cpu_ptr(sp->sda, cpu)->work))) + if (WARN_ON(delayed_work_pending(&per_cpu_ptr(ssp->sda, cpu)->work))) return; /* Just leak it! */ } else { - flush_delayed_work(&per_cpu_ptr(sp->sda, cpu)->work); + flush_delayed_work(&per_cpu_ptr(ssp->sda, cpu)->work); } - if (WARN_ON(rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) != SRCU_STATE_IDLE) || - WARN_ON(srcu_readers_active(sp))) { + if (WARN_ON(rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)) != SRCU_STATE_IDLE) || + WARN_ON(srcu_readers_active(ssp))) { pr_info("%s: Active srcu_struct %p state: %d\n", - __func__, sp, rcu_seq_state(READ_ONCE(sp->srcu_gp_seq))); + __func__, ssp, rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq))); return; /* Caller forgot to stop doing call_srcu()? */ } - free_percpu(sp->sda); - sp->sda = NULL; + free_percpu(ssp->sda); + ssp->sda = NULL; } EXPORT_SYMBOL_GPL(_cleanup_srcu_struct); @@ -409,12 +409,12 @@ EXPORT_SYMBOL_GPL(_cleanup_srcu_struct); * srcu_struct. * Returns an index that must be passed to the matching srcu_read_unlock(). */ -int __srcu_read_lock(struct srcu_struct *sp) +int __srcu_read_lock(struct srcu_struct *ssp) { int idx; - idx = READ_ONCE(sp->srcu_idx) & 0x1; - this_cpu_inc(sp->sda->srcu_lock_count[idx]); + idx = READ_ONCE(ssp->srcu_idx) & 0x1; + this_cpu_inc(ssp->sda->srcu_lock_count[idx]); smp_mb(); /* B */ /* Avoid leaking the critical section. */ return idx; } @@ -425,10 +425,10 @@ EXPORT_SYMBOL_GPL(__srcu_read_lock); * element of the srcu_struct. Note that this may well be a different * CPU than that which was incremented by the corresponding srcu_read_lock(). */ -void __srcu_read_unlock(struct srcu_struct *sp, int idx) +void __srcu_read_unlock(struct srcu_struct *ssp, int idx) { smp_mb(); /* C */ /* Avoid leaking the critical section. */ - this_cpu_inc(sp->sda->srcu_unlock_count[idx]); + this_cpu_inc(ssp->sda->srcu_unlock_count[idx]); } EXPORT_SYMBOL_GPL(__srcu_read_unlock); @@ -444,22 +444,22 @@ EXPORT_SYMBOL_GPL(__srcu_read_unlock); /* * Start an SRCU grace period. */ -static void srcu_gp_start(struct srcu_struct *sp) +static void srcu_gp_start(struct srcu_struct *ssp) { - struct srcu_data *sdp = this_cpu_ptr(sp->sda); + struct srcu_data *sdp = this_cpu_ptr(ssp->sda); int state; - lockdep_assert_held(&ACCESS_PRIVATE(sp, lock)); - WARN_ON_ONCE(ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)); + lockdep_assert_held(&ACCESS_PRIVATE(ssp, lock)); + WARN_ON_ONCE(ULONG_CMP_GE(ssp->srcu_gp_seq, ssp->srcu_gp_seq_needed)); spin_lock_rcu_node(sdp); /* Interrupts already disabled. */ rcu_segcblist_advance(&sdp->srcu_cblist, - rcu_seq_current(&sp->srcu_gp_seq)); + rcu_seq_current(&ssp->srcu_gp_seq)); (void)rcu_segcblist_accelerate(&sdp->srcu_cblist, - rcu_seq_snap(&sp->srcu_gp_seq)); + rcu_seq_snap(&ssp->srcu_gp_seq)); spin_unlock_rcu_node(sdp); /* Interrupts remain disabled. */ smp_mb(); /* Order prior store to ->srcu_gp_seq_needed vs. GP start. */ - rcu_seq_start(&sp->srcu_gp_seq); - state = rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)); + rcu_seq_start(&ssp->srcu_gp_seq); + state = rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)); WARN_ON_ONCE(state != SRCU_STATE_SCAN1); } @@ -513,7 +513,7 @@ static void srcu_schedule_cbs_sdp(struct srcu_data *sdp, unsigned long delay) * just-completed grace period, the one corresponding to idx. If possible, * schedule this invocation on the corresponding CPUs. */ -static void srcu_schedule_cbs_snp(struct srcu_struct *sp, struct srcu_node *snp, +static void srcu_schedule_cbs_snp(struct srcu_struct *ssp, struct srcu_node *snp, unsigned long mask, unsigned long delay) { int cpu; @@ -521,7 +521,7 @@ static void srcu_schedule_cbs_snp(struct srcu_struct *sp, struct srcu_node *snp, for (cpu = snp->grplo; cpu <= snp->grphi; cpu++) { if (!(mask & (1 << (cpu - snp->grplo)))) continue; - srcu_schedule_cbs_sdp(per_cpu_ptr(sp->sda, cpu), delay); + srcu_schedule_cbs_sdp(per_cpu_ptr(ssp->sda, cpu), delay); } } @@ -534,7 +534,7 @@ static void srcu_schedule_cbs_snp(struct srcu_struct *sp, struct srcu_node *snp, * are initiating callback invocation. This allows the ->srcu_have_cbs[] * array to have a finite number of elements. */ -static void srcu_gp_end(struct srcu_struct *sp) +static void srcu_gp_end(struct srcu_struct *ssp) { unsigned long cbdelay; bool cbs; @@ -548,28 +548,28 @@ static void srcu_gp_end(struct srcu_struct *sp) struct srcu_node *snp; /* Prevent more than one additional grace period. */ - mutex_lock(&sp->srcu_cb_mutex); + mutex_lock(&ssp->srcu_cb_mutex); /* End the current grace period. */ - spin_lock_irq_rcu_node(sp); - idx = rcu_seq_state(sp->srcu_gp_seq); + spin_lock_irq_rcu_node(ssp); + idx = rcu_seq_state(ssp->srcu_gp_seq); WARN_ON_ONCE(idx != SRCU_STATE_SCAN2); - cbdelay = srcu_get_delay(sp); - sp->srcu_last_gp_end = ktime_get_mono_fast_ns(); - rcu_seq_end(&sp->srcu_gp_seq); - gpseq = rcu_seq_current(&sp->srcu_gp_seq); - if (ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, gpseq)) - sp->srcu_gp_seq_needed_exp = gpseq; - spin_unlock_irq_rcu_node(sp); - mutex_unlock(&sp->srcu_gp_mutex); + cbdelay = srcu_get_delay(ssp); + ssp->srcu_last_gp_end = ktime_get_mono_fast_ns(); + rcu_seq_end(&ssp->srcu_gp_seq); + gpseq = rcu_seq_current(&ssp->srcu_gp_seq); + if (ULONG_CMP_LT(ssp->srcu_gp_seq_needed_exp, gpseq)) + ssp->srcu_gp_seq_needed_exp = gpseq; + spin_unlock_irq_rcu_node(ssp); + mutex_unlock(&ssp->srcu_gp_mutex); /* A new grace period can start at this point. But only one. */ /* Initiate callback invocation as needed. */ idx = rcu_seq_ctr(gpseq) % ARRAY_SIZE(snp->srcu_have_cbs); - srcu_for_each_node_breadth_first(sp, snp) { + srcu_for_each_node_breadth_first(ssp, snp) { spin_lock_irq_rcu_node(snp); cbs = false; - last_lvl = snp >= sp->level[rcu_num_lvls - 1]; + last_lvl = snp >= ssp->level[rcu_num_lvls - 1]; if (last_lvl) cbs = snp->srcu_have_cbs[idx] == gpseq; snp->srcu_have_cbs[idx] = gpseq; @@ -580,12 +580,12 @@ static void srcu_gp_end(struct srcu_struct *sp) snp->srcu_data_have_cbs[idx] = 0; spin_unlock_irq_rcu_node(snp); if (cbs) - srcu_schedule_cbs_snp(sp, snp, mask, cbdelay); + srcu_schedule_cbs_snp(ssp, snp, mask, cbdelay); /* Occasionally prevent srcu_data counter wrap. */ if (!(gpseq & counter_wrap_check) && last_lvl) for (cpu = snp->grplo; cpu <= snp->grphi; cpu++) { - sdp = per_cpu_ptr(sp->sda, cpu); + sdp = per_cpu_ptr(ssp->sda, cpu); spin_lock_irqsave_rcu_node(sdp, flags); if (ULONG_CMP_GE(gpseq, sdp->srcu_gp_seq_needed + 100)) @@ -598,18 +598,18 @@ static void srcu_gp_end(struct srcu_struct *sp) } /* Callback initiation done, allow grace periods after next. */ - mutex_unlock(&sp->srcu_cb_mutex); + mutex_unlock(&ssp->srcu_cb_mutex); /* Start a new grace period if needed. */ - spin_lock_irq_rcu_node(sp); - gpseq = rcu_seq_current(&sp->srcu_gp_seq); + spin_lock_irq_rcu_node(ssp); + gpseq = rcu_seq_current(&ssp->srcu_gp_seq); if (!rcu_seq_state(gpseq) && - ULONG_CMP_LT(gpseq, sp->srcu_gp_seq_needed)) { - srcu_gp_start(sp); - spin_unlock_irq_rcu_node(sp); - srcu_reschedule(sp, 0); + ULONG_CMP_LT(gpseq, ssp->srcu_gp_seq_needed)) { + srcu_gp_start(ssp); + spin_unlock_irq_rcu_node(ssp); + srcu_reschedule(ssp, 0); } else { - spin_unlock_irq_rcu_node(sp); + spin_unlock_irq_rcu_node(ssp); } } @@ -620,13 +620,13 @@ static void srcu_gp_end(struct srcu_struct *sp) * but without expediting. To start a completely new grace period, * whether expedited or not, use srcu_funnel_gp_start() instead. */ -static void srcu_funnel_exp_start(struct srcu_struct *sp, struct srcu_node *snp, +static void srcu_funnel_exp_start(struct srcu_struct *ssp, struct srcu_node *snp, unsigned long s) { unsigned long flags; for (; snp != NULL; snp = snp->srcu_parent) { - if (rcu_seq_done(&sp->srcu_gp_seq, s) || + if (rcu_seq_done(&ssp->srcu_gp_seq, s) || ULONG_CMP_GE(READ_ONCE(snp->srcu_gp_seq_needed_exp), s)) return; spin_lock_irqsave_rcu_node(snp, flags); @@ -637,10 +637,10 @@ static void srcu_funnel_exp_start(struct srcu_struct *sp, struct srcu_node *snp, WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s); spin_unlock_irqrestore_rcu_node(snp, flags); } - spin_lock_irqsave_rcu_node(sp, flags); - if (ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, s)) - sp->srcu_gp_seq_needed_exp = s; - spin_unlock_irqrestore_rcu_node(sp, flags); + spin_lock_irqsave_rcu_node(ssp, flags); + if (ULONG_CMP_LT(ssp->srcu_gp_seq_needed_exp, s)) + ssp->srcu_gp_seq_needed_exp = s; + spin_unlock_irqrestore_rcu_node(ssp, flags); } /* @@ -653,7 +653,7 @@ static void srcu_funnel_exp_start(struct srcu_struct *sp, struct srcu_node *snp, * Note that this function also does the work of srcu_funnel_exp_start(), * in some cases by directly invoking it. */ -static void srcu_funnel_gp_start(struct srcu_struct *sp, struct srcu_data *sdp, +static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp, unsigned long s, bool do_norm) { unsigned long flags; @@ -663,7 +663,7 @@ static void srcu_funnel_gp_start(struct srcu_struct *sp, struct srcu_data *sdp, /* Each pass through the loop does one level of the srcu_node tree. */ for (; snp != NULL; snp = snp->srcu_parent) { - if (rcu_seq_done(&sp->srcu_gp_seq, s) && snp != sdp->mynode) + if (rcu_seq_done(&ssp->srcu_gp_seq, s) && snp != sdp->mynode) return; /* GP already done and CBs recorded. */ spin_lock_irqsave_rcu_node(snp, flags); if (ULONG_CMP_GE(snp->srcu_have_cbs[idx], s)) { @@ -678,7 +678,7 @@ static void srcu_funnel_gp_start(struct srcu_struct *sp, struct srcu_data *sdp, return; } if (!do_norm) - srcu_funnel_exp_start(sp, snp, s); + srcu_funnel_exp_start(ssp, snp, s); return; } snp->srcu_have_cbs[idx] = s; @@ -690,29 +690,29 @@ static void srcu_funnel_gp_start(struct srcu_struct *sp, struct srcu_data *sdp, } /* Top of tree, must ensure the grace period will be started. */ - spin_lock_irqsave_rcu_node(sp, flags); - if (ULONG_CMP_LT(sp->srcu_gp_seq_needed, s)) { + spin_lock_irqsave_rcu_node(ssp, flags); + if (ULONG_CMP_LT(ssp->srcu_gp_seq_needed, s)) { /* * Record need for grace period s. Pair with load * acquire setting up for initialization. */ - smp_store_release(&sp->srcu_gp_seq_needed, s); /*^^^*/ + smp_store_release(&ssp->srcu_gp_seq_needed, s); /*^^^*/ } - if (!do_norm && ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, s)) - sp->srcu_gp_seq_needed_exp = s; + if (!do_norm && ULONG_CMP_LT(ssp->srcu_gp_seq_needed_exp, s)) + ssp->srcu_gp_seq_needed_exp = s; /* If grace period not already done and none in progress, start it. */ - if (!rcu_seq_done(&sp->srcu_gp_seq, s) && - rcu_seq_state(sp->srcu_gp_seq) == SRCU_STATE_IDLE) { - WARN_ON_ONCE(ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)); - srcu_gp_start(sp); + if (!rcu_seq_done(&ssp->srcu_gp_seq, s) && + rcu_seq_state(ssp->srcu_gp_seq) == SRCU_STATE_IDLE) { + WARN_ON_ONCE(ULONG_CMP_GE(ssp->srcu_gp_seq, ssp->srcu_gp_seq_needed)); + srcu_gp_start(ssp); if (likely(srcu_init_done)) - queue_delayed_work(rcu_gp_wq, &sp->work, - srcu_get_delay(sp)); - else if (list_empty(&sp->work.work.entry)) - list_add(&sp->work.work.entry, &srcu_boot_list); + queue_delayed_work(rcu_gp_wq, &ssp->work, + srcu_get_delay(ssp)); + else if (list_empty(&ssp->work.work.entry)) + list_add(&ssp->work.work.entry, &srcu_boot_list); } - spin_unlock_irqrestore_rcu_node(sp, flags); + spin_unlock_irqrestore_rcu_node(ssp, flags); } /* @@ -720,12 +720,12 @@ static void srcu_funnel_gp_start(struct srcu_struct *sp, struct srcu_data *sdp, * loop an additional time if there is an expedited grace period pending. * The caller must ensure that ->srcu_idx is not changed while checking. */ -static bool try_check_zero(struct srcu_struct *sp, int idx, int trycount) +static bool try_check_zero(struct srcu_struct *ssp, int idx, int trycount) { for (;;) { - if (srcu_readers_active_idx_check(sp, idx)) + if (srcu_readers_active_idx_check(ssp, idx)) return true; - if (--trycount + !srcu_get_delay(sp) <= 0) + if (--trycount + !srcu_get_delay(ssp) <= 0) return false; udelay(SRCU_RETRY_CHECK_DELAY); } @@ -736,7 +736,7 @@ static bool try_check_zero(struct srcu_struct *sp, int idx, int trycount) * use the other rank of the ->srcu_(un)lock_count[] arrays. This allows * us to wait for pre-existing readers in a starvation-free manner. */ -static void srcu_flip(struct srcu_struct *sp) +static void srcu_flip(struct srcu_struct *ssp) { /* * Ensure that if this updater saw a given reader's increment @@ -748,7 +748,7 @@ static void srcu_flip(struct srcu_struct *sp) */ smp_mb(); /* E */ /* Pairs with B and C. */ - WRITE_ONCE(sp->srcu_idx, sp->srcu_idx + 1); + WRITE_ONCE(ssp->srcu_idx, ssp->srcu_idx + 1); /* * Ensure that if the updater misses an __srcu_read_unlock() @@ -781,7 +781,7 @@ static void srcu_flip(struct srcu_struct *sp) * negligible when amoritized over that time period, and the extra latency * of a needlessly non-expedited grace period is similarly negligible. */ -static bool srcu_might_be_idle(struct srcu_struct *sp) +static bool srcu_might_be_idle(struct srcu_struct *ssp) { unsigned long curseq; unsigned long flags; @@ -790,7 +790,7 @@ static bool srcu_might_be_idle(struct srcu_struct *sp) /* If the local srcu_data structure has callbacks, not idle. */ local_irq_save(flags); - sdp = this_cpu_ptr(sp->sda); + sdp = this_cpu_ptr(ssp->sda); if (rcu_segcblist_pend_cbs(&sdp->srcu_cblist)) { local_irq_restore(flags); return false; /* Callbacks already present, so not idle. */ @@ -806,17 +806,17 @@ static bool srcu_might_be_idle(struct srcu_struct *sp) /* First, see if enough time has passed since the last GP. */ t = ktime_get_mono_fast_ns(); if (exp_holdoff == 0 || - time_in_range_open(t, sp->srcu_last_gp_end, - sp->srcu_last_gp_end + exp_holdoff)) + time_in_range_open(t, ssp->srcu_last_gp_end, + ssp->srcu_last_gp_end + exp_holdoff)) return false; /* Too soon after last GP. */ /* Next, check for probable idleness. */ - curseq = rcu_seq_current(&sp->srcu_gp_seq); + curseq = rcu_seq_current(&ssp->srcu_gp_seq); smp_mb(); /* Order ->srcu_gp_seq with ->srcu_gp_seq_needed. */ - if (ULONG_CMP_LT(curseq, READ_ONCE(sp->srcu_gp_seq_needed))) + if (ULONG_CMP_LT(curseq, READ_ONCE(ssp->srcu_gp_seq_needed))) return false; /* Grace period in progress, so not idle. */ smp_mb(); /* Order ->srcu_gp_seq with prior access. */ - if (curseq != rcu_seq_current(&sp->srcu_gp_seq)) + if (curseq != rcu_seq_current(&ssp->srcu_gp_seq)) return false; /* GP # changed, so not idle. */ return true; /* With reasonable probability, idle! */ } @@ -856,7 +856,7 @@ static void srcu_leak_callback(struct rcu_head *rhp) * srcu_read_lock(), and srcu_read_unlock() that are all passed the same * srcu_struct structure. */ -void __call_srcu(struct srcu_struct *sp, struct rcu_head *rhp, +void __call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp, rcu_callback_t func, bool do_norm) { unsigned long flags; @@ -866,7 +866,7 @@ void __call_srcu(struct srcu_struct *sp, struct rcu_head *rhp, unsigned long s; struct srcu_data *sdp; - check_init_srcu_struct(sp); + check_init_srcu_struct(ssp); if (debug_rcu_head_queue(rhp)) { /* Probable double call_srcu(), so leak the callback. */ WRITE_ONCE(rhp->func, srcu_leak_callback); @@ -874,14 +874,14 @@ void __call_srcu(struct srcu_struct *sp, struct rcu_head *rhp, return; } rhp->func = func; - idx = srcu_read_lock(sp); + idx = srcu_read_lock(ssp); local_irq_save(flags); - sdp = this_cpu_ptr(sp->sda); + sdp = this_cpu_ptr(ssp->sda); spin_lock_rcu_node(sdp); rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp, false); rcu_segcblist_advance(&sdp->srcu_cblist, - rcu_seq_current(&sp->srcu_gp_seq)); - s = rcu_seq_snap(&sp->srcu_gp_seq); + rcu_seq_current(&ssp->srcu_gp_seq)); + s = rcu_seq_snap(&ssp->srcu_gp_seq); (void)rcu_segcblist_accelerate(&sdp->srcu_cblist, s); if (ULONG_CMP_LT(sdp->srcu_gp_seq_needed, s)) { sdp->srcu_gp_seq_needed = s; @@ -893,15 +893,15 @@ void __call_srcu(struct srcu_struct *sp, struct rcu_head *rhp, } spin_unlock_irqrestore_rcu_node(sdp, flags); if (needgp) - srcu_funnel_gp_start(sp, sdp, s, do_norm); + srcu_funnel_gp_start(ssp, sdp, s, do_norm); else if (needexp) - srcu_funnel_exp_start(sp, sdp->mynode, s); - srcu_read_unlock(sp, idx); + srcu_funnel_exp_start(ssp, sdp->mynode, s); + srcu_read_unlock(ssp, idx); } /** * call_srcu() - Queue a callback for invocation after an SRCU grace period - * @sp: srcu_struct in queue the callback + * @ssp: srcu_struct in queue the callback * @rhp: structure to be used for queueing the SRCU callback. * @func: function to be invoked after the SRCU grace period * @@ -916,21 +916,21 @@ void __call_srcu(struct srcu_struct *sp, struct rcu_head *rhp, * The callback will be invoked from process context, but must nevertheless * be fast and must not block. */ -void call_srcu(struct srcu_struct *sp, struct rcu_head *rhp, +void call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp, rcu_callback_t func) { - __call_srcu(sp, rhp, func, true); + __call_srcu(ssp, rhp, func, true); } EXPORT_SYMBOL_GPL(call_srcu); /* * Helper function for synchronize_srcu() and synchronize_srcu_expedited(). */ -static void __synchronize_srcu(struct srcu_struct *sp, bool do_norm) +static void __synchronize_srcu(struct srcu_struct *ssp, bool do_norm) { struct rcu_synchronize rcu; - RCU_LOCKDEP_WARN(lock_is_held(&sp->dep_map) || + RCU_LOCKDEP_WARN(lock_is_held(&ssp->dep_map) || lock_is_held(&rcu_bh_lock_map) || lock_is_held(&rcu_lock_map) || lock_is_held(&rcu_sched_lock_map), @@ -939,10 +939,10 @@ static void __synchronize_srcu(struct srcu_struct *sp, bool do_norm) if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE) return; might_sleep(); - check_init_srcu_struct(sp); + check_init_srcu_struct(ssp); init_completion(&rcu.completion); init_rcu_head_on_stack(&rcu.head); - __call_srcu(sp, &rcu.head, wakeme_after_rcu, do_norm); + __call_srcu(ssp, &rcu.head, wakeme_after_rcu, do_norm); wait_for_completion(&rcu.completion); destroy_rcu_head_on_stack(&rcu.head); @@ -958,7 +958,7 @@ static void __synchronize_srcu(struct srcu_struct *sp, bool do_norm) /** * synchronize_srcu_expedited - Brute-force SRCU grace period - * @sp: srcu_struct with which to synchronize. + * @ssp: srcu_struct with which to synchronize. * * Wait for an SRCU grace period to elapse, but be more aggressive about * spinning rather than blocking when waiting. @@ -966,15 +966,15 @@ static void __synchronize_srcu(struct srcu_struct *sp, bool do_norm) * Note that synchronize_srcu_expedited() has the same deadlock and * memory-ordering properties as does synchronize_srcu(). */ -void synchronize_srcu_expedited(struct srcu_struct *sp) +void synchronize_srcu_expedited(struct srcu_struct *ssp) { - __synchronize_srcu(sp, rcu_gp_is_normal()); + __synchronize_srcu(ssp, rcu_gp_is_normal()); } EXPORT_SYMBOL_GPL(synchronize_srcu_expedited); /** * synchronize_srcu - wait for prior SRCU read-side critical-section completion - * @sp: srcu_struct with which to synchronize. + * @ssp: srcu_struct with which to synchronize. * * Wait for the count to drain to zero of both indexes. To avoid the * possible starvation of synchronize_srcu(), it waits for the count of @@ -1016,12 +1016,12 @@ EXPORT_SYMBOL_GPL(synchronize_srcu_expedited); * SRCU must also provide it. Note that detecting idleness is heuristic * and subject to both false positives and negatives. */ -void synchronize_srcu(struct srcu_struct *sp) +void synchronize_srcu(struct srcu_struct *ssp) { - if (srcu_might_be_idle(sp) || rcu_gp_is_expedited()) - synchronize_srcu_expedited(sp); + if (srcu_might_be_idle(ssp) || rcu_gp_is_expedited()) + synchronize_srcu_expedited(ssp); else - __synchronize_srcu(sp, true); + __synchronize_srcu(ssp, true); } EXPORT_SYMBOL_GPL(synchronize_srcu); @@ -1031,36 +1031,36 @@ EXPORT_SYMBOL_GPL(synchronize_srcu); static void srcu_barrier_cb(struct rcu_head *rhp) { struct srcu_data *sdp; - struct srcu_struct *sp; + struct srcu_struct *ssp; sdp = container_of(rhp, struct srcu_data, srcu_barrier_head); - sp = sdp->sp; - if (atomic_dec_and_test(&sp->srcu_barrier_cpu_cnt)) - complete(&sp->srcu_barrier_completion); + ssp = sdp->ssp; + if (atomic_dec_and_test(&ssp->srcu_barrier_cpu_cnt)) + complete(&ssp->srcu_barrier_completion); } /** * srcu_barrier - Wait until all in-flight call_srcu() callbacks complete. - * @sp: srcu_struct on which to wait for in-flight callbacks. + * @ssp: srcu_struct on which to wait for in-flight callbacks. */ -void srcu_barrier(struct srcu_struct *sp) +void srcu_barrier(struct srcu_struct *ssp) { int cpu; struct srcu_data *sdp; - unsigned long s = rcu_seq_snap(&sp->srcu_barrier_seq); + unsigned long s = rcu_seq_snap(&ssp->srcu_barrier_seq); - check_init_srcu_struct(sp); - mutex_lock(&sp->srcu_barrier_mutex); - if (rcu_seq_done(&sp->srcu_barrier_seq, s)) { + check_init_srcu_struct(ssp); + mutex_lock(&ssp->srcu_barrier_mutex); + if (rcu_seq_done(&ssp->srcu_barrier_seq, s)) { smp_mb(); /* Force ordering following return. */ - mutex_unlock(&sp->srcu_barrier_mutex); + mutex_unlock(&ssp->srcu_barrier_mutex); return; /* Someone else did our work for us. */ } - rcu_seq_start(&sp->srcu_barrier_seq); - init_completion(&sp->srcu_barrier_completion); + rcu_seq_start(&ssp->srcu_barrier_seq); + init_completion(&ssp->srcu_barrier_completion); /* Initial count prevents reaching zero until all CBs are posted. */ - atomic_set(&sp->srcu_barrier_cpu_cnt, 1); + atomic_set(&ssp->srcu_barrier_cpu_cnt, 1); /* * Each pass through this loop enqueues a callback, but only @@ -1071,39 +1071,39 @@ void srcu_barrier(struct srcu_struct *sp) * grace period as the last callback already in the queue. */ for_each_possible_cpu(cpu) { - sdp = per_cpu_ptr(sp->sda, cpu); + sdp = per_cpu_ptr(ssp->sda, cpu); spin_lock_irq_rcu_node(sdp); - atomic_inc(&sp->srcu_barrier_cpu_cnt); + atomic_inc(&ssp->srcu_barrier_cpu_cnt); sdp->srcu_barrier_head.func = srcu_barrier_cb; debug_rcu_head_queue(&sdp->srcu_barrier_head); if (!rcu_segcblist_entrain(&sdp->srcu_cblist, &sdp->srcu_barrier_head, 0)) { debug_rcu_head_unqueue(&sdp->srcu_barrier_head); - atomic_dec(&sp->srcu_barrier_cpu_cnt); + atomic_dec(&ssp->srcu_barrier_cpu_cnt); } spin_unlock_irq_rcu_node(sdp); } /* Remove the initial count, at which point reaching zero can happen. */ - if (atomic_dec_and_test(&sp->srcu_barrier_cpu_cnt)) - complete(&sp->srcu_barrier_completion); - wait_for_completion(&sp->srcu_barrier_completion); + if (atomic_dec_and_test(&ssp->srcu_barrier_cpu_cnt)) + complete(&ssp->srcu_barrier_completion); + wait_for_completion(&ssp->srcu_barrier_completion); - rcu_seq_end(&sp->srcu_barrier_seq); - mutex_unlock(&sp->srcu_barrier_mutex); + rcu_seq_end(&ssp->srcu_barrier_seq); + mutex_unlock(&ssp->srcu_barrier_mutex); } EXPORT_SYMBOL_GPL(srcu_barrier); /** * srcu_batches_completed - return batches completed. - * @sp: srcu_struct on which to report batch completion. + * @ssp: srcu_struct on which to report batch completion. * * Report the number of batches, correlated with, but not necessarily * precisely the same as, the number of grace periods that have elapsed. */ -unsigned long srcu_batches_completed(struct srcu_struct *sp) +unsigned long srcu_batches_completed(struct srcu_struct *ssp) { - return sp->srcu_idx; + return ssp->srcu_idx; } EXPORT_SYMBOL_GPL(srcu_batches_completed); @@ -1112,11 +1112,11 @@ EXPORT_SYMBOL_GPL(srcu_batches_completed); * to SRCU_STATE_SCAN2, and invoke srcu_gp_end() when scan has * completed in that state. */ -static void srcu_advance_state(struct srcu_struct *sp) +static void srcu_advance_state(struct srcu_struct *ssp) { int idx; - mutex_lock(&sp->srcu_gp_mutex); + mutex_lock(&ssp->srcu_gp_mutex); /* * Because readers might be delayed for an extended period after @@ -1128,47 +1128,47 @@ static void srcu_advance_state(struct srcu_struct *sp) * The load-acquire ensures that we see the accesses performed * by the prior grace period. */ - idx = rcu_seq_state(smp_load_acquire(&sp->srcu_gp_seq)); /* ^^^ */ + idx = rcu_seq_state(smp_load_acquire(&ssp->srcu_gp_seq)); /* ^^^ */ if (idx == SRCU_STATE_IDLE) { - spin_lock_irq_rcu_node(sp); - if (ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)) { - WARN_ON_ONCE(rcu_seq_state(sp->srcu_gp_seq)); - spin_unlock_irq_rcu_node(sp); - mutex_unlock(&sp->srcu_gp_mutex); + spin_lock_irq_rcu_node(ssp); + if (ULONG_CMP_GE(ssp->srcu_gp_seq, ssp->srcu_gp_seq_needed)) { + WARN_ON_ONCE(rcu_seq_state(ssp->srcu_gp_seq)); + spin_unlock_irq_rcu_node(ssp); + mutex_unlock(&ssp->srcu_gp_mutex); return; } - idx = rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)); + idx = rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)); if (idx == SRCU_STATE_IDLE) - srcu_gp_start(sp); - spin_unlock_irq_rcu_node(sp); + srcu_gp_start(ssp); + spin_unlock_irq_rcu_node(ssp); if (idx != SRCU_STATE_IDLE) { - mutex_unlock(&sp->srcu_gp_mutex); + mutex_unlock(&ssp->srcu_gp_mutex); return; /* Someone else started the grace period. */ } } - if (rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) == SRCU_STATE_SCAN1) { - idx = 1 ^ (sp->srcu_idx & 1); - if (!try_check_zero(sp, idx, 1)) { - mutex_unlock(&sp->srcu_gp_mutex); + if (rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)) == SRCU_STATE_SCAN1) { + idx = 1 ^ (ssp->srcu_idx & 1); + if (!try_check_zero(ssp, idx, 1)) { + mutex_unlock(&ssp->srcu_gp_mutex); return; /* readers present, retry later. */ } - srcu_flip(sp); - rcu_seq_set_state(&sp->srcu_gp_seq, SRCU_STATE_SCAN2); + srcu_flip(ssp); + rcu_seq_set_state(&ssp->srcu_gp_seq, SRCU_STATE_SCAN2); } - if (rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) == SRCU_STATE_SCAN2) { + if (rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)) == SRCU_STATE_SCAN2) { /* * SRCU read-side critical sections are normally short, * so check at least twice in quick succession after a flip. */ - idx = 1 ^ (sp->srcu_idx & 1); - if (!try_check_zero(sp, idx, 2)) { - mutex_unlock(&sp->srcu_gp_mutex); + idx = 1 ^ (ssp->srcu_idx & 1); + if (!try_check_zero(ssp, idx, 2)) { + mutex_unlock(&ssp->srcu_gp_mutex); return; /* readers present, retry later. */ } - srcu_gp_end(sp); /* Releases ->srcu_gp_mutex. */ + srcu_gp_end(ssp); /* Releases ->srcu_gp_mutex. */ } } @@ -1184,14 +1184,14 @@ static void srcu_invoke_callbacks(struct work_struct *work) struct rcu_cblist ready_cbs; struct rcu_head *rhp; struct srcu_data *sdp; - struct srcu_struct *sp; + struct srcu_struct *ssp; sdp = container_of(work, struct srcu_data, work.work); - sp = sdp->sp; + ssp = sdp->ssp; rcu_cblist_init(&ready_cbs); spin_lock_irq_rcu_node(sdp); rcu_segcblist_advance(&sdp->srcu_cblist, - rcu_seq_current(&sp->srcu_gp_seq)); + rcu_seq_current(&ssp->srcu_gp_seq)); if (sdp->srcu_cblist_invoking || !rcu_segcblist_ready_cbs(&sdp->srcu_cblist)) { spin_unlock_irq_rcu_node(sdp); @@ -1217,7 +1217,7 @@ static void srcu_invoke_callbacks(struct work_struct *work) spin_lock_irq_rcu_node(sdp); rcu_segcblist_insert_count(&sdp->srcu_cblist, &ready_cbs); (void)rcu_segcblist_accelerate(&sdp->srcu_cblist, - rcu_seq_snap(&sp->srcu_gp_seq)); + rcu_seq_snap(&ssp->srcu_gp_seq)); sdp->srcu_cblist_invoking = false; more = rcu_segcblist_ready_cbs(&sdp->srcu_cblist); spin_unlock_irq_rcu_node(sdp); @@ -1229,24 +1229,24 @@ static void srcu_invoke_callbacks(struct work_struct *work) * Finished one round of SRCU grace period. Start another if there are * more SRCU callbacks queued, otherwise put SRCU into not-running state. */ -static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay) +static void srcu_reschedule(struct srcu_struct *ssp, unsigned long delay) { bool pushgp = true; - spin_lock_irq_rcu_node(sp); - if (ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)) { - if (!WARN_ON_ONCE(rcu_seq_state(sp->srcu_gp_seq))) { + spin_lock_irq_rcu_node(ssp); + if (ULONG_CMP_GE(ssp->srcu_gp_seq, ssp->srcu_gp_seq_needed)) { + if (!WARN_ON_ONCE(rcu_seq_state(ssp->srcu_gp_seq))) { /* All requests fulfilled, time to go idle. */ pushgp = false; } - } else if (!rcu_seq_state(sp->srcu_gp_seq)) { + } else if (!rcu_seq_state(ssp->srcu_gp_seq)) { /* Outstanding request and no GP. Start one. */ - srcu_gp_start(sp); + srcu_gp_start(ssp); } - spin_unlock_irq_rcu_node(sp); + spin_unlock_irq_rcu_node(ssp); if (pushgp) - queue_delayed_work(rcu_gp_wq, &sp->work, delay); + queue_delayed_work(rcu_gp_wq, &ssp->work, delay); } /* @@ -1254,41 +1254,41 @@ static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay) */ static void process_srcu(struct work_struct *work) { - struct srcu_struct *sp; + struct srcu_struct *ssp; - sp = container_of(work, struct srcu_struct, work.work); + ssp = container_of(work, struct srcu_struct, work.work); - srcu_advance_state(sp); - srcu_reschedule(sp, srcu_get_delay(sp)); + srcu_advance_state(ssp); + srcu_reschedule(ssp, srcu_get_delay(ssp)); } void srcutorture_get_gp_data(enum rcutorture_type test_type, - struct srcu_struct *sp, int *flags, + struct srcu_struct *ssp, int *flags, unsigned long *gp_seq) { if (test_type != SRCU_FLAVOR) return; *flags = 0; - *gp_seq = rcu_seq_current(&sp->srcu_gp_seq); + *gp_seq = rcu_seq_current(&ssp->srcu_gp_seq); } EXPORT_SYMBOL_GPL(srcutorture_get_gp_data); -void srcu_torture_stats_print(struct srcu_struct *sp, char *tt, char *tf) +void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf) { int cpu; int idx; unsigned long s0 = 0, s1 = 0; - idx = sp->srcu_idx & 0x1; + idx = ssp->srcu_idx & 0x1; pr_alert("%s%s Tree SRCU g%ld per-CPU(idx=%d):", - tt, tf, rcu_seq_current(&sp->srcu_gp_seq), idx); + tt, tf, rcu_seq_current(&ssp->srcu_gp_seq), idx); for_each_possible_cpu(cpu) { unsigned long l0, l1; unsigned long u0, u1; long c0, c1; struct srcu_data *sdp; - sdp = per_cpu_ptr(sp->sda, cpu); + sdp = per_cpu_ptr(ssp->sda, cpu); u0 = sdp->srcu_unlock_count[!idx]; u1 = sdp->srcu_unlock_count[idx]; @@ -1323,14 +1323,14 @@ early_initcall(srcu_bootup_announce); void __init srcu_init(void) { - struct srcu_struct *sp; + struct srcu_struct *ssp; srcu_init_done = true; while (!list_empty(&srcu_boot_list)) { - sp = list_first_entry(&srcu_boot_list, struct srcu_struct, + ssp = list_first_entry(&srcu_boot_list, struct srcu_struct, work.work.entry); - check_init_srcu_struct(sp); - list_del_init(&sp->work.work.entry); - queue_work(rcu_gp_wq, &sp->work.work); + check_init_srcu_struct(ssp); + list_del_init(&ssp->work.work.entry); + queue_work(rcu_gp_wq, &ssp->work.work); } } -- cgit From 9adcfaffc34d53e498637237fb3701560359d50b Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Sat, 24 Nov 2018 13:10:25 +0900 Subject: printk: Make printk_emit() local function. printk_emit() is called from only devkmsg_write() in the same file. Save object size by making it a local function. Link: http://lkml.kernel.org/r/5cc99d2c-c408-34f7-d1fc-e1cd2a9e31da@i-love.sakura.ne.jp Cc: Steven Rostedt Signed-off-by: Tetsuo Handa Reviewed-by: Sergey Senozhatsky Signed-off-by: Petr Mladek --- include/linux/printk.h | 5 ----- kernel/printk/printk.c | 30 ++++++++++++++---------------- 2 files changed, 14 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/include/linux/printk.h b/include/linux/printk.h index cf3eccfe1543..55aa96975fa2 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -166,11 +166,6 @@ int vprintk_emit(int facility, int level, asmlinkage __printf(1, 0) int vprintk(const char *fmt, va_list args); -asmlinkage __printf(5, 6) __cold -int printk_emit(int facility, int level, - const char *dict, size_t dictlen, - const char *fmt, ...); - asmlinkage __printf(1, 2) __cold int printk(const char *fmt, ...); diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index b77150ad1965..a1d88212a5d2 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -753,6 +753,19 @@ struct devkmsg_user { char buf[CONSOLE_EXT_LOG_MAX]; }; +static __printf(3, 4) __cold +int devkmsg_emit(int facility, int level, const char *fmt, ...) +{ + va_list args; + int r; + + va_start(args, fmt); + r = vprintk_emit(facility, level, NULL, 0, fmt, args); + va_end(args); + + return r; +} + static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from) { char *buf, *line; @@ -811,7 +824,7 @@ static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from) } } - printk_emit(facility, level, NULL, 0, "%s", line); + devkmsg_emit(facility, level, "%s", line); kfree(buf); return ret; } @@ -1936,21 +1949,6 @@ asmlinkage int vprintk(const char *fmt, va_list args) } EXPORT_SYMBOL(vprintk); -asmlinkage int printk_emit(int facility, int level, - const char *dict, size_t dictlen, - const char *fmt, ...) -{ - va_list args; - int r; - - va_start(args, fmt); - r = vprintk_emit(facility, level, dict, dictlen, fmt, args); - va_end(args); - - return r; -} -EXPORT_SYMBOL(printk_emit); - int vprintk_default(const char *fmt, va_list args) { int r; -- cgit From 2d25bc55235314d869459c574be14e8faa73aca3 Mon Sep 17 00:00:00 2001 From: Jessica Yu Date: Mon, 19 Nov 2018 17:43:58 +0100 Subject: module: make it clearer when we're handling kallsyms symbols vs exported symbols The module loader internally works with both exported symbols represented as struct kernel_symbol, as well as Elf symbols from a module's symbol table. It's hard to distinguish sometimes which type of symbol we're handling given that some helper function names are not consistent or helpful. Take get_ksymbol() for instance - are we looking for an exported symbol or a kallsyms symbol here? Or symname() and kernel_symbol_name() - which function handles an exported symbol and which one an Elf symbol? Clean up and unify the function naming scheme a bit to make it clear which kind of symbol we're handling. This change only affects static functions internal to the module loader. Reviewed-by: Miroslav Benes Signed-off-by: Jessica Yu --- kernel/module.c | 78 ++++++++++++++++++++++++++++++++------------------------- 1 file changed, 44 insertions(+), 34 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 49a405891587..1b5edf78694c 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -495,9 +495,9 @@ struct find_symbol_arg { const struct kernel_symbol *sym; }; -static bool check_symbol(const struct symsearch *syms, - struct module *owner, - unsigned int symnum, void *data) +static bool check_exported_symbol(const struct symsearch *syms, + struct module *owner, + unsigned int symnum, void *data) { struct find_symbol_arg *fsa = data; @@ -555,9 +555,9 @@ static int cmp_name(const void *va, const void *vb) return strcmp(a, kernel_symbol_name(b)); } -static bool find_symbol_in_section(const struct symsearch *syms, - struct module *owner, - void *data) +static bool find_exported_symbol_in_section(const struct symsearch *syms, + struct module *owner, + void *data) { struct find_symbol_arg *fsa = data; struct kernel_symbol *sym; @@ -565,13 +565,14 @@ static bool find_symbol_in_section(const struct symsearch *syms, sym = bsearch(fsa->name, syms->start, syms->stop - syms->start, sizeof(struct kernel_symbol), cmp_name); - if (sym != NULL && check_symbol(syms, owner, sym - syms->start, data)) + if (sym != NULL && check_exported_symbol(syms, owner, + sym - syms->start, data)) return true; return false; } -/* Find a symbol and return it, along with, (optional) crc and +/* Find an exported symbol and return it, along with, (optional) crc and * (optional) module which owns it. Needs preempt disabled or module_mutex. */ const struct kernel_symbol *find_symbol(const char *name, struct module **owner, @@ -585,7 +586,7 @@ const struct kernel_symbol *find_symbol(const char *name, fsa.gplok = gplok; fsa.warn = warn; - if (each_symbol_section(find_symbol_in_section, &fsa)) { + if (each_symbol_section(find_exported_symbol_in_section, &fsa)) { if (owner) *owner = fsa.owner; if (crc) @@ -2198,7 +2199,7 @@ EXPORT_SYMBOL_GPL(__symbol_get); * * You must hold the module_mutex. */ -static int verify_export_symbols(struct module *mod) +static int verify_exported_symbols(struct module *mod) { unsigned int i; struct module *owner; @@ -2519,10 +2520,10 @@ static void free_modinfo(struct module *mod) #ifdef CONFIG_KALLSYMS -/* lookup symbol in given range of kernel_symbols */ -static const struct kernel_symbol *lookup_symbol(const char *name, - const struct kernel_symbol *start, - const struct kernel_symbol *stop) +/* Lookup exported symbol in given range of kernel_symbols */ +static const struct kernel_symbol *lookup_exported_symbol(const char *name, + const struct kernel_symbol *start, + const struct kernel_symbol *stop) { return bsearch(name, start, stop - start, sizeof(struct kernel_symbol), cmp_name); @@ -2533,9 +2534,10 @@ static int is_exported(const char *name, unsigned long value, { const struct kernel_symbol *ks; if (!mod) - ks = lookup_symbol(name, __start___ksymtab, __stop___ksymtab); + ks = lookup_exported_symbol(name, __start___ksymtab, __stop___ksymtab); else - ks = lookup_symbol(name, mod->syms, mod->syms + mod->num_syms); + ks = lookup_exported_symbol(name, mod->syms, mod->syms + mod->num_syms); + return ks != NULL && kernel_symbol_value(ks) == value; } @@ -3592,7 +3594,7 @@ static int complete_formation(struct module *mod, struct load_info *info) mutex_lock(&module_mutex); /* Find duplicate symbols (must be called under lock). */ - err = verify_export_symbols(mod); + err = verify_exported_symbols(mod); if (err < 0) goto out; @@ -3911,15 +3913,19 @@ static inline int is_arm_mapping_symbol(const char *str) && (str[2] == '\0' || str[2] == '.'); } -static const char *symname(struct mod_kallsyms *kallsyms, unsigned int symnum) +static const char *kallsyms_symbol_name(struct mod_kallsyms *kallsyms, unsigned int symnum) { return kallsyms->strtab + kallsyms->symtab[symnum].st_name; } -static const char *get_ksymbol(struct module *mod, - unsigned long addr, - unsigned long *size, - unsigned long *offset) +/* + * Given a module and address, find the corresponding symbol and return its name + * while providing its size and offset if needed. + */ +static const char *find_kallsyms_symbol(struct module *mod, + unsigned long addr, + unsigned long *size, + unsigned long *offset) { unsigned int i, best = 0; unsigned long nextval; @@ -3939,8 +3945,8 @@ static const char *get_ksymbol(struct module *mod, /* We ignore unnamed symbols: they're uninformative * and inserted at a whim. */ - if (*symname(kallsyms, i) == '\0' - || is_arm_mapping_symbol(symname(kallsyms, i))) + if (*kallsyms_symbol_name(kallsyms, i) == '\0' + || is_arm_mapping_symbol(kallsyms_symbol_name(kallsyms, i))) continue; if (kallsyms->symtab[i].st_value <= addr @@ -3958,7 +3964,8 @@ static const char *get_ksymbol(struct module *mod, *size = nextval - kallsyms->symtab[best].st_value; if (offset) *offset = addr - kallsyms->symtab[best].st_value; - return symname(kallsyms, best); + + return kallsyms_symbol_name(kallsyms, best); } void * __weak dereference_module_function_descriptor(struct module *mod, @@ -3983,7 +3990,8 @@ const char *module_address_lookup(unsigned long addr, if (mod) { if (modname) *modname = mod->name; - ret = get_ksymbol(mod, addr, size, offset); + + ret = find_kallsyms_symbol(mod, addr, size, offset); } /* Make a copy in here where it's safe */ if (ret) { @@ -4006,9 +4014,10 @@ int lookup_module_symbol_name(unsigned long addr, char *symname) if (within_module(addr, mod)) { const char *sym; - sym = get_ksymbol(mod, addr, NULL, NULL); + sym = find_kallsyms_symbol(mod, addr, NULL, NULL); if (!sym) goto out; + strlcpy(symname, sym, KSYM_NAME_LEN); preempt_enable(); return 0; @@ -4031,7 +4040,7 @@ int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size, if (within_module(addr, mod)) { const char *sym; - sym = get_ksymbol(mod, addr, size, offset); + sym = find_kallsyms_symbol(mod, addr, size, offset); if (!sym) goto out; if (modname) @@ -4062,7 +4071,7 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type, if (symnum < kallsyms->num_symtab) { *value = kallsyms->symtab[symnum].st_value; *type = kallsyms->symtab[symnum].st_info; - strlcpy(name, symname(kallsyms, symnum), KSYM_NAME_LEN); + strlcpy(name, kallsyms_symbol_name(kallsyms, symnum), KSYM_NAME_LEN); strlcpy(module_name, mod->name, MODULE_NAME_LEN); *exported = is_exported(name, *value, mod); preempt_enable(); @@ -4074,13 +4083,14 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type, return -ERANGE; } -static unsigned long mod_find_symname(struct module *mod, const char *name) +/* Given a module and name of symbol, find and return the symbol's value */ +static unsigned long find_kallsyms_symbol_value(struct module *mod, const char *name) { unsigned int i; struct mod_kallsyms *kallsyms = rcu_dereference_sched(mod->kallsyms); for (i = 0; i < kallsyms->num_symtab; i++) - if (strcmp(name, symname(kallsyms, i)) == 0 && + if (strcmp(name, kallsyms_symbol_name(kallsyms, i)) == 0 && kallsyms->symtab[i].st_shndx != SHN_UNDEF) return kallsyms->symtab[i].st_value; return 0; @@ -4097,12 +4107,12 @@ unsigned long module_kallsyms_lookup_name(const char *name) preempt_disable(); if ((colon = strnchr(name, MODULE_NAME_LEN, ':')) != NULL) { if ((mod = find_module_all(name, colon - name, false)) != NULL) - ret = mod_find_symname(mod, colon+1); + ret = find_kallsyms_symbol_value(mod, colon+1); } else { list_for_each_entry_rcu(mod, &modules, list) { if (mod->state == MODULE_STATE_UNFORMED) continue; - if ((ret = mod_find_symname(mod, name)) != 0) + if ((ret = find_kallsyms_symbol_value(mod, name)) != 0) break; } } @@ -4131,7 +4141,7 @@ int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *, if (kallsyms->symtab[i].st_shndx == SHN_UNDEF) continue; - ret = fn(data, symname(kallsyms, i), + ret = fn(data, kallsyms_symbol_name(kallsyms, i), mod, kallsyms->symtab[i].st_value); if (ret != 0) return ret; -- cgit From 96c6935212d632cb78db9149c61005b839e03b75 Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Tue, 6 Nov 2018 09:38:06 -0500 Subject: PM / QoS: Change to use DEFINE_SHOW_ATTRIBUTE macro Use DEFINE_SHOW_ATTRIBUTE macro to simplify the code. Signed-off-by: Yangtao Li Acked-by: Pavel Machek Signed-off-by: Rafael J. Wysocki --- kernel/power/qos.c | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/power/qos.c b/kernel/power/qos.c index 86d72ffb811b..b7a82502857a 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c @@ -184,7 +184,7 @@ static inline void pm_qos_set_value(struct pm_qos_constraints *c, s32 value) c->target_value = value; } -static int pm_qos_dbg_show_requests(struct seq_file *s, void *unused) +static int pm_qos_debug_show(struct seq_file *s, void *unused) { struct pm_qos_object *qos = (struct pm_qos_object *)s->private; struct pm_qos_constraints *c; @@ -245,18 +245,7 @@ out: return 0; } -static int pm_qos_dbg_open(struct inode *inode, struct file *file) -{ - return single_open(file, pm_qos_dbg_show_requests, - inode->i_private); -} - -static const struct file_operations pm_qos_debug_fops = { - .open = pm_qos_dbg_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(pm_qos_debug); /** * pm_qos_update_target - manages the constraints list and calls the notifiers -- cgit From c43ac4a5301986c015137bb89568979f9b3264ca Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 27 Nov 2018 09:36:37 -0500 Subject: tracing: Do not line wrap short line in function_graph_enter() Commit 588ca1786f2dd ("function_graph: Use new curr_ret_depth to manage depth instead of curr_ret_stack") removed a parameter from the call ftrace_push_return_trace() that made it so that the entire call was under 80 characters, but it did not remove the line break. There's no reason to break that line up, so make it a single line. Link: http://lkml.kernel.org/r/20181122100322.GN2131@hirez.programming.kicks-ass.net Reported-by: Peter Zijlstra Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_functions_graph.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 086af4f5c3e8..0d235e44d08e 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -188,8 +188,7 @@ int function_graph_enter(unsigned long ret, unsigned long func, trace.func = func; trace.depth = ++current->curr_ret_depth; - if (ftrace_push_return_trace(ret, func, - frame_pointer, retp)) + if (ftrace_push_return_trace(ret, func, frame_pointer, retp)) goto out; /* Only trace if the calling function expects to */ -- cgit From d864a3ca883095aa12575b84841ebd52b3d808fa Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 12 Nov 2018 15:21:22 -0500 Subject: fgraph: Create a fgraph.c file to store function graph infrastructure As the function graph infrastructure can be used by thing other than tracing, moving the code to its own file out of the trace_functions_graph.c code makes more sense. The fgraph.c file will only contain the infrastructure required to hook into functions and their return code. Reviewed-by: Joel Fernandes (Google) Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/Makefile | 1 + kernel/trace/fgraph.c | 232 +++++++++++++++++++++++++++++++++++ kernel/trace/trace_functions_graph.c | 220 --------------------------------- 3 files changed, 233 insertions(+), 220 deletions(-) create mode 100644 kernel/trace/fgraph.c (limited to 'kernel') diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index f81dadbc7c4a..c7ade7965464 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -57,6 +57,7 @@ obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o +obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += fgraph.o ifeq ($(CONFIG_BLOCK),y) obj-$(CONFIG_EVENT_TRACING) += blktrace.o endif diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c new file mode 100644 index 000000000000..5ad9c0e88b80 --- /dev/null +++ b/kernel/trace/fgraph.c @@ -0,0 +1,232 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Infrastructure to took into function calls and returns. + * Copyright (c) 2008-2009 Frederic Weisbecker + * Mostly borrowed from function tracer which + * is Copyright (c) Steven Rostedt + * + * Highly modified by Steven Rostedt (VMware). + */ +#include + +#include "trace.h" + +static bool kill_ftrace_graph; + +/** + * ftrace_graph_is_dead - returns true if ftrace_graph_stop() was called + * + * ftrace_graph_stop() is called when a severe error is detected in + * the function graph tracing. This function is called by the critical + * paths of function graph to keep those paths from doing any more harm. + */ +bool ftrace_graph_is_dead(void) +{ + return kill_ftrace_graph; +} + +/** + * ftrace_graph_stop - set to permanently disable function graph tracincg + * + * In case of an error int function graph tracing, this is called + * to try to keep function graph tracing from causing any more harm. + * Usually this is pretty severe and this is called to try to at least + * get a warning out to the user. + */ +void ftrace_graph_stop(void) +{ + kill_ftrace_graph = true; +} + +/* Add a function return address to the trace stack on thread info.*/ +static int +ftrace_push_return_trace(unsigned long ret, unsigned long func, + unsigned long frame_pointer, unsigned long *retp) +{ + unsigned long long calltime; + int index; + + if (unlikely(ftrace_graph_is_dead())) + return -EBUSY; + + if (!current->ret_stack) + return -EBUSY; + + /* + * We must make sure the ret_stack is tested before we read + * anything else. + */ + smp_rmb(); + + /* The return trace stack is full */ + if (current->curr_ret_stack == FTRACE_RETFUNC_DEPTH - 1) { + atomic_inc(¤t->trace_overrun); + return -EBUSY; + } + + /* + * The curr_ret_stack is an index to ftrace return stack of + * current task. Its value should be in [0, FTRACE_RETFUNC_ + * DEPTH) when the function graph tracer is used. To support + * filtering out specific functions, it makes the index + * negative by subtracting huge value (FTRACE_NOTRACE_DEPTH) + * so when it sees a negative index the ftrace will ignore + * the record. And the index gets recovered when returning + * from the filtered function by adding the FTRACE_NOTRACE_ + * DEPTH and then it'll continue to record functions normally. + * + * The curr_ret_stack is initialized to -1 and get increased + * in this function. So it can be less than -1 only if it was + * filtered out via ftrace_graph_notrace_addr() which can be + * set from set_graph_notrace file in tracefs by user. + */ + if (current->curr_ret_stack < -1) + return -EBUSY; + + calltime = trace_clock_local(); + + index = ++current->curr_ret_stack; + if (ftrace_graph_notrace_addr(func)) + current->curr_ret_stack -= FTRACE_NOTRACE_DEPTH; + barrier(); + current->ret_stack[index].ret = ret; + current->ret_stack[index].func = func; + current->ret_stack[index].calltime = calltime; +#ifdef HAVE_FUNCTION_GRAPH_FP_TEST + current->ret_stack[index].fp = frame_pointer; +#endif +#ifdef HAVE_FUNCTION_GRAPH_RET_ADDR_PTR + current->ret_stack[index].retp = retp; +#endif + return 0; +} + +int function_graph_enter(unsigned long ret, unsigned long func, + unsigned long frame_pointer, unsigned long *retp) +{ + struct ftrace_graph_ent trace; + + trace.func = func; + trace.depth = ++current->curr_ret_depth; + + if (ftrace_push_return_trace(ret, func, frame_pointer, retp)) + goto out; + + /* Only trace if the calling function expects to */ + if (!ftrace_graph_entry(&trace)) + goto out_ret; + + return 0; + out_ret: + current->curr_ret_stack--; + out: + current->curr_ret_depth--; + return -EBUSY; +} + +/* Retrieve a function return address to the trace stack on thread info.*/ +static void +ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret, + unsigned long frame_pointer) +{ + int index; + + index = current->curr_ret_stack; + + /* + * A negative index here means that it's just returned from a + * notrace'd function. Recover index to get an original + * return address. See ftrace_push_return_trace(). + * + * TODO: Need to check whether the stack gets corrupted. + */ + if (index < 0) + index += FTRACE_NOTRACE_DEPTH; + + if (unlikely(index < 0 || index >= FTRACE_RETFUNC_DEPTH)) { + ftrace_graph_stop(); + WARN_ON(1); + /* Might as well panic, otherwise we have no where to go */ + *ret = (unsigned long)panic; + return; + } + +#ifdef HAVE_FUNCTION_GRAPH_FP_TEST + /* + * The arch may choose to record the frame pointer used + * and check it here to make sure that it is what we expect it + * to be. If gcc does not set the place holder of the return + * address in the frame pointer, and does a copy instead, then + * the function graph trace will fail. This test detects this + * case. + * + * Currently, x86_32 with optimize for size (-Os) makes the latest + * gcc do the above. + * + * Note, -mfentry does not use frame pointers, and this test + * is not needed if CC_USING_FENTRY is set. + */ + if (unlikely(current->ret_stack[index].fp != frame_pointer)) { + ftrace_graph_stop(); + WARN(1, "Bad frame pointer: expected %lx, received %lx\n" + " from func %ps return to %lx\n", + current->ret_stack[index].fp, + frame_pointer, + (void *)current->ret_stack[index].func, + current->ret_stack[index].ret); + *ret = (unsigned long)panic; + return; + } +#endif + + *ret = current->ret_stack[index].ret; + trace->func = current->ret_stack[index].func; + trace->calltime = current->ret_stack[index].calltime; + trace->overrun = atomic_read(¤t->trace_overrun); + trace->depth = current->curr_ret_depth--; + /* + * We still want to trace interrupts coming in if + * max_depth is set to 1. Make sure the decrement is + * seen before ftrace_graph_return. + */ + barrier(); +} + +/* + * Send the trace to the ring-buffer. + * @return the original return address. + */ +unsigned long ftrace_return_to_handler(unsigned long frame_pointer) +{ + struct ftrace_graph_ret trace; + unsigned long ret; + + ftrace_pop_return_trace(&trace, &ret, frame_pointer); + trace.rettime = trace_clock_local(); + ftrace_graph_return(&trace); + /* + * The ftrace_graph_return() may still access the current + * ret_stack structure, we need to make sure the update of + * curr_ret_stack is after that. + */ + barrier(); + current->curr_ret_stack--; + /* + * The curr_ret_stack can be less than -1 only if it was + * filtered out and it's about to return from the function. + * Recover the index and continue to trace normal functions. + */ + if (current->curr_ret_stack < -1) { + current->curr_ret_stack += FTRACE_NOTRACE_DEPTH; + return ret; + } + + if (unlikely(!ret)) { + ftrace_graph_stop(); + WARN_ON(1); + /* Might as well panic. What else to do? */ + ret = (unsigned long)panic; + } + + return ret; +} diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 0d235e44d08e..b846d82c2f95 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -16,33 +16,6 @@ #include "trace.h" #include "trace_output.h" -static bool kill_ftrace_graph; - -/** - * ftrace_graph_is_dead - returns true if ftrace_graph_stop() was called - * - * ftrace_graph_stop() is called when a severe error is detected in - * the function graph tracing. This function is called by the critical - * paths of function graph to keep those paths from doing any more harm. - */ -bool ftrace_graph_is_dead(void) -{ - return kill_ftrace_graph; -} - -/** - * ftrace_graph_stop - set to permanently disable function graph tracincg - * - * In case of an error int function graph tracing, this is called - * to try to keep function graph tracing from causing any more harm. - * Usually this is pretty severe and this is called to try to at least - * get a warning out to the user. - */ -void ftrace_graph_stop(void) -{ - kill_ftrace_graph = true; -} - /* When set, irq functions will be ignored */ static int ftrace_graph_skip_irqs; @@ -117,199 +90,6 @@ static void print_graph_duration(struct trace_array *tr, unsigned long long duration, struct trace_seq *s, u32 flags); -/* Add a function return address to the trace stack on thread info.*/ -static int -ftrace_push_return_trace(unsigned long ret, unsigned long func, - unsigned long frame_pointer, unsigned long *retp) -{ - unsigned long long calltime; - int index; - - if (unlikely(ftrace_graph_is_dead())) - return -EBUSY; - - if (!current->ret_stack) - return -EBUSY; - - /* - * We must make sure the ret_stack is tested before we read - * anything else. - */ - smp_rmb(); - - /* The return trace stack is full */ - if (current->curr_ret_stack == FTRACE_RETFUNC_DEPTH - 1) { - atomic_inc(¤t->trace_overrun); - return -EBUSY; - } - - /* - * The curr_ret_stack is an index to ftrace return stack of - * current task. Its value should be in [0, FTRACE_RETFUNC_ - * DEPTH) when the function graph tracer is used. To support - * filtering out specific functions, it makes the index - * negative by subtracting huge value (FTRACE_NOTRACE_DEPTH) - * so when it sees a negative index the ftrace will ignore - * the record. And the index gets recovered when returning - * from the filtered function by adding the FTRACE_NOTRACE_ - * DEPTH and then it'll continue to record functions normally. - * - * The curr_ret_stack is initialized to -1 and get increased - * in this function. So it can be less than -1 only if it was - * filtered out via ftrace_graph_notrace_addr() which can be - * set from set_graph_notrace file in tracefs by user. - */ - if (current->curr_ret_stack < -1) - return -EBUSY; - - calltime = trace_clock_local(); - - index = ++current->curr_ret_stack; - if (ftrace_graph_notrace_addr(func)) - current->curr_ret_stack -= FTRACE_NOTRACE_DEPTH; - barrier(); - current->ret_stack[index].ret = ret; - current->ret_stack[index].func = func; - current->ret_stack[index].calltime = calltime; -#ifdef HAVE_FUNCTION_GRAPH_FP_TEST - current->ret_stack[index].fp = frame_pointer; -#endif -#ifdef HAVE_FUNCTION_GRAPH_RET_ADDR_PTR - current->ret_stack[index].retp = retp; -#endif - return 0; -} - -int function_graph_enter(unsigned long ret, unsigned long func, - unsigned long frame_pointer, unsigned long *retp) -{ - struct ftrace_graph_ent trace; - - trace.func = func; - trace.depth = ++current->curr_ret_depth; - - if (ftrace_push_return_trace(ret, func, frame_pointer, retp)) - goto out; - - /* Only trace if the calling function expects to */ - if (!ftrace_graph_entry(&trace)) - goto out_ret; - - return 0; - out_ret: - current->curr_ret_stack--; - out: - current->curr_ret_depth--; - return -EBUSY; -} - -/* Retrieve a function return address to the trace stack on thread info.*/ -static void -ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret, - unsigned long frame_pointer) -{ - int index; - - index = current->curr_ret_stack; - - /* - * A negative index here means that it's just returned from a - * notrace'd function. Recover index to get an original - * return address. See ftrace_push_return_trace(). - * - * TODO: Need to check whether the stack gets corrupted. - */ - if (index < 0) - index += FTRACE_NOTRACE_DEPTH; - - if (unlikely(index < 0 || index >= FTRACE_RETFUNC_DEPTH)) { - ftrace_graph_stop(); - WARN_ON(1); - /* Might as well panic, otherwise we have no where to go */ - *ret = (unsigned long)panic; - return; - } - -#ifdef HAVE_FUNCTION_GRAPH_FP_TEST - /* - * The arch may choose to record the frame pointer used - * and check it here to make sure that it is what we expect it - * to be. If gcc does not set the place holder of the return - * address in the frame pointer, and does a copy instead, then - * the function graph trace will fail. This test detects this - * case. - * - * Currently, x86_32 with optimize for size (-Os) makes the latest - * gcc do the above. - * - * Note, -mfentry does not use frame pointers, and this test - * is not needed if CC_USING_FENTRY is set. - */ - if (unlikely(current->ret_stack[index].fp != frame_pointer)) { - ftrace_graph_stop(); - WARN(1, "Bad frame pointer: expected %lx, received %lx\n" - " from func %ps return to %lx\n", - current->ret_stack[index].fp, - frame_pointer, - (void *)current->ret_stack[index].func, - current->ret_stack[index].ret); - *ret = (unsigned long)panic; - return; - } -#endif - - *ret = current->ret_stack[index].ret; - trace->func = current->ret_stack[index].func; - trace->calltime = current->ret_stack[index].calltime; - trace->overrun = atomic_read(¤t->trace_overrun); - trace->depth = current->curr_ret_depth--; - /* - * We still want to trace interrupts coming in if - * max_depth is set to 1. Make sure the decrement is - * seen before ftrace_graph_return. - */ - barrier(); -} - -/* - * Send the trace to the ring-buffer. - * @return the original return address. - */ -unsigned long ftrace_return_to_handler(unsigned long frame_pointer) -{ - struct ftrace_graph_ret trace; - unsigned long ret; - - ftrace_pop_return_trace(&trace, &ret, frame_pointer); - trace.rettime = trace_clock_local(); - ftrace_graph_return(&trace); - /* - * The ftrace_graph_return() may still access the current - * ret_stack structure, we need to make sure the update of - * curr_ret_stack is after that. - */ - barrier(); - current->curr_ret_stack--; - /* - * The curr_ret_stack can be less than -1 only if it was - * filtered out and it's about to return from the function. - * Recover the index and continue to trace normal functions. - */ - if (current->curr_ret_stack < -1) { - current->curr_ret_stack += FTRACE_NOTRACE_DEPTH; - return ret; - } - - if (unlikely(!ret)) { - ftrace_graph_stop(); - WARN_ON(1); - /* Might as well panic. What else to do? */ - ret = (unsigned long)panic; - } - - return ret; -} - /** * ftrace_graph_ret_addr - convert a potentially modified stack return address * to its original value -- cgit From 9cd2992f2d6c8df54c5b937d5d1f8a23b684cc1d Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 14 Nov 2018 13:14:58 -0500 Subject: fgraph: Have set_graph_notrace only affect function_graph tracer In order to make the function graph infrastructure more generic, there can not be code specific for the function_graph tracer in the generic code. This includes the set_graph_notrace logic, that stops all graph calls when a function in the set_graph_notrace is hit. By using the trace_recursion mask, we can use a bit in the current task_struct to implement the notrace code, and move the logic out of fgraph.c and into trace_functions_graph.c and keeps it affecting only the tracer and not all call graph callbacks. Acked-by: Namhyung Kim Reviewed-by: Joel Fernandes (Google) Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/fgraph.c | 21 --------------------- kernel/trace/trace.h | 7 +++++++ kernel/trace/trace_functions_graph.c | 22 ++++++++++++++++++++++ 3 files changed, 29 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index 5ad9c0e88b80..e852b69c0e64 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -64,30 +64,9 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, return -EBUSY; } - /* - * The curr_ret_stack is an index to ftrace return stack of - * current task. Its value should be in [0, FTRACE_RETFUNC_ - * DEPTH) when the function graph tracer is used. To support - * filtering out specific functions, it makes the index - * negative by subtracting huge value (FTRACE_NOTRACE_DEPTH) - * so when it sees a negative index the ftrace will ignore - * the record. And the index gets recovered when returning - * from the filtered function by adding the FTRACE_NOTRACE_ - * DEPTH and then it'll continue to record functions normally. - * - * The curr_ret_stack is initialized to -1 and get increased - * in this function. So it can be less than -1 only if it was - * filtered out via ftrace_graph_notrace_addr() which can be - * set from set_graph_notrace file in tracefs by user. - */ - if (current->curr_ret_stack < -1) - return -EBUSY; - calltime = trace_clock_local(); index = ++current->curr_ret_stack; - if (ftrace_graph_notrace_addr(func)) - current->curr_ret_stack -= FTRACE_NOTRACE_DEPTH; barrier(); current->ret_stack[index].ret = ret; current->ret_stack[index].func = func; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 447bd96ee658..f67060a75f38 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -534,6 +534,13 @@ enum { TRACE_GRAPH_DEPTH_START_BIT, TRACE_GRAPH_DEPTH_END_BIT, + + /* + * To implement set_graph_notrace, if this bit is set, we ignore + * function graph tracing of called functions, until the return + * function is called to clear it. + */ + TRACE_GRAPH_NOTRACE_BIT, }; #define trace_recursion_set(bit) do { (current)->trace_recursion |= (1<<(bit)); } while (0) diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index b846d82c2f95..ecf543df943b 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -188,6 +188,18 @@ int trace_graph_entry(struct ftrace_graph_ent *trace) int cpu; int pc; + if (trace_recursion_test(TRACE_GRAPH_NOTRACE_BIT)) + return 0; + + if (ftrace_graph_notrace_addr(trace->func)) { + trace_recursion_set(TRACE_GRAPH_NOTRACE_BIT); + /* + * Need to return 1 to have the return called + * that will clear the NOTRACE bit. + */ + return 1; + } + if (!ftrace_trace_task(tr)) return 0; @@ -290,6 +302,11 @@ void trace_graph_return(struct ftrace_graph_ret *trace) ftrace_graph_addr_finish(trace); + if (trace_recursion_test(TRACE_GRAPH_NOTRACE_BIT)) { + trace_recursion_clear(TRACE_GRAPH_NOTRACE_BIT); + return; + } + local_irq_save(flags); cpu = raw_smp_processor_id(); data = per_cpu_ptr(tr->trace_buffer.data, cpu); @@ -315,6 +332,11 @@ static void trace_graph_thresh_return(struct ftrace_graph_ret *trace) { ftrace_graph_addr_finish(trace); + if (trace_recursion_test(TRACE_GRAPH_NOTRACE_BIT)) { + trace_recursion_clear(TRACE_GRAPH_NOTRACE_BIT); + return; + } + if (tracing_thresh && (trace->rettime - trace->calltime < tracing_thresh)) return; -- cgit From e9ee9efc0d176512cdce9d27ff8549d7ffa2bfcd Mon Sep 17 00:00:00 2001 From: David Miller Date: Fri, 30 Nov 2018 21:08:14 -0800 Subject: bpf: Add BPF_F_ANY_ALIGNMENT. Often we want to write tests cases that check things like bad context offset accesses. And one way to do this is to use an odd offset on, for example, a 32-bit load. This unfortunately triggers the alignment checks first on platforms that do not set CONFIG_EFFICIENT_UNALIGNED_ACCESS. So the test case see the alignment failure rather than what it was testing for. It is often not completely possible to respect the original intention of the test, or even test the same exact thing, while solving the alignment issue. Another option could have been to check the alignment after the context and other validations are performed by the verifier, but that is a non-trivial change to the verifier. Signed-off-by: David S. Miller Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 14 ++++++++++++++ kernel/bpf/syscall.c | 7 ++++++- kernel/bpf/verifier.c | 2 ++ tools/include/uapi/linux/bpf.h | 14 ++++++++++++++ tools/lib/bpf/bpf.c | 8 ++++---- tools/lib/bpf/bpf.h | 2 +- tools/testing/selftests/bpf/test_align.c | 4 ++-- tools/testing/selftests/bpf/test_verifier.c | 3 ++- 8 files changed, 45 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 597afdbc1ab9..8050caea7495 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -232,6 +232,20 @@ enum bpf_attach_type { */ #define BPF_F_STRICT_ALIGNMENT (1U << 0) +/* If BPF_F_ANY_ALIGNMENT is used in BPF_PROF_LOAD command, the + * verifier will allow any alignment whatsoever. On platforms + * with strict alignment requirements for loads ands stores (such + * as sparc and mips) the verifier validates that all loads and + * stores provably follow this requirement. This flag turns that + * checking and enforcement off. + * + * It is mostly used for testing when we want to validate the + * context and memory access aspects of the verifier, but because + * of an unaligned access the alignment check would trigger before + * the one we are interested in. + */ +#define BPF_F_ANY_ALIGNMENT (1U << 1) + /* when bpf_ldimm64->src_reg == BPF_PSEUDO_MAP_FD, bpf_ldimm64->imm == fd */ #define BPF_PSEUDO_MAP_FD 1 diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 85cbeec06e50..f9554d9a14e1 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1452,9 +1452,14 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) if (CHECK_ATTR(BPF_PROG_LOAD)) return -EINVAL; - if (attr->prog_flags & ~BPF_F_STRICT_ALIGNMENT) + if (attr->prog_flags & ~(BPF_F_STRICT_ALIGNMENT | BPF_F_ANY_ALIGNMENT)) return -EINVAL; + if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && + (attr->prog_flags & BPF_F_ANY_ALIGNMENT) && + !capable(CAP_SYS_ADMIN)) + return -EPERM; + /* copy eBPF program license from user space */ if (strncpy_from_user(license, u64_to_user_ptr(attr->license), sizeof(license) - 1) < 0) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 9584438fa2cc..71988337ac14 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6505,6 +6505,8 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, env->strict_alignment = !!(attr->prog_flags & BPF_F_STRICT_ALIGNMENT); if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) env->strict_alignment = true; + if (attr->prog_flags & BPF_F_ANY_ALIGNMENT) + env->strict_alignment = false; ret = replace_map_fd_with_map_ptr(env); if (ret < 0) diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 597afdbc1ab9..8050caea7495 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -232,6 +232,20 @@ enum bpf_attach_type { */ #define BPF_F_STRICT_ALIGNMENT (1U << 0) +/* If BPF_F_ANY_ALIGNMENT is used in BPF_PROF_LOAD command, the + * verifier will allow any alignment whatsoever. On platforms + * with strict alignment requirements for loads ands stores (such + * as sparc and mips) the verifier validates that all loads and + * stores provably follow this requirement. This flag turns that + * checking and enforcement off. + * + * It is mostly used for testing when we want to validate the + * context and memory access aspects of the verifier, but because + * of an unaligned access the alignment check would trigger before + * the one we are interested in. + */ +#define BPF_F_ANY_ALIGNMENT (1U << 1) + /* when bpf_ldimm64->src_reg == BPF_PSEUDO_MAP_FD, bpf_ldimm64->imm == fd */ #define BPF_PSEUDO_MAP_FD 1 diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c index ce1822194590..c19226cccf39 100644 --- a/tools/lib/bpf/bpf.c +++ b/tools/lib/bpf/bpf.c @@ -279,9 +279,9 @@ int bpf_load_program(enum bpf_prog_type type, const struct bpf_insn *insns, } int bpf_verify_program(enum bpf_prog_type type, const struct bpf_insn *insns, - size_t insns_cnt, int strict_alignment, - const char *license, __u32 kern_version, - char *log_buf, size_t log_buf_sz, int log_level) + size_t insns_cnt, __u32 prog_flags, const char *license, + __u32 kern_version, char *log_buf, size_t log_buf_sz, + int log_level) { union bpf_attr attr; @@ -295,7 +295,7 @@ int bpf_verify_program(enum bpf_prog_type type, const struct bpf_insn *insns, attr.log_level = log_level; log_buf[0] = 0; attr.kern_version = kern_version; - attr.prog_flags = strict_alignment ? BPF_F_STRICT_ALIGNMENT : 0; + attr.prog_flags = prog_flags; return sys_bpf(BPF_PROG_LOAD, &attr, sizeof(attr)); } diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h index 09e8bbe111d4..60392b70587c 100644 --- a/tools/lib/bpf/bpf.h +++ b/tools/lib/bpf/bpf.h @@ -98,7 +98,7 @@ LIBBPF_API int bpf_load_program(enum bpf_prog_type type, char *log_buf, size_t log_buf_sz); LIBBPF_API int bpf_verify_program(enum bpf_prog_type type, const struct bpf_insn *insns, - size_t insns_cnt, int strict_alignment, + size_t insns_cnt, __u32 prog_flags, const char *license, __u32 kern_version, char *log_buf, size_t log_buf_sz, int log_level); diff --git a/tools/testing/selftests/bpf/test_align.c b/tools/testing/selftests/bpf/test_align.c index 5f377ec53f2f..3c789d03b629 100644 --- a/tools/testing/selftests/bpf/test_align.c +++ b/tools/testing/selftests/bpf/test_align.c @@ -620,8 +620,8 @@ static int do_test_single(struct bpf_align_test *test) prog_len = probe_filter_length(prog); fd_prog = bpf_verify_program(prog_type ? : BPF_PROG_TYPE_SOCKET_FILTER, - prog, prog_len, 1, "GPL", 0, - bpf_vlog, sizeof(bpf_vlog), 2); + prog, prog_len, BPF_F_STRICT_ALIGNMENT, + "GPL", 0, bpf_vlog, sizeof(bpf_vlog), 2); if (fd_prog < 0 && test->result != REJECT) { printf("Failed to load program.\n"); printf("%s", bpf_vlog); diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index 5eace1f606fb..78e779c35869 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -14275,7 +14275,8 @@ static void do_test_single(struct bpf_test *test, bool unpriv, prog_len = probe_filter_length(prog); fd_prog = bpf_verify_program(prog_type, prog, prog_len, - test->flags & F_LOAD_WITH_STRICT_ALIGNMENT, + test->flags & F_LOAD_WITH_STRICT_ALIGNMENT ? + BPF_F_STRICT_ALIGNMENT : 0, "GPL", 0, bpf_vlog, sizeof(bpf_vlog), 1); expected_ret = unpriv && test->result_unpriv != UNDEF ? -- cgit From b18814e767a445534ab9ccba02e82a31208f85d6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 4 Nov 2018 17:27:56 +0100 Subject: dma-direct: provide page based alloc/free helpers Some architectures support remapping highmem into DMA coherent allocations. To use the common code for them we need variants of dma_direct_{alloc,free}_pages that do not use kernel virtual addresses. Signed-off-by: Christoph Hellwig Reviewed-by: Robin Murphy --- include/linux/dma-direct.h | 3 +++ kernel/dma/direct.c | 32 ++++++++++++++++++++++---------- 2 files changed, 25 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h index 9e66bfe369aa..61b78f934f64 100644 --- a/include/linux/dma-direct.h +++ b/include/linux/dma-direct.h @@ -67,6 +67,9 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs); void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr, dma_addr_t dma_addr, unsigned long attrs); +struct page *__dma_direct_alloc_pages(struct device *dev, size_t size, + dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs); +void __dma_direct_free_pages(struct device *dev, size_t size, struct page *page); dma_addr_t dma_direct_map_page(struct device *dev, struct page *page, unsigned long offset, size_t size, enum dma_data_direction dir, unsigned long attrs); diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 22a12ab5a5e9..680287779b0a 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -103,14 +103,13 @@ static bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size) min_not_zero(dev->coherent_dma_mask, dev->bus_dma_mask); } -void *dma_direct_alloc_pages(struct device *dev, size_t size, +struct page *__dma_direct_alloc_pages(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs) { unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT; int page_order = get_order(size); struct page *page = NULL; u64 phys_mask; - void *ret; if (attrs & DMA_ATTR_NO_WARN) gfp |= __GFP_NOWARN; @@ -150,11 +149,22 @@ again: } } + return page; +} + +void *dma_direct_alloc_pages(struct device *dev, size_t size, + dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs) +{ + struct page *page; + void *ret; + + page = __dma_direct_alloc_pages(dev, size, dma_handle, gfp, attrs); if (!page) return NULL; + ret = page_address(page); if (force_dma_unencrypted()) { - set_memory_decrypted((unsigned long)ret, 1 << page_order); + set_memory_decrypted((unsigned long)ret, 1 << get_order(size)); *dma_handle = __phys_to_dma(dev, page_to_phys(page)); } else { *dma_handle = phys_to_dma(dev, page_to_phys(page)); @@ -163,20 +173,22 @@ again: return ret; } -/* - * NOTE: this function must never look at the dma_addr argument, because we want - * to be able to use it as a helper for iommu implementations as well. - */ +void __dma_direct_free_pages(struct device *dev, size_t size, struct page *page) +{ + unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT; + + if (!dma_release_from_contiguous(dev, page, count)) + __free_pages(page, get_order(size)); +} + void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr, dma_addr_t dma_addr, unsigned long attrs) { - unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT; unsigned int page_order = get_order(size); if (force_dma_unencrypted()) set_memory_encrypted((unsigned long)cpu_addr, 1 << page_order); - if (!dma_release_from_contiguous(dev, virt_to_page(cpu_addr), count)) - free_pages((unsigned long)cpu_addr, page_order); + __dma_direct_free_pages(dev, size, virt_to_page(cpu_addr)); } void *dma_direct_alloc(struct device *dev, size_t size, -- cgit From 704f2c20eaa566f6906e8812b6e2115889bd753d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 22 Sep 2018 20:47:26 +0200 Subject: dma-direct: reject highmem pages from dma_alloc_from_contiguous dma_alloc_from_contiguous can return highmem pages depending on the setup, which a plain non-remapping DMA allocator can't handle. Detect this case and fail the allocation. Signed-off-by: Christoph Hellwig Reviewed-by: Robin Murphy --- kernel/dma/direct.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'kernel') diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 680287779b0a..c49849bcced6 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -162,6 +162,18 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size, if (!page) return NULL; + if (PageHighMem(page)) { + /* + * Depending on the cma= arguments and per-arch setup + * dma_alloc_from_contiguous could return highmem pages. + * Without remapping there is no way to return them here, + * so log an error and fail. + */ + dev_info(dev, "Rejecting highmem page from CMA.\n"); + __dma_direct_free_pages(dev, size, page); + return NULL; + } + ret = page_address(page); if (force_dma_unencrypted()) { set_memory_decrypted((unsigned long)ret, 1 << get_order(size)); -- cgit From f0edfea8ef93ed6cc5f747c46c85c8e53e0798a0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 24 Aug 2018 10:31:08 +0200 Subject: dma-mapping: move the remap helpers to a separate file The dma remap code only makes sense for not cache coherent architectures (or possibly the corner case of highmem CMA allocations) and currently is only used by arm, arm64, csky and xtensa. Split it out into a separate file with a separate Kconfig symbol, which gets the right copyright notice given that this code was written by Laura Abbott working for Code Aurora at that point. Signed-off-by: Christoph Hellwig Acked-by: Laura Abbott Reviewed-by: Robin Murphy --- arch/arm/Kconfig | 1 + arch/arm64/Kconfig | 1 + arch/csky/Kconfig | 1 + arch/xtensa/Kconfig | 1 + kernel/dma/Kconfig | 4 +++ kernel/dma/Makefile | 2 +- kernel/dma/mapping.c | 84 ------------------------------------------------- kernel/dma/remap.c | 88 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 8 files changed, 97 insertions(+), 85 deletions(-) create mode 100644 kernel/dma/remap.c (limited to 'kernel') diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 91be74d8df65..3b2852df6eb3 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -30,6 +30,7 @@ config ARM select CPU_PM if (SUSPEND || CPU_IDLE) select DCACHE_WORD_ACCESS if HAVE_EFFICIENT_UNALIGNED_ACCESS select DMA_DIRECT_OPS if !MMU + select DMA_REMAP if MMU select EDAC_SUPPORT select EDAC_ATOMIC_SCRUB select GENERIC_ALLOCATOR diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 787d7850e064..5d065acb6d10 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -82,6 +82,7 @@ config ARM64 select CRC32 select DCACHE_WORD_ACCESS select DMA_DIRECT_OPS + select DMA_REMAP select EDAC_SUPPORT select FRAME_POINTER select GENERIC_ALLOCATOR diff --git a/arch/csky/Kconfig b/arch/csky/Kconfig index cb64f8dacd08..8a30e006a845 100644 --- a/arch/csky/Kconfig +++ b/arch/csky/Kconfig @@ -9,6 +9,7 @@ config CSKY select CLKSRC_OF select DMA_DIRECT_OPS select DMA_NONCOHERENT_OPS + select DMA_REMAP select IRQ_DOMAIN select HANDLE_DOMAIN_IRQ select DW_APB_TIMER_OF diff --git a/arch/xtensa/Kconfig b/arch/xtensa/Kconfig index d29b7365da8d..239bfb16c58b 100644 --- a/arch/xtensa/Kconfig +++ b/arch/xtensa/Kconfig @@ -11,6 +11,7 @@ config XTENSA select CLONE_BACKWARDS select COMMON_CLK select DMA_DIRECT_OPS + select DMA_REMAP if MMU select GENERIC_ATOMIC64 select GENERIC_CLOCKEVENTS select GENERIC_IRQ_SHOW diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig index 645c7a2ecde8..c92e08173ed8 100644 --- a/kernel/dma/Kconfig +++ b/kernel/dma/Kconfig @@ -51,3 +51,7 @@ config SWIOTLB bool select DMA_DIRECT_OPS select NEED_DMA_MAP_STATE + +config DMA_REMAP + depends on MMU + bool diff --git a/kernel/dma/Makefile b/kernel/dma/Makefile index 7d581e4eea4a..f4feeceb8020 100644 --- a/kernel/dma/Makefile +++ b/kernel/dma/Makefile @@ -7,4 +7,4 @@ obj-$(CONFIG_DMA_DIRECT_OPS) += direct.o obj-$(CONFIG_DMA_VIRT_OPS) += virt.o obj-$(CONFIG_DMA_API_DEBUG) += debug.o obj-$(CONFIG_SWIOTLB) += swiotlb.o - +obj-$(CONFIG_DMA_REMAP) += remap.o diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index 58dec7a92b7b..dfbc3deb95cd 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -262,87 +262,3 @@ int dma_common_mmap(struct device *dev, struct vm_area_struct *vma, #endif /* !CONFIG_ARCH_NO_COHERENT_DMA_MMAP */ } EXPORT_SYMBOL(dma_common_mmap); - -#ifdef CONFIG_MMU -static struct vm_struct *__dma_common_pages_remap(struct page **pages, - size_t size, unsigned long vm_flags, pgprot_t prot, - const void *caller) -{ - struct vm_struct *area; - - area = get_vm_area_caller(size, vm_flags, caller); - if (!area) - return NULL; - - if (map_vm_area(area, prot, pages)) { - vunmap(area->addr); - return NULL; - } - - return area; -} - -/* - * remaps an array of PAGE_SIZE pages into another vm_area - * Cannot be used in non-sleeping contexts - */ -void *dma_common_pages_remap(struct page **pages, size_t size, - unsigned long vm_flags, pgprot_t prot, - const void *caller) -{ - struct vm_struct *area; - - area = __dma_common_pages_remap(pages, size, vm_flags, prot, caller); - if (!area) - return NULL; - - area->pages = pages; - - return area->addr; -} - -/* - * remaps an allocated contiguous region into another vm_area. - * Cannot be used in non-sleeping contexts - */ - -void *dma_common_contiguous_remap(struct page *page, size_t size, - unsigned long vm_flags, - pgprot_t prot, const void *caller) -{ - int i; - struct page **pages; - struct vm_struct *area; - - pages = kmalloc(sizeof(struct page *) << get_order(size), GFP_KERNEL); - if (!pages) - return NULL; - - for (i = 0; i < (size >> PAGE_SHIFT); i++) - pages[i] = nth_page(page, i); - - area = __dma_common_pages_remap(pages, size, vm_flags, prot, caller); - - kfree(pages); - - if (!area) - return NULL; - return area->addr; -} - -/* - * unmaps a range previously mapped by dma_common_*_remap - */ -void dma_common_free_remap(void *cpu_addr, size_t size, unsigned long vm_flags) -{ - struct vm_struct *area = find_vm_area(cpu_addr); - - if (!area || (area->flags & vm_flags) != vm_flags) { - WARN(1, "trying to free invalid coherent area: %p\n", cpu_addr); - return; - } - - unmap_kernel_range((unsigned long)cpu_addr, PAGE_ALIGN(size)); - vunmap(cpu_addr); -} -#endif diff --git a/kernel/dma/remap.c b/kernel/dma/remap.c new file mode 100644 index 000000000000..a15c393ea4e5 --- /dev/null +++ b/kernel/dma/remap.c @@ -0,0 +1,88 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2014 The Linux Foundation + */ +#include +#include +#include + +static struct vm_struct *__dma_common_pages_remap(struct page **pages, + size_t size, unsigned long vm_flags, pgprot_t prot, + const void *caller) +{ + struct vm_struct *area; + + area = get_vm_area_caller(size, vm_flags, caller); + if (!area) + return NULL; + + if (map_vm_area(area, prot, pages)) { + vunmap(area->addr); + return NULL; + } + + return area; +} + +/* + * Remaps an array of PAGE_SIZE pages into another vm_area. + * Cannot be used in non-sleeping contexts + */ +void *dma_common_pages_remap(struct page **pages, size_t size, + unsigned long vm_flags, pgprot_t prot, + const void *caller) +{ + struct vm_struct *area; + + area = __dma_common_pages_remap(pages, size, vm_flags, prot, caller); + if (!area) + return NULL; + + area->pages = pages; + + return area->addr; +} + +/* + * Remaps an allocated contiguous region into another vm_area. + * Cannot be used in non-sleeping contexts + */ +void *dma_common_contiguous_remap(struct page *page, size_t size, + unsigned long vm_flags, + pgprot_t prot, const void *caller) +{ + int i; + struct page **pages; + struct vm_struct *area; + + pages = kmalloc(sizeof(struct page *) << get_order(size), GFP_KERNEL); + if (!pages) + return NULL; + + for (i = 0; i < (size >> PAGE_SHIFT); i++) + pages[i] = nth_page(page, i); + + area = __dma_common_pages_remap(pages, size, vm_flags, prot, caller); + + kfree(pages); + + if (!area) + return NULL; + return area->addr; +} + +/* + * Unmaps a range previously mapped by dma_common_*_remap + */ +void dma_common_free_remap(void *cpu_addr, size_t size, unsigned long vm_flags) +{ + struct vm_struct *area = find_vm_area(cpu_addr); + + if (!area || (area->flags & vm_flags) != vm_flags) { + WARN(1, "trying to free invalid coherent area: %p\n", cpu_addr); + return; + } + + unmap_kernel_range((unsigned long)cpu_addr, PAGE_ALIGN(size)); + vunmap(cpu_addr); +} -- cgit From 0c3b3171ceccb8830c2bb5adff1b4e9b204c1450 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 4 Nov 2018 20:29:28 +0100 Subject: dma-mapping: move the arm64 noncoherent alloc/free support to common code The arm64 codebase to implement coherent dma allocation for architectures with non-coherent DMA is a good start for a generic implementation, given that is uses the generic remap helpers, provides the atomic pool for allocations that can't sleep and still is realtively simple and well tested. Move it to kernel/dma and allow architectures to opt into it using a config symbol. Architectures just need to provide a new arch_dma_prep_coherent helper to writeback an invalidate the caches for any memory that gets remapped for uncached access. Signed-off-by: Christoph Hellwig Reviewed-by: Will Deacon Reviewed-by: Robin Murphy --- arch/arm64/Kconfig | 2 +- arch/arm64/mm/dma-mapping.c | 184 +++------------------------------------- include/linux/dma-mapping.h | 5 ++ include/linux/dma-noncoherent.h | 2 + kernel/dma/Kconfig | 5 ++ kernel/dma/remap.c | 158 +++++++++++++++++++++++++++++++++- 6 files changed, 180 insertions(+), 176 deletions(-) (limited to 'kernel') diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 5d065acb6d10..2e645ea693ea 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -82,7 +82,7 @@ config ARM64 select CRC32 select DCACHE_WORD_ACCESS select DMA_DIRECT_OPS - select DMA_REMAP + select DMA_DIRECT_REMAP select EDAC_SUPPORT select FRAME_POINTER select GENERIC_ALLOCATOR diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c index a3ac26284845..e2e7e5d0f94e 100644 --- a/arch/arm64/mm/dma-mapping.c +++ b/arch/arm64/mm/dma-mapping.c @@ -33,113 +33,6 @@ #include -static struct gen_pool *atomic_pool __ro_after_init; - -#define DEFAULT_DMA_COHERENT_POOL_SIZE SZ_256K -static size_t atomic_pool_size __initdata = DEFAULT_DMA_COHERENT_POOL_SIZE; - -static int __init early_coherent_pool(char *p) -{ - atomic_pool_size = memparse(p, &p); - return 0; -} -early_param("coherent_pool", early_coherent_pool); - -static void *__alloc_from_pool(size_t size, struct page **ret_page, gfp_t flags) -{ - unsigned long val; - void *ptr = NULL; - - if (!atomic_pool) { - WARN(1, "coherent pool not initialised!\n"); - return NULL; - } - - val = gen_pool_alloc(atomic_pool, size); - if (val) { - phys_addr_t phys = gen_pool_virt_to_phys(atomic_pool, val); - - *ret_page = phys_to_page(phys); - ptr = (void *)val; - memset(ptr, 0, size); - } - - return ptr; -} - -static bool __in_atomic_pool(void *start, size_t size) -{ - return addr_in_gen_pool(atomic_pool, (unsigned long)start, size); -} - -static int __free_from_pool(void *start, size_t size) -{ - if (!__in_atomic_pool(start, size)) - return 0; - - gen_pool_free(atomic_pool, (unsigned long)start, size); - - return 1; -} - -void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, - gfp_t flags, unsigned long attrs) -{ - struct page *page; - void *ptr, *coherent_ptr; - pgprot_t prot = pgprot_writecombine(PAGE_KERNEL); - - size = PAGE_ALIGN(size); - - if (!gfpflags_allow_blocking(flags)) { - struct page *page = NULL; - void *addr = __alloc_from_pool(size, &page, flags); - - if (addr) - *dma_handle = phys_to_dma(dev, page_to_phys(page)); - - return addr; - } - - ptr = dma_direct_alloc_pages(dev, size, dma_handle, flags, attrs); - if (!ptr) - goto no_mem; - - /* remove any dirty cache lines on the kernel alias */ - __dma_flush_area(ptr, size); - - /* create a coherent mapping */ - page = virt_to_page(ptr); - coherent_ptr = dma_common_contiguous_remap(page, size, VM_USERMAP, - prot, __builtin_return_address(0)); - if (!coherent_ptr) - goto no_map; - - return coherent_ptr; - -no_map: - dma_direct_free_pages(dev, size, ptr, *dma_handle, attrs); -no_mem: - return NULL; -} - -void arch_dma_free(struct device *dev, size_t size, void *vaddr, - dma_addr_t dma_handle, unsigned long attrs) -{ - if (!__free_from_pool(vaddr, PAGE_ALIGN(size))) { - void *kaddr = phys_to_virt(dma_to_phys(dev, dma_handle)); - - vunmap(vaddr); - dma_direct_free_pages(dev, size, kaddr, dma_handle, attrs); - } -} - -long arch_dma_coherent_to_pfn(struct device *dev, void *cpu_addr, - dma_addr_t dma_addr) -{ - return __phys_to_pfn(dma_to_phys(dev, dma_addr)); -} - pgprot_t arch_dma_mmap_pgprot(struct device *dev, pgprot_t prot, unsigned long attrs) { @@ -160,6 +53,11 @@ void arch_sync_dma_for_cpu(struct device *dev, phys_addr_t paddr, __dma_unmap_area(phys_to_virt(paddr), size, dir); } +void arch_dma_prep_coherent(struct page *page, size_t size) +{ + __dma_flush_area(page_address(page), size); +} + #ifdef CONFIG_IOMMU_DMA static int __swiotlb_get_sgtable_page(struct sg_table *sgt, struct page *page, size_t size) @@ -191,67 +89,6 @@ static int __swiotlb_mmap_pfn(struct vm_area_struct *vma, } #endif /* CONFIG_IOMMU_DMA */ -static int __init atomic_pool_init(void) -{ - pgprot_t prot = __pgprot(PROT_NORMAL_NC); - unsigned long nr_pages = atomic_pool_size >> PAGE_SHIFT; - struct page *page; - void *addr; - unsigned int pool_size_order = get_order(atomic_pool_size); - - if (dev_get_cma_area(NULL)) - page = dma_alloc_from_contiguous(NULL, nr_pages, - pool_size_order, false); - else - page = alloc_pages(GFP_DMA32, pool_size_order); - - if (page) { - int ret; - void *page_addr = page_address(page); - - memset(page_addr, 0, atomic_pool_size); - __dma_flush_area(page_addr, atomic_pool_size); - - atomic_pool = gen_pool_create(PAGE_SHIFT, -1); - if (!atomic_pool) - goto free_page; - - addr = dma_common_contiguous_remap(page, atomic_pool_size, - VM_USERMAP, prot, atomic_pool_init); - - if (!addr) - goto destroy_genpool; - - ret = gen_pool_add_virt(atomic_pool, (unsigned long)addr, - page_to_phys(page), - atomic_pool_size, -1); - if (ret) - goto remove_mapping; - - gen_pool_set_algo(atomic_pool, - gen_pool_first_fit_order_align, - NULL); - - pr_info("DMA: preallocated %zu KiB pool for atomic allocations\n", - atomic_pool_size / 1024); - return 0; - } - goto out; - -remove_mapping: - dma_common_free_remap(addr, atomic_pool_size, VM_USERMAP); -destroy_genpool: - gen_pool_destroy(atomic_pool); - atomic_pool = NULL; -free_page: - if (!dma_release_from_contiguous(NULL, page, nr_pages)) - __free_pages(page, pool_size_order); -out: - pr_err("DMA: failed to allocate %zu KiB pool for atomic coherent allocation\n", - atomic_pool_size / 1024); - return -ENOMEM; -} - /******************************************** * The following APIs are for dummy DMA ops * ********************************************/ @@ -350,8 +187,7 @@ static int __init arm64_dma_init(void) TAINT_CPU_OUT_OF_SPEC, "ARCH_DMA_MINALIGN smaller than CTR_EL0.CWG (%d < %d)", ARCH_DMA_MINALIGN, cache_line_size()); - - return atomic_pool_init(); + return dma_atomic_pool_init(GFP_DMA32, __pgprot(PROT_NORMAL_NC)); } arch_initcall(arm64_dma_init); @@ -397,7 +233,7 @@ static void *__iommu_alloc_attrs(struct device *dev, size_t size, page = alloc_pages(gfp, get_order(size)); addr = page ? page_address(page) : NULL; } else { - addr = __alloc_from_pool(size, &page, gfp); + addr = dma_alloc_from_pool(size, &page, gfp); } if (!addr) return NULL; @@ -407,7 +243,7 @@ static void *__iommu_alloc_attrs(struct device *dev, size_t size, if (coherent) __free_pages(page, get_order(size)); else - __free_from_pool(addr, size); + dma_free_from_pool(addr, size); addr = NULL; } } else if (attrs & DMA_ATTR_FORCE_CONTIGUOUS) { @@ -471,9 +307,9 @@ static void __iommu_free_attrs(struct device *dev, size_t size, void *cpu_addr, * coherent devices. * Hence how dodgy the below logic looks... */ - if (__in_atomic_pool(cpu_addr, size)) { + if (dma_in_atomic_pool(cpu_addr, size)) { iommu_dma_unmap_page(dev, handle, iosize, 0, 0); - __free_from_pool(cpu_addr, size); + dma_free_from_pool(cpu_addr, size); } else if (attrs & DMA_ATTR_FORCE_CONTIGUOUS) { struct page *page = vmalloc_to_page(cpu_addr); diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index 0f81c713f6e9..1a0edcde7d14 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -455,6 +455,11 @@ void *dma_common_pages_remap(struct page **pages, size_t size, const void *caller); void dma_common_free_remap(void *cpu_addr, size_t size, unsigned long vm_flags); +int __init dma_atomic_pool_init(gfp_t gfp, pgprot_t prot); +bool dma_in_atomic_pool(void *start, size_t size); +void *dma_alloc_from_pool(size_t size, struct page **ret_page, gfp_t flags); +bool dma_free_from_pool(void *start, size_t size); + /** * dma_mmap_attrs - map a coherent DMA allocation into user space * @dev: valid struct device pointer, or NULL for ISA and EISA-like devices diff --git a/include/linux/dma-noncoherent.h b/include/linux/dma-noncoherent.h index 9051b055beec..306557331d7d 100644 --- a/include/linux/dma-noncoherent.h +++ b/include/linux/dma-noncoherent.h @@ -69,4 +69,6 @@ static inline void arch_sync_dma_for_cpu_all(struct device *dev) } #endif /* CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL */ +void arch_dma_prep_coherent(struct page *page, size_t size); + #endif /* _LINUX_DMA_NONCOHERENT_H */ diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig index c92e08173ed8..41c3b1df70eb 100644 --- a/kernel/dma/Kconfig +++ b/kernel/dma/Kconfig @@ -55,3 +55,8 @@ config SWIOTLB config DMA_REMAP depends on MMU bool + +config DMA_DIRECT_REMAP + bool + depends on DMA_DIRECT_OPS + select DMA_REMAP diff --git a/kernel/dma/remap.c b/kernel/dma/remap.c index a15c393ea4e5..b32bb08f96ae 100644 --- a/kernel/dma/remap.c +++ b/kernel/dma/remap.c @@ -1,8 +1,13 @@ // SPDX-License-Identifier: GPL-2.0 /* + * Copyright (C) 2012 ARM Ltd. * Copyright (c) 2014 The Linux Foundation */ -#include +#include +#include +#include +#include +#include #include #include @@ -86,3 +91,154 @@ void dma_common_free_remap(void *cpu_addr, size_t size, unsigned long vm_flags) unmap_kernel_range((unsigned long)cpu_addr, PAGE_ALIGN(size)); vunmap(cpu_addr); } + +#ifdef CONFIG_DMA_DIRECT_REMAP +static struct gen_pool *atomic_pool __ro_after_init; + +#define DEFAULT_DMA_COHERENT_POOL_SIZE SZ_256K +static size_t atomic_pool_size __initdata = DEFAULT_DMA_COHERENT_POOL_SIZE; + +static int __init early_coherent_pool(char *p) +{ + atomic_pool_size = memparse(p, &p); + return 0; +} +early_param("coherent_pool", early_coherent_pool); + +int __init dma_atomic_pool_init(gfp_t gfp, pgprot_t prot) +{ + unsigned int pool_size_order = get_order(atomic_pool_size); + unsigned long nr_pages = atomic_pool_size >> PAGE_SHIFT; + struct page *page; + void *addr; + int ret; + + if (dev_get_cma_area(NULL)) + page = dma_alloc_from_contiguous(NULL, nr_pages, + pool_size_order, false); + else + page = alloc_pages(gfp, pool_size_order); + if (!page) + goto out; + + memset(page_address(page), 0, atomic_pool_size); + arch_dma_prep_coherent(page, atomic_pool_size); + + atomic_pool = gen_pool_create(PAGE_SHIFT, -1); + if (!atomic_pool) + goto free_page; + + addr = dma_common_contiguous_remap(page, atomic_pool_size, VM_USERMAP, + prot, __builtin_return_address(0)); + if (!addr) + goto destroy_genpool; + + ret = gen_pool_add_virt(atomic_pool, (unsigned long)addr, + page_to_phys(page), atomic_pool_size, -1); + if (ret) + goto remove_mapping; + gen_pool_set_algo(atomic_pool, gen_pool_first_fit_order_align, NULL); + + pr_info("DMA: preallocated %zu KiB pool for atomic allocations\n", + atomic_pool_size / 1024); + return 0; + +remove_mapping: + dma_common_free_remap(addr, atomic_pool_size, VM_USERMAP); +destroy_genpool: + gen_pool_destroy(atomic_pool); + atomic_pool = NULL; +free_page: + if (!dma_release_from_contiguous(NULL, page, nr_pages)) + __free_pages(page, pool_size_order); +out: + pr_err("DMA: failed to allocate %zu KiB pool for atomic coherent allocation\n", + atomic_pool_size / 1024); + return -ENOMEM; +} + +bool dma_in_atomic_pool(void *start, size_t size) +{ + return addr_in_gen_pool(atomic_pool, (unsigned long)start, size); +} + +void *dma_alloc_from_pool(size_t size, struct page **ret_page, gfp_t flags) +{ + unsigned long val; + void *ptr = NULL; + + if (!atomic_pool) { + WARN(1, "coherent pool not initialised!\n"); + return NULL; + } + + val = gen_pool_alloc(atomic_pool, size); + if (val) { + phys_addr_t phys = gen_pool_virt_to_phys(atomic_pool, val); + + *ret_page = pfn_to_page(__phys_to_pfn(phys)); + ptr = (void *)val; + memset(ptr, 0, size); + } + + return ptr; +} + +bool dma_free_from_pool(void *start, size_t size) +{ + if (!dma_in_atomic_pool(start, size)) + return false; + gen_pool_free(atomic_pool, (unsigned long)start, size); + return true; +} + +void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, + gfp_t flags, unsigned long attrs) +{ + struct page *page = NULL; + void *ret, *kaddr; + + size = PAGE_ALIGN(size); + + if (!gfpflags_allow_blocking(flags)) { + ret = dma_alloc_from_pool(size, &page, flags); + if (!ret) + return NULL; + *dma_handle = phys_to_dma(dev, page_to_phys(page)); + return ret; + } + + kaddr = dma_direct_alloc_pages(dev, size, dma_handle, flags, attrs); + if (!kaddr) + return NULL; + page = virt_to_page(kaddr); + + /* remove any dirty cache lines on the kernel alias */ + arch_dma_prep_coherent(page, size); + + /* create a coherent mapping */ + ret = dma_common_contiguous_remap(page, size, VM_USERMAP, + arch_dma_mmap_pgprot(dev, PAGE_KERNEL, attrs), + __builtin_return_address(0)); + if (!ret) + dma_direct_free_pages(dev, size, kaddr, *dma_handle, attrs); + return ret; +} + +void arch_dma_free(struct device *dev, size_t size, void *vaddr, + dma_addr_t dma_handle, unsigned long attrs) +{ + if (!dma_free_from_pool(vaddr, PAGE_ALIGN(size))) { + void *kaddr = phys_to_virt(dma_to_phys(dev, dma_handle)); + + vunmap(vaddr); + dma_direct_free_pages(dev, size, kaddr, dma_handle, attrs); + } +} + +long arch_dma_coherent_to_pfn(struct device *dev, void *cpu_addr, + dma_addr_t dma_addr) +{ + return __phys_to_pfn(dma_to_phys(dev, dma_addr)); +} +#endif /* CONFIG_DMA_DIRECT_REMAP */ -- cgit From bfd56cd605219d90b210a5377fca31a644efe95c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 4 Nov 2018 17:38:39 +0100 Subject: dma-mapping: support highmem in the generic remap allocator By using __dma_direct_alloc_pages we can deal entirely with struct page instead of having to derive a kernel virtual address. Signed-off-by: Christoph Hellwig Reviewed-by: Robin Murphy --- kernel/dma/remap.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/remap.c b/kernel/dma/remap.c index b32bb08f96ae..dcc82dd668f8 100644 --- a/kernel/dma/remap.c +++ b/kernel/dma/remap.c @@ -196,7 +196,7 @@ void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t flags, unsigned long attrs) { struct page *page = NULL; - void *ret, *kaddr; + void *ret; size = PAGE_ALIGN(size); @@ -208,10 +208,9 @@ void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, return ret; } - kaddr = dma_direct_alloc_pages(dev, size, dma_handle, flags, attrs); - if (!kaddr) + page = __dma_direct_alloc_pages(dev, size, dma_handle, flags, attrs); + if (!page) return NULL; - page = virt_to_page(kaddr); /* remove any dirty cache lines on the kernel alias */ arch_dma_prep_coherent(page, size); @@ -221,7 +220,7 @@ void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, arch_dma_mmap_pgprot(dev, PAGE_KERNEL, attrs), __builtin_return_address(0)); if (!ret) - dma_direct_free_pages(dev, size, kaddr, *dma_handle, attrs); + __dma_direct_free_pages(dev, size, page); return ret; } @@ -229,10 +228,11 @@ void arch_dma_free(struct device *dev, size_t size, void *vaddr, dma_addr_t dma_handle, unsigned long attrs) { if (!dma_free_from_pool(vaddr, PAGE_ALIGN(size))) { - void *kaddr = phys_to_virt(dma_to_phys(dev, dma_handle)); + phys_addr_t phys = dma_to_phys(dev, dma_handle); + struct page *page = pfn_to_page(__phys_to_pfn(phys)); vunmap(vaddr); - dma_direct_free_pages(dev, size, kaddr, dma_handle, attrs); + __dma_direct_free_pages(dev, size, page); } } -- cgit From e440e26a0251b054243b3db6356f3869a9f9c2d1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 4 Nov 2018 17:51:50 +0100 Subject: dma-remap: support DMA_ATTR_NO_KERNEL_MAPPING Do not waste vmalloc space on allocations that do not require a mapping into the kernel address space. Signed-off-by: Christoph Hellwig Reviewed-by: Robin Murphy --- kernel/dma/remap.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/remap.c b/kernel/dma/remap.c index dcc82dd668f8..68a64e3ff6a1 100644 --- a/kernel/dma/remap.c +++ b/kernel/dma/remap.c @@ -200,7 +200,8 @@ void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, size = PAGE_ALIGN(size); - if (!gfpflags_allow_blocking(flags)) { + if (!gfpflags_allow_blocking(flags) && + !(attrs & DMA_ATTR_NO_KERNEL_MAPPING)) { ret = dma_alloc_from_pool(size, &page, flags); if (!ret) return NULL; @@ -215,6 +216,9 @@ void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, /* remove any dirty cache lines on the kernel alias */ arch_dma_prep_coherent(page, size); + if (attrs & DMA_ATTR_NO_KERNEL_MAPPING) + return page; /* opaque cookie */ + /* create a coherent mapping */ ret = dma_common_contiguous_remap(page, size, VM_USERMAP, arch_dma_mmap_pgprot(dev, PAGE_KERNEL, attrs), @@ -227,7 +231,10 @@ void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, void arch_dma_free(struct device *dev, size_t size, void *vaddr, dma_addr_t dma_handle, unsigned long attrs) { - if (!dma_free_from_pool(vaddr, PAGE_ALIGN(size))) { + if (attrs & DMA_ATTR_NO_KERNEL_MAPPING) { + /* vaddr is a struct page cookie, not a kernel address */ + __dma_direct_free_pages(dev, size, vaddr); + } else if (!dma_free_from_pool(vaddr, PAGE_ALIGN(size))) { phys_addr_t phys = dma_to_phys(dev, dma_handle); struct page *page = pfn_to_page(__phys_to_pfn(phys)); -- cgit From 2af3024cd78f120d027cb44b454186ba9d7dab24 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 7 Nov 2018 14:11:40 -0800 Subject: cgroups: Replace synchronize_sched() with synchronize_rcu() Now that synchronize_rcu() waits for preempt-disable regions of code as well as RCU read-side critical sections, synchronize_sched() can be replaced by synchronize_rcu(). This commit therefore makes this change, even though it is but a comment. Signed-off-by: Paul E. McKenney Cc: Jens Axboe Cc: Dennis Zhou Cc: Johannes Weiner Cc: "Dennis Zhou (Facebook)" Acked-by: Tejun Heo --- kernel/cgroup/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 6aaf5dd5383b..7a8429f8e280 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -5343,7 +5343,7 @@ int __init cgroup_init(void) cgroup_rstat_boot(); /* - * The latency of the synchronize_sched() is too high for cgroups, + * The latency of the synchronize_rcu() is too high for cgroups, * avoid it at the cost of forcing all readers into the slow path. */ rcu_sync_enter_start(&cgroup_threadgroup_rwsem.rss); -- cgit From 6932689e4145f545062ca8c86cf76f38854d63d0 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 7 Nov 2018 14:16:57 -0800 Subject: livepatch: Replace synchronize_sched() with synchronize_rcu() Now that synchronize_rcu() waits for preempt-disable regions of code as well as RCU read-side critical sections, synchronize_sched() can be replaced by synchronize_rcu(). This commit therefore makes this change, even though it is but a comment. Signed-off-by: Paul E. McKenney --- kernel/livepatch/patch.c | 4 ++-- kernel/livepatch/transition.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/livepatch/patch.c b/kernel/livepatch/patch.c index 82d584225dc6..7702cb4064fc 100644 --- a/kernel/livepatch/patch.c +++ b/kernel/livepatch/patch.c @@ -61,7 +61,7 @@ static void notrace klp_ftrace_handler(unsigned long ip, ops = container_of(fops, struct klp_ops, fops); /* - * A variant of synchronize_sched() is used to allow patching functions + * A variant of synchronize_rcu() is used to allow patching functions * where RCU is not watching, see klp_synchronize_transition(). */ preempt_disable_notrace(); @@ -72,7 +72,7 @@ static void notrace klp_ftrace_handler(unsigned long ip, /* * func should never be NULL because preemption should be disabled here * and unregister_ftrace_function() does the equivalent of a - * synchronize_sched() before the func_stack removal. + * synchronize_rcu() before the func_stack removal. */ if (WARN_ON_ONCE(!func)) goto unlock; diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c index 5bc349805e03..304d5eb8a98c 100644 --- a/kernel/livepatch/transition.c +++ b/kernel/livepatch/transition.c @@ -52,7 +52,7 @@ static DECLARE_DELAYED_WORK(klp_transition_work, klp_transition_work_fn); /* * This function is just a stub to implement a hard force - * of synchronize_sched(). This requires synchronizing + * of synchronize_rcu(). This requires synchronizing * tasks even in userspace and idle. */ static void klp_sync(struct work_struct *work) @@ -175,7 +175,7 @@ void klp_cancel_transition(void) void klp_update_patch_state(struct task_struct *task) { /* - * A variant of synchronize_sched() is used to allow patching functions + * A variant of synchronize_rcu() is used to allow patching functions * where RCU is not watching, see klp_synchronize_transition(). */ preempt_disable_notrace(); -- cgit From 4871848531af1d62f30032bfb872c43b9afe03ad Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 15 Aug 2018 15:32:51 -0700 Subject: rcutorture: Add call_rcu() flooding forward-progress tests This commit adds a call_rcu() flooding loop to the forward-progress test. This emulates tight userspace loops that force call_rcu() invocations, for example, the infamous loop containing close(open()) that instigated the addition of blimit. If RCU does not make sufficient forward progress in invoking the resulting flood of callbacks, rcutorture emits a warning. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 129 +++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 127 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 210c77460365..8cf700ca7845 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -259,6 +259,8 @@ static atomic_t barrier_cbs_invoked; /* Barrier callbacks invoked. */ static wait_queue_head_t *barrier_cbs_wq; /* Coordinate barrier testing. */ static DECLARE_WAIT_QUEUE_HEAD(barrier_wq); +static bool rcu_fwd_cb_nodelay; /* Short rcu_torture_delay() delays. */ + /* * Allocate an element from the rcu_tortures pool. */ @@ -348,7 +350,8 @@ rcu_read_delay(struct torture_random_state *rrsp, struct rt_read_seg *rtrsp) * period, and we want a long delay occasionally to trigger * force_quiescent_state. */ - if (!(torture_random(rrsp) % (nrealreaders * 2000 * longdelay_ms))) { + if (!rcu_fwd_cb_nodelay && + !(torture_random(rrsp) % (nrealreaders * 2000 * longdelay_ms))) { started = cur_ops->get_gp_seq(); ts = rcu_trace_clock_local(); if (preempt_count() & (SOFTIRQ_MASK | HARDIRQ_MASK)) @@ -1674,6 +1677,43 @@ static void rcu_torture_fwd_prog_cb(struct rcu_head *rhp) cur_ops->call(&fcsp->rh, rcu_torture_fwd_prog_cb); } +/* State for continuous-flood RCU callbacks. */ +struct rcu_fwd_cb { + struct rcu_head rh; + struct rcu_fwd_cb *rfc_next; + int rfc_gps; +}; +static DEFINE_SPINLOCK(rcu_fwd_lock); +static struct rcu_fwd_cb *rcu_fwd_cb_head; +static struct rcu_fwd_cb **rcu_fwd_cb_tail = &rcu_fwd_cb_head; +static long n_launders_cb; +static unsigned long rcu_fwd_startat; +#define MAX_FWD_CB_JIFFIES (8 * HZ) /* Maximum CB test duration. */ +#define MIN_FWD_CB_LAUNDERS 3 /* This many CB invocations to count. */ +#define MIN_FWD_CBS_LAUNDERED 100 /* Number of counted CBs. */ +static long n_launders_hist[2 * MAX_FWD_CB_JIFFIES / HZ]; + +/* Callback function for continuous-flood RCU callbacks. */ +static void rcu_torture_fwd_cb_cr(struct rcu_head *rhp) +{ + int i; + struct rcu_fwd_cb *rfcp = container_of(rhp, struct rcu_fwd_cb, rh); + struct rcu_fwd_cb **rfcpp; + + rfcp->rfc_next = NULL; + rfcp->rfc_gps++; + spin_lock(&rcu_fwd_lock); + rfcpp = rcu_fwd_cb_tail; + rcu_fwd_cb_tail = &rfcp->rfc_next; + WRITE_ONCE(*rfcpp, rfcp); + WRITE_ONCE(n_launders_cb, n_launders_cb + 1); + i = ((jiffies - rcu_fwd_startat) / HZ); + if (i >= ARRAY_SIZE(n_launders_hist)) + i = ARRAY_SIZE(n_launders_hist) - 1; + n_launders_hist[i]++; + spin_unlock(&rcu_fwd_lock); +} + /* Carry out grace-period forward-progress testing. */ static int rcu_torture_fwd_prog(void *args) { @@ -1681,11 +1721,21 @@ static int rcu_torture_fwd_prog(void *args) unsigned long dur; struct fwd_cb_state fcs; unsigned long gps; + int i; int idx; + int j; + long n_launders; + long n_launders_cb_snap; + long n_launders_sa; + long n_max_cbs; + long n_max_gps; + struct rcu_fwd_cb *rfcp; + struct rcu_fwd_cb *rfcpn; int sd; int sd4; bool selfpropcb = false; unsigned long stopat; + unsigned long stoppedat; int tested = 0; int tested_tries = 0; static DEFINE_TORTURE_RANDOM(trs); @@ -1699,6 +1749,8 @@ static int rcu_torture_fwd_prog(void *args) } do { schedule_timeout_interruptible(fwd_progress_holdoff * HZ); + + /* Tight loop containing cond_resched(). */ if (selfpropcb) { WRITE_ONCE(fcs.stop, 0); cur_ops->call(&fcs.rh, rcu_torture_fwd_prog_cb); @@ -1708,7 +1760,8 @@ static int rcu_torture_fwd_prog(void *args) sd = cur_ops->stall_dur() + 1; sd4 = (sd + fwd_progress_div - 1) / fwd_progress_div; dur = sd4 + torture_random(&trs) % (sd - sd4); - stopat = jiffies + dur; + rcu_fwd_startat = jiffies; + stopat = rcu_fwd_startat + dur; while (time_before(jiffies, stopat) && !torture_must_stop()) { idx = cur_ops->readlock(); udelay(10); @@ -1729,6 +1782,78 @@ static int rcu_torture_fwd_prog(void *args) cur_ops->sync(); /* Wait for running CB to complete. */ cur_ops->cb_barrier(); /* Wait for queued callbacks. */ } + + /* Loop continuously posting RCU callbacks. */ + WRITE_ONCE(rcu_fwd_cb_nodelay, true); + cur_ops->sync(); /* Later readers see above write. */ + rcu_fwd_startat = jiffies; + stopat = rcu_fwd_startat + MAX_FWD_CB_JIFFIES; + n_launders = 0; + n_launders_cb = 0; + n_launders_sa = 0; + n_max_cbs = 0; + n_max_gps = 0; + for (i = 0; i < ARRAY_SIZE(n_launders_hist); i++) + n_launders_hist[i] = 0; + cver = READ_ONCE(rcu_torture_current_version); + gps = cur_ops->get_gp_seq(); + while (time_before(jiffies, stopat) && !torture_must_stop()) { + rfcp = READ_ONCE(rcu_fwd_cb_head); + rfcpn = NULL; + if (rfcp) + rfcpn = READ_ONCE(rfcp->rfc_next); + if (rfcpn) { + if (rfcp->rfc_gps >= MIN_FWD_CB_LAUNDERS && + ++n_max_gps >= MIN_FWD_CBS_LAUNDERED) + break; + rcu_fwd_cb_head = rfcpn; + n_launders++; + n_launders_sa++; + } else { + rfcp = kmalloc(sizeof(*rfcp), GFP_KERNEL); + if (WARN_ON_ONCE(!rfcp)) { + schedule_timeout_interruptible(1); + continue; + } + n_max_cbs++; + n_launders_sa = 0; + rfcp->rfc_gps = 0; + } + cur_ops->call(&rfcp->rh, rcu_torture_fwd_cb_cr); + cond_resched(); + } + stoppedat = jiffies; + n_launders_cb_snap = READ_ONCE(n_launders_cb); + cver = READ_ONCE(rcu_torture_current_version) - cver; + gps = rcutorture_seq_diff(cur_ops->get_gp_seq(), gps); + cur_ops->cb_barrier(); /* Wait for callbacks to be invoked. */ + for (;;) { + rfcp = rcu_fwd_cb_head; + if (!rfcp) + break; + rcu_fwd_cb_head = rfcp->rfc_next; + kfree(rfcp); + } + rcu_fwd_cb_tail = &rcu_fwd_cb_head; + WRITE_ONCE(rcu_fwd_cb_nodelay, false); + if (!torture_must_stop()) { + WARN_ON(n_max_gps < MIN_FWD_CBS_LAUNDERED); + pr_alert("%s Duration %lu barrier: %lu pending %ld n_launders: %ld n_launders_sa: %ld n_max_gps: %ld n_max_cbs: %ld cver %ld gps %ld\n", + __func__, + stoppedat - rcu_fwd_startat, + jiffies - stoppedat, + n_launders + n_max_cbs - n_launders_cb_snap, + n_launders, n_launders_sa, + n_max_gps, n_max_cbs, cver, gps); + for (i = ARRAY_SIZE(n_launders_hist) - 1; i > 0; i--) + if (n_launders_hist[i] > 0) + break; + pr_alert("Callback-invocation histogram:"); + for (j = 0; j <= i; j++) + pr_cont(" %ds: %ld", j + 1, n_launders_hist[j]); + pr_cont("\n"); + } + /* Avoid slow periods, better to test when busy. */ stutter_wait("rcu_torture_fwd_prog"); } while (!torture_must_stop()); -- cgit From 28cf5952f56005325f269ccfe402a880cd741189 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 21 Aug 2018 15:27:16 -0700 Subject: torture: Bring any extra CPUs online during kernel startup Currently, the torture scripts rely on the initrd/init script to bring any extra CPUs online, for example, in the case where the kernel and qemu have different ideas about how many CPUs are present. This works, but is an unnecessary dependency on initrd, which needs to vary depending on the distro. This commit therefore causes torture_onoff() to check for additional CPUs, attempting to bring any found online. Errors are ignored, just as they are by the initrd/init script. Signed-off-by: Paul E. McKenney --- kernel/torture.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'kernel') diff --git a/kernel/torture.c b/kernel/torture.c index 17d91f5fba2a..9410d1bf84d6 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -194,11 +194,23 @@ torture_onoff(void *arg) int cpu; int maxcpu = -1; DEFINE_TORTURE_RANDOM(rand); + int ret; VERBOSE_TOROUT_STRING("torture_onoff task started"); for_each_online_cpu(cpu) maxcpu = cpu; WARN_ON(maxcpu < 0); + if (!IS_MODULE(CONFIG_TORTURE_TEST)) + for_each_possible_cpu(cpu) { + if (cpu_online(cpu)) + continue; + ret = cpu_up(cpu); + if (ret && verbose) { + pr_alert("%s" TORTURE_FLAG + "%s: Initial online %d: errno %d\n", + __func__, torture_type, cpu, ret); + } + } if (maxcpu == 0) { VERBOSE_TOROUT_STRING("Only one CPU, so CPU-hotplug testing is disabled"); -- cgit From fc6f9c57787e578473d47b7bbc846e317d17c1df Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 27 Aug 2018 14:43:05 -0700 Subject: rcutorture: Remove cbflood facility Now that the forward-progress code does a full-bore continuous callback flood lasting multiple seconds, there is little point in also posting a mere 60,000 callbacks every second or so. This commit therefore removes the old cbflood testing. Over time, it may be desirable to concurrently do full-bore continuous callback floods on all CPUs simultaneously, but one dragon at a time. Signed-off-by: Paul E. McKenney --- Documentation/admin-guide/kernel-parameters.txt | 18 ------ kernel/rcu/rcutorture.c | 86 +------------------------ 2 files changed, 1 insertion(+), 103 deletions(-) (limited to 'kernel') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 3823679deea5..86e825e0927a 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -3743,24 +3743,6 @@ in microseconds. The default of zero says no holdoff. - rcutorture.cbflood_inter_holdoff= [KNL] - Set holdoff time (jiffies) between successive - callback-flood tests. - - rcutorture.cbflood_intra_holdoff= [KNL] - Set holdoff time (jiffies) between successive - bursts of callbacks within a given callback-flood - test. - - rcutorture.cbflood_n_burst= [KNL] - Set the number of bursts making up a given - callback-flood test. Set this to zero to - disable callback-flood testing. - - rcutorture.cbflood_n_per_burst= [KNL] - Set the number of callbacks to be registered - in a given burst of a callback-flood test. - rcutorture.fqs_duration= [KNL] Set duration of force_quiescent_state bursts in microseconds. diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 8cf700ca7845..17f480129a78 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -80,13 +80,6 @@ MODULE_AUTHOR("Paul E. McKenney and Josh Triplett 0 && - cbflood_inter_holdoff > 0 && - cbflood_intra_holdoff > 0 && - cur_ops->call && - cur_ops->cb_barrier) { - rhp = vmalloc(array3_size(cbflood_n_burst, - cbflood_n_per_burst, - sizeof(*rhp))); - err = !rhp; - } - if (err) { - VERBOSE_TOROUT_STRING("rcu_torture_cbflood disabled: Bad args or OOM"); - goto wait_for_stop; - } - VERBOSE_TOROUT_STRING("rcu_torture_cbflood task started"); - do { - schedule_timeout_interruptible(cbflood_inter_holdoff); - atomic_long_inc(&n_cbfloods); - WARN_ON(signal_pending(current)); - for (i = 0; i < cbflood_n_burst; i++) { - for (j = 0; j < cbflood_n_per_burst; j++) { - cur_ops->call(&rhp[i * cbflood_n_per_burst + j], - rcu_torture_cbflood_cb); - } - schedule_timeout_interruptible(cbflood_intra_holdoff); - WARN_ON(signal_pending(current)); - } - cur_ops->cb_barrier(); - stutter_wait("rcu_torture_cbflood"); - } while (!torture_must_stop()); - vfree(rhp); -wait_for_stop: - torture_kthread_stopping("rcu_torture_cbflood"); - return 0; -} - /* * RCU torture force-quiescent-state kthread. Repeatedly induces * bursts of calls to force_quiescent_state(), increasing the probability @@ -1460,11 +1397,10 @@ rcu_torture_stats_print(void) n_rcu_torture_boosts, atomic_long_read(&n_rcu_torture_timers)); torture_onoff_stats(); - pr_cont("barrier: %ld/%ld:%ld ", + pr_cont("barrier: %ld/%ld:%ld\n", n_barrier_successes, n_barrier_attempts, n_rcu_torture_barrier_error); - pr_cont("cbflood: %ld\n", atomic_long_read(&n_cbfloods)); pr_alert("%s%s ", torture_type, TORTURE_FLAG); if (atomic_read(&n_rcu_torture_mberror) != 0 || @@ -2093,8 +2029,6 @@ rcu_torture_cleanup(void) cur_ops->name, gp_seq, flags); torture_stop_kthread(rcu_torture_stats, stats_task); torture_stop_kthread(rcu_torture_fqs, fqs_task); - for (i = 0; i < ncbflooders; i++) - torture_stop_kthread(rcu_torture_cbflood, cbflood_task[i]); if (rcu_torture_can_boost()) cpuhp_remove_state(rcutor_hp); @@ -2377,24 +2311,6 @@ rcu_torture_init(void) goto unwind; if (object_debug) rcu_test_debug_objects(); - if (cbflood_n_burst > 0) { - /* Create the cbflood threads */ - ncbflooders = (num_online_cpus() + 3) / 4; - cbflood_task = kcalloc(ncbflooders, sizeof(*cbflood_task), - GFP_KERNEL); - if (!cbflood_task) { - VERBOSE_TOROUT_ERRSTRING("out of memory"); - firsterr = -ENOMEM; - goto unwind; - } - for (i = 0; i < ncbflooders; i++) { - firsterr = torture_create_kthread(rcu_torture_cbflood, - NULL, - cbflood_task[i]); - if (firsterr) - goto unwind; - } - } torture_init_end(); return 0; -- cgit From 6b3de7a172bc59010a9d8e425877d98c1f24555e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 28 Aug 2018 14:38:43 -0700 Subject: rcutorture: Break up too-long rcu_torture_fwd_prog() function This commit splits rcu_torture_fwd_prog_nr() and rcu_torture_fwd_prog_cr() functions out of rcu_torture_fwd_prog() in order to reduce indentation pain and because rcu_torture_fwd_prog() was getting a bit too long. In addition, this will enable easier conditional execution of the rcu_torture_fwd_prog_cr() function, which can give false-positive failures in some NO_HZ_FULL configurations due to overloading the housekeeping CPUs. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 254 +++++++++++++++++++++++++----------------------- 1 file changed, 135 insertions(+), 119 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 17f480129a78..bcc33bb8d9a6 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1650,15 +1650,70 @@ static void rcu_torture_fwd_cb_cr(struct rcu_head *rhp) spin_unlock(&rcu_fwd_lock); } -/* Carry out grace-period forward-progress testing. */ -static int rcu_torture_fwd_prog(void *args) +/* Carry out need_resched()/cond_resched() forward-progress testing. */ +static void rcu_torture_fwd_prog_nr(int *tested, int *tested_tries) { unsigned long cver; unsigned long dur; struct fwd_cb_state fcs; unsigned long gps; - int i; int idx; + int sd; + int sd4; + bool selfpropcb = false; + unsigned long stopat; + static DEFINE_TORTURE_RANDOM(trs); + + if (cur_ops->call && cur_ops->sync && cur_ops->cb_barrier) { + init_rcu_head_on_stack(&fcs.rh); + selfpropcb = true; + } + + /* Tight loop containing cond_resched(). */ + if (selfpropcb) { + WRITE_ONCE(fcs.stop, 0); + cur_ops->call(&fcs.rh, rcu_torture_fwd_prog_cb); + } + cver = READ_ONCE(rcu_torture_current_version); + gps = cur_ops->get_gp_seq(); + sd = cur_ops->stall_dur() + 1; + sd4 = (sd + fwd_progress_div - 1) / fwd_progress_div; + dur = sd4 + torture_random(&trs) % (sd - sd4); + rcu_fwd_startat = jiffies; + stopat = rcu_fwd_startat + dur; + while (time_before(jiffies, stopat) && !torture_must_stop()) { + idx = cur_ops->readlock(); + udelay(10); + cur_ops->readunlock(idx); + if (!fwd_progress_need_resched || need_resched()) + cond_resched(); + } + (*tested_tries)++; + if (!time_before(jiffies, stopat) && !torture_must_stop()) { + (*tested)++; + cver = READ_ONCE(rcu_torture_current_version) - cver; + gps = rcutorture_seq_diff(cur_ops->get_gp_seq(), gps); + WARN_ON(!cver && gps < 2); + pr_alert("%s: Duration %ld cver %ld gps %ld\n", __func__, dur, cver, gps); + } + if (selfpropcb) { + WRITE_ONCE(fcs.stop, 1); + cur_ops->sync(); /* Wait for running CB to complete. */ + cur_ops->cb_barrier(); /* Wait for queued callbacks. */ + } + + if (selfpropcb) { + WARN_ON(READ_ONCE(fcs.stop) != 2); + destroy_rcu_head_on_stack(&fcs.rh); + } +} + +/* Carry out call_rcu() forward-progress testing. */ +static void rcu_torture_fwd_prog_cr(void) +{ + unsigned long cver; + unsigned long gps; + int i; int j; long n_launders; long n_launders_cb_snap; @@ -1667,136 +1722,97 @@ static int rcu_torture_fwd_prog(void *args) long n_max_gps; struct rcu_fwd_cb *rfcp; struct rcu_fwd_cb *rfcpn; - int sd; - int sd4; - bool selfpropcb = false; unsigned long stopat; unsigned long stoppedat; + + /* Loop continuously posting RCU callbacks. */ + WRITE_ONCE(rcu_fwd_cb_nodelay, true); + cur_ops->sync(); /* Later readers see above write. */ + rcu_fwd_startat = jiffies; + stopat = rcu_fwd_startat + MAX_FWD_CB_JIFFIES; + n_launders = 0; + n_launders_cb = 0; + n_launders_sa = 0; + n_max_cbs = 0; + n_max_gps = 0; + for (i = 0; i < ARRAY_SIZE(n_launders_hist); i++) + n_launders_hist[i] = 0; + cver = READ_ONCE(rcu_torture_current_version); + gps = cur_ops->get_gp_seq(); + while (time_before(jiffies, stopat) && !torture_must_stop()) { + rfcp = READ_ONCE(rcu_fwd_cb_head); + rfcpn = NULL; + if (rfcp) + rfcpn = READ_ONCE(rfcp->rfc_next); + if (rfcpn) { + if (rfcp->rfc_gps >= MIN_FWD_CB_LAUNDERS && + ++n_max_gps >= MIN_FWD_CBS_LAUNDERED) + break; + rcu_fwd_cb_head = rfcpn; + n_launders++; + n_launders_sa++; + } else { + rfcp = kmalloc(sizeof(*rfcp), GFP_KERNEL); + if (WARN_ON_ONCE(!rfcp)) { + schedule_timeout_interruptible(1); + continue; + } + n_max_cbs++; + n_launders_sa = 0; + rfcp->rfc_gps = 0; + } + cur_ops->call(&rfcp->rh, rcu_torture_fwd_cb_cr); + cond_resched(); + } + stoppedat = jiffies; + n_launders_cb_snap = READ_ONCE(n_launders_cb); + cver = READ_ONCE(rcu_torture_current_version) - cver; + gps = rcutorture_seq_diff(cur_ops->get_gp_seq(), gps); + cur_ops->cb_barrier(); /* Wait for callbacks to be invoked. */ + for (;;) { + rfcp = rcu_fwd_cb_head; + if (!rfcp) + break; + rcu_fwd_cb_head = rfcp->rfc_next; + kfree(rfcp); + } + rcu_fwd_cb_tail = &rcu_fwd_cb_head; + WRITE_ONCE(rcu_fwd_cb_nodelay, false); + if (!torture_must_stop()) { + WARN_ON(n_max_gps < MIN_FWD_CBS_LAUNDERED); + pr_alert("%s Duration %lu barrier: %lu pending %ld n_launders: %ld n_launders_sa: %ld n_max_gps: %ld n_max_cbs: %ld cver %ld gps %ld\n", + __func__, + stoppedat - rcu_fwd_startat, jiffies - stoppedat, + n_launders + n_max_cbs - n_launders_cb_snap, + n_launders, n_launders_sa, + n_max_gps, n_max_cbs, cver, gps); + for (i = ARRAY_SIZE(n_launders_hist) - 1; i > 0; i--) + if (n_launders_hist[i] > 0) + break; + pr_alert("Callback-invocation histogram:"); + for (j = 0; j <= i; j++) + pr_cont(" %ds: %ld", j + 1, n_launders_hist[j]); + pr_cont("\n"); + } +} + +/* Carry out grace-period forward-progress testing. */ +static int rcu_torture_fwd_prog(void *args) +{ int tested = 0; int tested_tries = 0; - static DEFINE_TORTURE_RANDOM(trs); VERBOSE_TOROUT_STRING("rcu_torture_fwd_progress task started"); if (!IS_ENABLED(CONFIG_SMP) || !IS_ENABLED(CONFIG_RCU_BOOST)) set_user_nice(current, MAX_NICE); - if (cur_ops->call && cur_ops->sync && cur_ops->cb_barrier) { - init_rcu_head_on_stack(&fcs.rh); - selfpropcb = true; - } do { schedule_timeout_interruptible(fwd_progress_holdoff * HZ); - - /* Tight loop containing cond_resched(). */ - if (selfpropcb) { - WRITE_ONCE(fcs.stop, 0); - cur_ops->call(&fcs.rh, rcu_torture_fwd_prog_cb); - } - cver = READ_ONCE(rcu_torture_current_version); - gps = cur_ops->get_gp_seq(); - sd = cur_ops->stall_dur() + 1; - sd4 = (sd + fwd_progress_div - 1) / fwd_progress_div; - dur = sd4 + torture_random(&trs) % (sd - sd4); - rcu_fwd_startat = jiffies; - stopat = rcu_fwd_startat + dur; - while (time_before(jiffies, stopat) && !torture_must_stop()) { - idx = cur_ops->readlock(); - udelay(10); - cur_ops->readunlock(idx); - if (!fwd_progress_need_resched || need_resched()) - cond_resched(); - } - tested_tries++; - if (!time_before(jiffies, stopat) && !torture_must_stop()) { - tested++; - cver = READ_ONCE(rcu_torture_current_version) - cver; - gps = rcutorture_seq_diff(cur_ops->get_gp_seq(), gps); - WARN_ON(!cver && gps < 2); - pr_alert("%s: Duration %ld cver %ld gps %ld\n", __func__, dur, cver, gps); - } - if (selfpropcb) { - WRITE_ONCE(fcs.stop, 1); - cur_ops->sync(); /* Wait for running CB to complete. */ - cur_ops->cb_barrier(); /* Wait for queued callbacks. */ - } - - /* Loop continuously posting RCU callbacks. */ - WRITE_ONCE(rcu_fwd_cb_nodelay, true); - cur_ops->sync(); /* Later readers see above write. */ - rcu_fwd_startat = jiffies; - stopat = rcu_fwd_startat + MAX_FWD_CB_JIFFIES; - n_launders = 0; - n_launders_cb = 0; - n_launders_sa = 0; - n_max_cbs = 0; - n_max_gps = 0; - for (i = 0; i < ARRAY_SIZE(n_launders_hist); i++) - n_launders_hist[i] = 0; - cver = READ_ONCE(rcu_torture_current_version); - gps = cur_ops->get_gp_seq(); - while (time_before(jiffies, stopat) && !torture_must_stop()) { - rfcp = READ_ONCE(rcu_fwd_cb_head); - rfcpn = NULL; - if (rfcp) - rfcpn = READ_ONCE(rfcp->rfc_next); - if (rfcpn) { - if (rfcp->rfc_gps >= MIN_FWD_CB_LAUNDERS && - ++n_max_gps >= MIN_FWD_CBS_LAUNDERED) - break; - rcu_fwd_cb_head = rfcpn; - n_launders++; - n_launders_sa++; - } else { - rfcp = kmalloc(sizeof(*rfcp), GFP_KERNEL); - if (WARN_ON_ONCE(!rfcp)) { - schedule_timeout_interruptible(1); - continue; - } - n_max_cbs++; - n_launders_sa = 0; - rfcp->rfc_gps = 0; - } - cur_ops->call(&rfcp->rh, rcu_torture_fwd_cb_cr); - cond_resched(); - } - stoppedat = jiffies; - n_launders_cb_snap = READ_ONCE(n_launders_cb); - cver = READ_ONCE(rcu_torture_current_version) - cver; - gps = rcutorture_seq_diff(cur_ops->get_gp_seq(), gps); - cur_ops->cb_barrier(); /* Wait for callbacks to be invoked. */ - for (;;) { - rfcp = rcu_fwd_cb_head; - if (!rfcp) - break; - rcu_fwd_cb_head = rfcp->rfc_next; - kfree(rfcp); - } - rcu_fwd_cb_tail = &rcu_fwd_cb_head; - WRITE_ONCE(rcu_fwd_cb_nodelay, false); - if (!torture_must_stop()) { - WARN_ON(n_max_gps < MIN_FWD_CBS_LAUNDERED); - pr_alert("%s Duration %lu barrier: %lu pending %ld n_launders: %ld n_launders_sa: %ld n_max_gps: %ld n_max_cbs: %ld cver %ld gps %ld\n", - __func__, - stoppedat - rcu_fwd_startat, - jiffies - stoppedat, - n_launders + n_max_cbs - n_launders_cb_snap, - n_launders, n_launders_sa, - n_max_gps, n_max_cbs, cver, gps); - for (i = ARRAY_SIZE(n_launders_hist) - 1; i > 0; i--) - if (n_launders_hist[i] > 0) - break; - pr_alert("Callback-invocation histogram:"); - for (j = 0; j <= i; j++) - pr_cont(" %ds: %ld", j + 1, n_launders_hist[j]); - pr_cont("\n"); - } + rcu_torture_fwd_prog_nr(&tested, &tested_tries); + rcu_torture_fwd_prog_cr(); /* Avoid slow periods, better to test when busy. */ stutter_wait("rcu_torture_fwd_prog"); } while (!torture_must_stop()); - if (selfpropcb) { - WARN_ON(READ_ONCE(fcs.stop) != 2); - destroy_rcu_head_on_stack(&fcs.rh); - } /* Short runs might not contain a valid forward-progress attempt. */ WARN_ON(!tested && tested_tries >= 5); pr_alert("%s: tested %d tested_tries %d\n", __func__, tested, tested_tries); -- cgit From 5ab7ab8362fa8a4f7995d65ea05edf71530e8004 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 21 Sep 2018 18:08:09 -0700 Subject: rcutorture: Affinity forward-progress test to avoid housekeeping CPUs This commit affinities the forward-progress tests to avoid hogging a housekeeping CPU on the theory that the offloaded callbacks will be running on those housekeeping CPUs. Signed-off-by: Paul E. McKenney [ paulmck: Fix NULL-pointer issue located by kbuild test robot. ] Tested-by: Rong Chen --- kernel/rcu/rcu.h | 2 ++ kernel/rcu/rcutorture.c | 1 + kernel/rcu/tree_plugin.h | 11 +++++++++++ 3 files changed, 14 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 2866166863f0..0f0f5ae8c3d4 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -539,8 +539,10 @@ extern struct workqueue_struct *rcu_par_gp_wq; #ifdef CONFIG_RCU_NOCB_CPU bool rcu_is_nocb_cpu(int cpu); +void rcu_bind_current_to_nocb(void); #else static inline bool rcu_is_nocb_cpu(int cpu) { return false; } +static inline void rcu_bind_current_to_nocb(void) { } #endif #endif /* __LINUX_RCU_H */ diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index bcc33bb8d9a6..36a3bc42782d 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1803,6 +1803,7 @@ static int rcu_torture_fwd_prog(void *args) int tested_tries = 0; VERBOSE_TOROUT_STRING("rcu_torture_fwd_progress task started"); + rcu_bind_current_to_nocb(); if (!IS_ENABLED(CONFIG_SMP) || !IS_ENABLED(CONFIG_RCU_BOOST)) set_user_nice(current, MAX_NICE); do { diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 605ff3b06098..25fe26c4e9c1 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2603,6 +2603,17 @@ static bool init_nocb_callback_list(struct rcu_data *rdp) return true; } +/* + * Bind the current task to the offloaded CPUs. If there are no offloaded + * CPUs, leave the task unbound. Splat if the bind attempt fails. + */ +void rcu_bind_current_to_nocb(void) +{ + if (cpumask_available(rcu_nocb_mask) && cpumask_weight(rcu_nocb_mask)) + WARN_ON(sched_setaffinity(current->pid, rcu_nocb_mask)); +} +EXPORT_SYMBOL_GPL(rcu_bind_current_to_nocb); + #else /* #ifdef CONFIG_RCU_NOCB_CPU */ static bool rcu_nocb_cpu_needs_barrier(int cpu) -- cgit From 2a7d968816a94a4c52f0082c085c6714a5b3f1ec Mon Sep 17 00:00:00 2001 From: Pierce Griffiths Date: Fri, 21 Sep 2018 20:21:31 -0500 Subject: torture: Remove unnecessary "ret" variables Remove return variables (declared as "ret") in cases where, depending on whether a condition evaluates as true, the result of a function call can be immediately returned instead of storing the result in the return variable. When the condition evaluates as false, the constant initially stored in the return variable at declaration is returned instead. Signed-off-by: Pierce Griffiths Signed-off-by: Paul E. McKenney --- kernel/torture.c | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/torture.c b/kernel/torture.c index 9410d1bf84d6..bbf6d473e50c 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -245,16 +245,15 @@ stop: */ int torture_onoff_init(long ooholdoff, long oointerval) { - int ret = 0; - #ifdef CONFIG_HOTPLUG_CPU onoff_holdoff = ooholdoff; onoff_interval = oointerval; if (onoff_interval <= 0) return 0; - ret = torture_create_kthread(torture_onoff, NULL, onoff_task); -#endif /* #ifdef CONFIG_HOTPLUG_CPU */ - return ret; + return torture_create_kthread(torture_onoff, NULL, onoff_task); +#else /* #ifdef CONFIG_HOTPLUG_CPU */ + return 0; +#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */ } EXPORT_SYMBOL_GPL(torture_onoff_init); @@ -525,15 +524,13 @@ static int torture_shutdown(void *arg) */ int torture_shutdown_init(int ssecs, void (*cleanup)(void)) { - int ret = 0; - torture_shutdown_hook = cleanup; if (ssecs > 0) { shutdown_time = ktime_add(ktime_get(), ktime_set(ssecs, 0)); - ret = torture_create_kthread(torture_shutdown, NULL, + return torture_create_kthread(torture_shutdown, NULL, shutdown_task); } - return ret; + return 0; } EXPORT_SYMBOL_GPL(torture_shutdown_init); @@ -632,13 +629,10 @@ static int torture_stutter(void *arg) /* * Initialize and kick off the torture_stutter kthread. */ -int torture_stutter_init(int s) +int torture_stutter_init(const int s) { - int ret; - stutter = s; - ret = torture_create_kthread(torture_stutter, NULL, stutter_task); - return ret; + return torture_create_kthread(torture_stutter, NULL, stutter_task); } EXPORT_SYMBOL_GPL(torture_stutter_init); -- cgit From 61670adcb4a9f66ff3fa8a9e846a623d9a9e1553 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 1 Oct 2018 13:27:41 -0700 Subject: rcutorture: Prepare for asynchronous access to rcu_fwd_startat Because rcutorture's forward-progress checking will trigger from an OOM notifier, this notifier will introduce asynchronous concurrent access to the rcu_fwd_startat variable. This commit therefore prepares for this by converting updates to WRITE_ONCE(). Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 36a3bc42782d..c4fd61dccedb 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1679,7 +1679,7 @@ static void rcu_torture_fwd_prog_nr(int *tested, int *tested_tries) sd = cur_ops->stall_dur() + 1; sd4 = (sd + fwd_progress_div - 1) / fwd_progress_div; dur = sd4 + torture_random(&trs) % (sd - sd4); - rcu_fwd_startat = jiffies; + WRITE_ONCE(rcu_fwd_startat, jiffies); stopat = rcu_fwd_startat + dur; while (time_before(jiffies, stopat) && !torture_must_stop()) { idx = cur_ops->readlock(); @@ -1728,7 +1728,7 @@ static void rcu_torture_fwd_prog_cr(void) /* Loop continuously posting RCU callbacks. */ WRITE_ONCE(rcu_fwd_cb_nodelay, true); cur_ops->sync(); /* Later readers see above write. */ - rcu_fwd_startat = jiffies; + WRITE_ONCE(rcu_fwd_startat, jiffies); stopat = rcu_fwd_startat + MAX_FWD_CB_JIFFIES; n_launders = 0; n_launders_cb = 0; -- cgit From e0aff97355575ac6a28a48a4217533a3953095c5 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 1 Oct 2018 17:40:54 -0700 Subject: rcutorture: Dump grace-period diagnostics upon forward-progress OOM This commit adds an OOM notifier during rcutorture forward-progress testing. If this notifier is invoked, it dumps out some grace-period state to help debug the forward-progress problem. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu.h | 2 ++ kernel/rcu/rcutorture.c | 31 ++++++++++++++++++++++++++++--- kernel/rcu/tree.c | 20 ++++++++++++++++++++ 3 files changed, 50 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 0f0f5ae8c3d4..a393e24a9195 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -526,12 +526,14 @@ srcu_batches_completed(struct srcu_struct *sp) { return 0; } static inline void rcu_force_quiescent_state(void) { } static inline void show_rcu_gp_kthreads(void) { } static inline int rcu_get_gp_kthreads_prio(void) { return 0; } +static inline void rcu_fwd_progress_check(unsigned long j) { } #else /* #ifdef CONFIG_TINY_RCU */ unsigned long rcu_get_gp_seq(void); unsigned long rcu_exp_batches_completed(void); unsigned long srcu_batches_completed(struct srcu_struct *sp); void show_rcu_gp_kthreads(void); int rcu_get_gp_kthreads_prio(void); +void rcu_fwd_progress_check(unsigned long j); void rcu_force_quiescent_state(void); extern struct workqueue_struct *rcu_gp_wq; extern struct workqueue_struct *rcu_par_gp_wq; diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index c4fd61dccedb..f28b88ecb47a 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -56,6 +56,7 @@ #include #include #include +#include #include "rcu.h" @@ -1624,6 +1625,7 @@ static struct rcu_fwd_cb *rcu_fwd_cb_head; static struct rcu_fwd_cb **rcu_fwd_cb_tail = &rcu_fwd_cb_head; static long n_launders_cb; static unsigned long rcu_fwd_startat; +static bool rcu_fwd_emergency_stop; #define MAX_FWD_CB_JIFFIES (8 * HZ) /* Maximum CB test duration. */ #define MIN_FWD_CB_LAUNDERS 3 /* This many CB invocations to count. */ #define MIN_FWD_CBS_LAUNDERED 100 /* Number of counted CBs. */ @@ -1681,7 +1683,8 @@ static void rcu_torture_fwd_prog_nr(int *tested, int *tested_tries) dur = sd4 + torture_random(&trs) % (sd - sd4); WRITE_ONCE(rcu_fwd_startat, jiffies); stopat = rcu_fwd_startat + dur; - while (time_before(jiffies, stopat) && !torture_must_stop()) { + while (time_before(jiffies, stopat) && + !READ_ONCE(rcu_fwd_emergency_stop) && !torture_must_stop()) { idx = cur_ops->readlock(); udelay(10); cur_ops->readunlock(idx); @@ -1689,7 +1692,8 @@ static void rcu_torture_fwd_prog_nr(int *tested, int *tested_tries) cond_resched(); } (*tested_tries)++; - if (!time_before(jiffies, stopat) && !torture_must_stop()) { + if (!time_before(jiffies, stopat) && + !READ_ONCE(rcu_fwd_emergency_stop) && !torture_must_stop()) { (*tested)++; cver = READ_ONCE(rcu_torture_current_version) - cver; gps = rcutorture_seq_diff(cur_ops->get_gp_seq(), gps); @@ -1739,7 +1743,8 @@ static void rcu_torture_fwd_prog_cr(void) n_launders_hist[i] = 0; cver = READ_ONCE(rcu_torture_current_version); gps = cur_ops->get_gp_seq(); - while (time_before(jiffies, stopat) && !torture_must_stop()) { + while (time_before(jiffies, stopat) && + !READ_ONCE(rcu_fwd_emergency_stop) && !torture_must_stop()) { rfcp = READ_ONCE(rcu_fwd_cb_head); rfcpn = NULL; if (rfcp) @@ -1796,6 +1801,23 @@ static void rcu_torture_fwd_prog_cr(void) } } + +/* + * OOM notifier, but this only prints diagnostic information for the + * current forward-progress test. + */ +static int rcutorture_oom_notify(struct notifier_block *self, + unsigned long notused, void *nfreed) +{ + rcu_fwd_progress_check(1 + (jiffies - READ_ONCE(rcu_fwd_startat) / 2)); + WRITE_ONCE(rcu_fwd_emergency_stop, true); + return NOTIFY_OK; +} + +static struct notifier_block rcutorture_oom_nb = { + .notifier_call = rcutorture_oom_notify +}; + /* Carry out grace-period forward-progress testing. */ static int rcu_torture_fwd_prog(void *args) { @@ -1808,8 +1830,11 @@ static int rcu_torture_fwd_prog(void *args) set_user_nice(current, MAX_NICE); do { schedule_timeout_interruptible(fwd_progress_holdoff * HZ); + WRITE_ONCE(rcu_fwd_emergency_stop, false); + register_oom_notifier(&rcutorture_oom_nb); rcu_torture_fwd_prog_nr(&tested, &tested_tries); rcu_torture_fwd_prog_cr(); + unregister_oom_notifier(&rcutorture_oom_nb); /* Avoid slow periods, better to test when busy. */ stutter_wait("rcu_torture_fwd_prog"); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 6ec3abbe90e2..853b79a6ff10 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2657,6 +2657,26 @@ rcu_check_gp_start_stall(struct rcu_node *rnp, struct rcu_data *rdp, raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } +/* + * Do a forward-progress check for rcutorture. This is normally invoked + * due to an OOM event. The argument "j" gives the time period during + * which rcutorture would like progress to have been made. + */ +void rcu_fwd_progress_check(unsigned long j) +{ + struct rcu_data *rdp; + + if (rcu_gp_in_progress()) { + show_rcu_gp_kthreads(); + } else { + preempt_disable(); + rdp = this_cpu_ptr(&rcu_data); + rcu_check_gp_start_stall(rdp->mynode, rdp, j); + preempt_enable(); + } +} +EXPORT_SYMBOL_GPL(rcu_fwd_progress_check); + /* * This does the RCU core processing work for the specified rcu_data * structures. This may be called only from the CPU to whom the rdp -- cgit From 903ee83d91776bc72d856147743687d4b6c99286 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 2 Oct 2018 16:05:46 -0700 Subject: rcu: Account for nocb-CPU callback counts in RCU CPU stall warnings The RCU CPU stall warnings print an estimate of the total number of RCU callbacks queued in the system, but this estimate leaves out the callbacks queued for nocbs CPUs. This commit therefore introduces rcu_get_n_cbs_cpu(), which gives an accurate callback estimate for both nocbs and normal CPUs, and uses this new function as needed. This commit also introduces a rcu_get_n_cbs_nocb_cpu() helper function that returns the number of callbacks for nocbs CPUs or zero otherwise, and also uses this function in place of direct access to ->nocb_q_count while in the area (fewer characters, you see). Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 19 +++++++++++++++---- kernel/rcu/tree.h | 1 + kernel/rcu/tree_plugin.h | 24 +++++++++++++++++++----- 3 files changed, 35 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 853b79a6ff10..0933d8650890 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -207,6 +207,19 @@ static int rcu_gp_in_progress(void) return rcu_seq_state(rcu_seq_current(&rcu_state.gp_seq)); } +/* + * Return the number of callbacks queued on the specified CPU. + * Handles both the nocbs and normal cases. + */ +static long rcu_get_n_cbs_cpu(int cpu) +{ + struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); + + if (rcu_segcblist_is_enabled(&rdp->cblist)) /* Online normal CPU? */ + return rcu_segcblist_n_cbs(&rdp->cblist); + return rcu_get_n_cbs_nocb_cpu(rdp); /* Works for offline, too. */ +} + void rcu_softirq_qs(void) { rcu_qs(); @@ -1265,8 +1278,7 @@ static void print_other_cpu_stall(unsigned long gp_seq) print_cpu_stall_info_end(); for_each_possible_cpu(cpu) - totqlen += rcu_segcblist_n_cbs(&per_cpu_ptr(&rcu_data, - cpu)->cblist); + totqlen += rcu_get_n_cbs_cpu(cpu); pr_cont("(detected by %d, t=%ld jiffies, g=%ld, q=%lu)\n", smp_processor_id(), (long)(jiffies - rcu_state.gp_start), (long)rcu_seq_current(&rcu_state.gp_seq), totqlen); @@ -1326,8 +1338,7 @@ static void print_cpu_stall(void) raw_spin_unlock_irqrestore_rcu_node(rdp->mynode, flags); print_cpu_stall_info_end(); for_each_possible_cpu(cpu) - totqlen += rcu_segcblist_n_cbs(&per_cpu_ptr(&rcu_data, - cpu)->cblist); + totqlen += rcu_get_n_cbs_cpu(cpu); pr_cont(" (t=%lu jiffies g=%ld q=%lu)\n", jiffies - rcu_state.gp_start, (long)rcu_seq_current(&rcu_state.gp_seq), totqlen); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index c3e2807a834a..a8f82b7dc5e2 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -455,6 +455,7 @@ static void __init rcu_spawn_nocb_kthreads(void); static void __init rcu_organize_nocb_kthreads(void); #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ static bool init_nocb_callback_list(struct rcu_data *rdp); +static unsigned long rcu_get_n_cbs_nocb_cpu(struct rcu_data *rdp); static void rcu_bind_gp_kthread(void); static bool rcu_nohz_full_cpu(void); static void rcu_dynticks_task_enter(void); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 25fe26c4e9c1..1b3dd2fc0cd6 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2011,7 +2011,7 @@ static bool rcu_nocb_cpu_needs_barrier(int cpu) * (if a callback is in fact needed). This is associated with an * atomic_inc() in the caller. */ - ret = atomic_long_read(&rdp->nocb_q_count); + ret = rcu_get_n_cbs_nocb_cpu(rdp); #ifdef CONFIG_PROVE_RCU rhp = READ_ONCE(rdp->nocb_head); @@ -2066,7 +2066,7 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, TPS("WakeNotPoll")); return; } - len = atomic_long_read(&rdp->nocb_q_count); + len = rcu_get_n_cbs_nocb_cpu(rdp); if (old_rhpp == &rdp->nocb_head) { if (!irqs_disabled_flags(flags)) { /* ... if queue was empty ... */ @@ -2115,11 +2115,11 @@ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, trace_rcu_kfree_callback(rcu_state.name, rhp, (unsigned long)rhp->func, -atomic_long_read(&rdp->nocb_q_count_lazy), - -atomic_long_read(&rdp->nocb_q_count)); + -rcu_get_n_cbs_nocb_cpu(rdp)); else trace_rcu_callback(rcu_state.name, rhp, -atomic_long_read(&rdp->nocb_q_count_lazy), - -atomic_long_read(&rdp->nocb_q_count)); + -rcu_get_n_cbs_nocb_cpu(rdp)); /* * If called from an extended quiescent state with interrupts @@ -2343,7 +2343,7 @@ static int rcu_nocb_kthread(void *arg) /* Each pass through the following loop invokes a callback. */ trace_rcu_batch_start(rcu_state.name, atomic_long_read(&rdp->nocb_q_count_lazy), - atomic_long_read(&rdp->nocb_q_count), -1); + rcu_get_n_cbs_nocb_cpu(rdp), -1); c = cl = 0; while (list) { next = list->next; @@ -2614,6 +2614,15 @@ void rcu_bind_current_to_nocb(void) } EXPORT_SYMBOL_GPL(rcu_bind_current_to_nocb); +/* + * Return the number of RCU callbacks still queued from the specified + * CPU, which must be a nocbs CPU. + */ +static unsigned long rcu_get_n_cbs_nocb_cpu(struct rcu_data *rdp) +{ + return atomic_long_read(&rdp->nocb_q_count); +} + #else /* #ifdef CONFIG_RCU_NOCB_CPU */ static bool rcu_nocb_cpu_needs_barrier(int cpu) @@ -2674,6 +2683,11 @@ static bool init_nocb_callback_list(struct rcu_data *rdp) return false; } +static unsigned long rcu_get_n_cbs_nocb_cpu(struct rcu_data *rdp) +{ + return 0; +} + #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ /* -- cgit From bfcfcffc5f23e76a1b88f7412ee7efaec5107b28 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 2 Oct 2018 17:09:56 -0700 Subject: rcu: Print per-CPU callback counts for forward-progress failures This commit prints out the non-zero per-CPU callback counts when a forware-progress error (OOM event) occurs. Signed-off-by: Paul E. McKenney [ paulmck: Fix a pair of uninitialized locals spotted by kbuild test robot. ] --- kernel/rcu/tree.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 0933d8650890..90a67c75a447 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2675,6 +2675,10 @@ rcu_check_gp_start_stall(struct rcu_node *rnp, struct rcu_data *rdp, */ void rcu_fwd_progress_check(unsigned long j) { + unsigned long cbs; + int cpu; + unsigned long max_cbs = 0; + int max_cpu = -1; struct rcu_data *rdp; if (rcu_gp_in_progress()) { @@ -2685,6 +2689,20 @@ void rcu_fwd_progress_check(unsigned long j) rcu_check_gp_start_stall(rdp->mynode, rdp, j); preempt_enable(); } + for_each_possible_cpu(cpu) { + cbs = rcu_get_n_cbs_cpu(cpu); + if (!cbs) + continue; + if (max_cpu < 0) + pr_info("%s: callbacks", __func__); + pr_cont(" %d: %lu", cpu, cbs); + if (cbs <= max_cbs) + continue; + max_cbs = cbs; + max_cpu = cpu; + } + if (max_cpu >= 0) + pr_cont("\n"); } EXPORT_SYMBOL_GPL(rcu_fwd_progress_check); -- cgit From 8dd3b54689d9a2103c0817f2b7adc51760a45551 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 3 Oct 2018 11:02:05 -0700 Subject: rcutorture: Print GP age upon forward-progress failure Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 90a67c75a447..f91631541965 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2682,6 +2682,8 @@ void rcu_fwd_progress_check(unsigned long j) struct rcu_data *rdp; if (rcu_gp_in_progress()) { + pr_info("%s: GP age %lu jiffies\n", + __func__, jiffies - rcu_state.gp_start); show_rcu_gp_kthreads(); } else { preempt_disable(); -- cgit From 1a682754c7ed9df213069d5a0d3981f8360a32c2 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 3 Oct 2018 12:33:41 -0700 Subject: rcutorture: Print histogram of CB invocation at OOM time One reason why a forward-progress test might fail would be if something prevented or delayed callback invocation. This commit therefore adds a callback-invocation histogram printout when OOM is reported to rcutorture. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index f28b88ecb47a..329f4fb13125 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1631,6 +1631,20 @@ static bool rcu_fwd_emergency_stop; #define MIN_FWD_CBS_LAUNDERED 100 /* Number of counted CBs. */ static long n_launders_hist[2 * MAX_FWD_CB_JIFFIES / HZ]; +static void rcu_torture_fwd_cb_hist(void) +{ + int i; + int j; + + for (i = ARRAY_SIZE(n_launders_hist) - 1; i > 0; i--) + if (n_launders_hist[i] > 0) + break; + pr_alert("%s: Callback-invocation histogram:", __func__); + for (j = 0; j <= i; j++) + pr_cont(" %ds: %ld", j + 1, n_launders_hist[j]); + pr_cont("\n"); +} + /* Callback function for continuous-flood RCU callbacks. */ static void rcu_torture_fwd_cb_cr(struct rcu_head *rhp) { @@ -1718,7 +1732,6 @@ static void rcu_torture_fwd_prog_cr(void) unsigned long cver; unsigned long gps; int i; - int j; long n_launders; long n_launders_cb_snap; long n_launders_sa; @@ -1791,13 +1804,7 @@ static void rcu_torture_fwd_prog_cr(void) n_launders + n_max_cbs - n_launders_cb_snap, n_launders, n_launders_sa, n_max_gps, n_max_cbs, cver, gps); - for (i = ARRAY_SIZE(n_launders_hist) - 1; i > 0; i--) - if (n_launders_hist[i] > 0) - break; - pr_alert("Callback-invocation histogram:"); - for (j = 0; j <= i; j++) - pr_cont(" %ds: %ld", j + 1, n_launders_hist[j]); - pr_cont("\n"); + rcu_torture_fwd_cb_hist(); } } @@ -1809,6 +1816,7 @@ static void rcu_torture_fwd_prog_cr(void) static int rcutorture_oom_notify(struct notifier_block *self, unsigned long notused, void *nfreed) { + rcu_torture_fwd_cb_hist(); rcu_fwd_progress_check(1 + (jiffies - READ_ONCE(rcu_fwd_startat) / 2)); WRITE_ONCE(rcu_fwd_emergency_stop, true); return NOTIFY_OK; -- cgit From c51d7b5e6c94aa6b554c27bd2b0eb64ebef02334 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 3 Oct 2018 17:25:33 -0700 Subject: rcutorture: Print time since GP end upon forward-progress failure If rcutorture's forward-progress tests fail while a grace period is not in progress, it is useful to print the time since the last grace period ended as a way to detect failure to launch a new grace period. This commit therefore makes this change. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 5 ++++- kernel/rcu/tree.h | 2 ++ 2 files changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index f91631541965..9180158756d2 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2000,7 +2000,8 @@ static void rcu_gp_cleanup(void) WRITE_ONCE(rcu_state.gp_activity, jiffies); raw_spin_lock_irq_rcu_node(rnp); - gp_duration = jiffies - rcu_state.gp_start; + rcu_state.gp_end = jiffies; + gp_duration = rcu_state.gp_end - rcu_state.gp_start; if (gp_duration > rcu_state.gp_max) rcu_state.gp_max = gp_duration; @@ -2686,6 +2687,8 @@ void rcu_fwd_progress_check(unsigned long j) __func__, jiffies - rcu_state.gp_start); show_rcu_gp_kthreads(); } else { + pr_info("%s: Last GP end %lu jiffies ago\n", + __func__, jiffies - rcu_state.gp_end); preempt_disable(); rdp = this_cpu_ptr(&rcu_data); rcu_check_gp_start_stall(rdp->mynode, rdp, j); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index a8f82b7dc5e2..d90b02b53c0e 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -328,6 +328,8 @@ struct rcu_state { /* force_quiescent_state(). */ unsigned long gp_start; /* Time at which GP started, */ /* but in jiffies. */ + unsigned long gp_end; /* Time last GP ended, again */ + /* in jiffies. */ unsigned long gp_activity; /* Time of last GP kthread */ /* activity in jiffies. */ unsigned long gp_req_activity; /* Time of last GP request */ -- cgit From 73d665b1410afae405309ad4475a98924776ab13 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 4 Oct 2018 10:54:22 -0700 Subject: rcutorture: Print forward-progress test age upon failure This commit prints the age of the forward-progress test in jiffies, in order to allow better interpretation of the callback-invocation histograms. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 329f4fb13125..080b5ac6340c 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1639,7 +1639,8 @@ static void rcu_torture_fwd_cb_hist(void) for (i = ARRAY_SIZE(n_launders_hist) - 1; i > 0; i--) if (n_launders_hist[i] > 0) break; - pr_alert("%s: Callback-invocation histogram:", __func__); + pr_alert("%s: Callback-invocation histogram (duration %lu jiffies):", + __func__, jiffies - rcu_fwd_startat); for (j = 0; j <= i; j++) pr_cont(" %ds: %ld", j + 1, n_launders_hist[j]); pr_cont("\n"); -- cgit From 2667ccce9328e4e25ed77a83291c066d5e11e65a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 5 Oct 2018 09:09:49 -0700 Subject: rcutorture: Recover from OOM during forward-progress tests This commit causes the OOM handler to do rcu_barrier() calls and to free up forward-progress callbacks in order to recover from OOM events. The current test is terminated, but subsequent forward-progress tests can proceed. This allows a long test to result in multiple forward-progress failures, greatly reducing the required testing time. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 60 ++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 49 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 080b5ac6340c..afa98162575d 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1649,13 +1649,14 @@ static void rcu_torture_fwd_cb_hist(void) /* Callback function for continuous-flood RCU callbacks. */ static void rcu_torture_fwd_cb_cr(struct rcu_head *rhp) { + unsigned long flags; int i; struct rcu_fwd_cb *rfcp = container_of(rhp, struct rcu_fwd_cb, rh); struct rcu_fwd_cb **rfcpp; rfcp->rfc_next = NULL; rfcp->rfc_gps++; - spin_lock(&rcu_fwd_lock); + spin_lock_irqsave(&rcu_fwd_lock, flags); rfcpp = rcu_fwd_cb_tail; rcu_fwd_cb_tail = &rfcp->rfc_next; WRITE_ONCE(*rfcpp, rfcp); @@ -1664,7 +1665,33 @@ static void rcu_torture_fwd_cb_cr(struct rcu_head *rhp) if (i >= ARRAY_SIZE(n_launders_hist)) i = ARRAY_SIZE(n_launders_hist) - 1; n_launders_hist[i]++; - spin_unlock(&rcu_fwd_lock); + spin_unlock_irqrestore(&rcu_fwd_lock, flags); +} + +/* + * Free all callbacks on the rcu_fwd_cb_head list, either because the + * test is over or because we hit an OOM event. + */ +static unsigned long rcu_torture_fwd_prog_cbfree(void) +{ + unsigned long flags; + unsigned long freed = 0; + struct rcu_fwd_cb *rfcp; + + for (;;) { + spin_lock_irqsave(&rcu_fwd_lock, flags); + rfcp = rcu_fwd_cb_head; + if (!rfcp) + break; + rcu_fwd_cb_head = rfcp->rfc_next; + if (!rcu_fwd_cb_head) + rcu_fwd_cb_tail = &rcu_fwd_cb_head; + spin_unlock_irqrestore(&rcu_fwd_lock, flags); + kfree(rfcp); + freed++; + } + spin_unlock_irqrestore(&rcu_fwd_lock, flags); + return freed; } /* Carry out need_resched()/cond_resched() forward-progress testing. */ @@ -1743,6 +1770,9 @@ static void rcu_torture_fwd_prog_cr(void) unsigned long stopat; unsigned long stoppedat; + if (READ_ONCE(rcu_fwd_emergency_stop)) + return; /* Get out of the way quickly, no GP wait! */ + /* Loop continuously posting RCU callbacks. */ WRITE_ONCE(rcu_fwd_cb_nodelay, true); cur_ops->sync(); /* Later readers see above write. */ @@ -1788,16 +1818,10 @@ static void rcu_torture_fwd_prog_cr(void) cver = READ_ONCE(rcu_torture_current_version) - cver; gps = rcutorture_seq_diff(cur_ops->get_gp_seq(), gps); cur_ops->cb_barrier(); /* Wait for callbacks to be invoked. */ - for (;;) { - rfcp = rcu_fwd_cb_head; - if (!rfcp) - break; - rcu_fwd_cb_head = rfcp->rfc_next; - kfree(rfcp); - } - rcu_fwd_cb_tail = &rcu_fwd_cb_head; + (void)rcu_torture_fwd_prog_cbfree(); + WRITE_ONCE(rcu_fwd_cb_nodelay, false); - if (!torture_must_stop()) { + if (!torture_must_stop() && !READ_ONCE(rcu_fwd_emergency_stop)) { WARN_ON(n_max_gps < MIN_FWD_CBS_LAUNDERED); pr_alert("%s Duration %lu barrier: %lu pending %ld n_launders: %ld n_launders_sa: %ld n_max_gps: %ld n_max_cbs: %ld cver %ld gps %ld\n", __func__, @@ -1817,9 +1841,23 @@ static void rcu_torture_fwd_prog_cr(void) static int rcutorture_oom_notify(struct notifier_block *self, unsigned long notused, void *nfreed) { + WARN(1, "%s invoked upon OOM during forward-progress testing.\n", + __func__); rcu_torture_fwd_cb_hist(); rcu_fwd_progress_check(1 + (jiffies - READ_ONCE(rcu_fwd_startat) / 2)); WRITE_ONCE(rcu_fwd_emergency_stop, true); + smp_mb(); /* Emergency stop before free and wait to avoid hangs. */ + pr_info("%s: Freed %lu RCU callbacks.\n", + __func__, rcu_torture_fwd_prog_cbfree()); + rcu_barrier(); + pr_info("%s: Freed %lu RCU callbacks.\n", + __func__, rcu_torture_fwd_prog_cbfree()); + rcu_barrier(); + pr_info("%s: Freed %lu RCU callbacks.\n", + __func__, rcu_torture_fwd_prog_cbfree()); + smp_mb(); /* Frees before return to avoid redoing OOM. */ + (*(unsigned long *)nfreed)++; /* Forward progress CBs freed! */ + pr_info("%s returning after OOM processing.\n", __func__); return NOTIFY_OK; } -- cgit From 2e57bf97a6856f2dc10fb4377c452cb08f844047 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 5 Oct 2018 16:43:09 -0700 Subject: rcutorture: Use 100ms buckets for forward-progress callback histograms This commit narrows the scope of each bucket of the forward-progress callback-invocation histograms from one second to 100 milliseconds, which aids debugging of forward-progress problems by making shorter-duration callback-invocation stalls visible. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index afa98162575d..a4c4a24bdcaa 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1629,7 +1629,8 @@ static bool rcu_fwd_emergency_stop; #define MAX_FWD_CB_JIFFIES (8 * HZ) /* Maximum CB test duration. */ #define MIN_FWD_CB_LAUNDERS 3 /* This many CB invocations to count. */ #define MIN_FWD_CBS_LAUNDERED 100 /* Number of counted CBs. */ -static long n_launders_hist[2 * MAX_FWD_CB_JIFFIES / HZ]; +#define FWD_CBS_HIST_DIV 10 /* Histogram buckets/second. */ +static long n_launders_hist[2 * MAX_FWD_CB_JIFFIES / (HZ / FWD_CBS_HIST_DIV)]; static void rcu_torture_fwd_cb_hist(void) { @@ -1642,7 +1643,8 @@ static void rcu_torture_fwd_cb_hist(void) pr_alert("%s: Callback-invocation histogram (duration %lu jiffies):", __func__, jiffies - rcu_fwd_startat); for (j = 0; j <= i; j++) - pr_cont(" %ds: %ld", j + 1, n_launders_hist[j]); + pr_cont(" %ds/%d: %ld", + j + 1, FWD_CBS_HIST_DIV, n_launders_hist[j]); pr_cont("\n"); } @@ -1661,7 +1663,7 @@ static void rcu_torture_fwd_cb_cr(struct rcu_head *rhp) rcu_fwd_cb_tail = &rfcp->rfc_next; WRITE_ONCE(*rfcpp, rfcp); WRITE_ONCE(n_launders_cb, n_launders_cb + 1); - i = ((jiffies - rcu_fwd_startat) / HZ); + i = ((jiffies - rcu_fwd_startat) / (HZ / FWD_CBS_HIST_DIV)); if (i >= ARRAY_SIZE(n_launders_hist)) i = ARRAY_SIZE(n_launders_hist) - 1; n_launders_hist[i]++; -- cgit From 5ac7cdc29897e5fc3f5e214f3f8c8b03ef8d7029 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 16 Oct 2018 05:46:58 -0700 Subject: rcutorture: Don't do busted forward-progress testing The "busted" rcutorture type is an intentionally broken implementation of RCU. Doing forward-progress testing on this implementation is not particularly meaningful on the one hand and can result in fatal abuse of the memory allocator on the other. This commit therefore disables forward-progress testing of the "busted" rcutorture type. Reported-by: kernel test robot Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index a4c4a24bdcaa..f6e85faa4ff4 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1900,7 +1900,8 @@ static int __init rcu_torture_fwd_prog_init(void) { if (!fwd_progress) return 0; /* Not requested, so don't do it. */ - if (!cur_ops->stall_dur || cur_ops->stall_dur() <= 0) { + if (!cur_ops->stall_dur || cur_ops->stall_dur() <= 0 || + cur_ops == &rcu_busted_ops) { VERBOSE_TOROUT_STRING("rcu_torture_fwd_prog_init: Disabled, unsupported by RCU flavor under test"); return 0; } -- cgit From 5482e9a93c83839f94e75db712e6837e6a39962c Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Sat, 1 Dec 2018 17:08:44 -0800 Subject: bpf: Fix memleak in aux->func_info and aux->btf The aux->func_info and aux->btf are leaked in the error out cases during bpf_prog_load(). This patch fixes it. Fixes: ba64e7d85252 ("bpf: btf: support proper non-jit func info") Cc: Yonghong Song Signed-off-by: Martin KaFai Lau Acked-by: Yonghong Song Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index f9554d9a14e1..4445d0d084d8 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1560,6 +1560,8 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) return err; free_used_maps: + kvfree(prog->aux->func_info); + btf_put(prog->aux->btf); bpf_prog_kallsyms_del_subprogs(prog); free_used_maps(prog->aux); free_prog: -- cgit From fca0c116504e1c5c72963bf28b1ede0932228fef Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 3 Dec 2018 10:52:21 +0100 Subject: perf: Fix typos in comments Fix two typos in kernel/events/*. No change in functionality intended. Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Linus Torvalds Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/events/core.c | 2 +- kernel/events/hw_breakpoint.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 84530ab358c3..a2a6e0b4a881 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5541,7 +5541,7 @@ out_put: static const struct vm_operations_struct perf_mmap_vmops = { .open = perf_mmap_open, - .close = perf_mmap_close, /* non mergable */ + .close = perf_mmap_close, /* non mergeable */ .fault = perf_mmap_fault, .page_mkwrite = perf_mmap_fault, }; diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index d6b56180827c..5befb338a18d 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -238,7 +238,7 @@ __weak void arch_unregister_hw_breakpoint(struct perf_event *bp) } /* - * Contraints to check before allowing this new breakpoint counter: + * Constraints to check before allowing this new breakpoint counter: * * == Non-pinned counter == (Considered as pinned for now) * -- cgit From dfcb245e28481256a10a9133441baf2a93d26642 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 3 Dec 2018 10:05:56 +0100 Subject: sched: Fix various typos in comments Go over the scheduler source code and fix common typos in comments - and a typo in an actual variable name. No change in functionality intended. Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Linus Torvalds Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- include/linux/sched.h | 4 ++-- include/linux/sched/isolation.h | 4 ++-- include/linux/sched/mm.h | 2 +- include/linux/sched/stat.h | 2 +- kernel/sched/core.c | 2 +- kernel/sched/cputime.c | 2 +- kernel/sched/deadline.c | 2 +- kernel/sched/fair.c | 8 ++++---- kernel/sched/isolation.c | 14 +++++++------- kernel/sched/sched.h | 4 ++-- 10 files changed, 22 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index 291a9bd5b97f..b8c7ba0e3796 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -176,7 +176,7 @@ struct task_group; * TASK_RUNNING store which can collide with __set_current_state(TASK_RUNNING). * * However, with slightly different timing the wakeup TASK_RUNNING store can - * also collide with the TASK_UNINTERRUPTIBLE store. Loosing that store is not + * also collide with the TASK_UNINTERRUPTIBLE store. Losing that store is not * a problem either because that will result in one extra go around the loop * and our @cond test will save the day. * @@ -515,7 +515,7 @@ struct sched_dl_entity { /* * Actual scheduling parameters. Initialized with the values above, - * they are continously updated during task execution. Note that + * they are continuously updated during task execution. Note that * the remaining runtime could be < 0 in case we are in overrun. */ s64 runtime; /* Remaining runtime for this instance */ diff --git a/include/linux/sched/isolation.h b/include/linux/sched/isolation.h index 4a6582c27dea..b0fb1446fe04 100644 --- a/include/linux/sched/isolation.h +++ b/include/linux/sched/isolation.h @@ -16,7 +16,7 @@ enum hk_flags { }; #ifdef CONFIG_CPU_ISOLATION -DECLARE_STATIC_KEY_FALSE(housekeeping_overriden); +DECLARE_STATIC_KEY_FALSE(housekeeping_overridden); extern int housekeeping_any_cpu(enum hk_flags flags); extern const struct cpumask *housekeeping_cpumask(enum hk_flags flags); extern void housekeeping_affine(struct task_struct *t, enum hk_flags flags); @@ -43,7 +43,7 @@ static inline void housekeeping_init(void) { } static inline bool housekeeping_cpu(int cpu, enum hk_flags flags) { #ifdef CONFIG_CPU_ISOLATION - if (static_branch_unlikely(&housekeeping_overriden)) + if (static_branch_unlikely(&housekeeping_overridden)) return housekeeping_test_cpu(cpu, flags); #endif return true; diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index aebb370a0006..3bfa6a0cbba4 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -153,7 +153,7 @@ static inline gfp_t current_gfp_context(gfp_t flags) { /* * NOIO implies both NOIO and NOFS and it is a weaker context - * so always make sure it makes precendence + * so always make sure it makes precedence */ if (unlikely(current->flags & PF_MEMALLOC_NOIO)) flags &= ~(__GFP_IO | __GFP_FS); diff --git a/include/linux/sched/stat.h b/include/linux/sched/stat.h index f30954cc059d..568286411b43 100644 --- a/include/linux/sched/stat.h +++ b/include/linux/sched/stat.h @@ -8,7 +8,7 @@ * Various counters maintained by the scheduler and fork(), * exposed via /proc, sys.c or used by drivers via these APIs. * - * ( Note that all these values are aquired without locking, + * ( Note that all these values are acquired without locking, * so they can only be relied on in narrow circumstances. ) */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 8050f266751a..e4ca15d75541 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2857,7 +2857,7 @@ unsigned long nr_running(void) * preemption, thus the result might have a time-of-check-to-time-of-use * race. The caller is responsible to use it correctly, for example: * - * - from a non-preemptable section (of course) + * - from a non-preemptible section (of course) * * - from a thread that is bound to a single CPU * diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 0796f938c4f0..ba4a143bdcf3 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -525,7 +525,7 @@ void account_idle_ticks(unsigned long ticks) /* * Perform (stime * rtime) / total, but avoid multiplication overflow by - * loosing precision when the numbers are big. + * losing precision when the numbers are big. */ static u64 scale_stime(u64 stime, u64 rtime, u64 total) { diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 470ba6b464fe..b32bc1f7cd14 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -727,7 +727,7 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se, * refill the runtime and set the deadline a period in the future, * because keeping the current (absolute) deadline of the task would * result in breaking guarantees promised to other tasks (refer to - * Documentation/scheduler/sched-deadline.txt for more informations). + * Documentation/scheduler/sched-deadline.txt for more information). * * This function returns true if: * diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e30dea59d215..fdc8356ea742 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -703,9 +703,9 @@ void init_entity_runnable_average(struct sched_entity *se) memset(sa, 0, sizeof(*sa)); /* - * Tasks are intialized with full load to be seen as heavy tasks until + * Tasks are initialized with full load to be seen as heavy tasks until * they get a chance to stabilize to their real load level. - * Group entities are intialized with zero load to reflect the fact that + * Group entities are initialized with zero load to reflect the fact that * nothing has been attached to the task group yet. */ if (entity_is_task(se)) @@ -3976,8 +3976,8 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) /* * When dequeuing a sched_entity, we must: * - Update loads to have both entity and cfs_rq synced with now. - * - Substract its load from the cfs_rq->runnable_avg. - * - Substract its previous weight from cfs_rq->load.weight. + * - Subtract its load from the cfs_rq->runnable_avg. + * - Subtract its previous weight from cfs_rq->load.weight. * - For group entity, update its weight to reflect the new share * of its group cfs_rq. */ diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c index e6802181900f..81faddba9e20 100644 --- a/kernel/sched/isolation.c +++ b/kernel/sched/isolation.c @@ -8,14 +8,14 @@ */ #include "sched.h" -DEFINE_STATIC_KEY_FALSE(housekeeping_overriden); -EXPORT_SYMBOL_GPL(housekeeping_overriden); +DEFINE_STATIC_KEY_FALSE(housekeeping_overridden); +EXPORT_SYMBOL_GPL(housekeeping_overridden); static cpumask_var_t housekeeping_mask; static unsigned int housekeeping_flags; int housekeeping_any_cpu(enum hk_flags flags) { - if (static_branch_unlikely(&housekeeping_overriden)) + if (static_branch_unlikely(&housekeeping_overridden)) if (housekeeping_flags & flags) return cpumask_any_and(housekeeping_mask, cpu_online_mask); return smp_processor_id(); @@ -24,7 +24,7 @@ EXPORT_SYMBOL_GPL(housekeeping_any_cpu); const struct cpumask *housekeeping_cpumask(enum hk_flags flags) { - if (static_branch_unlikely(&housekeeping_overriden)) + if (static_branch_unlikely(&housekeeping_overridden)) if (housekeeping_flags & flags) return housekeeping_mask; return cpu_possible_mask; @@ -33,7 +33,7 @@ EXPORT_SYMBOL_GPL(housekeeping_cpumask); void housekeeping_affine(struct task_struct *t, enum hk_flags flags) { - if (static_branch_unlikely(&housekeeping_overriden)) + if (static_branch_unlikely(&housekeeping_overridden)) if (housekeeping_flags & flags) set_cpus_allowed_ptr(t, housekeeping_mask); } @@ -41,7 +41,7 @@ EXPORT_SYMBOL_GPL(housekeeping_affine); bool housekeeping_test_cpu(int cpu, enum hk_flags flags) { - if (static_branch_unlikely(&housekeeping_overriden)) + if (static_branch_unlikely(&housekeeping_overridden)) if (housekeeping_flags & flags) return cpumask_test_cpu(cpu, housekeeping_mask); return true; @@ -53,7 +53,7 @@ void __init housekeeping_init(void) if (!housekeeping_flags) return; - static_branch_enable(&housekeeping_overriden); + static_branch_enable(&housekeeping_overridden); if (housekeeping_flags & HK_FLAG_TICK) sched_tick_offload_init(); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 71cd8b710599..9bde60a11805 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -637,7 +637,7 @@ struct dl_rq { /* * Deadline values of the currently executing and the * earliest ready task on this rq. Caching these facilitates - * the decision wether or not a ready but not running task + * the decision whether or not a ready but not running task * should migrate somewhere else. */ struct { @@ -1434,7 +1434,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) #ifdef CONFIG_SMP /* * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be - * successfuly executed on another CPU. We must ensure that updates of + * successfully executed on another CPU. We must ensure that updates of * per-task data have been completed by this moment. */ smp_wmb(); -- cgit From 1e7eacaf1db2f0f5f62fceda4e6c5a8869f00c13 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Sat, 1 Dec 2018 03:12:56 +0000 Subject: cpuset: Remove set but not used variable 'cs' Fixes gcc '-Wunused-but-set-variable' warning: kernel/cgroup/cpuset.c: In function 'cpuset_cancel_attach': kernel/cgroup/cpuset.c:2167:17: warning: variable 'cs' set but not used [-Wunused-but-set-variable] It never used since introduction in commit 1f7dd3e5a6e4 ("cgroup: fix handling of multi-destination migration from subtree_control enabling") Signed-off-by: YueHaibing Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 1151e93d71b6..f0decd8165e7 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -2109,10 +2109,8 @@ out_unlock: static void cpuset_cancel_attach(struct cgroup_taskset *tset) { struct cgroup_subsys_state *css; - struct cpuset *cs; cgroup_taskset_first(tset, &css); - cs = css_cs(css); mutex_lock(&cpuset_mutex); css_cs(css)->attach_in_progress--; -- cgit From 9a547c7e575fc2501c12081558fda3027d0f2a5e Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Fri, 30 Nov 2018 16:13:16 -0500 Subject: audit: shorten PATH cap values when zero Since the vast majority of files (99.993% on a typical system) have no fcaps, display "0" instead of the full zero-padded 16 hex digits in the two PATH record cap_f* fields to save netlink bandwidth and disk space. Simply changing the format to %x won't work since the value is two (or possibly more in the future) 32-bit hexadecimal values concatenated and bits in higher order values will be misrepresented. Passes audit-testsuite and userspace tools already work fine. Please see the github issue tracker for more details https://github.com/linux-audit/audit-kernel/issues/101 Signed-off-by: Richard Guy Briggs Acked-by: Steve Grubb Signed-off-by: Paul Moore --- kernel/audit.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 779671883349..a0a4544e69ca 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -2059,11 +2059,13 @@ void audit_log_cap(struct audit_buffer *ab, char *prefix, kernel_cap_t *cap) { int i; - audit_log_format(ab, " %s=", prefix); - CAP_FOR_EACH_U32(i) { - audit_log_format(ab, "%08x", - cap->cap[CAP_LAST_U32 - i]); + if (cap_isclear(*cap)) { + audit_log_format(ab, " %s=0", prefix); + return; } + audit_log_format(ab, " %s=", prefix); + CAP_FOR_EACH_U32(i) + audit_log_format(ab, "%08x", cap->cap[CAP_LAST_U32 - i]); } static void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name) -- cgit From ce10a5b3954f2514af726beb78ed8d7350c5e41c Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 28 Nov 2018 15:43:09 -0800 Subject: timekeeping: Use proper seqcount initializer tk_core.seq is initialized open coded, but that misses to initialize the lockdep map when lockdep is enabled. Lockdep splats involving tk_core seq consequently lack a name and are hard to read. Use the proper initializer which takes care of the lockdep map initialization. [ tglx: Massaged changelog ] Signed-off-by: Bart Van Assche Signed-off-by: Thomas Gleixner Cc: peterz@infradead.org Cc: tj@kernel.org Cc: johannes.berg@intel.com Link: https://lkml.kernel.org/r/20181128234325.110011-12-bvanassche@acm.org --- kernel/time/timekeeping.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index cd02bd38cf2d..c801e25875a3 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -45,7 +45,9 @@ enum timekeeping_adv_mode { static struct { seqcount_t seq; struct timekeeper timekeeper; -} tk_core ____cacheline_aligned; +} tk_core ____cacheline_aligned = { + .seq = SEQCNT_ZERO(tk_core.seq), +}; static DEFINE_RAW_SPINLOCK(timekeeper_lock); static struct timekeeper shadow_timekeeper; -- cgit From a1da439cc0d929891f0f7372c9e03530c809068c Mon Sep 17 00:00:00 2001 From: Marek Szyprowski Date: Wed, 5 Dec 2018 11:14:01 +0100 Subject: dma-mapping: fix lack of DMA address assignment in generic remap allocator Commit bfd56cd60521 ("dma-mapping: support highmem in the generic remap allocator") replaced dma_direct_alloc_pages() with __dma_direct_alloc_pages(), which doesn't set dma_handle and zero allocated memory. Fix it by doing this directly in the caller function. Fixes: bfd56cd60521 ("dma-mapping: support highmem in the generic remap allocator") Signed-off-by: Marek Szyprowski Tested-by: Thierry Reding Signed-off-by: Christoph Hellwig --- kernel/dma/remap.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/dma/remap.c b/kernel/dma/remap.c index 68a64e3ff6a1..8a44317cfc1a 100644 --- a/kernel/dma/remap.c +++ b/kernel/dma/remap.c @@ -223,8 +223,14 @@ void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, ret = dma_common_contiguous_remap(page, size, VM_USERMAP, arch_dma_mmap_pgprot(dev, PAGE_KERNEL, attrs), __builtin_return_address(0)); - if (!ret) + if (!ret) { __dma_direct_free_pages(dev, size, page); + return ret; + } + + *dma_handle = phys_to_dma(dev, page_to_phys(page)); + memset(ret, 0, size); + return ret; } -- cgit From dc002bb62f10c5905420f8b8a7d5ec0da567fc82 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Fri, 23 Nov 2018 23:18:03 +0100 Subject: bpf: add __weak hook for allocating executable memory By default, BPF uses module_alloc() to allocate executable memory, but this is not necessary on all arches and potentially undesirable on some of them. So break out the module_alloc() and module_memfree() calls into __weak functions to allow them to be overridden in arch code. Signed-off-by: Ard Biesheuvel Signed-off-by: Daniel Borkmann --- kernel/bpf/core.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index f93ed667546f..86817ab204e8 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -623,6 +623,16 @@ static void bpf_jit_uncharge_modmem(u32 pages) atomic_long_sub(pages, &bpf_jit_current); } +void *__weak bpf_jit_alloc_exec(unsigned long size) +{ + return module_alloc(size); +} + +void __weak bpf_jit_free_exec(void *addr) +{ + module_memfree(addr); +} + struct bpf_binary_header * bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, unsigned int alignment, @@ -640,7 +650,7 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, if (bpf_jit_charge_modmem(pages)) return NULL; - hdr = module_alloc(size); + hdr = bpf_jit_alloc_exec(size); if (!hdr) { bpf_jit_uncharge_modmem(pages); return NULL; @@ -664,7 +674,7 @@ void bpf_jit_binary_free(struct bpf_binary_header *hdr) { u32 pages = hdr->pages; - module_memfree(hdr); + bpf_jit_free_exec(hdr); bpf_jit_uncharge_modmem(pages); } -- cgit From 7337224fc150b3b762190425399ac0e8dee380d1 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 5 Dec 2018 17:35:43 -0800 Subject: bpf: Improve the info.func_info and info.func_info_rec_size behavior 1) When bpf_dump_raw_ok() == false and the kernel can provide >=1 func_info to the userspace, the current behavior is setting the info.func_info_cnt to 0 instead of setting info.func_info to 0. It is different from the behavior in jited_func_lens/nr_jited_func_lens, jited_ksyms/nr_jited_ksyms...etc. This patch fixes it. (i.e. set func_info to 0 instead of func_info_cnt to 0 when bpf_dump_raw_ok() == false). 2) When the userspace passed in info.func_info_cnt == 0, the kernel will set the expected func_info size back to the info.func_info_rec_size. It is a way for the userspace to learn the kernel expected func_info_rec_size introduced in commit 838e96904ff3 ("bpf: Introduce bpf_func_info"). An exception is the kernel expected size is not set when func_info is not available for a bpf_prog. This makes the returned info.func_info_rec_size has different values depending on the returned value of info.func_info_cnt. This patch sets the kernel expected size to info.func_info_rec_size independent of the info.func_info_cnt. 3) The current logic only rejects invalid func_info_rec_size if func_info_cnt is non zero. This patch also rejects invalid nonzero info.func_info_rec_size and not equal to the kernel expected size. 4) Set info.btf_id as long as prog->aux->btf != NULL. That will setup the later copy_to_user() codes look the same as others which then easier to understand and maintain. prog->aux->btf is not NULL only if prog->aux->func_info_cnt > 0. Breaking up info.btf_id from prog->aux->func_info_cnt is needed for the later line info patch anyway. A similar change is made to bpf_get_prog_name(). Fixes: 838e96904ff3 ("bpf: Introduce bpf_func_info") Signed-off-by: Martin KaFai Lau Acked-by: Yonghong Song Signed-off-by: Alexei Starovoitov --- kernel/bpf/core.c | 2 +- kernel/bpf/syscall.c | 46 ++++++++++++++++++++-------------------------- 2 files changed, 21 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 86817ab204e8..628b3970a49b 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -410,7 +410,7 @@ static void bpf_get_prog_name(const struct bpf_prog *prog, char *sym) sym = bin2hex(sym, prog->tag, sizeof(prog->tag)); /* prog->aux->name will be ignored if full btf name is available */ - if (prog->aux->btf) { + if (prog->aux->func_info_cnt) { type = btf_type_by_id(prog->aux->btf, prog->aux->func_info[prog->aux->func_idx].type_id); func_name = btf_name_by_offset(prog->aux->btf, type->name_off); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 4445d0d084d8..aa05aa38f4a8 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2083,6 +2083,12 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, return -EFAULT; } + if ((info.func_info_cnt || info.func_info_rec_size) && + info.func_info_rec_size != sizeof(struct bpf_func_info)) + return -EINVAL; + + info.func_info_rec_size = sizeof(struct bpf_func_info); + if (!capable(CAP_SYS_ADMIN)) { info.jited_prog_len = 0; info.xlated_prog_len = 0; @@ -2226,35 +2232,23 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, } } - if (prog->aux->btf) { - u32 krec_size = sizeof(struct bpf_func_info); - u32 ucnt, urec_size; - + if (prog->aux->btf) info.btf_id = btf_id(prog->aux->btf); - ucnt = info.func_info_cnt; - info.func_info_cnt = prog->aux->func_info_cnt; - urec_size = info.func_info_rec_size; - info.func_info_rec_size = krec_size; - if (ucnt) { - /* expect passed-in urec_size is what the kernel expects */ - if (urec_size != info.func_info_rec_size) - return -EINVAL; - - if (bpf_dump_raw_ok()) { - char __user *user_finfo; - - user_finfo = u64_to_user_ptr(info.func_info); - ucnt = min_t(u32, info.func_info_cnt, ucnt); - if (copy_to_user(user_finfo, prog->aux->func_info, - krec_size * ucnt)) - return -EFAULT; - } else { - info.func_info_cnt = 0; - } + ulen = info.func_info_cnt; + info.func_info_cnt = prog->aux->func_info_cnt; + if (info.func_info_cnt && ulen) { + if (bpf_dump_raw_ok()) { + char __user *user_finfo; + + user_finfo = u64_to_user_ptr(info.func_info); + ulen = min_t(u32, info.func_info_cnt, ulen); + if (copy_to_user(user_finfo, prog->aux->func_info, + info.func_info_rec_size * ulen)) + return -EFAULT; + } else { + info.func_info = 0; } - } else { - info.func_info_cnt = 0; } done: -- cgit From d30d42e08c76cb9323ec6121190eb026b07f773b Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 5 Dec 2018 17:35:44 -0800 Subject: bpf: Change insn_offset to insn_off in bpf_func_info The later patch will introduce "struct bpf_line_info" which has member "line_off" and "file_off" referring back to the string section in btf. The line_"off" and file_"off" are more consistent to the naming convention in btf.h that means "offset" (e.g. name_off in "struct btf_type"). The to-be-added "struct bpf_line_info" also has another member, "insn_off" which is the same as the "insn_offset" in "struct bpf_func_info". Hence, this patch renames "insn_offset" to "insn_off" for "struct bpf_func_info". Signed-off-by: Martin KaFai Lau Acked-by: Yonghong Song Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 2 +- kernel/bpf/verifier.c | 18 +++++++++--------- 2 files changed, 10 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index c8e1eeee2c5f..a84fd232d934 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2991,7 +2991,7 @@ struct bpf_flow_keys { }; struct bpf_func_info { - __u32 insn_offset; + __u32 insn_off; __u32 type_id; }; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 71988337ac14..7658c61c1a88 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4707,24 +4707,24 @@ static int check_btf_func(struct bpf_prog *prog, struct bpf_verifier_env *env, goto free_btf; } - /* check insn_offset */ + /* check insn_off */ if (i == 0) { - if (krecord[i].insn_offset) { + if (krecord[i].insn_off) { verbose(env, - "nonzero insn_offset %u for the first func info record", - krecord[i].insn_offset); + "nonzero insn_off %u for the first func info record", + krecord[i].insn_off); ret = -EINVAL; goto free_btf; } - } else if (krecord[i].insn_offset <= prev_offset) { + } else if (krecord[i].insn_off <= prev_offset) { verbose(env, "same or smaller insn offset (%u) than previous func info record (%u)", - krecord[i].insn_offset, prev_offset); + krecord[i].insn_off, prev_offset); ret = -EINVAL; goto free_btf; } - if (env->subprog_info[i].start != krecord[i].insn_offset) { + if (env->subprog_info[i].start != krecord[i].insn_off) { verbose(env, "func_info BTF section doesn't match subprog layout in BPF program\n"); ret = -EINVAL; goto free_btf; @@ -4739,7 +4739,7 @@ static int check_btf_func(struct bpf_prog *prog, struct bpf_verifier_env *env, goto free_btf; } - prev_offset = krecord[i].insn_offset; + prev_offset = krecord[i].insn_off; urecord += urec_size; } @@ -4762,7 +4762,7 @@ static void adjust_btf_func(struct bpf_verifier_env *env) return; for (i = 0; i < env->subprog_cnt; i++) - env->prog->aux->func_info[i].insn_offset = env->subprog_info[i].start; + env->prog->aux->func_info[i].insn_off = env->subprog_info[i].start; } /* check %cur's range satisfies %old's */ -- cgit From 92a98a2b9f64a8b3c200a7709ceae04d09c39451 Mon Sep 17 00:00:00 2001 From: AKASHI Takahiro Date: Thu, 15 Nov 2018 14:52:41 +0900 Subject: kexec_file: make kexec_image_post_load_cleanup_default() global Change this function from static to global so that arm64 can implement its own arch_kimage_file_post_load_cleanup() later using kexec_image_post_load_cleanup_default(). Signed-off-by: AKASHI Takahiro Acked-by: Dave Young Cc: Vivek Goyal Cc: Baoquan He Signed-off-by: Will Deacon --- include/linux/kexec.h | 1 + kernel/kexec_file.c | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/kexec.h b/include/linux/kexec.h index 9e4e638fb505..49ab758f4d91 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -143,6 +143,7 @@ extern const struct kexec_file_ops * const kexec_file_loaders[]; int kexec_image_probe_default(struct kimage *image, void *buf, unsigned long buf_len); +int kexec_image_post_load_cleanup_default(struct kimage *image); /** * struct kexec_buf - parameters for finding a place for a buffer in memory diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 35cf0ad29718..9ce6672f4fa3 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -76,7 +76,7 @@ void * __weak arch_kexec_kernel_image_load(struct kimage *image) return kexec_image_load_default(image); } -static int kexec_image_post_load_cleanup_default(struct kimage *image) +int kexec_image_post_load_cleanup_default(struct kimage *image) { if (!image->fops || !image->fops->cleanup) return 0; -- cgit From b6664ba42f1424d2768b605dd60cecc4428d9364 Mon Sep 17 00:00:00 2001 From: AKASHI Takahiro Date: Thu, 15 Nov 2018 14:52:42 +0900 Subject: s390, kexec_file: drop arch_kexec_mem_walk() Since s390 already knows where to locate buffers, calling arch_kexec_mem_walk() has no sense. So we can just drop it as kbuf->mem indicates this while all other architectures sets it to 0 initially. This change is a preparatory work for the next patch, where all the variant memory walks, either on system resource or memblock, will be put in one common place so that it will satisfy all the architectures' need. Signed-off-by: AKASHI Takahiro Reviewed-by: Philipp Rudo Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: Dave Young Cc: Vivek Goyal Cc: Baoquan He Signed-off-by: Will Deacon --- arch/s390/kernel/machine_kexec_file.c | 10 ---------- include/linux/kexec.h | 8 ++++++++ kernel/kexec_file.c | 4 ++++ 3 files changed, 12 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/arch/s390/kernel/machine_kexec_file.c b/arch/s390/kernel/machine_kexec_file.c index f413f57f8d20..32023b4f9dc0 100644 --- a/arch/s390/kernel/machine_kexec_file.c +++ b/arch/s390/kernel/machine_kexec_file.c @@ -134,16 +134,6 @@ int kexec_file_add_initrd(struct kimage *image, struct s390_load_data *data, return ret; } -/* - * The kernel is loaded to a fixed location. Turn off kexec_locate_mem_hole - * and provide kbuf->mem by hand. - */ -int arch_kexec_walk_mem(struct kexec_buf *kbuf, - int (*func)(struct resource *, void *)) -{ - return 1; -} - int arch_kexec_apply_relocations_add(struct purgatory_info *pi, Elf_Shdr *section, const Elf_Shdr *relsec, diff --git a/include/linux/kexec.h b/include/linux/kexec.h index 49ab758f4d91..f378cb786f1b 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -145,6 +145,14 @@ int kexec_image_probe_default(struct kimage *image, void *buf, unsigned long buf_len); int kexec_image_post_load_cleanup_default(struct kimage *image); +/* + * If kexec_buf.mem is set to this value, kexec_locate_mem_hole() + * will try to allocate free memory. Arch may overwrite it. + */ +#ifndef KEXEC_BUF_MEM_UNKNOWN +#define KEXEC_BUF_MEM_UNKNOWN 0 +#endif + /** * struct kexec_buf - parameters for finding a place for a buffer in memory * @image: kexec image in which memory to search. diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 9ce6672f4fa3..9e6529da12ed 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -532,6 +532,10 @@ int kexec_locate_mem_hole(struct kexec_buf *kbuf) { int ret; + /* Arch knows where to place */ + if (kbuf->mem != KEXEC_BUF_MEM_UNKNOWN) + return 0; + ret = arch_kexec_walk_mem(kbuf, locate_mem_hole_callback); return ret == 1 ? 0 : -EADDRNOTAVAIL; -- cgit From 735c2f90e333b3d0adee52a8e7e855a0c0eca284 Mon Sep 17 00:00:00 2001 From: AKASHI Takahiro Date: Thu, 15 Nov 2018 14:52:43 +0900 Subject: powerpc, kexec_file: factor out memblock-based arch_kexec_walk_mem() Memblock list is another source for usable system memory layout. So move powerpc's arch_kexec_walk_mem() to common code so that other memblock-based architectures, particularly arm64, can also utilise it. A moved function is now renamed to kexec_walk_memblock() and integrated into kexec_locate_mem_hole(), which will now be usable for all architectures with no need for overriding arch_kexec_walk_mem(). With this change, arch_kexec_walk_mem() need no longer be a weak function, and was now renamed to kexec_walk_resources(). Since powerpc doesn't support kdump in its kexec_file_load(), the current kexec_walk_memblock() won't work for kdump either in this form, this will be fixed in the next patch. Signed-off-by: AKASHI Takahiro Cc: "Eric W. Biederman" Acked-by: Dave Young Cc: Vivek Goyal Cc: Baoquan He Acked-by: James Morse Signed-off-by: Will Deacon --- arch/powerpc/kernel/machine_kexec_file_64.c | 54 ------------------------- include/linux/kexec.h | 2 - kernel/kexec_file.c | 61 +++++++++++++++++++++++++++-- 3 files changed, 57 insertions(+), 60 deletions(-) (limited to 'kernel') diff --git a/arch/powerpc/kernel/machine_kexec_file_64.c b/arch/powerpc/kernel/machine_kexec_file_64.c index c77e95e9b384..0d20c7ad40fa 100644 --- a/arch/powerpc/kernel/machine_kexec_file_64.c +++ b/arch/powerpc/kernel/machine_kexec_file_64.c @@ -24,7 +24,6 @@ #include #include -#include #include #include #include @@ -46,59 +45,6 @@ int arch_kexec_kernel_image_probe(struct kimage *image, void *buf, return kexec_image_probe_default(image, buf, buf_len); } -/** - * arch_kexec_walk_mem - call func(data) for each unreserved memory block - * @kbuf: Context info for the search. Also passed to @func. - * @func: Function to call for each memory block. - * - * This function is used by kexec_add_buffer and kexec_locate_mem_hole - * to find unreserved memory to load kexec segments into. - * - * Return: The memory walk will stop when func returns a non-zero value - * and that value will be returned. If all free regions are visited without - * func returning non-zero, then zero will be returned. - */ -int arch_kexec_walk_mem(struct kexec_buf *kbuf, - int (*func)(struct resource *, void *)) -{ - int ret = 0; - u64 i; - phys_addr_t mstart, mend; - struct resource res = { }; - - if (kbuf->top_down) { - for_each_free_mem_range_reverse(i, NUMA_NO_NODE, 0, - &mstart, &mend, NULL) { - /* - * In memblock, end points to the first byte after the - * range while in kexec, end points to the last byte - * in the range. - */ - res.start = mstart; - res.end = mend - 1; - ret = func(&res, kbuf); - if (ret) - break; - } - } else { - for_each_free_mem_range(i, NUMA_NO_NODE, 0, &mstart, &mend, - NULL) { - /* - * In memblock, end points to the first byte after the - * range while in kexec, end points to the last byte - * in the range. - */ - res.start = mstart; - res.end = mend - 1; - ret = func(&res, kbuf); - if (ret) - break; - } - } - - return ret; -} - /** * setup_purgatory - initialize the purgatory's global variables * @image: kexec image. diff --git a/include/linux/kexec.h b/include/linux/kexec.h index f378cb786f1b..d58d1f2fab10 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -192,8 +192,6 @@ int __weak arch_kexec_apply_relocations(struct purgatory_info *pi, const Elf_Shdr *relsec, const Elf_Shdr *symtab); -int __weak arch_kexec_walk_mem(struct kexec_buf *kbuf, - int (*func)(struct resource *, void *)); extern int kexec_add_buffer(struct kexec_buf *kbuf); int kexec_locate_mem_hole(struct kexec_buf *kbuf); diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 9e6529da12ed..d03195a8cb6e 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -499,8 +500,57 @@ static int locate_mem_hole_callback(struct resource *res, void *arg) return locate_mem_hole_bottom_up(start, end, kbuf); } +#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK +static int kexec_walk_memblock(struct kexec_buf *kbuf, + int (*func)(struct resource *, void *)) +{ + return 0; +} +#else +static int kexec_walk_memblock(struct kexec_buf *kbuf, + int (*func)(struct resource *, void *)) +{ + int ret = 0; + u64 i; + phys_addr_t mstart, mend; + struct resource res = { }; + + if (kbuf->top_down) { + for_each_free_mem_range_reverse(i, NUMA_NO_NODE, 0, + &mstart, &mend, NULL) { + /* + * In memblock, end points to the first byte after the + * range while in kexec, end points to the last byte + * in the range. + */ + res.start = mstart; + res.end = mend - 1; + ret = func(&res, kbuf); + if (ret) + break; + } + } else { + for_each_free_mem_range(i, NUMA_NO_NODE, 0, &mstart, &mend, + NULL) { + /* + * In memblock, end points to the first byte after the + * range while in kexec, end points to the last byte + * in the range. + */ + res.start = mstart; + res.end = mend - 1; + ret = func(&res, kbuf); + if (ret) + break; + } + } + + return ret; +} +#endif + /** - * arch_kexec_walk_mem - call func(data) on free memory regions + * kexec_walk_resources - call func(data) on free memory regions * @kbuf: Context info for the search. Also passed to @func. * @func: Function to call for each memory region. * @@ -508,8 +558,8 @@ static int locate_mem_hole_callback(struct resource *res, void *arg) * and that value will be returned. If all free regions are visited without * func returning non-zero, then zero will be returned. */ -int __weak arch_kexec_walk_mem(struct kexec_buf *kbuf, - int (*func)(struct resource *, void *)) +static int kexec_walk_resources(struct kexec_buf *kbuf, + int (*func)(struct resource *, void *)) { if (kbuf->image->type == KEXEC_TYPE_CRASH) return walk_iomem_res_desc(crashk_res.desc, @@ -536,7 +586,10 @@ int kexec_locate_mem_hole(struct kexec_buf *kbuf) if (kbuf->mem != KEXEC_BUF_MEM_UNKNOWN) return 0; - ret = arch_kexec_walk_mem(kbuf, locate_mem_hole_callback); + if (IS_ENABLED(CONFIG_ARCH_DISCARD_MEMBLOCK)) + ret = kexec_walk_resources(kbuf, locate_mem_hole_callback); + else + ret = kexec_walk_memblock(kbuf, locate_mem_hole_callback); return ret == 1 ? 0 : -EADDRNOTAVAIL; } -- cgit From 497e1858647a0b859076dc96300527375977d76a Mon Sep 17 00:00:00 2001 From: AKASHI Takahiro Date: Thu, 15 Nov 2018 14:52:44 +0900 Subject: kexec_file: kexec_walk_memblock() only walks a dedicated region at kdump In kdump case, there exists only one dedicated memblock region as usable memory (crashk_res). With this patch, kexec_walk_memblock() runs a given callback function on this region. Cosmetic change: 0 to MEMBLOCK_NONE at for_each_free_mem_range*() Signed-off-by: AKASHI Takahiro Acked-by: Dave Young Cc: Vivek Goyal Cc: Baoquan He Signed-off-by: Will Deacon --- kernel/kexec_file.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index d03195a8cb6e..f1d0e00a3971 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -515,8 +515,11 @@ static int kexec_walk_memblock(struct kexec_buf *kbuf, phys_addr_t mstart, mend; struct resource res = { }; + if (kbuf->image->type == KEXEC_TYPE_CRASH) + return func(&crashk_res, kbuf); + if (kbuf->top_down) { - for_each_free_mem_range_reverse(i, NUMA_NO_NODE, 0, + for_each_free_mem_range_reverse(i, NUMA_NO_NODE, MEMBLOCK_NONE, &mstart, &mend, NULL) { /* * In memblock, end points to the first byte after the @@ -530,8 +533,8 @@ static int kexec_walk_memblock(struct kexec_buf *kbuf, break; } } else { - for_each_free_mem_range(i, NUMA_NO_NODE, 0, &mstart, &mend, - NULL) { + for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE, + &mstart, &mend, NULL) { /* * In memblock, end points to the first byte after the * range while in kexec, end points to the last byte -- cgit From b0cbeae4944924640bf550b75487729a20204c14 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 21 Nov 2018 18:52:35 +0100 Subject: dma-direct: remove the mapping_error dma_map_ops method The dma-direct code already returns (~(dma_addr_t)0x0) on mapping failures, so we can switch over to returning DMA_MAPPING_ERROR and let the core dma-mapping code handle the rest. Signed-off-by: Christoph Hellwig Acked-by: Linus Torvalds --- arch/powerpc/kernel/dma-swiotlb.c | 1 - include/linux/dma-direct.h | 3 --- kernel/dma/direct.c | 8 +------- kernel/dma/swiotlb.c | 11 +++++------ 4 files changed, 6 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/arch/powerpc/kernel/dma-swiotlb.c b/arch/powerpc/kernel/dma-swiotlb.c index 5fc335f4d9cd..3d8df2cf8be9 100644 --- a/arch/powerpc/kernel/dma-swiotlb.c +++ b/arch/powerpc/kernel/dma-swiotlb.c @@ -59,7 +59,6 @@ const struct dma_map_ops powerpc_swiotlb_dma_ops = { .sync_single_for_device = swiotlb_sync_single_for_device, .sync_sg_for_cpu = swiotlb_sync_sg_for_cpu, .sync_sg_for_device = swiotlb_sync_sg_for_device, - .mapping_error = dma_direct_mapping_error, .get_required_mask = swiotlb_powerpc_get_required, }; diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h index 61b78f934f64..6e5a47ae7d64 100644 --- a/include/linux/dma-direct.h +++ b/include/linux/dma-direct.h @@ -5,8 +5,6 @@ #include #include -#define DIRECT_MAPPING_ERROR (~(dma_addr_t)0) - #ifdef CONFIG_ARCH_HAS_PHYS_TO_DMA #include #else @@ -76,5 +74,4 @@ dma_addr_t dma_direct_map_page(struct device *dev, struct page *page, int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents, enum dma_data_direction dir, unsigned long attrs); int dma_direct_supported(struct device *dev, u64 mask); -int dma_direct_mapping_error(struct device *dev, dma_addr_t dma_addr); #endif /* _LINUX_DMA_DIRECT_H */ diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index c49849bcced6..308f88a750c8 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -289,7 +289,7 @@ dma_addr_t dma_direct_map_page(struct device *dev, struct page *page, dma_addr_t dma_addr = phys_to_dma(dev, phys); if (!check_addr(dev, dma_addr, size, __func__)) - return DIRECT_MAPPING_ERROR; + return DMA_MAPPING_ERROR; if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC)) dma_direct_sync_single_for_device(dev, dma_addr, size, dir); @@ -336,11 +336,6 @@ int dma_direct_supported(struct device *dev, u64 mask) return mask >= phys_to_dma(dev, min_mask); } -int dma_direct_mapping_error(struct device *dev, dma_addr_t dma_addr) -{ - return dma_addr == DIRECT_MAPPING_ERROR; -} - const struct dma_map_ops dma_direct_ops = { .alloc = dma_direct_alloc, .free = dma_direct_free, @@ -359,7 +354,6 @@ const struct dma_map_ops dma_direct_ops = { #endif .get_required_mask = dma_direct_get_required_mask, .dma_supported = dma_direct_supported, - .mapping_error = dma_direct_mapping_error, .cache_sync = arch_dma_cache_sync, }; EXPORT_SYMBOL(dma_direct_ops); diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 045930e32c0e..ff1ce81bb623 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -631,21 +631,21 @@ static dma_addr_t swiotlb_bounce_page(struct device *dev, phys_addr_t *phys, if (unlikely(swiotlb_force == SWIOTLB_NO_FORCE)) { dev_warn_ratelimited(dev, "Cannot do DMA to address %pa\n", phys); - return DIRECT_MAPPING_ERROR; + return DMA_MAPPING_ERROR; } /* Oh well, have to allocate and map a bounce buffer. */ *phys = swiotlb_tbl_map_single(dev, __phys_to_dma(dev, io_tlb_start), *phys, size, dir, attrs); if (*phys == SWIOTLB_MAP_ERROR) - return DIRECT_MAPPING_ERROR; + return DMA_MAPPING_ERROR; /* Ensure that the address returned is DMA'ble */ dma_addr = __phys_to_dma(dev, *phys); if (unlikely(!dma_capable(dev, dma_addr, size))) { swiotlb_tbl_unmap_single(dev, *phys, size, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC); - return DIRECT_MAPPING_ERROR; + return DMA_MAPPING_ERROR; } return dma_addr; @@ -680,7 +680,7 @@ dma_addr_t swiotlb_map_page(struct device *dev, struct page *page, if (!dev_is_dma_coherent(dev) && (attrs & DMA_ATTR_SKIP_CPU_SYNC) == 0 && - dev_addr != DIRECT_MAPPING_ERROR) + dev_addr != DMA_MAPPING_ERROR) arch_sync_dma_for_device(dev, phys, size, dir); return dev_addr; @@ -789,7 +789,7 @@ swiotlb_map_sg_attrs(struct device *dev, struct scatterlist *sgl, int nelems, for_each_sg(sgl, sg, nelems, i) { sg->dma_address = swiotlb_map_page(dev, sg_page(sg), sg->offset, sg->length, dir, attrs); - if (sg->dma_address == DIRECT_MAPPING_ERROR) + if (sg->dma_address == DMA_MAPPING_ERROR) goto out_error; sg_dma_len(sg) = sg->length; } @@ -869,7 +869,6 @@ swiotlb_dma_supported(struct device *hwdev, u64 mask) } const struct dma_map_ops swiotlb_dma_ops = { - .mapping_error = dma_direct_mapping_error, .alloc = dma_direct_alloc, .free = dma_direct_free, .sync_single_for_cpu = swiotlb_sync_single_for_cpu, -- cgit From df44b479654f62b478c18ee4d8bc4e9f897a9844 Mon Sep 17 00:00:00 2001 From: Peter Rajnoha Date: Wed, 5 Dec 2018 12:27:44 +0100 Subject: kobject: return error code if writing /sys/.../uevent fails Propagate error code back to userspace if writing the /sys/.../uevent file fails. Before, the write operation always returned with success, even if we failed to recognize the input string or if we failed to generate the uevent itself. With the error codes properly propagated back to userspace, we are able to react in userspace accordingly by not assuming and awaiting a uevent that is not delivered. Signed-off-by: Peter Rajnoha Signed-off-by: Greg Kroah-Hartman --- drivers/base/bus.c | 12 ++++++++---- drivers/base/core.c | 8 +++++++- kernel/module.c | 6 ++++-- 3 files changed, 19 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/drivers/base/bus.c b/drivers/base/bus.c index 8bfd27ec73d6..b886b15cb53b 100644 --- a/drivers/base/bus.c +++ b/drivers/base/bus.c @@ -611,8 +611,10 @@ static void remove_probe_files(struct bus_type *bus) static ssize_t uevent_store(struct device_driver *drv, const char *buf, size_t count) { - kobject_synth_uevent(&drv->p->kobj, buf, count); - return count; + int rc; + + rc = kobject_synth_uevent(&drv->p->kobj, buf, count); + return rc ? rc : count; } static DRIVER_ATTR_WO(uevent); @@ -828,8 +830,10 @@ static void klist_devices_put(struct klist_node *n) static ssize_t bus_uevent_store(struct bus_type *bus, const char *buf, size_t count) { - kobject_synth_uevent(&bus->p->subsys.kobj, buf, count); - return count; + int rc; + + rc = kobject_synth_uevent(&bus->p->subsys.kobj, buf, count); + return rc ? rc : count; } static BUS_ATTR(uevent, S_IWUSR, NULL, bus_uevent_store); diff --git a/drivers/base/core.c b/drivers/base/core.c index e2285059161d..a4ced331bc50 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -1073,8 +1073,14 @@ out: static ssize_t uevent_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { - if (kobject_synth_uevent(&dev->kobj, buf, count)) + int rc; + + rc = kobject_synth_uevent(&dev->kobj, buf, count); + + if (rc) { dev_err(dev, "uevent: failed to send synthetic uevent\n"); + return rc; + } return count; } diff --git a/kernel/module.c b/kernel/module.c index 49a405891587..0812a7f80fa7 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1207,8 +1207,10 @@ static ssize_t store_uevent(struct module_attribute *mattr, struct module_kobject *mk, const char *buffer, size_t count) { - kobject_synth_uevent(&mk->kobj, buffer, count); - return count; + int rc; + + rc = kobject_synth_uevent(&mk->kobj, buffer, count); + return rc ? rc : count; } struct module_attribute module_uevent = -- cgit From ded653ccbec0335a78fa7a7aff3ec9870349fafb Mon Sep 17 00:00:00 2001 From: Deepa Dinamani Date: Wed, 19 Sep 2018 21:41:04 -0700 Subject: signal: Add set_user_sigmask() Refactor reading sigset from userspace and updating sigmask into an api. This is useful for versions of syscalls that pass in the sigmask and expect the current->sigmask to be changed during, and restored after, the execution of the syscall. With the advent of new y2038 syscalls in the subsequent patches, we add two more new versions of the syscalls (for pselect, ppoll, and io_pgetevents) in addition to the existing native and compat versions. Adding such an api reduces the logic that would need to be replicated otherwise. Note that the calls to sigprocmask() ignored the return value from the api as the function only returns an error on an invalid first argument that is hardcoded at these call sites. The updated logic uses set_current_blocked() instead. Signed-off-by: Deepa Dinamani Signed-off-by: Arnd Bergmann --- fs/aio.c | 23 +++++++---------------- fs/eventpoll.c | 22 ++++++---------------- fs/select.c | 50 ++++++++++++-------------------------------------- include/linux/compat.h | 4 ++++ include/linux/signal.h | 2 ++ kernel/signal.c | 45 +++++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 76 insertions(+), 70 deletions(-) (limited to 'kernel') diff --git a/fs/aio.c b/fs/aio.c index 301e6314183b..6ddb63ee8eb6 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -2104,14 +2104,10 @@ SYSCALL_DEFINE6(io_pgetevents, if (usig && copy_from_user(&ksig, usig, sizeof(ksig))) return -EFAULT; - if (ksig.sigmask) { - if (ksig.sigsetsize != sizeof(sigset_t)) - return -EINVAL; - if (copy_from_user(&ksigmask, ksig.sigmask, sizeof(ksigmask))) - return -EFAULT; - sigdelsetmask(&ksigmask, sigmask(SIGKILL) | sigmask(SIGSTOP)); - sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved); - } + + ret = set_user_sigmask(ksig.sigmask, &ksigmask, &sigsaved, ksig.sigsetsize); + if (ret) + return ret; ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &ts : NULL); if (signal_pending(current)) { @@ -2174,14 +2170,9 @@ COMPAT_SYSCALL_DEFINE6(io_pgetevents, if (usig && copy_from_user(&ksig, usig, sizeof(ksig))) return -EFAULT; - if (ksig.sigmask) { - if (ksig.sigsetsize != sizeof(compat_sigset_t)) - return -EINVAL; - if (get_compat_sigset(&ksigmask, ksig.sigmask)) - return -EFAULT; - sigdelsetmask(&ksigmask, sigmask(SIGKILL) | sigmask(SIGSTOP)); - sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved); - } + ret = set_compat_user_sigmask(ksig.sigmask, &ksigmask, &sigsaved, ksig.sigsetsize); + if (ret) + return ret; ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &t : NULL); if (signal_pending(current)) { diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 42bbe6824b4b..2d86eeba837b 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -2223,14 +2223,9 @@ SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events, * If the caller wants a certain signal mask to be set during the wait, * we apply it here. */ - if (sigmask) { - if (sigsetsize != sizeof(sigset_t)) - return -EINVAL; - if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask))) - return -EFAULT; - sigsaved = current->blocked; - set_current_blocked(&ksigmask); - } + error = set_user_sigmask(sigmask, &ksigmask, &sigsaved, sigsetsize); + if (error) + return error; error = do_epoll_wait(epfd, events, maxevents, timeout); @@ -2266,14 +2261,9 @@ COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd, * If the caller wants a certain signal mask to be set during the wait, * we apply it here. */ - if (sigmask) { - if (sigsetsize != sizeof(compat_sigset_t)) - return -EINVAL; - if (get_compat_sigset(&ksigmask, sigmask)) - return -EFAULT; - sigsaved = current->blocked; - set_current_blocked(&ksigmask); - } + err = set_compat_user_sigmask(sigmask, &ksigmask, &sigsaved, sigsetsize); + if (err) + return err; err = do_epoll_wait(epfd, events, maxevents, timeout); diff --git a/fs/select.c b/fs/select.c index 22b3bf89f051..65c78b4147a2 100644 --- a/fs/select.c +++ b/fs/select.c @@ -717,16 +717,9 @@ static long do_pselect(int n, fd_set __user *inp, fd_set __user *outp, return -EINVAL; } - if (sigmask) { - /* XXX: Don't preclude handling different sized sigset_t's. */ - if (sigsetsize != sizeof(sigset_t)) - return -EINVAL; - if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask))) - return -EFAULT; - - sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP)); - sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved); - } + ret = set_user_sigmask(sigmask, &ksigmask, &sigsaved, sigsetsize); + if (ret) + return ret; ret = core_sys_select(n, inp, outp, exp, to); ret = poll_select_copy_remaining(&end_time, tsp, 0, ret); @@ -1061,16 +1054,9 @@ SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, unsigned int, nfds, return -EINVAL; } - if (sigmask) { - /* XXX: Don't preclude handling different sized sigset_t's. */ - if (sigsetsize != sizeof(sigset_t)) - return -EINVAL; - if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask))) - return -EFAULT; - - sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP)); - sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved); - } + ret = set_user_sigmask(sigmask, &ksigmask, &sigsaved, sigsetsize); + if (ret) + return ret; ret = do_sys_poll(ufds, nfds, to); @@ -1323,15 +1309,9 @@ static long do_compat_pselect(int n, compat_ulong_t __user *inp, return -EINVAL; } - if (sigmask) { - if (sigsetsize != sizeof(compat_sigset_t)) - return -EINVAL; - if (get_compat_sigset(&ksigmask, sigmask)) - return -EFAULT; - - sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP)); - sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved); - } + ret = set_compat_user_sigmask(sigmask, &ksigmask, &sigsaved, sigsetsize); + if (ret) + return ret; ret = compat_core_sys_select(n, inp, outp, exp, to); ret = compat_poll_select_copy_remaining(&end_time, tsp, 0, ret); @@ -1389,15 +1369,9 @@ COMPAT_SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, return -EINVAL; } - if (sigmask) { - if (sigsetsize != sizeof(compat_sigset_t)) - return -EINVAL; - if (get_compat_sigset(&ksigmask, sigmask)) - return -EFAULT; - - sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP)); - sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved); - } + ret = set_compat_user_sigmask(sigmask, &ksigmask, &sigsaved, sigsetsize); + if (ret) + return ret; ret = do_sys_poll(ufds, nfds, to); diff --git a/include/linux/compat.h b/include/linux/compat.h index 88720b443cd6..17c497b82690 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -169,6 +169,10 @@ typedef struct { compat_sigset_word sig[_COMPAT_NSIG_WORDS]; } compat_sigset_t; +int set_compat_user_sigmask(const compat_sigset_t __user *usigmask, + sigset_t *set, sigset_t *oldset, + size_t sigsetsize); + struct compat_sigaction { #ifndef __ARCH_HAS_IRIX_SIGACTION compat_uptr_t sa_handler; diff --git a/include/linux/signal.h b/include/linux/signal.h index f428e86f4800..ce14b951befb 100644 --- a/include/linux/signal.h +++ b/include/linux/signal.h @@ -273,6 +273,8 @@ extern int group_send_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *p, enum pid_type type); extern int __group_send_sig_info(int, struct kernel_siginfo *, struct task_struct *); extern int sigprocmask(int, sigset_t *, sigset_t *); +extern int set_user_sigmask(const sigset_t __user *usigmask, sigset_t *set, + sigset_t *oldset, size_t sigsetsize); extern void set_current_blocked(sigset_t *); extern void __set_current_blocked(const sigset_t *); extern int show_unhandled_signals; diff --git a/kernel/signal.c b/kernel/signal.c index 9a32bc2088c9..811b5d440617 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2735,6 +2735,51 @@ int sigprocmask(int how, sigset_t *set, sigset_t *oldset) } EXPORT_SYMBOL(sigprocmask); +/* + * The api helps set app-provided sigmasks. + * + * This is useful for syscalls such as ppoll, pselect, io_pgetevents and + * epoll_pwait where a new sigmask is passed from userland for the syscalls. + */ +int set_user_sigmask(const sigset_t __user *usigmask, sigset_t *set, + sigset_t *oldset, size_t sigsetsize) +{ + if (!usigmask) + return 0; + + if (sigsetsize != sizeof(sigset_t)) + return -EINVAL; + if (copy_from_user(set, usigmask, sizeof(sigset_t))) + return -EFAULT; + + *oldset = current->blocked; + set_current_blocked(set); + + return 0; +} +EXPORT_SYMBOL(set_user_sigmask); + +#ifdef CONFIG_COMPAT +int set_compat_user_sigmask(const compat_sigset_t __user *usigmask, + sigset_t *set, sigset_t *oldset, + size_t sigsetsize) +{ + if (!usigmask) + return 0; + + if (sigsetsize != sizeof(compat_sigset_t)) + return -EINVAL; + if (get_compat_sigset(set, usigmask)) + return -EFAULT; + + *oldset = current->blocked; + set_current_blocked(set); + + return 0; +} +EXPORT_SYMBOL(set_compat_user_sigmask); +#endif + /** * sys_rt_sigprocmask - change the list of currently blocked signals * @how: whether to add, remove, or set signals -- cgit From 854a6ed56839a40f6b5d02a2962f48841482eec4 Mon Sep 17 00:00:00 2001 From: Deepa Dinamani Date: Wed, 19 Sep 2018 21:41:05 -0700 Subject: signal: Add restore_user_sigmask() Refactor the logic to restore the sigmask before the syscall returns into an api. This is useful for versions of syscalls that pass in the sigmask and expect the current->sigmask to be changed during the execution and restored after the execution of the syscall. With the advent of new y2038 syscalls in the subsequent patches, we add two more new versions of the syscalls (for pselect, ppoll and io_pgetevents) in addition to the existing native and compat versions. Adding such an api reduces the logic that would need to be replicated otherwise. Signed-off-by: Deepa Dinamani Signed-off-by: Arnd Bergmann --- fs/aio.c | 29 +++++------------------- fs/eventpoll.c | 30 ++----------------------- fs/select.c | 60 +++++++------------------------------------------- include/linux/signal.h | 2 ++ kernel/signal.c | 33 +++++++++++++++++++++++++++ 5 files changed, 51 insertions(+), 103 deletions(-) (limited to 'kernel') diff --git a/fs/aio.c b/fs/aio.c index 6ddb63ee8eb6..39a1f2df6805 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -2110,18 +2110,9 @@ SYSCALL_DEFINE6(io_pgetevents, return ret; ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &ts : NULL); - if (signal_pending(current)) { - if (ksig.sigmask) { - current->saved_sigmask = sigsaved; - set_restore_sigmask(); - } - - if (!ret) - ret = -ERESTARTNOHAND; - } else { - if (ksig.sigmask) - sigprocmask(SIG_SETMASK, &sigsaved, NULL); - } + restore_user_sigmask(ksig.sigmask, &sigsaved); + if (signal_pending(current) && !ret) + ret = -ERESTARTNOHAND; return ret; } @@ -2175,17 +2166,9 @@ COMPAT_SYSCALL_DEFINE6(io_pgetevents, return ret; ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &t : NULL); - if (signal_pending(current)) { - if (ksig.sigmask) { - current->saved_sigmask = sigsaved; - set_restore_sigmask(); - } - if (!ret) - ret = -ERESTARTNOHAND; - } else { - if (ksig.sigmask) - sigprocmask(SIG_SETMASK, &sigsaved, NULL); - } + restore_user_sigmask(ksig.sigmask, &sigsaved); + if (signal_pending(current) && !ret) + ret = -ERESTARTNOHAND; return ret; } diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 2d86eeba837b..8a5a1010886b 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -2229,20 +2229,7 @@ SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events, error = do_epoll_wait(epfd, events, maxevents, timeout); - /* - * If we changed the signal mask, we need to restore the original one. - * In case we've got a signal while waiting, we do not restore the - * signal mask yet, and we allow do_signal() to deliver the signal on - * the way back to userspace, before the signal mask is restored. - */ - if (sigmask) { - if (error == -EINTR) { - memcpy(¤t->saved_sigmask, &sigsaved, - sizeof(sigsaved)); - set_restore_sigmask(); - } else - set_current_blocked(&sigsaved); - } + restore_user_sigmask(sigmask, &sigsaved); return error; } @@ -2267,20 +2254,7 @@ COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd, err = do_epoll_wait(epfd, events, maxevents, timeout); - /* - * If we changed the signal mask, we need to restore the original one. - * In case we've got a signal while waiting, we do not restore the - * signal mask yet, and we allow do_signal() to deliver the signal on - * the way back to userspace, before the signal mask is restored. - */ - if (sigmask) { - if (err == -EINTR) { - memcpy(¤t->saved_sigmask, &sigsaved, - sizeof(sigsaved)); - set_restore_sigmask(); - } else - set_current_blocked(&sigsaved); - } + restore_user_sigmask(sigmask, &sigsaved); return err; } diff --git a/fs/select.c b/fs/select.c index 65c78b4147a2..eb9132520197 100644 --- a/fs/select.c +++ b/fs/select.c @@ -724,19 +724,7 @@ static long do_pselect(int n, fd_set __user *inp, fd_set __user *outp, ret = core_sys_select(n, inp, outp, exp, to); ret = poll_select_copy_remaining(&end_time, tsp, 0, ret); - if (ret == -ERESTARTNOHAND) { - /* - * Don't restore the signal mask yet. Let do_signal() deliver - * the signal on the way back to userspace, before the signal - * mask is restored. - */ - if (sigmask) { - memcpy(¤t->saved_sigmask, &sigsaved, - sizeof(sigsaved)); - set_restore_sigmask(); - } - } else if (sigmask) - sigprocmask(SIG_SETMASK, &sigsaved, NULL); + restore_user_sigmask(sigmask, &sigsaved); return ret; } @@ -1060,21 +1048,11 @@ SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, unsigned int, nfds, ret = do_sys_poll(ufds, nfds, to); + restore_user_sigmask(sigmask, &sigsaved); + /* We can restart this syscall, usually */ - if (ret == -EINTR) { - /* - * Don't restore the signal mask yet. Let do_signal() deliver - * the signal on the way back to userspace, before the signal - * mask is restored. - */ - if (sigmask) { - memcpy(¤t->saved_sigmask, &sigsaved, - sizeof(sigsaved)); - set_restore_sigmask(); - } + if (ret == -EINTR) ret = -ERESTARTNOHAND; - } else if (sigmask) - sigprocmask(SIG_SETMASK, &sigsaved, NULL); ret = poll_select_copy_remaining(&end_time, tsp, 0, ret); @@ -1316,19 +1294,7 @@ static long do_compat_pselect(int n, compat_ulong_t __user *inp, ret = compat_core_sys_select(n, inp, outp, exp, to); ret = compat_poll_select_copy_remaining(&end_time, tsp, 0, ret); - if (ret == -ERESTARTNOHAND) { - /* - * Don't restore the signal mask yet. Let do_signal() deliver - * the signal on the way back to userspace, before the signal - * mask is restored. - */ - if (sigmask) { - memcpy(¤t->saved_sigmask, &sigsaved, - sizeof(sigsaved)); - set_restore_sigmask(); - } - } else if (sigmask) - sigprocmask(SIG_SETMASK, &sigsaved, NULL); + restore_user_sigmask(sigmask, &sigsaved); return ret; } @@ -1375,21 +1341,11 @@ COMPAT_SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, ret = do_sys_poll(ufds, nfds, to); + restore_user_sigmask(sigmask, &sigsaved); + /* We can restart this syscall, usually */ - if (ret == -EINTR) { - /* - * Don't restore the signal mask yet. Let do_signal() deliver - * the signal on the way back to userspace, before the signal - * mask is restored. - */ - if (sigmask) { - memcpy(¤t->saved_sigmask, &sigsaved, - sizeof(sigsaved)); - set_restore_sigmask(); - } + if (ret == -EINTR) ret = -ERESTARTNOHAND; - } else if (sigmask) - sigprocmask(SIG_SETMASK, &sigsaved, NULL); ret = compat_poll_select_copy_remaining(&end_time, tsp, 0, ret); diff --git a/include/linux/signal.h b/include/linux/signal.h index ce14b951befb..cc7e2c1cd444 100644 --- a/include/linux/signal.h +++ b/include/linux/signal.h @@ -275,6 +275,8 @@ extern int __group_send_sig_info(int, struct kernel_siginfo *, struct task_struc extern int sigprocmask(int, sigset_t *, sigset_t *); extern int set_user_sigmask(const sigset_t __user *usigmask, sigset_t *set, sigset_t *oldset, size_t sigsetsize); +extern void restore_user_sigmask(const void __user *usigmask, + sigset_t *sigsaved); extern void set_current_blocked(sigset_t *); extern void __set_current_blocked(const sigset_t *); extern int show_unhandled_signals; diff --git a/kernel/signal.c b/kernel/signal.c index 811b5d440617..3c8ea7a328e0 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2780,6 +2780,39 @@ int set_compat_user_sigmask(const compat_sigset_t __user *usigmask, EXPORT_SYMBOL(set_compat_user_sigmask); #endif +/* + * restore_user_sigmask: + * usigmask: sigmask passed in from userland. + * sigsaved: saved sigmask when the syscall started and changed the sigmask to + * usigmask. + * + * This is useful for syscalls such as ppoll, pselect, io_pgetevents and + * epoll_pwait where a new sigmask is passed in from userland for the syscalls. + */ +void restore_user_sigmask(const void __user *usigmask, sigset_t *sigsaved) +{ + + if (!usigmask) + return; + /* + * When signals are pending, do not restore them here. + * Restoring sigmask here can lead to delivering signals that the above + * syscalls are intended to block because of the sigmask passed in. + */ + if (signal_pending(current)) { + current->saved_sigmask = *sigsaved; + set_restore_sigmask(); + return; + } + + /* + * This is needed because the fast syscall return path does not restore + * saved_sigmask when signals are not pending. + */ + set_current_blocked(sigsaved); +} +EXPORT_SYMBOL(restore_user_sigmask); + /** * sys_rt_sigprocmask - change the list of currently blocked signals * @how: whether to add, remove, or set signals -- cgit From 04e7712f4460585e5eed5b853fd8b82a9943958f Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 17 Apr 2018 16:31:07 +0200 Subject: y2038: futex: Move compat implementation into futex.c We are going to share the compat_sys_futex() handler between 64-bit architectures and 32-bit architectures that need to deal with both 32-bit and 64-bit time_t, and this is easier if both entry points are in the same file. In fact, most other system call handlers do the same thing these days, so let's follow the trend here and merge all of futex_compat.c into futex.c. In the process, a few minor changes have to be done to make sure everything still makes sense: handle_futex_death() and futex_cmpxchg_enabled() become local symbol, and the compat version of the fetch_robust_entry() function gets renamed to compat_fetch_robust_entry() to avoid a symbol clash. This is intended as a purely cosmetic patch, no behavior should change. Signed-off-by: Arnd Bergmann --- include/linux/futex.h | 8 -- kernel/Makefile | 3 - kernel/futex.c | 195 +++++++++++++++++++++++++++++++++++++++++++++++- kernel/futex_compat.c | 202 -------------------------------------------------- 4 files changed, 192 insertions(+), 216 deletions(-) delete mode 100644 kernel/futex_compat.c (limited to 'kernel') diff --git a/include/linux/futex.h b/include/linux/futex.h index 821ae502d3d8..ccaef0097785 100644 --- a/include/linux/futex.h +++ b/include/linux/futex.h @@ -9,9 +9,6 @@ struct inode; struct mm_struct; struct task_struct; -extern int -handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi); - /* * Futexes are matched on equal values of this key. * The key type depends on whether it's a shared or private mapping. @@ -55,11 +52,6 @@ extern void exit_robust_list(struct task_struct *curr); long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, u32 __user *uaddr2, u32 val2, u32 val3); -#ifdef CONFIG_HAVE_FUTEX_CMPXCHG -#define futex_cmpxchg_enabled 1 -#else -extern int futex_cmpxchg_enabled; -#endif #else static inline void exit_robust_list(struct task_struct *curr) { diff --git a/kernel/Makefile b/kernel/Makefile index 7343b3a9bff0..8e40a6742d23 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -49,9 +49,6 @@ obj-$(CONFIG_PROFILING) += profile.o obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-y += time/ obj-$(CONFIG_FUTEX) += futex.o -ifeq ($(CONFIG_COMPAT),y) -obj-$(CONFIG_FUTEX) += futex_compat.o -endif obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o obj-$(CONFIG_SMP) += smp.o ifneq ($(CONFIG_SMP),y) diff --git a/kernel/futex.c b/kernel/futex.c index f423f9b6577e..5cc7c3b098e9 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -44,6 +44,7 @@ * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ +#include #include #include #include @@ -173,8 +174,10 @@ * double_lock_hb() and double_unlock_hb(), respectively. */ -#ifndef CONFIG_HAVE_FUTEX_CMPXCHG -int __read_mostly futex_cmpxchg_enabled; +#ifdef CONFIG_HAVE_FUTEX_CMPXCHG +#define futex_cmpxchg_enabled 1 +#else +static int __read_mostly futex_cmpxchg_enabled; #endif /* @@ -3360,7 +3363,7 @@ err_unlock: * Process a futex-list entry, check whether it's owned by the * dying task, and do notification if so: */ -int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi) +static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi) { u32 uval, uninitialized_var(nval), mval; @@ -3589,6 +3592,192 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, return do_futex(uaddr, op, val, tp, uaddr2, val2, val3); } +#ifdef CONFIG_COMPAT +/* + * Fetch a robust-list pointer. Bit 0 signals PI futexes: + */ +static inline int +compat_fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry, + compat_uptr_t __user *head, unsigned int *pi) +{ + if (get_user(*uentry, head)) + return -EFAULT; + + *entry = compat_ptr((*uentry) & ~1); + *pi = (unsigned int)(*uentry) & 1; + + return 0; +} + +static void __user *futex_uaddr(struct robust_list __user *entry, + compat_long_t futex_offset) +{ + compat_uptr_t base = ptr_to_compat(entry); + void __user *uaddr = compat_ptr(base + futex_offset); + + return uaddr; +} + +/* + * Walk curr->robust_list (very carefully, it's a userspace list!) + * and mark any locks found there dead, and notify any waiters. + * + * We silently return on any sign of list-walking problem. + */ +void compat_exit_robust_list(struct task_struct *curr) +{ + struct compat_robust_list_head __user *head = curr->compat_robust_list; + struct robust_list __user *entry, *next_entry, *pending; + unsigned int limit = ROBUST_LIST_LIMIT, pi, pip; + unsigned int uninitialized_var(next_pi); + compat_uptr_t uentry, next_uentry, upending; + compat_long_t futex_offset; + int rc; + + if (!futex_cmpxchg_enabled) + return; + + /* + * Fetch the list head (which was registered earlier, via + * sys_set_robust_list()): + */ + if (compat_fetch_robust_entry(&uentry, &entry, &head->list.next, &pi)) + return; + /* + * Fetch the relative futex offset: + */ + if (get_user(futex_offset, &head->futex_offset)) + return; + /* + * Fetch any possibly pending lock-add first, and handle it + * if it exists: + */ + if (compat_fetch_robust_entry(&upending, &pending, + &head->list_op_pending, &pip)) + return; + + next_entry = NULL; /* avoid warning with gcc */ + while (entry != (struct robust_list __user *) &head->list) { + /* + * Fetch the next entry in the list before calling + * handle_futex_death: + */ + rc = compat_fetch_robust_entry(&next_uentry, &next_entry, + (compat_uptr_t __user *)&entry->next, &next_pi); + /* + * A pending lock might already be on the list, so + * dont process it twice: + */ + if (entry != pending) { + void __user *uaddr = futex_uaddr(entry, futex_offset); + + if (handle_futex_death(uaddr, curr, pi)) + return; + } + if (rc) + return; + uentry = next_uentry; + entry = next_entry; + pi = next_pi; + /* + * Avoid excessively long or circular lists: + */ + if (!--limit) + break; + + cond_resched(); + } + if (pending) { + void __user *uaddr = futex_uaddr(pending, futex_offset); + + handle_futex_death(uaddr, curr, pip); + } +} + +COMPAT_SYSCALL_DEFINE2(set_robust_list, + struct compat_robust_list_head __user *, head, + compat_size_t, len) +{ + if (!futex_cmpxchg_enabled) + return -ENOSYS; + + if (unlikely(len != sizeof(*head))) + return -EINVAL; + + current->compat_robust_list = head; + + return 0; +} + +COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid, + compat_uptr_t __user *, head_ptr, + compat_size_t __user *, len_ptr) +{ + struct compat_robust_list_head __user *head; + unsigned long ret; + struct task_struct *p; + + if (!futex_cmpxchg_enabled) + return -ENOSYS; + + rcu_read_lock(); + + ret = -ESRCH; + if (!pid) + p = current; + else { + p = find_task_by_vpid(pid); + if (!p) + goto err_unlock; + } + + ret = -EPERM; + if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS)) + goto err_unlock; + + head = p->compat_robust_list; + rcu_read_unlock(); + + if (put_user(sizeof(*head), len_ptr)) + return -EFAULT; + return put_user(ptr_to_compat(head), head_ptr); + +err_unlock: + rcu_read_unlock(); + + return ret; +} + +COMPAT_SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, + struct old_timespec32 __user *, utime, u32 __user *, uaddr2, + u32, val3) +{ + struct timespec ts; + ktime_t t, *tp = NULL; + int val2 = 0; + int cmd = op & FUTEX_CMD_MASK; + + if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || + cmd == FUTEX_WAIT_BITSET || + cmd == FUTEX_WAIT_REQUEUE_PI)) { + if (compat_get_timespec(&ts, utime)) + return -EFAULT; + if (!timespec_valid(&ts)) + return -EINVAL; + + t = timespec_to_ktime(ts); + if (cmd == FUTEX_WAIT) + t = ktime_add_safe(ktime_get(), t); + tp = &t; + } + if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE || + cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP) + val2 = (int) (unsigned long) utime; + + return do_futex(uaddr, op, val, tp, uaddr2, val2, val3); +} +#endif /* CONFIG_COMPAT */ + static void __init futex_detect_cmpxchg(void) { #ifndef CONFIG_HAVE_FUTEX_CMPXCHG diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c deleted file mode 100644 index 410a77a8f6e2..000000000000 --- a/kernel/futex_compat.c +++ /dev/null @@ -1,202 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * linux/kernel/futex_compat.c - * - * Futex compatibililty routines. - * - * Copyright 2006, Red Hat, Inc., Ingo Molnar - */ - -#include -#include -#include -#include -#include -#include - -#include - - -/* - * Fetch a robust-list pointer. Bit 0 signals PI futexes: - */ -static inline int -fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry, - compat_uptr_t __user *head, unsigned int *pi) -{ - if (get_user(*uentry, head)) - return -EFAULT; - - *entry = compat_ptr((*uentry) & ~1); - *pi = (unsigned int)(*uentry) & 1; - - return 0; -} - -static void __user *futex_uaddr(struct robust_list __user *entry, - compat_long_t futex_offset) -{ - compat_uptr_t base = ptr_to_compat(entry); - void __user *uaddr = compat_ptr(base + futex_offset); - - return uaddr; -} - -/* - * Walk curr->robust_list (very carefully, it's a userspace list!) - * and mark any locks found there dead, and notify any waiters. - * - * We silently return on any sign of list-walking problem. - */ -void compat_exit_robust_list(struct task_struct *curr) -{ - struct compat_robust_list_head __user *head = curr->compat_robust_list; - struct robust_list __user *entry, *next_entry, *pending; - unsigned int limit = ROBUST_LIST_LIMIT, pi, pip; - unsigned int uninitialized_var(next_pi); - compat_uptr_t uentry, next_uentry, upending; - compat_long_t futex_offset; - int rc; - - if (!futex_cmpxchg_enabled) - return; - - /* - * Fetch the list head (which was registered earlier, via - * sys_set_robust_list()): - */ - if (fetch_robust_entry(&uentry, &entry, &head->list.next, &pi)) - return; - /* - * Fetch the relative futex offset: - */ - if (get_user(futex_offset, &head->futex_offset)) - return; - /* - * Fetch any possibly pending lock-add first, and handle it - * if it exists: - */ - if (fetch_robust_entry(&upending, &pending, - &head->list_op_pending, &pip)) - return; - - next_entry = NULL; /* avoid warning with gcc */ - while (entry != (struct robust_list __user *) &head->list) { - /* - * Fetch the next entry in the list before calling - * handle_futex_death: - */ - rc = fetch_robust_entry(&next_uentry, &next_entry, - (compat_uptr_t __user *)&entry->next, &next_pi); - /* - * A pending lock might already be on the list, so - * dont process it twice: - */ - if (entry != pending) { - void __user *uaddr = futex_uaddr(entry, futex_offset); - - if (handle_futex_death(uaddr, curr, pi)) - return; - } - if (rc) - return; - uentry = next_uentry; - entry = next_entry; - pi = next_pi; - /* - * Avoid excessively long or circular lists: - */ - if (!--limit) - break; - - cond_resched(); - } - if (pending) { - void __user *uaddr = futex_uaddr(pending, futex_offset); - - handle_futex_death(uaddr, curr, pip); - } -} - -COMPAT_SYSCALL_DEFINE2(set_robust_list, - struct compat_robust_list_head __user *, head, - compat_size_t, len) -{ - if (!futex_cmpxchg_enabled) - return -ENOSYS; - - if (unlikely(len != sizeof(*head))) - return -EINVAL; - - current->compat_robust_list = head; - - return 0; -} - -COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid, - compat_uptr_t __user *, head_ptr, - compat_size_t __user *, len_ptr) -{ - struct compat_robust_list_head __user *head; - unsigned long ret; - struct task_struct *p; - - if (!futex_cmpxchg_enabled) - return -ENOSYS; - - rcu_read_lock(); - - ret = -ESRCH; - if (!pid) - p = current; - else { - p = find_task_by_vpid(pid); - if (!p) - goto err_unlock; - } - - ret = -EPERM; - if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS)) - goto err_unlock; - - head = p->compat_robust_list; - rcu_read_unlock(); - - if (put_user(sizeof(*head), len_ptr)) - return -EFAULT; - return put_user(ptr_to_compat(head), head_ptr); - -err_unlock: - rcu_read_unlock(); - - return ret; -} - -COMPAT_SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, - struct old_timespec32 __user *, utime, u32 __user *, uaddr2, - u32, val3) -{ - struct timespec ts; - ktime_t t, *tp = NULL; - int val2 = 0; - int cmd = op & FUTEX_CMD_MASK; - - if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || - cmd == FUTEX_WAIT_BITSET || - cmd == FUTEX_WAIT_REQUEUE_PI)) { - if (compat_get_timespec(&ts, utime)) - return -EFAULT; - if (!timespec_valid(&ts)) - return -EINVAL; - - t = timespec_to_ktime(ts); - if (cmd == FUTEX_WAIT) - t = ktime_add_safe(ktime_get(), t); - tp = &t; - } - if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE || - cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP) - val2 = (int) (unsigned long) utime; - - return do_futex(uaddr, op, val, tp, uaddr2, val2, val3); -} -- cgit From bec2f7cbb73eadf5e1cc7d54ecb0980ede244257 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 17 Apr 2018 17:23:35 +0200 Subject: y2038: futex: Add support for __kernel_timespec This prepares sys_futex for y2038 safe calling: the native syscall is changed to receive a __kernel_timespec argument, which will be switched to 64-bit time_t in the future. All the internal time handling gets changed to timespec64, and the compat_sys_futex entry point is moved under the CONFIG_COMPAT_32BIT_TIME check to provide compatibility for existing 32-bit architectures. Signed-off-by: Arnd Bergmann --- include/linux/syscalls.h | 2 +- kernel/futex.c | 22 ++++++++++++---------- 2 files changed, 13 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index a27cf407de92..247ad9eca955 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -553,7 +553,7 @@ asmlinkage long sys_unshare(unsigned long unshare_flags); /* kernel/futex.c */ asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val, - struct timespec __user *utime, u32 __user *uaddr2, + struct __kernel_timespec __user *utime, u32 __user *uaddr2, u32 val3); asmlinkage long sys_get_robust_list(int pid, struct robust_list_head __user * __user *head_ptr, diff --git a/kernel/futex.c b/kernel/futex.c index 5cc7c3b098e9..b305beaab739 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -3558,10 +3558,10 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, - struct timespec __user *, utime, u32 __user *, uaddr2, + struct __kernel_timespec __user *, utime, u32 __user *, uaddr2, u32, val3) { - struct timespec ts; + struct timespec64 ts; ktime_t t, *tp = NULL; u32 val2 = 0; int cmd = op & FUTEX_CMD_MASK; @@ -3571,12 +3571,12 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, cmd == FUTEX_WAIT_REQUEUE_PI)) { if (unlikely(should_fail_futex(!(op & FUTEX_PRIVATE_FLAG)))) return -EFAULT; - if (copy_from_user(&ts, utime, sizeof(ts)) != 0) + if (get_timespec64(&ts, utime)) return -EFAULT; - if (!timespec_valid(&ts)) + if (!timespec64_valid(&ts)) return -EINVAL; - t = timespec_to_ktime(ts); + t = timespec64_to_ktime(ts); if (cmd == FUTEX_WAIT) t = ktime_add_safe(ktime_get(), t); tp = &t; @@ -3747,12 +3747,14 @@ err_unlock: return ret; } +#endif /* CONFIG_COMPAT */ +#ifdef CONFIG_COMPAT_32BIT_TIME COMPAT_SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, struct old_timespec32 __user *, utime, u32 __user *, uaddr2, u32, val3) { - struct timespec ts; + struct timespec64 ts; ktime_t t, *tp = NULL; int val2 = 0; int cmd = op & FUTEX_CMD_MASK; @@ -3760,12 +3762,12 @@ COMPAT_SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || cmd == FUTEX_WAIT_BITSET || cmd == FUTEX_WAIT_REQUEUE_PI)) { - if (compat_get_timespec(&ts, utime)) + if (get_old_timespec32(&ts, utime)) return -EFAULT; - if (!timespec_valid(&ts)) + if (!timespec64_valid(&ts)) return -EINVAL; - t = timespec_to_ktime(ts); + t = timespec64_to_ktime(ts); if (cmd == FUTEX_WAIT) t = ktime_add_safe(ktime_get(), t); tp = &t; @@ -3776,7 +3778,7 @@ COMPAT_SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, return do_futex(uaddr, op, val, tp, uaddr2, val2, val3); } -#endif /* CONFIG_COMPAT */ +#endif /* CONFIG_COMPAT_32BIT_TIME */ static void __init futex_detect_cmpxchg(void) { -- cgit From 2dc6b100f928aac8d7532bf7112d3f8d3f952bad Mon Sep 17 00:00:00 2001 From: Jiong Wang Date: Wed, 5 Dec 2018 13:52:34 -0500 Subject: bpf: interpreter support BPF_ALU | BPF_ARSH This patch implements interpreting BPF_ALU | BPF_ARSH. Do arithmetic right shift on low 32-bit sub-register, and zero the high 32 bits. Reviewed-by: Jakub Kicinski Signed-off-by: Jiong Wang Signed-off-by: Alexei Starovoitov --- kernel/bpf/core.c | 52 ++++++++++++++++++++++++++++++---------------------- 1 file changed, 30 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 628b3970a49b..a5b223ef7131 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -933,32 +933,34 @@ EXPORT_SYMBOL_GPL(__bpf_call_base); #define BPF_INSN_MAP(INSN_2, INSN_3) \ /* 32 bit ALU operations. */ \ /* Register based. */ \ - INSN_3(ALU, ADD, X), \ - INSN_3(ALU, SUB, X), \ - INSN_3(ALU, AND, X), \ - INSN_3(ALU, OR, X), \ - INSN_3(ALU, LSH, X), \ - INSN_3(ALU, RSH, X), \ - INSN_3(ALU, XOR, X), \ - INSN_3(ALU, MUL, X), \ - INSN_3(ALU, MOV, X), \ - INSN_3(ALU, DIV, X), \ - INSN_3(ALU, MOD, X), \ + INSN_3(ALU, ADD, X), \ + INSN_3(ALU, SUB, X), \ + INSN_3(ALU, AND, X), \ + INSN_3(ALU, OR, X), \ + INSN_3(ALU, LSH, X), \ + INSN_3(ALU, RSH, X), \ + INSN_3(ALU, XOR, X), \ + INSN_3(ALU, MUL, X), \ + INSN_3(ALU, MOV, X), \ + INSN_3(ALU, ARSH, X), \ + INSN_3(ALU, DIV, X), \ + INSN_3(ALU, MOD, X), \ INSN_2(ALU, NEG), \ INSN_3(ALU, END, TO_BE), \ INSN_3(ALU, END, TO_LE), \ /* Immediate based. */ \ - INSN_3(ALU, ADD, K), \ - INSN_3(ALU, SUB, K), \ - INSN_3(ALU, AND, K), \ - INSN_3(ALU, OR, K), \ - INSN_3(ALU, LSH, K), \ - INSN_3(ALU, RSH, K), \ - INSN_3(ALU, XOR, K), \ - INSN_3(ALU, MUL, K), \ - INSN_3(ALU, MOV, K), \ - INSN_3(ALU, DIV, K), \ - INSN_3(ALU, MOD, K), \ + INSN_3(ALU, ADD, K), \ + INSN_3(ALU, SUB, K), \ + INSN_3(ALU, AND, K), \ + INSN_3(ALU, OR, K), \ + INSN_3(ALU, LSH, K), \ + INSN_3(ALU, RSH, K), \ + INSN_3(ALU, XOR, K), \ + INSN_3(ALU, MUL, K), \ + INSN_3(ALU, MOV, K), \ + INSN_3(ALU, ARSH, K), \ + INSN_3(ALU, DIV, K), \ + INSN_3(ALU, MOD, K), \ /* 64 bit ALU operations. */ \ /* Register based. */ \ INSN_3(ALU64, ADD, X), \ @@ -1137,6 +1139,12 @@ select_insn: DST = (u64) (u32) insn[0].imm | ((u64) (u32) insn[1].imm) << 32; insn++; CONT; + ALU_ARSH_X: + DST = (u64) (u32) ((*(s32 *) &DST) >> SRC); + CONT; + ALU_ARSH_K: + DST = (u64) (u32) ((*(s32 *) &DST) >> IMM); + CONT; ALU64_ARSH_X: (*(s64 *) &DST) >>= SRC; CONT; -- cgit From c49f7dbd4f9c2c49df7fc0f5b50c1350ee7e01ee Mon Sep 17 00:00:00 2001 From: Jiong Wang Date: Wed, 5 Dec 2018 13:52:35 -0500 Subject: bpf: verifier remove the rejection on BPF_ALU | BPF_ARSH This patch remove the rejection on BPF_ALU | BPF_ARSH as we have supported them on interpreter and all JIT back-ends Reviewed-by: Jakub Kicinski Signed-off-by: Jiong Wang Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 7658c61c1a88..2752d35ad073 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3649,11 +3649,6 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) return -EINVAL; } - if (opcode == BPF_ARSH && BPF_CLASS(insn->code) != BPF_ALU64) { - verbose(env, "BPF_ARSH not supported for 32 bit ALU\n"); - return -EINVAL; - } - if ((opcode == BPF_LSH || opcode == BPF_RSH || opcode == BPF_ARSH) && BPF_SRC(insn->code) == BPF_K) { int size = BPF_CLASS(insn->code) == BPF_ALU64 ? 64 : 32; -- cgit From db6638d7d177a8bc74c9e539e2e0d7d061c767b1 Mon Sep 17 00:00:00 2001 From: Dennis Zhou Date: Wed, 5 Dec 2018 12:10:35 -0500 Subject: blkcg: remove bio->bi_css and instead use bio->bi_blkg Prior patches ensured that any bio that interacts with a request_queue is properly associated with a blkg. This makes bio->bi_css unnecessary as blkg maintains a reference to blkcg already. This removes the bio field bi_css and transfers corresponding uses to access via bi_blkg. Signed-off-by: Dennis Zhou Reviewed-by: Josef Bacik Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- block/bio.c | 59 ++++++++++------------------------------------ block/bounce.c | 2 +- drivers/block/loop.c | 5 ++-- drivers/md/raid0.c | 2 +- include/linux/bio.h | 11 ++++----- include/linux/blk-cgroup.h | 8 +++---- include/linux/blk_types.h | 7 +++--- kernel/trace/blktrace.c | 4 ++-- 8 files changed, 32 insertions(+), 66 deletions(-) (limited to 'kernel') diff --git a/block/bio.c b/block/bio.c index b42477b6a225..2b6bc7b805ec 100644 --- a/block/bio.c +++ b/block/bio.c @@ -610,7 +610,7 @@ void __bio_clone_fast(struct bio *bio, struct bio *bio_src) bio->bi_iter = bio_src->bi_iter; bio->bi_io_vec = bio_src->bi_io_vec; - bio_clone_blkcg_association(bio, bio_src); + bio_clone_blkg_association(bio, bio_src); blkcg_bio_issue_init(bio); } EXPORT_SYMBOL(__bio_clone_fast); @@ -1957,34 +1957,6 @@ EXPORT_SYMBOL(bioset_init_from_src); #ifdef CONFIG_BLK_CGROUP -/** - * bio_associate_blkcg - associate a bio with the specified blkcg - * @bio: target bio - * @blkcg_css: css of the blkcg to associate - * - * Associate @bio with the blkcg specified by @blkcg_css. Block layer will - * treat @bio as if it were issued by a task which belongs to the blkcg. - * - * This function takes an extra reference of @blkcg_css which will be put - * when @bio is released. The caller must own @bio and is responsible for - * synchronizing calls to this function. If @blkcg_css is %NULL, a call to - * blkcg_get_css() finds the current css from the kthread or task. - */ -int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css) -{ - if (unlikely(bio->bi_css)) - return -EBUSY; - - if (blkcg_css) - css_get(blkcg_css); - else - blkcg_css = blkcg_get_css(); - - bio->bi_css = blkcg_css; - return 0; -} -EXPORT_SYMBOL_GPL(bio_associate_blkcg); - /** * bio_disassociate_blkg - puts back the blkg reference if associated * @bio: target bio @@ -1994,6 +1966,8 @@ EXPORT_SYMBOL_GPL(bio_associate_blkcg); void bio_disassociate_blkg(struct bio *bio) { if (bio->bi_blkg) { + /* a ref is always taken on css */ + css_put(&bio_blkcg(bio)->css); blkg_put(bio->bi_blkg); bio->bi_blkg = NULL; } @@ -2047,7 +2021,6 @@ void bio_associate_blkg_from_css(struct bio *bio, struct cgroup_subsys_state *css) { css_get(css); - bio->bi_css = css; __bio_associate_blkg_from_css(bio, css); } EXPORT_SYMBOL_GPL(bio_associate_blkg_from_css); @@ -2066,13 +2039,10 @@ void bio_associate_blkg_from_page(struct bio *bio, struct page *page) { struct cgroup_subsys_state *css; - if (unlikely(bio->bi_css)) - return; if (!page->mem_cgroup) return; css = cgroup_get_e_css(page->mem_cgroup->css.cgroup, &io_cgrp_subsys); - bio->bi_css = css; __bio_associate_blkg_from_css(bio, css); } #endif /* CONFIG_MEMCG */ @@ -2094,8 +2064,10 @@ void bio_associate_blkg(struct bio *bio) rcu_read_lock(); - bio_associate_blkcg(bio, NULL); - blkcg = bio_blkcg(bio); + if (bio->bi_blkg) + blkcg = bio->bi_blkg->blkcg; + else + blkcg = css_to_blkcg(blkcg_get_css()); if (!blkcg->css.parent) { __bio_associate_blkg(bio, q->root_blkg); @@ -2115,27 +2087,22 @@ EXPORT_SYMBOL_GPL(bio_associate_blkg); */ void bio_disassociate_task(struct bio *bio) { - if (bio->bi_css) { - css_put(bio->bi_css); - bio->bi_css = NULL; - } bio_disassociate_blkg(bio); } /** - * bio_clone_blkcg_association - clone blkcg association from src to dst bio + * bio_clone_blkg_association - clone blkg association from src to dst bio * @dst: destination bio * @src: source bio */ -void bio_clone_blkcg_association(struct bio *dst, struct bio *src) +void bio_clone_blkg_association(struct bio *dst, struct bio *src) { - if (src->bi_css) - WARN_ON(bio_associate_blkcg(dst, src->bi_css)); - - if (src->bi_blkg) + if (src->bi_blkg) { + css_get(&bio_blkcg(src)->css); __bio_associate_blkg(dst, src->bi_blkg); + } } -EXPORT_SYMBOL_GPL(bio_clone_blkcg_association); +EXPORT_SYMBOL_GPL(bio_clone_blkg_association); #endif /* CONFIG_BLK_CGROUP */ static void __init biovec_init_slabs(void) diff --git a/block/bounce.c b/block/bounce.c index cfb96d5170d0..ffb9e9ecfa7e 100644 --- a/block/bounce.c +++ b/block/bounce.c @@ -277,7 +277,7 @@ static struct bio *bounce_clone_bio(struct bio *bio_src, gfp_t gfp_mask, } } - bio_clone_blkcg_association(bio, bio_src); + bio_clone_blkg_association(bio, bio_src); blkcg_bio_issue_init(bio); return bio; diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 176ab1f28eca..0770004616de 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -77,6 +77,7 @@ #include #include #include +#include #include "loop.h" @@ -1820,8 +1821,8 @@ static blk_status_t loop_queue_rq(struct blk_mq_hw_ctx *hctx, /* always use the first bio's css */ #ifdef CONFIG_BLK_CGROUP - if (cmd->use_aio && rq->bio && rq->bio->bi_css) { - cmd->css = rq->bio->bi_css; + if (cmd->use_aio && rq->bio && rq->bio->bi_blkg) { + cmd->css = &bio_blkcg(rq->bio)->css; css_get(cmd->css); } else #endif diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index ac1cffd2a09b..f3fb5bb8c82a 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -542,7 +542,7 @@ static void raid0_handle_discard(struct mddev *mddev, struct bio *bio) !discard_bio) continue; bio_chain(discard_bio, bio); - bio_clone_blkcg_association(discard_bio, bio); + bio_clone_blkg_association(discard_bio, bio); if (mddev->gendisk) trace_block_bio_remap(bdev_get_queue(rdev->bdev), discard_bio, disk_devt(mddev->gendisk), diff --git a/include/linux/bio.h b/include/linux/bio.h index f0438061a5a3..84e1c4dc703a 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -498,7 +498,7 @@ do { \ do { \ (dst)->bi_disk = (src)->bi_disk; \ (dst)->bi_partno = (src)->bi_partno; \ - bio_clone_blkcg_association(dst, src); \ + bio_clone_blkg_association(dst, src); \ } while (0) #define bio_dev(bio) \ @@ -512,24 +512,21 @@ static inline void bio_associate_blkg_from_page(struct bio *bio, #endif #ifdef CONFIG_BLK_CGROUP -int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css); void bio_disassociate_blkg(struct bio *bio); void bio_associate_blkg(struct bio *bio); void bio_associate_blkg_from_css(struct bio *bio, struct cgroup_subsys_state *css); void bio_disassociate_task(struct bio *bio); -void bio_clone_blkcg_association(struct bio *dst, struct bio *src); +void bio_clone_blkg_association(struct bio *dst, struct bio *src); #else /* CONFIG_BLK_CGROUP */ -static inline int bio_associate_blkcg(struct bio *bio, - struct cgroup_subsys_state *blkcg_css) { return 0; } static inline void bio_disassociate_blkg(struct bio *bio) { } static inline void bio_associate_blkg(struct bio *bio) { } static inline void bio_associate_blkg_from_css(struct bio *bio, struct cgroup_subsys_state *css) { } static inline void bio_disassociate_task(struct bio *bio) { } -static inline void bio_clone_blkcg_association(struct bio *dst, - struct bio *src) { } +static inline void bio_clone_blkg_association(struct bio *dst, + struct bio *src) { } #endif /* CONFIG_BLK_CGROUP */ #ifdef CONFIG_HIGHMEM diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 8b069c3775ee..f11c37f8ce09 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -309,8 +309,8 @@ static inline struct blkcg *css_to_blkcg(struct cgroup_subsys_state *css) */ static inline struct blkcg *__bio_blkcg(struct bio *bio) { - if (bio && bio->bi_css) - return css_to_blkcg(bio->bi_css); + if (bio && bio->bi_blkg) + return bio->bi_blkg->blkcg; return css_to_blkcg(blkcg_css()); } @@ -324,8 +324,8 @@ static inline struct blkcg *__bio_blkcg(struct bio *bio) */ static inline struct blkcg *bio_blkcg(struct bio *bio) { - if (bio && bio->bi_css) - return css_to_blkcg(bio->bi_css); + if (bio && bio->bi_blkg) + return bio->bi_blkg->blkcg; return NULL; } diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index c0ba1a038ff3..46c005d601ac 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -174,10 +174,11 @@ struct bio { void *bi_private; #ifdef CONFIG_BLK_CGROUP /* - * Optional css associated with this bio. Put on bio - * release. Read comment on top of bio_associate_current(). + * Represents the association of the css and request_queue for the bio. + * If a bio goes direct to device, it will not have a blkg as it will + * not have a request_queue associated with it. The reference is put + * on release of the bio. */ - struct cgroup_subsys_state *bi_css; struct blkcg_gq *bi_blkg; struct bio_issue bi_issue; #endif diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 2868d85f1fb1..fac0ddf8a8e2 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -764,9 +764,9 @@ blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio) if (!bt || !(blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP)) return NULL; - if (!bio->bi_css) + if (!bio->bi_blkg) return NULL; - return cgroup_get_kernfs_id(bio->bi_css->cgroup); + return cgroup_get_kernfs_id(bio_blkcg(bio)->css.cgroup); } #else static union kernfs_node_id * -- cgit From fc5a828bfad628c1092194f2814604943561c52d Mon Sep 17 00:00:00 2001 From: Dennis Zhou Date: Wed, 5 Dec 2018 12:10:36 -0500 Subject: blkcg: remove additional reference to the css The previous patch in this series removed carrying around a pointer to the css in blkg. However, the blkg association logic still relied on taking a reference on the css to ensure we wouldn't fail in getting a reference for the blkg. Here the implicit dependency on the css is removed. The association continues to rely on the tryget logic walking up the blkg tree. This streamlines the three ways that association can happen: normal, swap, and writeback. Signed-off-by: Dennis Zhou Acked-by: Tejun Heo Reviewed-by: Josef Bacik Signed-off-by: Jens Axboe --- block/bio.c | 66 ++++++++++++++++++++-------------------------- include/linux/blk-cgroup.h | 41 ---------------------------- include/linux/cgroup.h | 2 ++ kernel/cgroup/cgroup.c | 48 ++++++++++++++++++++++++++------- 4 files changed, 69 insertions(+), 88 deletions(-) (limited to 'kernel') diff --git a/block/bio.c b/block/bio.c index 2b6bc7b805ec..ce1e512dca5a 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1966,8 +1966,6 @@ EXPORT_SYMBOL(bioset_init_from_src); void bio_disassociate_blkg(struct bio *bio) { if (bio->bi_blkg) { - /* a ref is always taken on css */ - css_put(&bio_blkcg(bio)->css); blkg_put(bio->bi_blkg); bio->bi_blkg = NULL; } @@ -1995,33 +1993,31 @@ static void __bio_associate_blkg(struct bio *bio, struct blkcg_gq *blkg) bio->bi_blkg = blkg_try_get_closest(blkg); } -static void __bio_associate_blkg_from_css(struct bio *bio, - struct cgroup_subsys_state *css) -{ - struct blkcg_gq *blkg; - - rcu_read_lock(); - - blkg = blkg_lookup_create(css_to_blkcg(css), bio->bi_disk->queue); - __bio_associate_blkg(bio, blkg); - - rcu_read_unlock(); -} - /** * bio_associate_blkg_from_css - associate a bio with a specified css * @bio: target bio * @css: target css * * Associate @bio with the blkg found by combining the css's blkg and the - * request_queue of the @bio. This takes a reference on the css that will - * be put upon freeing of @bio. + * request_queue of the @bio. This falls back to the queue's root_blkg if + * the association fails with the css. */ void bio_associate_blkg_from_css(struct bio *bio, struct cgroup_subsys_state *css) { - css_get(css); - __bio_associate_blkg_from_css(bio, css); + struct request_queue *q = bio->bi_disk->queue; + struct blkcg_gq *blkg; + + rcu_read_lock(); + + if (!css || !css->parent) + blkg = q->root_blkg; + else + blkg = blkg_lookup_create(css_to_blkcg(css), q); + + __bio_associate_blkg(bio, blkg); + + rcu_read_unlock(); } EXPORT_SYMBOL_GPL(bio_associate_blkg_from_css); @@ -2032,8 +2028,8 @@ EXPORT_SYMBOL_GPL(bio_associate_blkg_from_css); * @page: the page to lookup the blkcg from * * Associate @bio with the blkg from @page's owning memcg and the respective - * request_queue. This works like every other associate function wrt - * references. + * request_queue. If cgroup_e_css returns %NULL, fall back to the queue's + * root_blkg. */ void bio_associate_blkg_from_page(struct bio *bio, struct page *page) { @@ -2042,8 +2038,12 @@ void bio_associate_blkg_from_page(struct bio *bio, struct page *page) if (!page->mem_cgroup) return; - css = cgroup_get_e_css(page->mem_cgroup->css.cgroup, &io_cgrp_subsys); - __bio_associate_blkg_from_css(bio, css); + rcu_read_lock(); + + css = cgroup_e_css(page->mem_cgroup->css.cgroup, &io_cgrp_subsys); + bio_associate_blkg_from_css(bio, css); + + rcu_read_unlock(); } #endif /* CONFIG_MEMCG */ @@ -2058,24 +2058,16 @@ void bio_associate_blkg_from_page(struct bio *bio, struct page *page) */ void bio_associate_blkg(struct bio *bio) { - struct request_queue *q = bio->bi_disk->queue; - struct blkcg *blkcg; - struct blkcg_gq *blkg; + struct cgroup_subsys_state *css; rcu_read_lock(); if (bio->bi_blkg) - blkcg = bio->bi_blkg->blkcg; + css = &bio_blkcg(bio)->css; else - blkcg = css_to_blkcg(blkcg_get_css()); + css = blkcg_css(); - if (!blkcg->css.parent) { - __bio_associate_blkg(bio, q->root_blkg); - } else { - blkg = blkg_lookup_create(blkcg, q); - - __bio_associate_blkg(bio, blkg); - } + bio_associate_blkg_from_css(bio, css); rcu_read_unlock(); } @@ -2097,10 +2089,8 @@ void bio_disassociate_task(struct bio *bio) */ void bio_clone_blkg_association(struct bio *dst, struct bio *src) { - if (src->bi_blkg) { - css_get(&bio_blkcg(src)->css); + if (src->bi_blkg) __bio_associate_blkg(dst, src->bi_blkg); - } } EXPORT_SYMBOL_GPL(bio_clone_blkg_association); #endif /* CONFIG_BLK_CGROUP */ diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index f11c37f8ce09..284819a4d122 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -247,47 +247,6 @@ static inline struct cgroup_subsys_state *blkcg_css(void) return task_css(current, io_cgrp_id); } -/** - * blkcg_get_css - find and get a reference to the css - * - * Find the css associated with either the kthread or the current task. - * This takes a reference on the blkcg which will need to be managed by the - * caller. - */ -static inline struct cgroup_subsys_state *blkcg_get_css(void) -{ - struct cgroup_subsys_state *css; - - rcu_read_lock(); - - css = kthread_blkcg(); - if (css) { - css_get(css); - } else { - /* - * This is a bit complicated. It is possible task_css() is - * seeing an old css pointer here. This is caused by the - * current thread migrating away from this cgroup and this - * cgroup dying. css_tryget() will fail when trying to take a - * ref on a cgroup that's ref count has hit 0. - * - * Therefore, if it does fail, this means current must have - * been swapped away already and this is waiting for it to - * propagate on the polling cpu. Hence the use of cpu_relax(). - */ - while (true) { - css = task_css(current, io_cgrp_id); - if (likely(css_tryget(css))) - break; - cpu_relax(); - } - } - - rcu_read_unlock(); - - return css; -} - static inline struct blkcg *css_to_blkcg(struct cgroup_subsys_state *css) { return css ? container_of(css, struct blkcg, css) : NULL; diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 9d12757a65b0..9968332cceed 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -93,6 +93,8 @@ extern struct css_set init_css_set; bool css_has_online_children(struct cgroup_subsys_state *css); struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss); +struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgroup, + struct cgroup_subsys *ss); struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgroup, struct cgroup_subsys *ss); struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry, diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 6aaf5dd5383b..8b79318810ad 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -493,7 +493,7 @@ static struct cgroup_subsys_state *cgroup_tryget_css(struct cgroup *cgrp, } /** - * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem + * cgroup_e_css_by_mask - obtain a cgroup's effective css for the specified ss * @cgrp: the cgroup of interest * @ss: the subsystem of interest (%NULL returns @cgrp->self) * @@ -502,8 +502,8 @@ static struct cgroup_subsys_state *cgroup_tryget_css(struct cgroup *cgrp, * enabled. If @ss is associated with the hierarchy @cgrp is on, this * function is guaranteed to return non-NULL css. */ -static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp, - struct cgroup_subsys *ss) +static struct cgroup_subsys_state *cgroup_e_css_by_mask(struct cgroup *cgrp, + struct cgroup_subsys *ss) { lockdep_assert_held(&cgroup_mutex); @@ -523,6 +523,35 @@ static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp, return cgroup_css(cgrp, ss); } +/** + * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem + * @cgrp: the cgroup of interest + * @ss: the subsystem of interest + * + * Find and get the effective css of @cgrp for @ss. The effective css is + * defined as the matching css of the nearest ancestor including self which + * has @ss enabled. If @ss is not mounted on the hierarchy @cgrp is on, + * the root css is returned, so this function always returns a valid css. + * + * The returned css is not guaranteed to be online, and therefore it is the + * callers responsiblity to tryget a reference for it. + */ +struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp, + struct cgroup_subsys *ss) +{ + struct cgroup_subsys_state *css; + + do { + css = cgroup_css(cgrp, ss); + + if (css) + return css; + cgrp = cgroup_parent(cgrp); + } while (cgrp); + + return init_css_set.subsys[ss->id]; +} + /** * cgroup_get_e_css - get a cgroup's effective css for the specified subsystem * @cgrp: the cgroup of interest @@ -605,10 +634,11 @@ EXPORT_SYMBOL_GPL(of_css); * * Should be called under cgroup_[tree_]mutex. */ -#define for_each_e_css(css, ssid, cgrp) \ - for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \ - if (!((css) = cgroup_e_css(cgrp, cgroup_subsys[(ssid)]))) \ - ; \ +#define for_each_e_css(css, ssid, cgrp) \ + for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \ + if (!((css) = cgroup_e_css_by_mask(cgrp, \ + cgroup_subsys[(ssid)]))) \ + ; \ else /** @@ -1007,7 +1037,7 @@ static struct css_set *find_existing_css_set(struct css_set *old_cset, * @ss is in this hierarchy, so we want the * effective css from @cgrp. */ - template[i] = cgroup_e_css(cgrp, ss); + template[i] = cgroup_e_css_by_mask(cgrp, ss); } else { /* * @ss is not in this hierarchy, so we don't want @@ -3024,7 +3054,7 @@ static int cgroup_apply_control(struct cgroup *cgrp) return ret; /* - * At this point, cgroup_e_css() results reflect the new csses + * At this point, cgroup_e_css_by_mask() results reflect the new csses * making the following cgroup_update_dfl_csses() properly update * css associations of all tasks in the subtree. */ -- cgit From 761efe8a94cfcd0a3dd90f2008411550f3520b63 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Sun, 18 Nov 2018 18:44:04 -0500 Subject: function_graph: Remove the use of FTRACE_NOTRACE_DEPTH The curr_ret_stack is no longer set to a negative value when a function is not to be traced by the function graph tracer. Remove the usage of FTRACE_NOTRACE_DEPTH, as it is no longer needed. Reviewed-by: Joel Fernandes (Google) Signed-off-by: Steven Rostedt (VMware) --- include/linux/ftrace.h | 1 - kernel/trace/fgraph.c | 19 ------------------- kernel/trace/trace_functions_graph.c | 11 ----------- 3 files changed, 31 deletions(-) (limited to 'kernel') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 10bd46434908..98625f10d982 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -790,7 +790,6 @@ unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx, */ #define __notrace_funcgraph notrace -#define FTRACE_NOTRACE_DEPTH 65536 #define FTRACE_RETFUNC_DEPTH 50 #define FTRACE_RETSTACK_ALLOC_SIZE 32 extern int register_ftrace_graph(trace_func_graph_ret_t retfunc, diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index e852b69c0e64..de887a983ac7 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -112,16 +112,6 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret, index = current->curr_ret_stack; - /* - * A negative index here means that it's just returned from a - * notrace'd function. Recover index to get an original - * return address. See ftrace_push_return_trace(). - * - * TODO: Need to check whether the stack gets corrupted. - */ - if (index < 0) - index += FTRACE_NOTRACE_DEPTH; - if (unlikely(index < 0 || index >= FTRACE_RETFUNC_DEPTH)) { ftrace_graph_stop(); WARN_ON(1); @@ -190,15 +180,6 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer) */ barrier(); current->curr_ret_stack--; - /* - * The curr_ret_stack can be less than -1 only if it was - * filtered out and it's about to return from the function. - * Recover the index and continue to trace normal functions. - */ - if (current->curr_ret_stack < -1) { - current->curr_ret_stack += FTRACE_NOTRACE_DEPTH; - return ret; - } if (unlikely(!ret)) { ftrace_graph_stop(); diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index ecf543df943b..eaf9b1629956 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -115,9 +115,6 @@ unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx, if (ret != (unsigned long)return_to_handler) return ret; - if (index < -1) - index += FTRACE_NOTRACE_DEPTH; - if (index < 0) return ret; @@ -675,10 +672,6 @@ print_graph_entry_leaf(struct trace_iterator *iter, cpu_data = per_cpu_ptr(data->cpu_data, cpu); - /* If a graph tracer ignored set_graph_notrace */ - if (call->depth < -1) - call->depth += FTRACE_NOTRACE_DEPTH; - /* * Comments display at + 1 to depth. Since * this is a leaf function, keep the comments @@ -721,10 +714,6 @@ print_graph_entry_nested(struct trace_iterator *iter, struct fgraph_cpu_data *cpu_data; int cpu = iter->cpu; - /* If a graph tracer ignored set_graph_notrace */ - if (call->depth < -1) - call->depth += FTRACE_NOTRACE_DEPTH; - cpu_data = per_cpu_ptr(data->cpu_data, cpu); cpu_data->depth = call->depth; -- cgit From 3306fc4aff464f9c08c8899695a218f4b1125d4a Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 15 Nov 2018 12:32:38 -0500 Subject: ftrace: Create new ftrace_internal.h header In order to move function graph infrastructure into its own file (fgraph.h) it needs to access various functions and variables in ftrace.c that are currently static. Create a new file called ftrace-internal.h that holds the function prototypes and the extern declarations of the variables needed by fgraph.c as well, and make them global in ftrace.c such that they can be used outside that file. Reviewed-by: Joel Fernandes (Google) Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 76 ++++++++---------------------------------- kernel/trace/ftrace_internal.h | 75 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 89 insertions(+), 62 deletions(-) create mode 100644 kernel/trace/ftrace_internal.h (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 77734451cb05..52c89428b0db 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -40,6 +40,7 @@ #include #include +#include "ftrace_internal.h" #include "trace_output.h" #include "trace_stat.h" @@ -77,7 +78,7 @@ #define ASSIGN_OPS_HASH(opsname, val) #endif -static struct ftrace_ops ftrace_list_end __read_mostly = { +struct ftrace_ops ftrace_list_end __read_mostly = { .func = ftrace_stub, .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_STUB, INIT_OPS_HASH(ftrace_list_end) @@ -112,11 +113,11 @@ static void ftrace_update_trampoline(struct ftrace_ops *ops); */ static int ftrace_disabled __read_mostly; -static DEFINE_MUTEX(ftrace_lock); +DEFINE_MUTEX(ftrace_lock); -static struct ftrace_ops __rcu *ftrace_ops_list __read_mostly = &ftrace_list_end; +struct ftrace_ops __rcu *ftrace_ops_list __read_mostly = &ftrace_list_end; ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; -static struct ftrace_ops global_ops; +struct ftrace_ops global_ops; #if ARCH_SUPPORTS_FTRACE_OPS static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, @@ -127,26 +128,6 @@ static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip); #define ftrace_ops_list_func ((ftrace_func_t)ftrace_ops_no_ops) #endif -/* - * Traverse the ftrace_global_list, invoking all entries. The reason that we - * can use rcu_dereference_raw_notrace() is that elements removed from this list - * are simply leaked, so there is no need to interact with a grace-period - * mechanism. The rcu_dereference_raw_notrace() calls are needed to handle - * concurrent insertions into the ftrace_global_list. - * - * Silly Alpha and silly pointer-speculation compiler optimizations! - */ -#define do_for_each_ftrace_op(op, list) \ - op = rcu_dereference_raw_notrace(list); \ - do - -/* - * Optimized for just a single item in the list (as that is the normal case). - */ -#define while_for_each_ftrace_op(op) \ - while (likely(op = rcu_dereference_raw_notrace((op)->next)) && \ - unlikely((op) != &ftrace_list_end)) - static inline void ftrace_ops_init(struct ftrace_ops *ops) { #ifdef CONFIG_DYNAMIC_FTRACE @@ -187,17 +168,11 @@ static void ftrace_sync_ipi(void *data) } #ifdef CONFIG_FUNCTION_GRAPH_TRACER -static void update_function_graph_func(void); - /* Both enabled by default (can be cleared by function_graph tracer flags */ static bool fgraph_sleep_time = true; static bool fgraph_graph_time = true; - -#else -static inline void update_function_graph_func(void) { } #endif - static ftrace_func_t ftrace_ops_get_list_func(struct ftrace_ops *ops) { /* @@ -334,7 +309,7 @@ static int remove_ftrace_ops(struct ftrace_ops __rcu **list, static void ftrace_update_trampoline(struct ftrace_ops *ops); -static int __register_ftrace_function(struct ftrace_ops *ops) +int __register_ftrace_function(struct ftrace_ops *ops) { if (ops->flags & FTRACE_OPS_FL_DELETED) return -EINVAL; @@ -375,7 +350,7 @@ static int __register_ftrace_function(struct ftrace_ops *ops) return 0; } -static int __unregister_ftrace_function(struct ftrace_ops *ops) +int __unregister_ftrace_function(struct ftrace_ops *ops) { int ret; @@ -1022,9 +997,7 @@ static __init void ftrace_profile_tracefs(struct dentry *d_tracer) #endif /* CONFIG_FUNCTION_PROFILER */ #ifdef CONFIG_FUNCTION_GRAPH_TRACER -static int ftrace_graph_active; -#else -# define ftrace_graph_active 0 +int ftrace_graph_active; #endif #ifdef CONFIG_DYNAMIC_FTRACE @@ -1067,7 +1040,7 @@ static const struct ftrace_hash empty_hash = { }; #define EMPTY_HASH ((struct ftrace_hash *)&empty_hash) -static struct ftrace_ops global_ops = { +struct ftrace_ops global_ops = { .func = ftrace_stub, .local_hash.notrace_hash = EMPTY_HASH, .local_hash.filter_hash = EMPTY_HASH, @@ -1503,7 +1476,7 @@ static bool hash_contains_ip(unsigned long ip, * This needs to be called with preemption disabled as * the hashes are freed with call_rcu_sched(). */ -static int +int ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs) { struct ftrace_ops_hash hash; @@ -2682,7 +2655,7 @@ static void ftrace_startup_all(int command) update_all_ops = false; } -static int ftrace_startup(struct ftrace_ops *ops, int command) +int ftrace_startup(struct ftrace_ops *ops, int command) { int ret; @@ -2724,7 +2697,7 @@ static int ftrace_startup(struct ftrace_ops *ops, int command) return 0; } -static int ftrace_shutdown(struct ftrace_ops *ops, int command) +int ftrace_shutdown(struct ftrace_ops *ops, int command) { int ret; @@ -6177,7 +6150,7 @@ void ftrace_init_trace_array(struct trace_array *tr) } #else -static struct ftrace_ops global_ops = { +struct ftrace_ops global_ops = { .func = ftrace_stub, .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED | @@ -6194,31 +6167,10 @@ core_initcall(ftrace_nodyn_init); static inline int ftrace_init_dyn_tracefs(struct dentry *d_tracer) { return 0; } static inline void ftrace_startup_enable(int command) { } static inline void ftrace_startup_all(int command) { } -/* Keep as macros so we do not need to define the commands */ -# define ftrace_startup(ops, command) \ - ({ \ - int ___ret = __register_ftrace_function(ops); \ - if (!___ret) \ - (ops)->flags |= FTRACE_OPS_FL_ENABLED; \ - ___ret; \ - }) -# define ftrace_shutdown(ops, command) \ - ({ \ - int ___ret = __unregister_ftrace_function(ops); \ - if (!___ret) \ - (ops)->flags &= ~FTRACE_OPS_FL_ENABLED; \ - ___ret; \ - }) # define ftrace_startup_sysctl() do { } while (0) # define ftrace_shutdown_sysctl() do { } while (0) -static inline int -ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs) -{ - return 1; -} - static void ftrace_update_trampoline(struct ftrace_ops *ops) { } @@ -6930,7 +6882,7 @@ static int ftrace_graph_entry_test(struct ftrace_graph_ent *trace) * function against the global ops, and not just trace any function * that any ftrace_ops registered. */ -static void update_function_graph_func(void) +void update_function_graph_func(void) { struct ftrace_ops *op; bool do_test = false; diff --git a/kernel/trace/ftrace_internal.h b/kernel/trace/ftrace_internal.h new file mode 100644 index 000000000000..0515a2096f90 --- /dev/null +++ b/kernel/trace/ftrace_internal.h @@ -0,0 +1,75 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_KERNEL_FTRACE_INTERNAL_H +#define _LINUX_KERNEL_FTRACE_INTERNAL_H + +#ifdef CONFIG_FUNCTION_TRACER + +/* + * Traverse the ftrace_global_list, invoking all entries. The reason that we + * can use rcu_dereference_raw_notrace() is that elements removed from this list + * are simply leaked, so there is no need to interact with a grace-period + * mechanism. The rcu_dereference_raw_notrace() calls are needed to handle + * concurrent insertions into the ftrace_global_list. + * + * Silly Alpha and silly pointer-speculation compiler optimizations! + */ +#define do_for_each_ftrace_op(op, list) \ + op = rcu_dereference_raw_notrace(list); \ + do + +/* + * Optimized for just a single item in the list (as that is the normal case). + */ +#define while_for_each_ftrace_op(op) \ + while (likely(op = rcu_dereference_raw_notrace((op)->next)) && \ + unlikely((op) != &ftrace_list_end)) + +extern struct ftrace_ops __rcu *ftrace_ops_list; +extern struct ftrace_ops ftrace_list_end; +extern struct mutex ftrace_lock; +extern struct ftrace_ops global_ops; + +#ifdef CONFIG_DYNAMIC_FTRACE + +int ftrace_startup(struct ftrace_ops *ops, int command); +int ftrace_shutdown(struct ftrace_ops *ops, int command); +int ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs); + +#else /* !CONFIG_DYNAMIC_FTRACE */ + +int __register_ftrace_function(struct ftrace_ops *ops); +int __unregister_ftrace_function(struct ftrace_ops *ops); +/* Keep as macros so we do not need to define the commands */ +# define ftrace_startup(ops, command) \ + ({ \ + int ___ret = __register_ftrace_function(ops); \ + if (!___ret) \ + (ops)->flags |= FTRACE_OPS_FL_ENABLED; \ + ___ret; \ + }) +# define ftrace_shutdown(ops, command) \ + ({ \ + int ___ret = __unregister_ftrace_function(ops); \ + if (!___ret) \ + (ops)->flags &= ~FTRACE_OPS_FL_ENABLED; \ + ___ret; \ + }) +static inline int +ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs) +{ + return 1; +} +#endif /* CONFIG_DYNAMIC_FTRACE */ + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +extern int ftrace_graph_active; +void update_function_graph_func(void); +#else /* !CONFIG_FUNCTION_GRAPH_TRACER */ +# define ftrace_graph_active 0 +static inline void update_function_graph_func(void) { } +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ + +#else /* !CONFIG_FUNCTION_TRACER */ +#endif /* CONFIG_FUNCTION_TRACER */ + +#endif -- cgit From c8dd0f45874547e6e77bab03d71feb16c4cb98a8 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 23 Nov 2018 13:06:07 -0500 Subject: function_graph: Do not expose the graph_time option when profiler is not configured When the function profiler is not configured, the "graph_time" option is meaningless, as the function profiler is the only thing that makes use of it. Do not expose it if the profiler is not configured. Link: http://lkml.kernel.org/r/20181123061133.GA195223@google.com Reported-by: Joel Fernandes Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.h | 5 +++++ kernel/trace/trace_functions_graph.c | 4 ++++ 2 files changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index f67060a75f38..ab16eca76e59 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -862,7 +862,12 @@ static __always_inline bool ftrace_hash_empty(struct ftrace_hash *hash) #define TRACE_GRAPH_PRINT_FILL_MASK (0x3 << TRACE_GRAPH_PRINT_FILL_SHIFT) extern void ftrace_graph_sleep_time_control(bool enable); + +#ifdef CONFIG_FUNCTION_PROFILER extern void ftrace_graph_graph_time_control(bool enable); +#else +static inline void ftrace_graph_graph_time_control(bool enable) { } +#endif extern enum print_line_t print_graph_function_flags(struct trace_iterator *iter, u32 flags); diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index eaf9b1629956..855c13c61e77 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -60,8 +60,12 @@ static struct tracer_opt trace_opts[] = { { TRACER_OPT(funcgraph-tail, TRACE_GRAPH_PRINT_TAIL) }, /* Include sleep time (scheduled out) between entry and return */ { TRACER_OPT(sleep-time, TRACE_GRAPH_SLEEP_TIME) }, + +#ifdef CONFIG_FUNCTION_PROFILER /* Include time within nested functions */ { TRACER_OPT(graph-time, TRACE_GRAPH_GRAPH_TIME) }, +#endif + { } /* Empty entry */ }; -- cgit From e73e679f656e678b0e7f8961094201f3544f4541 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 15 Nov 2018 12:35:13 -0500 Subject: fgraph: Move function graph specific code into fgraph.c To make the function graph infrastructure more managable, the code needs to be in its own file (fgraph.c). Move the code that is specific for managing the function graph infrastructure out of ftrace.c and into fgraph.c Reviewed-by: Joel Fernandes (Google) Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/fgraph.c | 360 +++++++++++++++++++++++++++++++++++++++++++++++- kernel/trace/ftrace.c | 368 +------------------------------------------------- 2 files changed, 366 insertions(+), 362 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index de887a983ac7..374f3e42e29e 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -7,11 +7,27 @@ * * Highly modified by Steven Rostedt (VMware). */ +#include #include +#include -#include "trace.h" +#include + +#include "ftrace_internal.h" + +#ifdef CONFIG_DYNAMIC_FTRACE +#define ASSIGN_OPS_HASH(opsname, val) \ + .func_hash = val, \ + .local_hash.regex_lock = __MUTEX_INITIALIZER(opsname.local_hash.regex_lock), +#else +#define ASSIGN_OPS_HASH(opsname, val) +#endif static bool kill_ftrace_graph; +int ftrace_graph_active; + +/* Both enabled by default (can be cleared by function_graph tracer flags */ +static bool fgraph_sleep_time = true; /** * ftrace_graph_is_dead - returns true if ftrace_graph_stop() was called @@ -161,6 +177,31 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret, barrier(); } +/* + * Hibernation protection. + * The state of the current task is too much unstable during + * suspend/restore to disk. We want to protect against that. + */ +static int +ftrace_suspend_notifier_call(struct notifier_block *bl, unsigned long state, + void *unused) +{ + switch (state) { + case PM_HIBERNATION_PREPARE: + pause_graph_tracing(); + break; + + case PM_POST_HIBERNATION: + unpause_graph_tracing(); + break; + } + return NOTIFY_DONE; +} + +static struct notifier_block ftrace_suspend_notifier = { + .notifier_call = ftrace_suspend_notifier_call, +}; + /* * Send the trace to the ring-buffer. * @return the original return address. @@ -190,3 +231,320 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer) return ret; } + +static struct ftrace_ops graph_ops = { + .func = ftrace_stub, + .flags = FTRACE_OPS_FL_RECURSION_SAFE | + FTRACE_OPS_FL_INITIALIZED | + FTRACE_OPS_FL_PID | + FTRACE_OPS_FL_STUB, +#ifdef FTRACE_GRAPH_TRAMP_ADDR + .trampoline = FTRACE_GRAPH_TRAMP_ADDR, + /* trampoline_size is only needed for dynamically allocated tramps */ +#endif + ASSIGN_OPS_HASH(graph_ops, &global_ops.local_hash) +}; + +void ftrace_graph_sleep_time_control(bool enable) +{ + fgraph_sleep_time = enable; +} + +int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace) +{ + return 0; +} + +/* The callbacks that hook a function */ +trace_func_graph_ret_t ftrace_graph_return = + (trace_func_graph_ret_t)ftrace_stub; +trace_func_graph_ent_t ftrace_graph_entry = ftrace_graph_entry_stub; +static trace_func_graph_ent_t __ftrace_graph_entry = ftrace_graph_entry_stub; + +/* Try to assign a return stack array on FTRACE_RETSTACK_ALLOC_SIZE tasks. */ +static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list) +{ + int i; + int ret = 0; + int start = 0, end = FTRACE_RETSTACK_ALLOC_SIZE; + struct task_struct *g, *t; + + for (i = 0; i < FTRACE_RETSTACK_ALLOC_SIZE; i++) { + ret_stack_list[i] = + kmalloc_array(FTRACE_RETFUNC_DEPTH, + sizeof(struct ftrace_ret_stack), + GFP_KERNEL); + if (!ret_stack_list[i]) { + start = 0; + end = i; + ret = -ENOMEM; + goto free; + } + } + + read_lock(&tasklist_lock); + do_each_thread(g, t) { + if (start == end) { + ret = -EAGAIN; + goto unlock; + } + + if (t->ret_stack == NULL) { + atomic_set(&t->tracing_graph_pause, 0); + atomic_set(&t->trace_overrun, 0); + t->curr_ret_stack = -1; + t->curr_ret_depth = -1; + /* Make sure the tasks see the -1 first: */ + smp_wmb(); + t->ret_stack = ret_stack_list[start++]; + } + } while_each_thread(g, t); + +unlock: + read_unlock(&tasklist_lock); +free: + for (i = start; i < end; i++) + kfree(ret_stack_list[i]); + return ret; +} + +static void +ftrace_graph_probe_sched_switch(void *ignore, bool preempt, + struct task_struct *prev, struct task_struct *next) +{ + unsigned long long timestamp; + int index; + + /* + * Does the user want to count the time a function was asleep. + * If so, do not update the time stamps. + */ + if (fgraph_sleep_time) + return; + + timestamp = trace_clock_local(); + + prev->ftrace_timestamp = timestamp; + + /* only process tasks that we timestamped */ + if (!next->ftrace_timestamp) + return; + + /* + * Update all the counters in next to make up for the + * time next was sleeping. + */ + timestamp -= next->ftrace_timestamp; + + for (index = next->curr_ret_stack; index >= 0; index--) + next->ret_stack[index].calltime += timestamp; +} + +static int ftrace_graph_entry_test(struct ftrace_graph_ent *trace) +{ + if (!ftrace_ops_test(&global_ops, trace->func, NULL)) + return 0; + return __ftrace_graph_entry(trace); +} + +/* + * The function graph tracer should only trace the functions defined + * by set_ftrace_filter and set_ftrace_notrace. If another function + * tracer ops is registered, the graph tracer requires testing the + * function against the global ops, and not just trace any function + * that any ftrace_ops registered. + */ +void update_function_graph_func(void) +{ + struct ftrace_ops *op; + bool do_test = false; + + /* + * The graph and global ops share the same set of functions + * to test. If any other ops is on the list, then + * the graph tracing needs to test if its the function + * it should call. + */ + do_for_each_ftrace_op(op, ftrace_ops_list) { + if (op != &global_ops && op != &graph_ops && + op != &ftrace_list_end) { + do_test = true; + /* in double loop, break out with goto */ + goto out; + } + } while_for_each_ftrace_op(op); + out: + if (do_test) + ftrace_graph_entry = ftrace_graph_entry_test; + else + ftrace_graph_entry = __ftrace_graph_entry; +} + +static DEFINE_PER_CPU(struct ftrace_ret_stack *, idle_ret_stack); + +static void +graph_init_task(struct task_struct *t, struct ftrace_ret_stack *ret_stack) +{ + atomic_set(&t->tracing_graph_pause, 0); + atomic_set(&t->trace_overrun, 0); + t->ftrace_timestamp = 0; + /* make curr_ret_stack visible before we add the ret_stack */ + smp_wmb(); + t->ret_stack = ret_stack; +} + +/* + * Allocate a return stack for the idle task. May be the first + * time through, or it may be done by CPU hotplug online. + */ +void ftrace_graph_init_idle_task(struct task_struct *t, int cpu) +{ + t->curr_ret_stack = -1; + t->curr_ret_depth = -1; + /* + * The idle task has no parent, it either has its own + * stack or no stack at all. + */ + if (t->ret_stack) + WARN_ON(t->ret_stack != per_cpu(idle_ret_stack, cpu)); + + if (ftrace_graph_active) { + struct ftrace_ret_stack *ret_stack; + + ret_stack = per_cpu(idle_ret_stack, cpu); + if (!ret_stack) { + ret_stack = + kmalloc_array(FTRACE_RETFUNC_DEPTH, + sizeof(struct ftrace_ret_stack), + GFP_KERNEL); + if (!ret_stack) + return; + per_cpu(idle_ret_stack, cpu) = ret_stack; + } + graph_init_task(t, ret_stack); + } +} + +/* Allocate a return stack for newly created task */ +void ftrace_graph_init_task(struct task_struct *t) +{ + /* Make sure we do not use the parent ret_stack */ + t->ret_stack = NULL; + t->curr_ret_stack = -1; + t->curr_ret_depth = -1; + + if (ftrace_graph_active) { + struct ftrace_ret_stack *ret_stack; + + ret_stack = kmalloc_array(FTRACE_RETFUNC_DEPTH, + sizeof(struct ftrace_ret_stack), + GFP_KERNEL); + if (!ret_stack) + return; + graph_init_task(t, ret_stack); + } +} + +void ftrace_graph_exit_task(struct task_struct *t) +{ + struct ftrace_ret_stack *ret_stack = t->ret_stack; + + t->ret_stack = NULL; + /* NULL must become visible to IRQs before we free it: */ + barrier(); + + kfree(ret_stack); +} + +/* Allocate a return stack for each task */ +static int start_graph_tracing(void) +{ + struct ftrace_ret_stack **ret_stack_list; + int ret, cpu; + + ret_stack_list = kmalloc_array(FTRACE_RETSTACK_ALLOC_SIZE, + sizeof(struct ftrace_ret_stack *), + GFP_KERNEL); + + if (!ret_stack_list) + return -ENOMEM; + + /* The cpu_boot init_task->ret_stack will never be freed */ + for_each_online_cpu(cpu) { + if (!idle_task(cpu)->ret_stack) + ftrace_graph_init_idle_task(idle_task(cpu), cpu); + } + + do { + ret = alloc_retstack_tasklist(ret_stack_list); + } while (ret == -EAGAIN); + + if (!ret) { + ret = register_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL); + if (ret) + pr_info("ftrace_graph: Couldn't activate tracepoint" + " probe to kernel_sched_switch\n"); + } + + kfree(ret_stack_list); + return ret; +} + +int register_ftrace_graph(trace_func_graph_ret_t retfunc, + trace_func_graph_ent_t entryfunc) +{ + int ret = 0; + + mutex_lock(&ftrace_lock); + + /* we currently allow only one tracer registered at a time */ + if (ftrace_graph_active) { + ret = -EBUSY; + goto out; + } + + register_pm_notifier(&ftrace_suspend_notifier); + + ftrace_graph_active++; + ret = start_graph_tracing(); + if (ret) { + ftrace_graph_active--; + goto out; + } + + ftrace_graph_return = retfunc; + + /* + * Update the indirect function to the entryfunc, and the + * function that gets called to the entry_test first. Then + * call the update fgraph entry function to determine if + * the entryfunc should be called directly or not. + */ + __ftrace_graph_entry = entryfunc; + ftrace_graph_entry = ftrace_graph_entry_test; + update_function_graph_func(); + + ret = ftrace_startup(&graph_ops, FTRACE_START_FUNC_RET); +out: + mutex_unlock(&ftrace_lock); + return ret; +} + +void unregister_ftrace_graph(void) +{ + mutex_lock(&ftrace_lock); + + if (unlikely(!ftrace_graph_active)) + goto out; + + ftrace_graph_active--; + ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub; + ftrace_graph_entry = ftrace_graph_entry_stub; + __ftrace_graph_entry = ftrace_graph_entry_stub; + ftrace_shutdown(&graph_ops, FTRACE_STOP_FUNC_RET); + unregister_pm_notifier(&ftrace_suspend_notifier); + unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL); + + out: + mutex_unlock(&ftrace_lock); +} diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 52c89428b0db..c53533b833cf 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -19,7 +19,6 @@ #include #include #include -#include #include #include #include @@ -167,12 +166,6 @@ static void ftrace_sync_ipi(void *data) smp_rmb(); } -#ifdef CONFIG_FUNCTION_GRAPH_TRACER -/* Both enabled by default (can be cleared by function_graph tracer flags */ -static bool fgraph_sleep_time = true; -static bool fgraph_graph_time = true; -#endif - static ftrace_func_t ftrace_ops_get_list_func(struct ftrace_ops *ops) { /* @@ -790,6 +783,13 @@ function_profile_call(unsigned long ip, unsigned long parent_ip, } #ifdef CONFIG_FUNCTION_GRAPH_TRACER +static bool fgraph_graph_time = true; + +void ftrace_graph_graph_time_control(bool enable) +{ + fgraph_graph_time = enable; +} + static int profile_graph_entry(struct ftrace_graph_ent *trace) { int index = current->curr_ret_stack; @@ -996,10 +996,6 @@ static __init void ftrace_profile_tracefs(struct dentry *d_tracer) } #endif /* CONFIG_FUNCTION_PROFILER */ -#ifdef CONFIG_FUNCTION_GRAPH_TRACER -int ftrace_graph_active; -#endif - #ifdef CONFIG_DYNAMIC_FTRACE static struct ftrace_ops *removed_ops; @@ -6697,353 +6693,3 @@ ftrace_enable_sysctl(struct ctl_table *table, int write, mutex_unlock(&ftrace_lock); return ret; } - -#ifdef CONFIG_FUNCTION_GRAPH_TRACER - -static struct ftrace_ops graph_ops = { - .func = ftrace_stub, - .flags = FTRACE_OPS_FL_RECURSION_SAFE | - FTRACE_OPS_FL_INITIALIZED | - FTRACE_OPS_FL_PID | - FTRACE_OPS_FL_STUB, -#ifdef FTRACE_GRAPH_TRAMP_ADDR - .trampoline = FTRACE_GRAPH_TRAMP_ADDR, - /* trampoline_size is only needed for dynamically allocated tramps */ -#endif - ASSIGN_OPS_HASH(graph_ops, &global_ops.local_hash) -}; - -void ftrace_graph_sleep_time_control(bool enable) -{ - fgraph_sleep_time = enable; -} - -void ftrace_graph_graph_time_control(bool enable) -{ - fgraph_graph_time = enable; -} - -int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace) -{ - return 0; -} - -/* The callbacks that hook a function */ -trace_func_graph_ret_t ftrace_graph_return = - (trace_func_graph_ret_t)ftrace_stub; -trace_func_graph_ent_t ftrace_graph_entry = ftrace_graph_entry_stub; -static trace_func_graph_ent_t __ftrace_graph_entry = ftrace_graph_entry_stub; - -/* Try to assign a return stack array on FTRACE_RETSTACK_ALLOC_SIZE tasks. */ -static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list) -{ - int i; - int ret = 0; - int start = 0, end = FTRACE_RETSTACK_ALLOC_SIZE; - struct task_struct *g, *t; - - for (i = 0; i < FTRACE_RETSTACK_ALLOC_SIZE; i++) { - ret_stack_list[i] = - kmalloc_array(FTRACE_RETFUNC_DEPTH, - sizeof(struct ftrace_ret_stack), - GFP_KERNEL); - if (!ret_stack_list[i]) { - start = 0; - end = i; - ret = -ENOMEM; - goto free; - } - } - - read_lock(&tasklist_lock); - do_each_thread(g, t) { - if (start == end) { - ret = -EAGAIN; - goto unlock; - } - - if (t->ret_stack == NULL) { - atomic_set(&t->tracing_graph_pause, 0); - atomic_set(&t->trace_overrun, 0); - t->curr_ret_stack = -1; - t->curr_ret_depth = -1; - /* Make sure the tasks see the -1 first: */ - smp_wmb(); - t->ret_stack = ret_stack_list[start++]; - } - } while_each_thread(g, t); - -unlock: - read_unlock(&tasklist_lock); -free: - for (i = start; i < end; i++) - kfree(ret_stack_list[i]); - return ret; -} - -static void -ftrace_graph_probe_sched_switch(void *ignore, bool preempt, - struct task_struct *prev, struct task_struct *next) -{ - unsigned long long timestamp; - int index; - - /* - * Does the user want to count the time a function was asleep. - * If so, do not update the time stamps. - */ - if (fgraph_sleep_time) - return; - - timestamp = trace_clock_local(); - - prev->ftrace_timestamp = timestamp; - - /* only process tasks that we timestamped */ - if (!next->ftrace_timestamp) - return; - - /* - * Update all the counters in next to make up for the - * time next was sleeping. - */ - timestamp -= next->ftrace_timestamp; - - for (index = next->curr_ret_stack; index >= 0; index--) - next->ret_stack[index].calltime += timestamp; -} - -/* Allocate a return stack for each task */ -static int start_graph_tracing(void) -{ - struct ftrace_ret_stack **ret_stack_list; - int ret, cpu; - - ret_stack_list = kmalloc_array(FTRACE_RETSTACK_ALLOC_SIZE, - sizeof(struct ftrace_ret_stack *), - GFP_KERNEL); - - if (!ret_stack_list) - return -ENOMEM; - - /* The cpu_boot init_task->ret_stack will never be freed */ - for_each_online_cpu(cpu) { - if (!idle_task(cpu)->ret_stack) - ftrace_graph_init_idle_task(idle_task(cpu), cpu); - } - - do { - ret = alloc_retstack_tasklist(ret_stack_list); - } while (ret == -EAGAIN); - - if (!ret) { - ret = register_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL); - if (ret) - pr_info("ftrace_graph: Couldn't activate tracepoint" - " probe to kernel_sched_switch\n"); - } - - kfree(ret_stack_list); - return ret; -} - -/* - * Hibernation protection. - * The state of the current task is too much unstable during - * suspend/restore to disk. We want to protect against that. - */ -static int -ftrace_suspend_notifier_call(struct notifier_block *bl, unsigned long state, - void *unused) -{ - switch (state) { - case PM_HIBERNATION_PREPARE: - pause_graph_tracing(); - break; - - case PM_POST_HIBERNATION: - unpause_graph_tracing(); - break; - } - return NOTIFY_DONE; -} - -static int ftrace_graph_entry_test(struct ftrace_graph_ent *trace) -{ - if (!ftrace_ops_test(&global_ops, trace->func, NULL)) - return 0; - return __ftrace_graph_entry(trace); -} - -/* - * The function graph tracer should only trace the functions defined - * by set_ftrace_filter and set_ftrace_notrace. If another function - * tracer ops is registered, the graph tracer requires testing the - * function against the global ops, and not just trace any function - * that any ftrace_ops registered. - */ -void update_function_graph_func(void) -{ - struct ftrace_ops *op; - bool do_test = false; - - /* - * The graph and global ops share the same set of functions - * to test. If any other ops is on the list, then - * the graph tracing needs to test if its the function - * it should call. - */ - do_for_each_ftrace_op(op, ftrace_ops_list) { - if (op != &global_ops && op != &graph_ops && - op != &ftrace_list_end) { - do_test = true; - /* in double loop, break out with goto */ - goto out; - } - } while_for_each_ftrace_op(op); - out: - if (do_test) - ftrace_graph_entry = ftrace_graph_entry_test; - else - ftrace_graph_entry = __ftrace_graph_entry; -} - -static struct notifier_block ftrace_suspend_notifier = { - .notifier_call = ftrace_suspend_notifier_call, -}; - -int register_ftrace_graph(trace_func_graph_ret_t retfunc, - trace_func_graph_ent_t entryfunc) -{ - int ret = 0; - - mutex_lock(&ftrace_lock); - - /* we currently allow only one tracer registered at a time */ - if (ftrace_graph_active) { - ret = -EBUSY; - goto out; - } - - register_pm_notifier(&ftrace_suspend_notifier); - - ftrace_graph_active++; - ret = start_graph_tracing(); - if (ret) { - ftrace_graph_active--; - goto out; - } - - ftrace_graph_return = retfunc; - - /* - * Update the indirect function to the entryfunc, and the - * function that gets called to the entry_test first. Then - * call the update fgraph entry function to determine if - * the entryfunc should be called directly or not. - */ - __ftrace_graph_entry = entryfunc; - ftrace_graph_entry = ftrace_graph_entry_test; - update_function_graph_func(); - - ret = ftrace_startup(&graph_ops, FTRACE_START_FUNC_RET); -out: - mutex_unlock(&ftrace_lock); - return ret; -} - -void unregister_ftrace_graph(void) -{ - mutex_lock(&ftrace_lock); - - if (unlikely(!ftrace_graph_active)) - goto out; - - ftrace_graph_active--; - ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub; - ftrace_graph_entry = ftrace_graph_entry_stub; - __ftrace_graph_entry = ftrace_graph_entry_stub; - ftrace_shutdown(&graph_ops, FTRACE_STOP_FUNC_RET); - unregister_pm_notifier(&ftrace_suspend_notifier); - unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL); - - out: - mutex_unlock(&ftrace_lock); -} - -static DEFINE_PER_CPU(struct ftrace_ret_stack *, idle_ret_stack); - -static void -graph_init_task(struct task_struct *t, struct ftrace_ret_stack *ret_stack) -{ - atomic_set(&t->tracing_graph_pause, 0); - atomic_set(&t->trace_overrun, 0); - t->ftrace_timestamp = 0; - /* make curr_ret_stack visible before we add the ret_stack */ - smp_wmb(); - t->ret_stack = ret_stack; -} - -/* - * Allocate a return stack for the idle task. May be the first - * time through, or it may be done by CPU hotplug online. - */ -void ftrace_graph_init_idle_task(struct task_struct *t, int cpu) -{ - t->curr_ret_stack = -1; - t->curr_ret_depth = -1; - /* - * The idle task has no parent, it either has its own - * stack or no stack at all. - */ - if (t->ret_stack) - WARN_ON(t->ret_stack != per_cpu(idle_ret_stack, cpu)); - - if (ftrace_graph_active) { - struct ftrace_ret_stack *ret_stack; - - ret_stack = per_cpu(idle_ret_stack, cpu); - if (!ret_stack) { - ret_stack = - kmalloc_array(FTRACE_RETFUNC_DEPTH, - sizeof(struct ftrace_ret_stack), - GFP_KERNEL); - if (!ret_stack) - return; - per_cpu(idle_ret_stack, cpu) = ret_stack; - } - graph_init_task(t, ret_stack); - } -} - -/* Allocate a return stack for newly created task */ -void ftrace_graph_init_task(struct task_struct *t) -{ - /* Make sure we do not use the parent ret_stack */ - t->ret_stack = NULL; - t->curr_ret_stack = -1; - t->curr_ret_depth = -1; - - if (ftrace_graph_active) { - struct ftrace_ret_stack *ret_stack; - - ret_stack = kmalloc_array(FTRACE_RETFUNC_DEPTH, - sizeof(struct ftrace_ret_stack), - GFP_KERNEL); - if (!ret_stack) - return; - graph_init_task(t, ret_stack); - } -} - -void ftrace_graph_exit_task(struct task_struct *t) -{ - struct ftrace_ret_stack *ret_stack = t->ret_stack; - - t->ret_stack = NULL; - /* NULL must become visible to IRQs before we free it: */ - barrier(); - - kfree(ret_stack); -} -#endif -- cgit From 317e04ca905ac6c4b33cb879e9a107c412125f14 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 28 Nov 2018 10:26:27 -0500 Subject: tracing: Rearrange functions in trace_sched_wakeup.c Rearrange the functions in trace_sched_wakeup.c so that there are fewer #ifdef CONFIG_FUNCTION_TRACER and #ifdef CONFIG_FUNCTION_GRAPH_TRACER, instead of having the #ifdefs spread all over. No functional change is made. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_sched_wakeup.c | 272 ++++++++++++++++++-------------------- 1 file changed, 130 insertions(+), 142 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 7d04b9890755..2ce78100b4d3 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -35,26 +35,19 @@ static arch_spinlock_t wakeup_lock = static void wakeup_reset(struct trace_array *tr); static void __wakeup_reset(struct trace_array *tr); +static int start_func_tracer(struct trace_array *tr, int graph); +static void stop_func_tracer(struct trace_array *tr, int graph); static int save_flags; #ifdef CONFIG_FUNCTION_GRAPH_TRACER -static int wakeup_display_graph(struct trace_array *tr, int set); # define is_graph(tr) ((tr)->trace_flags & TRACE_ITER_DISPLAY_GRAPH) #else -static inline int wakeup_display_graph(struct trace_array *tr, int set) -{ - return 0; -} # define is_graph(tr) false #endif - #ifdef CONFIG_FUNCTION_TRACER -static int wakeup_graph_entry(struct ftrace_graph_ent *trace); -static void wakeup_graph_return(struct ftrace_graph_ret *trace); - static bool function_enabled; /* @@ -104,122 +97,8 @@ out_enable: return 0; } -/* - * wakeup uses its own tracer function to keep the overhead down: - */ -static void -wakeup_tracer_call(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *op, struct pt_regs *pt_regs) -{ - struct trace_array *tr = wakeup_trace; - struct trace_array_cpu *data; - unsigned long flags; - int pc; - - if (!func_prolog_preempt_disable(tr, &data, &pc)) - return; - - local_irq_save(flags); - trace_function(tr, ip, parent_ip, flags, pc); - local_irq_restore(flags); - - atomic_dec(&data->disabled); - preempt_enable_notrace(); -} - -static int register_wakeup_function(struct trace_array *tr, int graph, int set) -{ - int ret; - - /* 'set' is set if TRACE_ITER_FUNCTION is about to be set */ - if (function_enabled || (!set && !(tr->trace_flags & TRACE_ITER_FUNCTION))) - return 0; - - if (graph) - ret = register_ftrace_graph(&wakeup_graph_return, - &wakeup_graph_entry); - else - ret = register_ftrace_function(tr->ops); - - if (!ret) - function_enabled = true; - - return ret; -} - -static void unregister_wakeup_function(struct trace_array *tr, int graph) -{ - if (!function_enabled) - return; - - if (graph) - unregister_ftrace_graph(); - else - unregister_ftrace_function(tr->ops); - - function_enabled = false; -} - -static int wakeup_function_set(struct trace_array *tr, u32 mask, int set) -{ - if (!(mask & TRACE_ITER_FUNCTION)) - return 0; - - if (set) - register_wakeup_function(tr, is_graph(tr), 1); - else - unregister_wakeup_function(tr, is_graph(tr)); - return 1; -} -#else -static int register_wakeup_function(struct trace_array *tr, int graph, int set) -{ - return 0; -} -static void unregister_wakeup_function(struct trace_array *tr, int graph) { } -static int wakeup_function_set(struct trace_array *tr, u32 mask, int set) -{ - return 0; -} -#endif /* CONFIG_FUNCTION_TRACER */ - -static int wakeup_flag_changed(struct trace_array *tr, u32 mask, int set) -{ - struct tracer *tracer = tr->current_trace; - - if (wakeup_function_set(tr, mask, set)) - return 0; - #ifdef CONFIG_FUNCTION_GRAPH_TRACER - if (mask & TRACE_ITER_DISPLAY_GRAPH) - return wakeup_display_graph(tr, set); -#endif - - return trace_keep_overwrite(tracer, mask, set); -} -static int start_func_tracer(struct trace_array *tr, int graph) -{ - int ret; - - ret = register_wakeup_function(tr, graph, 0); - - if (!ret && tracing_is_enabled()) - tracer_enabled = 1; - else - tracer_enabled = 0; - - return ret; -} - -static void stop_func_tracer(struct trace_array *tr, int graph) -{ - tracer_enabled = 0; - - unregister_wakeup_function(tr, graph); -} - -#ifdef CONFIG_FUNCTION_GRAPH_TRACER static int wakeup_display_graph(struct trace_array *tr, int set) { if (!(is_graph(tr) ^ set)) @@ -318,20 +197,94 @@ static void wakeup_print_header(struct seq_file *s) else trace_default_header(s); } +#else /* CONFIG_FUNCTION_GRAPH_TRACER */ +static int wakeup_graph_entry(struct ftrace_graph_ent *trace) +{ + return -1; +} +static void wakeup_graph_return(struct ftrace_graph_ret *trace) { } +#endif /* else CONFIG_FUNCTION_GRAPH_TRACER */ +/* + * wakeup uses its own tracer function to keep the overhead down: + */ static void -__trace_function(struct trace_array *tr, - unsigned long ip, unsigned long parent_ip, - unsigned long flags, int pc) +wakeup_tracer_call(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct pt_regs *pt_regs) { - if (is_graph(tr)) - trace_graph_function(tr, ip, parent_ip, flags, pc); + struct trace_array *tr = wakeup_trace; + struct trace_array_cpu *data; + unsigned long flags; + int pc; + + if (!func_prolog_preempt_disable(tr, &data, &pc)) + return; + + local_irq_save(flags); + trace_function(tr, ip, parent_ip, flags, pc); + local_irq_restore(flags); + + atomic_dec(&data->disabled); + preempt_enable_notrace(); +} + +static int register_wakeup_function(struct trace_array *tr, int graph, int set) +{ + int ret; + + /* 'set' is set if TRACE_ITER_FUNCTION is about to be set */ + if (function_enabled || (!set && !(tr->trace_flags & TRACE_ITER_FUNCTION))) + return 0; + + if (graph) + ret = register_ftrace_graph(&wakeup_graph_return, + &wakeup_graph_entry); else - trace_function(tr, ip, parent_ip, flags, pc); + ret = register_ftrace_function(tr->ops); + + if (!ret) + function_enabled = true; + + return ret; } -#else -#define __trace_function trace_function +static void unregister_wakeup_function(struct trace_array *tr, int graph) +{ + if (!function_enabled) + return; + + if (graph) + unregister_ftrace_graph(); + else + unregister_ftrace_function(tr->ops); + + function_enabled = false; +} + +static int wakeup_function_set(struct trace_array *tr, u32 mask, int set) +{ + if (!(mask & TRACE_ITER_FUNCTION)) + return 0; + + if (set) + register_wakeup_function(tr, is_graph(tr), 1); + else + unregister_wakeup_function(tr, is_graph(tr)); + return 1; +} +#else /* CONFIG_FUNCTION_TRACER */ +static int register_wakeup_function(struct trace_array *tr, int graph, int set) +{ + return 0; +} +static void unregister_wakeup_function(struct trace_array *tr, int graph) { } +static int wakeup_function_set(struct trace_array *tr, u32 mask, int set) +{ + return 0; +} +#endif /* else CONFIG_FUNCTION_TRACER */ + +#ifndef CONFIG_FUNCTION_GRAPH_TRACER static enum print_line_t wakeup_print_line(struct trace_iterator *iter) { return TRACE_TYPE_UNHANDLED; @@ -340,23 +293,58 @@ static enum print_line_t wakeup_print_line(struct trace_iterator *iter) static void wakeup_trace_open(struct trace_iterator *iter) { } static void wakeup_trace_close(struct trace_iterator *iter) { } -#ifdef CONFIG_FUNCTION_TRACER -static int wakeup_graph_entry(struct ftrace_graph_ent *trace) -{ - return -1; -} -static void wakeup_graph_return(struct ftrace_graph_ret *trace) { } static void wakeup_print_header(struct seq_file *s) { trace_default_header(s); } -#else -static void wakeup_print_header(struct seq_file *s) +#endif /* !CONFIG_FUNCTION_GRAPH_TRACER */ + +static void +__trace_function(struct trace_array *tr, + unsigned long ip, unsigned long parent_ip, + unsigned long flags, int pc) +{ + if (is_graph(tr)) + trace_graph_function(tr, ip, parent_ip, flags, pc); + else + trace_function(tr, ip, parent_ip, flags, pc); +} + +static int wakeup_flag_changed(struct trace_array *tr, u32 mask, int set) { - trace_latency_header(s); + struct tracer *tracer = tr->current_trace; + + if (wakeup_function_set(tr, mask, set)) + return 0; + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + if (mask & TRACE_ITER_DISPLAY_GRAPH) + return wakeup_display_graph(tr, set); +#endif + + return trace_keep_overwrite(tracer, mask, set); +} + +static int start_func_tracer(struct trace_array *tr, int graph) +{ + int ret; + + ret = register_wakeup_function(tr, graph, 0); + + if (!ret && tracing_is_enabled()) + tracer_enabled = 1; + else + tracer_enabled = 0; + + return ret; +} + +static void stop_func_tracer(struct trace_array *tr, int graph) +{ + tracer_enabled = 0; + + unregister_wakeup_function(tr, graph); } -#endif /* CONFIG_FUNCTION_TRACER */ -#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ /* * Should this new latency be reported/recorded? -- cgit From 688f7089d8851b1a81106f0c0b9b29181b2f2dc8 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 15 Nov 2018 14:06:47 -0500 Subject: fgraph: Add new fgraph_ops structure to enable function graph hooks Currently the registering of function graph is to pass in a entry and return function. We need to have a way to associate those functions together where the entry can determine to run the return hook. Having a structure that contains both functions will facilitate the process of converting the code to be able to do such. This is similar to the way function hooks are enabled (it passes in ftrace_ops). Instead of passing in the functions to use, a single structure is passed in to the registering function. The unregister function is now passed in the fgraph_ops handle. When we allow more than one callback to the function graph hooks, this will let the system know which one to remove. Reviewed-by: Joel Fernandes (Google) Signed-off-by: Steven Rostedt (VMware) --- include/linux/ftrace.h | 21 +++++++++++---------- kernel/trace/fgraph.c | 9 ++++----- kernel/trace/ftrace.c | 10 +++++++--- kernel/trace/trace_functions_graph.c | 21 ++++++++++++++++----- kernel/trace/trace_irqsoff.c | 18 +++++++----------- kernel/trace/trace_sched_wakeup.c | 16 +++++++--------- kernel/trace/trace_selftest.c | 8 ++++++-- 7 files changed, 58 insertions(+), 45 deletions(-) (limited to 'kernel') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 98625f10d982..21c80491ccde 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -749,6 +749,11 @@ typedef int (*trace_func_graph_ent_t)(struct ftrace_graph_ent *); /* entry */ #ifdef CONFIG_FUNCTION_GRAPH_TRACER +struct fgraph_ops { + trace_func_graph_ent_t entryfunc; + trace_func_graph_ret_t retfunc; +}; + /* * Stack of return addresses for functions * of a thread. @@ -792,8 +797,9 @@ unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx, #define FTRACE_RETFUNC_DEPTH 50 #define FTRACE_RETSTACK_ALLOC_SIZE 32 -extern int register_ftrace_graph(trace_func_graph_ret_t retfunc, - trace_func_graph_ent_t entryfunc); + +extern int register_ftrace_graph(struct fgraph_ops *ops); +extern void unregister_ftrace_graph(struct fgraph_ops *ops); extern bool ftrace_graph_is_dead(void); extern void ftrace_graph_stop(void); @@ -802,8 +808,6 @@ extern void ftrace_graph_stop(void); extern trace_func_graph_ret_t ftrace_graph_return; extern trace_func_graph_ent_t ftrace_graph_entry; -extern void unregister_ftrace_graph(void); - extern void ftrace_graph_init_task(struct task_struct *t); extern void ftrace_graph_exit_task(struct task_struct *t); extern void ftrace_graph_init_idle_task(struct task_struct *t, int cpu); @@ -825,12 +829,9 @@ static inline void ftrace_graph_init_task(struct task_struct *t) { } static inline void ftrace_graph_exit_task(struct task_struct *t) { } static inline void ftrace_graph_init_idle_task(struct task_struct *t, int cpu) { } -static inline int register_ftrace_graph(trace_func_graph_ret_t retfunc, - trace_func_graph_ent_t entryfunc) -{ - return -1; -} -static inline void unregister_ftrace_graph(void) { } +/* Define as macros as fgraph_ops may not be defined */ +#define register_ftrace_graph(ops) ({ -1; }) +#define unregister_ftrace_graph(ops) do { } while (0) static inline unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx, unsigned long ret, diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index 374f3e42e29e..cc35606e9a3e 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -490,8 +490,7 @@ static int start_graph_tracing(void) return ret; } -int register_ftrace_graph(trace_func_graph_ret_t retfunc, - trace_func_graph_ent_t entryfunc) +int register_ftrace_graph(struct fgraph_ops *gops) { int ret = 0; @@ -512,7 +511,7 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc, goto out; } - ftrace_graph_return = retfunc; + ftrace_graph_return = gops->retfunc; /* * Update the indirect function to the entryfunc, and the @@ -520,7 +519,7 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc, * call the update fgraph entry function to determine if * the entryfunc should be called directly or not. */ - __ftrace_graph_entry = entryfunc; + __ftrace_graph_entry = gops->entryfunc; ftrace_graph_entry = ftrace_graph_entry_test; update_function_graph_func(); @@ -530,7 +529,7 @@ out: return ret; } -void unregister_ftrace_graph(void) +void unregister_ftrace_graph(struct fgraph_ops *gops) { mutex_lock(&ftrace_lock); diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index c53533b833cf..d06fe588e650 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -849,15 +849,19 @@ static void profile_graph_return(struct ftrace_graph_ret *trace) local_irq_restore(flags); } +static struct fgraph_ops fprofiler_ops = { + .entryfunc = &profile_graph_entry, + .retfunc = &profile_graph_return, +}; + static int register_ftrace_profiler(void) { - return register_ftrace_graph(&profile_graph_return, - &profile_graph_entry); + return register_ftrace_graph(&fprofiler_ops); } static void unregister_ftrace_profiler(void) { - unregister_ftrace_graph(); + unregister_ftrace_graph(&fprofiler_ops); } #else static struct ftrace_ops ftrace_profile_ops __read_mostly = { diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 855c13c61e77..140b4b51ab34 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -345,17 +345,25 @@ static void trace_graph_thresh_return(struct ftrace_graph_ret *trace) trace_graph_return(trace); } +static struct fgraph_ops funcgraph_thresh_ops = { + .entryfunc = &trace_graph_entry, + .retfunc = &trace_graph_thresh_return, +}; + +static struct fgraph_ops funcgraph_ops = { + .entryfunc = &trace_graph_entry, + .retfunc = &trace_graph_return, +}; + static int graph_trace_init(struct trace_array *tr) { int ret; set_graph_array(tr); if (tracing_thresh) - ret = register_ftrace_graph(&trace_graph_thresh_return, - &trace_graph_entry); + ret = register_ftrace_graph(&funcgraph_thresh_ops); else - ret = register_ftrace_graph(&trace_graph_return, - &trace_graph_entry); + ret = register_ftrace_graph(&funcgraph_ops); if (ret) return ret; tracing_start_cmdline_record(); @@ -366,7 +374,10 @@ static int graph_trace_init(struct trace_array *tr) static void graph_trace_reset(struct trace_array *tr) { tracing_stop_cmdline_record(); - unregister_ftrace_graph(); + if (tracing_thresh) + unregister_ftrace_graph(&funcgraph_thresh_ops); + else + unregister_ftrace_graph(&funcgraph_ops); } static int graph_trace_update_thresh(struct trace_array *tr) diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 98ea6d28df15..d3294721f119 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -218,6 +218,11 @@ static void irqsoff_graph_return(struct ftrace_graph_ret *trace) atomic_dec(&data->disabled); } +static struct fgraph_ops fgraph_ops = { + .entryfunc = &irqsoff_graph_entry, + .retfunc = &irqsoff_graph_return, +}; + static void irqsoff_trace_open(struct trace_iterator *iter) { if (is_graph(iter->tr)) @@ -272,13 +277,6 @@ __trace_function(struct trace_array *tr, #else #define __trace_function trace_function -#ifdef CONFIG_FUNCTION_TRACER -static int irqsoff_graph_entry(struct ftrace_graph_ent *trace) -{ - return -1; -} -#endif - static enum print_line_t irqsoff_print_line(struct trace_iterator *iter) { return TRACE_TYPE_UNHANDLED; @@ -288,7 +286,6 @@ static void irqsoff_trace_open(struct trace_iterator *iter) { } static void irqsoff_trace_close(struct trace_iterator *iter) { } #ifdef CONFIG_FUNCTION_TRACER -static void irqsoff_graph_return(struct ftrace_graph_ret *trace) { } static void irqsoff_print_header(struct seq_file *s) { trace_default_header(s); @@ -468,8 +465,7 @@ static int register_irqsoff_function(struct trace_array *tr, int graph, int set) return 0; if (graph) - ret = register_ftrace_graph(&irqsoff_graph_return, - &irqsoff_graph_entry); + ret = register_ftrace_graph(&fgraph_ops); else ret = register_ftrace_function(tr->ops); @@ -485,7 +481,7 @@ static void unregister_irqsoff_function(struct trace_array *tr, int graph) return; if (graph) - unregister_ftrace_graph(); + unregister_ftrace_graph(&fgraph_ops); else unregister_ftrace_function(tr->ops); diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 2ce78100b4d3..4ea7e6845efb 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -162,6 +162,11 @@ static void wakeup_graph_return(struct ftrace_graph_ret *trace) return; } +static struct fgraph_ops fgraph_wakeup_ops = { + .entryfunc = &wakeup_graph_entry, + .retfunc = &wakeup_graph_return, +}; + static void wakeup_trace_open(struct trace_iterator *iter) { if (is_graph(iter->tr)) @@ -197,12 +202,6 @@ static void wakeup_print_header(struct seq_file *s) else trace_default_header(s); } -#else /* CONFIG_FUNCTION_GRAPH_TRACER */ -static int wakeup_graph_entry(struct ftrace_graph_ent *trace) -{ - return -1; -} -static void wakeup_graph_return(struct ftrace_graph_ret *trace) { } #endif /* else CONFIG_FUNCTION_GRAPH_TRACER */ /* @@ -237,8 +236,7 @@ static int register_wakeup_function(struct trace_array *tr, int graph, int set) return 0; if (graph) - ret = register_ftrace_graph(&wakeup_graph_return, - &wakeup_graph_entry); + ret = register_ftrace_graph(&fgraph_wakeup_ops); else ret = register_ftrace_function(tr->ops); @@ -254,7 +252,7 @@ static void unregister_wakeup_function(struct trace_array *tr, int graph) return; if (graph) - unregister_ftrace_graph(); + unregister_ftrace_graph(&fgraph_wakeup_ops); else unregister_ftrace_function(tr->ops); diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 11e9daa4a568..9d402e7fc949 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -741,6 +741,11 @@ static int trace_graph_entry_watchdog(struct ftrace_graph_ent *trace) return trace_graph_entry(trace); } +static struct fgraph_ops fgraph_ops __initdata = { + .entryfunc = &trace_graph_entry_watchdog, + .retfunc = &trace_graph_return, +}; + /* * Pretty much the same than for the function tracer from which the selftest * has been borrowed. @@ -765,8 +770,7 @@ trace_selftest_startup_function_graph(struct tracer *trace, */ tracing_reset_online_cpus(&tr->trace_buffer); set_graph_array(tr); - ret = register_ftrace_graph(&trace_graph_return, - &trace_graph_entry_watchdog); + ret = register_ftrace_graph(&fgraph_ops); if (ret) { warn_failed_init_tracer(trace, ret); goto out; -- cgit From 76b42b63ed0d004961097d3a3cd979129d4afd26 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Sun, 18 Nov 2018 18:36:19 -0500 Subject: function_graph: Move ftrace_graph_ret_addr() to fgraph.c Move the function function_graph_ret_addr() to fgraph.c, as the management of the curr_ret_stack is going to change, and all the accesses to ret_stack needs to be done in fgraph.c. Reviewed-by: Joel Fernandes (Google) Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/fgraph.c | 55 ++++++++++++++++++++++++++++++++++++ kernel/trace/trace_functions_graph.c | 55 ------------------------------------ 2 files changed, 55 insertions(+), 55 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index cc35606e9a3e..90fcefcaff2a 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -232,6 +232,61 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer) return ret; } +/** + * ftrace_graph_ret_addr - convert a potentially modified stack return address + * to its original value + * + * This function can be called by stack unwinding code to convert a found stack + * return address ('ret') to its original value, in case the function graph + * tracer has modified it to be 'return_to_handler'. If the address hasn't + * been modified, the unchanged value of 'ret' is returned. + * + * 'idx' is a state variable which should be initialized by the caller to zero + * before the first call. + * + * 'retp' is a pointer to the return address on the stack. It's ignored if + * the arch doesn't have HAVE_FUNCTION_GRAPH_RET_ADDR_PTR defined. + */ +#ifdef HAVE_FUNCTION_GRAPH_RET_ADDR_PTR +unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx, + unsigned long ret, unsigned long *retp) +{ + int index = task->curr_ret_stack; + int i; + + if (ret != (unsigned long)return_to_handler) + return ret; + + if (index < 0) + return ret; + + for (i = 0; i <= index; i++) + if (task->ret_stack[i].retp == retp) + return task->ret_stack[i].ret; + + return ret; +} +#else /* !HAVE_FUNCTION_GRAPH_RET_ADDR_PTR */ +unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx, + unsigned long ret, unsigned long *retp) +{ + int task_idx; + + if (ret != (unsigned long)return_to_handler) + return ret; + + task_idx = task->curr_ret_stack; + + if (!task->ret_stack || task_idx < *idx) + return ret; + + task_idx -= *idx; + (*idx)++; + + return task->ret_stack[task_idx].ret; +} +#endif /* HAVE_FUNCTION_GRAPH_RET_ADDR_PTR */ + static struct ftrace_ops graph_ops = { .func = ftrace_stub, .flags = FTRACE_OPS_FL_RECURSION_SAFE | diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 140b4b51ab34..c2af1560e856 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -94,61 +94,6 @@ static void print_graph_duration(struct trace_array *tr, unsigned long long duration, struct trace_seq *s, u32 flags); -/** - * ftrace_graph_ret_addr - convert a potentially modified stack return address - * to its original value - * - * This function can be called by stack unwinding code to convert a found stack - * return address ('ret') to its original value, in case the function graph - * tracer has modified it to be 'return_to_handler'. If the address hasn't - * been modified, the unchanged value of 'ret' is returned. - * - * 'idx' is a state variable which should be initialized by the caller to zero - * before the first call. - * - * 'retp' is a pointer to the return address on the stack. It's ignored if - * the arch doesn't have HAVE_FUNCTION_GRAPH_RET_ADDR_PTR defined. - */ -#ifdef HAVE_FUNCTION_GRAPH_RET_ADDR_PTR -unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx, - unsigned long ret, unsigned long *retp) -{ - int index = task->curr_ret_stack; - int i; - - if (ret != (unsigned long)return_to_handler) - return ret; - - if (index < 0) - return ret; - - for (i = 0; i <= index; i++) - if (task->ret_stack[i].retp == retp) - return task->ret_stack[i].ret; - - return ret; -} -#else /* !HAVE_FUNCTION_GRAPH_RET_ADDR_PTR */ -unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx, - unsigned long ret, unsigned long *retp) -{ - int task_idx; - - if (ret != (unsigned long)return_to_handler) - return ret; - - task_idx = task->curr_ret_stack; - - if (!task->ret_stack || task_idx < *idx) - return ret; - - task_idx -= *idx; - (*idx)++; - - return task->ret_stack[task_idx].ret; -} -#endif /* HAVE_FUNCTION_GRAPH_RET_ADDR_PTR */ - int __trace_graph_entry(struct trace_array *tr, struct ftrace_graph_ent *trace, unsigned long flags, -- cgit From b0e21a61d3196762b61f43ae994ffd255f646774 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 19 Nov 2018 20:54:08 -0500 Subject: function_graph: Have profiler use new helper ftrace_graph_get_ret_stack() The ret_stack processing is going to change, and that is going to break anything that is accessing the ret_stack directly. One user is the function graph profiler. By using the ftrace_graph_get_ret_stack() helper function, the profiler can access the ret_stack entry without relying on the implementation details of the stack itself. Reviewed-by: Joel Fernandes (Google) Signed-off-by: Steven Rostedt (VMware) --- include/linux/ftrace.h | 3 +++ kernel/trace/fgraph.c | 11 +++++++++++ kernel/trace/ftrace.c | 21 +++++++++++---------- 3 files changed, 25 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 21c80491ccde..98e141c71ad0 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -785,6 +785,9 @@ extern int function_graph_enter(unsigned long ret, unsigned long func, unsigned long frame_pointer, unsigned long *retp); +struct ftrace_ret_stack * +ftrace_graph_get_ret_stack(struct task_struct *task, int idx); + unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx, unsigned long ret, unsigned long *retp); diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index 90fcefcaff2a..a3704ec8b599 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -232,6 +232,17 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer) return ret; } +struct ftrace_ret_stack * +ftrace_graph_get_ret_stack(struct task_struct *task, int idx) +{ + idx = current->curr_ret_stack - idx; + + if (idx >= 0 && idx <= task->curr_ret_stack) + return ¤t->ret_stack[idx]; + + return NULL; +} + /** * ftrace_graph_ret_addr - convert a potentially modified stack return address * to its original value diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index d06fe588e650..8ef9fc226037 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -792,7 +792,7 @@ void ftrace_graph_graph_time_control(bool enable) static int profile_graph_entry(struct ftrace_graph_ent *trace) { - int index = current->curr_ret_stack; + struct ftrace_ret_stack *ret_stack; function_profile_call(trace->func, 0, NULL, NULL); @@ -800,14 +800,16 @@ static int profile_graph_entry(struct ftrace_graph_ent *trace) if (!current->ret_stack) return 0; - if (index >= 0 && index < FTRACE_RETFUNC_DEPTH) - current->ret_stack[index].subtime = 0; + ret_stack = ftrace_graph_get_ret_stack(current, 0); + if (ret_stack) + ret_stack->subtime = 0; return 1; } static void profile_graph_return(struct ftrace_graph_ret *trace) { + struct ftrace_ret_stack *ret_stack; struct ftrace_profile_stat *stat; unsigned long long calltime; struct ftrace_profile *rec; @@ -825,16 +827,15 @@ static void profile_graph_return(struct ftrace_graph_ret *trace) calltime = trace->rettime - trace->calltime; if (!fgraph_graph_time) { - int index; - - index = current->curr_ret_stack; /* Append this call time to the parent time to subtract */ - if (index) - current->ret_stack[index - 1].subtime += calltime; + ret_stack = ftrace_graph_get_ret_stack(current, 1); + if (ret_stack) + ret_stack->subtime += calltime; - if (current->ret_stack[index].subtime < calltime) - calltime -= current->ret_stack[index].subtime; + ret_stack = ftrace_graph_get_ret_stack(current, 0); + if (ret_stack && ret_stack->subtime < calltime) + calltime -= ret_stack->subtime; else calltime = 0; } -- cgit From ca16b0fbb05242f18da9d810c07d3882ffed831c Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 20 Jun 2018 14:08:00 +0300 Subject: tracing: Have trace_stack nr_entries compare not be so subtle Dan Carpenter reviewed the trace_stack.c code and figured he found an off by one bug. "From reviewing the code, it seems possible for stack_trace_max.nr_entries to be set to .max_entries and in that case we would be reading one element beyond the end of the stack_dump_trace[] array. If it's not set to .max_entries then the bug doesn't affect runtime." Although it looks to be the case, it is not. Because we have: static unsigned long stack_dump_trace[STACK_TRACE_ENTRIES+1] = { [0 ... (STACK_TRACE_ENTRIES)] = ULONG_MAX }; struct stack_trace stack_trace_max = { .max_entries = STACK_TRACE_ENTRIES - 1, .entries = &stack_dump_trace[0], }; And: stack_trace_max.nr_entries = x; for (; x < i; x++) stack_dump_trace[x] = ULONG_MAX; Even if nr_entries equals max_entries, indexing with it into the stack_dump_trace[] array will not overflow the array. But if it is the case, the second part of the conditional that tests stack_dump_trace[nr_entries] to ULONG_MAX will always be true. By applying Dan's patch, it removes the subtle aspect of it and makes the if conditional slightly more efficient. Link: http://lkml.kernel.org/r/20180620110758.crunhd5bfep7zuiz@kili.mountain Signed-off-by: Dan Carpenter Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_stack.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 2b0d1ee3241c..e2a153fc1afc 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -286,7 +286,7 @@ __next(struct seq_file *m, loff_t *pos) { long n = *pos - 1; - if (n > stack_trace_max.nr_entries || stack_dump_trace[n] == ULONG_MAX) + if (n >= stack_trace_max.nr_entries || stack_dump_trace[n] == ULONG_MAX) return NULL; m->private = (void *)n; -- cgit From 2c2b0a78b373908926e4683ea5571332f63c0eb5 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 29 Nov 2018 20:32:26 -0500 Subject: ring-buffer: Add percentage of ring buffer full to wake up reader Instead of just waiting for a page to be full before waking up a pending reader, allow the reader to pass in a "percentage" of pages that have content before waking up a reader. This should help keep the process of reading the events not cause wake ups that constantly cause reading of the buffer. Signed-off-by: Steven Rostedt (VMware) --- include/linux/ring_buffer.h | 4 ++- kernel/trace/ring_buffer.c | 71 +++++++++++++++++++++++++++++++++++++++++---- kernel/trace/trace.c | 8 ++--- 3 files changed, 73 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index 0940fda59872..5b9ae62272bb 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -97,7 +97,7 @@ __ring_buffer_alloc(unsigned long size, unsigned flags, struct lock_class_key *k __ring_buffer_alloc((size), (flags), &__key); \ }) -int ring_buffer_wait(struct ring_buffer *buffer, int cpu, bool full); +int ring_buffer_wait(struct ring_buffer *buffer, int cpu, int full); __poll_t ring_buffer_poll_wait(struct ring_buffer *buffer, int cpu, struct file *filp, poll_table *poll_table); @@ -189,6 +189,8 @@ bool ring_buffer_time_stamp_abs(struct ring_buffer *buffer); size_t ring_buffer_page_len(void *page); +size_t ring_buffer_nr_pages(struct ring_buffer *buffer, int cpu); +size_t ring_buffer_nr_dirty_pages(struct ring_buffer *buffer, int cpu); void *ring_buffer_alloc_read_page(struct ring_buffer *buffer, int cpu); void ring_buffer_free_read_page(struct ring_buffer *buffer, int cpu, void *data); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 65bd4616220d..9edb628603ab 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -487,6 +487,9 @@ struct ring_buffer_per_cpu { local_t dropped_events; local_t committing; local_t commits; + local_t pages_touched; + local_t pages_read; + size_t shortest_full; unsigned long read; unsigned long read_bytes; u64 write_stamp; @@ -529,6 +532,41 @@ struct ring_buffer_iter { u64 read_stamp; }; +/** + * ring_buffer_nr_pages - get the number of buffer pages in the ring buffer + * @buffer: The ring_buffer to get the number of pages from + * @cpu: The cpu of the ring_buffer to get the number of pages from + * + * Returns the number of pages used by a per_cpu buffer of the ring buffer. + */ +size_t ring_buffer_nr_pages(struct ring_buffer *buffer, int cpu) +{ + return buffer->buffers[cpu]->nr_pages; +} + +/** + * ring_buffer_nr_pages_dirty - get the number of used pages in the ring buffer + * @buffer: The ring_buffer to get the number of pages from + * @cpu: The cpu of the ring_buffer to get the number of pages from + * + * Returns the number of pages that have content in the ring buffer. + */ +size_t ring_buffer_nr_dirty_pages(struct ring_buffer *buffer, int cpu) +{ + size_t read; + size_t cnt; + + read = local_read(&buffer->buffers[cpu]->pages_read); + cnt = local_read(&buffer->buffers[cpu]->pages_touched); + /* The reader can read an empty page, but not more than that */ + if (cnt < read) { + WARN_ON_ONCE(read > cnt + 1); + return 0; + } + + return cnt - read; +} + /* * rb_wake_up_waiters - wake up tasks waiting for ring buffer input * @@ -556,7 +594,7 @@ static void rb_wake_up_waiters(struct irq_work *work) * as data is added to any of the @buffer's cpu buffers. Otherwise * it will wait for data to be added to a specific cpu buffer. */ -int ring_buffer_wait(struct ring_buffer *buffer, int cpu, bool full) +int ring_buffer_wait(struct ring_buffer *buffer, int cpu, int full) { struct ring_buffer_per_cpu *uninitialized_var(cpu_buffer); DEFINE_WAIT(wait); @@ -571,7 +609,7 @@ int ring_buffer_wait(struct ring_buffer *buffer, int cpu, bool full) if (cpu == RING_BUFFER_ALL_CPUS) { work = &buffer->irq_work; /* Full only makes sense on per cpu reads */ - full = false; + full = 0; } else { if (!cpumask_test_cpu(cpu, buffer->cpumask)) return -ENODEV; @@ -623,15 +661,22 @@ int ring_buffer_wait(struct ring_buffer *buffer, int cpu, bool full) !ring_buffer_empty_cpu(buffer, cpu)) { unsigned long flags; bool pagebusy; + size_t nr_pages; + size_t dirty; if (!full) break; raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page; + nr_pages = cpu_buffer->nr_pages; + dirty = ring_buffer_nr_dirty_pages(buffer, cpu); + if (!cpu_buffer->shortest_full || + cpu_buffer->shortest_full < full) + cpu_buffer->shortest_full = full; raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); - - if (!pagebusy) + if (!pagebusy && + (!nr_pages || (dirty * 100) > full * nr_pages)) break; } @@ -1054,6 +1099,7 @@ static void rb_tail_page_update(struct ring_buffer_per_cpu *cpu_buffer, old_write = local_add_return(RB_WRITE_INTCNT, &next_page->write); old_entries = local_add_return(RB_WRITE_INTCNT, &next_page->entries); + local_inc(&cpu_buffer->pages_touched); /* * Just make sure we have seen our old_write and synchronize * with any interrupts that come in. @@ -2603,6 +2649,16 @@ rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer) pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page; if (!pagebusy && cpu_buffer->irq_work.full_waiters_pending) { + size_t nr_pages; + size_t dirty; + size_t full; + + full = cpu_buffer->shortest_full; + nr_pages = cpu_buffer->nr_pages; + dirty = ring_buffer_nr_dirty_pages(buffer, cpu_buffer->cpu); + if (full && nr_pages && (dirty * 100) <= full * nr_pages) + return; + cpu_buffer->irq_work.wakeup_full = true; cpu_buffer->irq_work.full_waiters_pending = false; /* irq_work_queue() supplies it's own memory barriers */ @@ -3732,13 +3788,15 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) goto spin; /* - * Yeah! We succeeded in replacing the page. + * Yay! We succeeded in replacing the page. * * Now make the new head point back to the reader page. */ rb_list_head(reader->list.next)->prev = &cpu_buffer->reader_page->list; rb_inc_page(cpu_buffer, &cpu_buffer->head_page); + local_inc(&cpu_buffer->pages_read); + /* Finally update the reader page to the new head */ cpu_buffer->reader_page = reader; cpu_buffer->reader_page->read = 0; @@ -4334,6 +4392,9 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) local_set(&cpu_buffer->entries, 0); local_set(&cpu_buffer->committing, 0); local_set(&cpu_buffer->commits, 0); + local_set(&cpu_buffer->pages_touched, 0); + local_set(&cpu_buffer->pages_read, 0); + cpu_buffer->shortest_full = 0; cpu_buffer->read = 0; cpu_buffer->read_bytes = 0; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index ff1c4b20cd0a..48d5eb22ff33 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1431,7 +1431,7 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) } #endif /* CONFIG_TRACER_MAX_TRACE */ -static int wait_on_pipe(struct trace_iterator *iter, bool full) +static int wait_on_pipe(struct trace_iterator *iter, int full) { /* Iterators are static, they should be filled or empty */ if (trace_buffer_iter(iter, iter->cpu_file)) @@ -5693,7 +5693,7 @@ static int tracing_wait_pipe(struct file *filp) mutex_unlock(&iter->mutex); - ret = wait_on_pipe(iter, false); + ret = wait_on_pipe(iter, 0); mutex_lock(&iter->mutex); @@ -6751,7 +6751,7 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, if ((filp->f_flags & O_NONBLOCK)) return -EAGAIN; - ret = wait_on_pipe(iter, false); + ret = wait_on_pipe(iter, 0); if (ret) return ret; @@ -6948,7 +6948,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK)) goto out; - ret = wait_on_pipe(iter, true); + ret = wait_on_pipe(iter, 1); if (ret) goto out; -- cgit From 03329f9939781041424edd29487b9603a704fcd9 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 29 Nov 2018 21:38:42 -0500 Subject: tracing: Add tracefs file buffer_percentage Add a "buffer_percentage" file, that allows users to specify how much of the buffer (percentage of pages) need to be filled before waking up a task blocked on a per cpu trace_pipe_raw file. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ring_buffer.c | 39 +++++++++++++++++++-------------- kernel/trace/trace.c | 54 +++++++++++++++++++++++++++++++++++++++++++++- kernel/trace/trace.h | 1 + 3 files changed, 77 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 9edb628603ab..5434c16f2192 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -489,6 +489,7 @@ struct ring_buffer_per_cpu { local_t commits; local_t pages_touched; local_t pages_read; + long last_pages_touch; size_t shortest_full; unsigned long read; unsigned long read_bytes; @@ -2632,7 +2633,9 @@ static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer, static __always_inline void rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer) { - bool pagebusy; + size_t nr_pages; + size_t dirty; + size_t full; if (buffer->irq_work.waiters_pending) { buffer->irq_work.waiters_pending = false; @@ -2646,24 +2649,27 @@ rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer) irq_work_queue(&cpu_buffer->irq_work.work); } - pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page; + if (cpu_buffer->last_pages_touch == local_read(&cpu_buffer->pages_touched)) + return; - if (!pagebusy && cpu_buffer->irq_work.full_waiters_pending) { - size_t nr_pages; - size_t dirty; - size_t full; + if (cpu_buffer->reader_page == cpu_buffer->commit_page) + return; - full = cpu_buffer->shortest_full; - nr_pages = cpu_buffer->nr_pages; - dirty = ring_buffer_nr_dirty_pages(buffer, cpu_buffer->cpu); - if (full && nr_pages && (dirty * 100) <= full * nr_pages) - return; + if (!cpu_buffer->irq_work.full_waiters_pending) + return; - cpu_buffer->irq_work.wakeup_full = true; - cpu_buffer->irq_work.full_waiters_pending = false; - /* irq_work_queue() supplies it's own memory barriers */ - irq_work_queue(&cpu_buffer->irq_work.work); - } + cpu_buffer->last_pages_touch = local_read(&cpu_buffer->pages_touched); + + full = cpu_buffer->shortest_full; + nr_pages = cpu_buffer->nr_pages; + dirty = ring_buffer_nr_dirty_pages(buffer, cpu_buffer->cpu); + if (full && nr_pages && (dirty * 100) <= full * nr_pages) + return; + + cpu_buffer->irq_work.wakeup_full = true; + cpu_buffer->irq_work.full_waiters_pending = false; + /* irq_work_queue() supplies it's own memory barriers */ + irq_work_queue(&cpu_buffer->irq_work.work); } /* @@ -4394,6 +4400,7 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) local_set(&cpu_buffer->commits, 0); local_set(&cpu_buffer->pages_touched, 0); local_set(&cpu_buffer->pages_read, 0); + cpu_buffer->last_pages_touch = 0; cpu_buffer->shortest_full = 0; cpu_buffer->read = 0; cpu_buffer->read_bytes = 0; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 48d5eb22ff33..d382fd1aa4a6 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6948,7 +6948,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK)) goto out; - ret = wait_on_pipe(iter, 1); + ret = wait_on_pipe(iter, iter->tr->buffer_percent); if (ret) goto out; @@ -7662,6 +7662,53 @@ static const struct file_operations rb_simple_fops = { .llseek = default_llseek, }; +static ssize_t +buffer_percent_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct trace_array *tr = filp->private_data; + char buf[64]; + int r; + + r = tr->buffer_percent; + r = sprintf(buf, "%d\n", r); + + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); +} + +static ssize_t +buffer_percent_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct trace_array *tr = filp->private_data; + unsigned long val; + int ret; + + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); + if (ret) + return ret; + + if (val > 100) + return -EINVAL; + + if (!val) + val = 1; + + tr->buffer_percent = val; + + (*ppos)++; + + return cnt; +} + +static const struct file_operations buffer_percent_fops = { + .open = tracing_open_generic_tr, + .read = buffer_percent_read, + .write = buffer_percent_write, + .release = tracing_release_generic_tr, + .llseek = default_llseek, +}; + struct dentry *trace_instance_dir; static void @@ -7970,6 +8017,11 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) trace_create_file("timestamp_mode", 0444, d_tracer, tr, &trace_time_stamp_mode_fops); + tr->buffer_percent = 1; + + trace_create_file("buffer_percent", 0444, d_tracer, + tr, &buffer_percent_fops); + create_trace_options_dir(tr); #if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index ab16eca76e59..08900828d282 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -247,6 +247,7 @@ struct trace_array { int clock_id; int nr_topts; bool clear_trace; + int buffer_percent; struct tracer *current_trace; unsigned int trace_flags; unsigned char trace_flags_index[TRACE_FLAGS_MAX_SIZE]; -- cgit From a7b1d74e872a4299e1aef66a648316c9c23e5ab4 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 29 Nov 2018 22:36:47 -0500 Subject: tracing: Change default buffer_percent to 50 After running several tests, it appears that having the reader wait till half the buffer is full before starting to read (and causing its own events to fill up the ring buffer constantly), works well. It keeps trace-cmd (the main user of this interface) from dominating the traces it records. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d382fd1aa4a6..194c01838e3f 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -8017,7 +8017,7 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) trace_create_file("timestamp_mode", 0444, d_tracer, tr, &trace_time_stamp_mode_fops); - tr->buffer_percent = 1; + tr->buffer_percent = 50; trace_create_file("buffer_percent", 0444, d_tracer, tr, &buffer_percent_fops); -- cgit From 547cd9eacd1c699c8d1fa77c95c6cdb33b2eba6a Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 5 Nov 2018 18:00:15 +0900 Subject: tracing/uprobes: Add busy check when cleanup all uprobes Add a busy check loop in cleanup_all_probes() before trying to remove all events in uprobe_events, the same way that kprobe_events does. Without this change, writing null to uprobe_events will try to remove events but if one of them is enabled, it will stop there leaving some events cleared and others not clceared. With this change, writing null to uprobe_events makes sure all events are not enabled before removing events. So, it clears all events, or returns an error (-EBUSY) with keeping all events. Link: http://lkml.kernel.org/r/154140841557.17322.12653952888762532401.stgit@devbox Reviewed-by: Tom Zanussi Tested-by: Tom Zanussi Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_uprobe.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 31ea48eceda1..b708e4ff7ea7 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -587,12 +587,19 @@ static int cleanup_all_probes(void) int ret = 0; mutex_lock(&uprobe_lock); + /* Ensure no probe is in use. */ + list_for_each_entry(tu, &uprobe_list, list) + if (trace_probe_is_enabled(&tu->tp)) { + ret = -EBUSY; + goto end; + } while (!list_empty(&uprobe_list)) { tu = list_entry(uprobe_list.next, struct trace_uprobe, list); ret = unregister_trace_uprobe(tu); if (ret) break; } +end: mutex_unlock(&uprobe_lock); return ret; } -- cgit From fc800a10be26017f8f338bc8e500d48e3e6429d9 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 5 Nov 2018 18:00:43 +0900 Subject: tracing: Lock event_mutex before synth_event_mutex synthetic event is using synth_event_mutex for protecting synth_event_list, and event_trigger_write() path acquires locks as below order. event_trigger_write(event_mutex) ->trigger_process_regex(trigger_cmd_mutex) ->event_hist_trigger_func(synth_event_mutex) On the other hand, synthetic event creation and deletion paths call trace_add_event_call() and trace_remove_event_call() which acquires event_mutex. In that case, if we keep the synth_event_mutex locked while registering/unregistering synthetic events, its dependency will be inversed. To avoid this issue, current synthetic event is using a 2 phase process to create/delete events. For example, it searches existing events under synth_event_mutex to check for event-name conflicts, and unlocks synth_event_mutex, then registers a new event under event_mutex locked. Finally, it locks synth_event_mutex and tries to add the new event to the list. But it can introduce complexity and a chance for name conflicts. To solve this simpler, this introduces trace_add_event_call_nolock() and trace_remove_event_call_nolock() which don't acquire event_mutex inside. synthetic event can lock event_mutex before synth_event_mutex to solve the lock dependency issue simpler. Link: http://lkml.kernel.org/r/154140844377.17322.13781091165954002713.stgit@devbox Reviewed-by: Tom Zanussi Tested-by: Tom Zanussi Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- include/linux/trace_events.h | 2 ++ kernel/trace/trace_events.c | 34 ++++++++++++++++++++++++++++------ kernel/trace/trace_events_hist.c | 24 ++++++++++-------------- 3 files changed, 40 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 4130a5497d40..3aa05593a53f 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -529,6 +529,8 @@ extern int trace_event_raw_init(struct trace_event_call *call); extern int trace_define_field(struct trace_event_call *call, const char *type, const char *name, int offset, int size, int is_signed, int filter_type); +extern int trace_add_event_call_nolock(struct trace_event_call *call); +extern int trace_remove_event_call_nolock(struct trace_event_call *call); extern int trace_add_event_call(struct trace_event_call *call); extern int trace_remove_event_call(struct trace_event_call *call); extern int trace_event_get_offsets(struct trace_event_call *call); diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index f94be0c2827b..a3b157f689ee 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2305,11 +2305,11 @@ __trace_early_add_new_event(struct trace_event_call *call, struct ftrace_module_file_ops; static void __add_event_to_tracers(struct trace_event_call *call); -/* Add an additional event_call dynamically */ -int trace_add_event_call(struct trace_event_call *call) +int trace_add_event_call_nolock(struct trace_event_call *call) { int ret; - mutex_lock(&event_mutex); + lockdep_assert_held(&event_mutex); + mutex_lock(&trace_types_lock); ret = __register_event(call, NULL); @@ -2317,6 +2317,16 @@ int trace_add_event_call(struct trace_event_call *call) __add_event_to_tracers(call); mutex_unlock(&trace_types_lock); + return ret; +} + +/* Add an additional event_call dynamically */ +int trace_add_event_call(struct trace_event_call *call) +{ + int ret; + + mutex_lock(&event_mutex); + ret = trace_add_event_call_nolock(call); mutex_unlock(&event_mutex); return ret; } @@ -2366,17 +2376,29 @@ static int probe_remove_event_call(struct trace_event_call *call) return 0; } -/* Remove an event_call */ -int trace_remove_event_call(struct trace_event_call *call) +/* no event_mutex version */ +int trace_remove_event_call_nolock(struct trace_event_call *call) { int ret; - mutex_lock(&event_mutex); + lockdep_assert_held(&event_mutex); + mutex_lock(&trace_types_lock); down_write(&trace_event_sem); ret = probe_remove_event_call(call); up_write(&trace_event_sem); mutex_unlock(&trace_types_lock); + + return ret; +} + +/* Remove an event_call */ +int trace_remove_event_call(struct trace_event_call *call) +{ + int ret; + + mutex_lock(&event_mutex); + ret = trace_remove_event_call_nolock(call); mutex_unlock(&event_mutex); return ret; diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index eb908ef2ecec..1670c65389fe 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -912,7 +912,7 @@ static int register_synth_event(struct synth_event *event) call->data = event; call->tp = event->tp; - ret = trace_add_event_call(call); + ret = trace_add_event_call_nolock(call); if (ret) { pr_warn("Failed to register synthetic event: %s\n", trace_event_name(call)); @@ -936,7 +936,7 @@ static int unregister_synth_event(struct synth_event *event) struct trace_event_call *call = &event->call; int ret; - ret = trace_remove_event_call(call); + ret = trace_remove_event_call_nolock(call); return ret; } @@ -1013,12 +1013,10 @@ static void add_or_delete_synth_event(struct synth_event *event, int delete) if (delete) free_synth_event(event); else { - mutex_lock(&synth_event_mutex); if (!find_synth_event(event->name)) list_add(&event->list, &synth_event_list); else free_synth_event(event); - mutex_unlock(&synth_event_mutex); } } @@ -1030,6 +1028,7 @@ static int create_synth_event(int argc, char **argv) int i, consumed = 0, n_fields = 0, ret = 0; char *name; + mutex_lock(&event_mutex); mutex_lock(&synth_event_mutex); /* @@ -1102,8 +1101,6 @@ static int create_synth_event(int argc, char **argv) goto err; } out: - mutex_unlock(&synth_event_mutex); - if (event) { if (delete_event) { ret = unregister_synth_event(event); @@ -1113,10 +1110,13 @@ static int create_synth_event(int argc, char **argv) add_or_delete_synth_event(event, ret); } } + mutex_unlock(&synth_event_mutex); + mutex_unlock(&event_mutex); return ret; err: mutex_unlock(&synth_event_mutex); + mutex_unlock(&event_mutex); for (i = 0; i < n_fields; i++) free_synth_field(fields[i]); @@ -1127,12 +1127,10 @@ static int create_synth_event(int argc, char **argv) static int release_all_synth_events(void) { - struct list_head release_events; struct synth_event *event, *e; int ret = 0; - INIT_LIST_HEAD(&release_events); - + mutex_lock(&event_mutex); mutex_lock(&synth_event_mutex); list_for_each_entry(event, &synth_event_list, list) { @@ -1142,16 +1140,14 @@ static int release_all_synth_events(void) } } - list_splice_init(&event->list, &release_events); - - mutex_unlock(&synth_event_mutex); - - list_for_each_entry_safe(event, e, &release_events, list) { + list_for_each_entry_safe(event, e, &synth_event_list, list) { list_del(&event->list); ret = unregister_synth_event(event); add_or_delete_synth_event(event, !ret); } + mutex_unlock(&synth_event_mutex); + mutex_unlock(&event_mutex); return ret; } -- cgit From faacb361f271be4baf2d807e2eeaba87e059225f Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 5 Nov 2018 18:01:12 +0900 Subject: tracing: Simplify creation and deletion of synthetic events Since the event_mutex and synth_event_mutex ordering issue is gone, we can skip existing event check when adding or deleting events, and some redundant code in error path. This changes release_all_synth_events() to abort the process when it hits any error and returns the error code. It succeeds only if it has no error. Link: http://lkml.kernel.org/r/154140847194.17322.17960275728005067803.stgit@devbox Reviewed-by: Tom Zanussi Tested-by: Tom Zanussi Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 53 ++++++++++++++-------------------------- 1 file changed, 18 insertions(+), 35 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 1670c65389fe..0feb7f460123 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -1008,18 +1008,6 @@ struct hist_var_data { struct hist_trigger_data *hist_data; }; -static void add_or_delete_synth_event(struct synth_event *event, int delete) -{ - if (delete) - free_synth_event(event); - else { - if (!find_synth_event(event->name)) - list_add(&event->list, &synth_event_list); - else - free_synth_event(event); - } -} - static int create_synth_event(int argc, char **argv) { struct synth_field *field, *fields[SYNTH_FIELDS_MAX]; @@ -1052,15 +1040,16 @@ static int create_synth_event(int argc, char **argv) if (event) { if (delete_event) { if (event->ref) { - event = NULL; ret = -EBUSY; goto out; } - list_del(&event->list); - goto out; - } - event = NULL; - ret = -EEXIST; + ret = unregister_synth_event(event); + if (!ret) { + list_del(&event->list); + free_synth_event(event); + } + } else + ret = -EEXIST; goto out; } else if (delete_event) { ret = -ENOENT; @@ -1100,29 +1089,21 @@ static int create_synth_event(int argc, char **argv) event = NULL; goto err; } + ret = register_synth_event(event); + if (!ret) + list_add(&event->list, &synth_event_list); + else + free_synth_event(event); out: - if (event) { - if (delete_event) { - ret = unregister_synth_event(event); - add_or_delete_synth_event(event, !ret); - } else { - ret = register_synth_event(event); - add_or_delete_synth_event(event, ret); - } - } mutex_unlock(&synth_event_mutex); mutex_unlock(&event_mutex); return ret; err: - mutex_unlock(&synth_event_mutex); - mutex_unlock(&event_mutex); - for (i = 0; i < n_fields; i++) free_synth_field(fields[i]); - free_synth_event(event); - return ret; + goto out; } static int release_all_synth_events(void) @@ -1141,10 +1122,12 @@ static int release_all_synth_events(void) } list_for_each_entry_safe(event, e, &synth_event_list, list) { - list_del(&event->list); - ret = unregister_synth_event(event); - add_or_delete_synth_event(event, !ret); + if (!ret) { + list_del(&event->list); + free_synth_event(event); + } else + break; } mutex_unlock(&synth_event_mutex); mutex_unlock(&event_mutex); -- cgit From d00bbea9456f35fb34310d454e561f05d68d07fe Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 5 Nov 2018 18:01:40 +0900 Subject: tracing: Integrate similar probe argument parsers Integrate similar argument parsers for kprobes and uprobes events into traceprobe_parse_probe_arg(). Link: http://lkml.kernel.org/r/154140850016.17322.9836787731210512176.stgit@devbox Reviewed-by: Tom Zanussi Tested-by: Tom Zanussi Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_kprobe.c | 48 ++------------------------------------------- kernel/trace/trace_probe.c | 47 +++++++++++++++++++++++++++++++++++++++++--- kernel/trace/trace_probe.h | 7 ++----- kernel/trace/trace_uprobe.c | 44 ++--------------------------------------- 4 files changed, 50 insertions(+), 96 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index fec67188c4d2..d313bcc259dc 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -548,7 +548,6 @@ static int create_trace_kprobe(int argc, char **argv) bool is_return = false, is_delete = false; char *symbol = NULL, *event = NULL, *group = NULL; int maxactive = 0; - char *arg; long offset = 0; void *addr = NULL; char buf[MAX_EVENT_NAME_LEN]; @@ -676,53 +675,10 @@ static int create_trace_kprobe(int argc, char **argv) } /* parse arguments */ - ret = 0; for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { - struct probe_arg *parg = &tk->tp.args[i]; - - /* Increment count for freeing args in error case */ - tk->tp.nr_args++; - - /* Parse argument name */ - arg = strchr(argv[i], '='); - if (arg) { - *arg++ = '\0'; - parg->name = kstrdup(argv[i], GFP_KERNEL); - } else { - arg = argv[i]; - /* If argument name is omitted, set "argN" */ - snprintf(buf, MAX_EVENT_NAME_LEN, "arg%d", i + 1); - parg->name = kstrdup(buf, GFP_KERNEL); - } - - if (!parg->name) { - pr_info("Failed to allocate argument[%d] name.\n", i); - ret = -ENOMEM; - goto error; - } - - if (!is_good_name(parg->name)) { - pr_info("Invalid argument[%d] name: %s\n", - i, parg->name); - ret = -EINVAL; - goto error; - } - - if (traceprobe_conflict_field_name(parg->name, - tk->tp.args, i)) { - pr_info("Argument[%d] name '%s' conflicts with " - "another field.\n", i, argv[i]); - ret = -EINVAL; - goto error; - } - - /* Parse fetch argument */ - ret = traceprobe_parse_probe_arg(arg, &tk->tp.size, parg, - flags); - if (ret) { - pr_info("Parse error at argument[%d]. (%d)\n", i, ret); + ret = traceprobe_parse_probe_arg(&tk->tp, i, argv[i], flags); + if (ret) goto error; - } } ret = register_trace_kprobe(tk); diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index bd30e9398d2a..449150c6a87f 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -348,7 +348,7 @@ static int __parse_bitfield_probe_arg(const char *bf, } /* String length checking wrapper */ -int traceprobe_parse_probe_arg(char *arg, ssize_t *size, +static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size, struct probe_arg *parg, unsigned int flags) { struct fetch_insn *code, *scode, *tmp = NULL; @@ -491,8 +491,8 @@ fail: } /* Return 1 if name is reserved or already used by another argument */ -int traceprobe_conflict_field_name(const char *name, - struct probe_arg *args, int narg) +static int traceprobe_conflict_field_name(const char *name, + struct probe_arg *args, int narg) { int i; @@ -507,6 +507,47 @@ int traceprobe_conflict_field_name(const char *name, return 0; } +int traceprobe_parse_probe_arg(struct trace_probe *tp, int i, char *arg, + unsigned int flags) +{ + struct probe_arg *parg = &tp->args[i]; + char *body; + int ret; + + /* Increment count for freeing args in error case */ + tp->nr_args++; + + body = strchr(arg, '='); + if (body) { + parg->name = kmemdup_nul(arg, body - arg, GFP_KERNEL); + body++; + } else { + /* If argument name is omitted, set "argN" */ + parg->name = kasprintf(GFP_KERNEL, "arg%d", i + 1); + body = arg; + } + if (!parg->name) + return -ENOMEM; + + if (!is_good_name(parg->name)) { + pr_info("Invalid argument[%d] name: %s\n", + i, parg->name); + return -EINVAL; + } + + if (traceprobe_conflict_field_name(parg->name, tp->args, i)) { + pr_info("Argument[%d]: '%s' conflicts with another field.\n", + i, parg->name); + return -EINVAL; + } + + /* Parse fetch argument */ + ret = traceprobe_parse_probe_arg_body(body, &tp->size, parg, flags); + if (ret) + pr_info("Parse error at argument[%d]. (%d)\n", i, ret); + return ret; +} + void traceprobe_free_probe_arg(struct probe_arg *arg) { struct fetch_insn *code = arg->code; diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 974afc1a3e73..feeec261b356 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -272,11 +272,8 @@ find_event_file_link(struct trace_probe *tp, struct trace_event_file *file) #define TPARG_FL_FENTRY BIT(2) #define TPARG_FL_MASK GENMASK(2, 0) -extern int traceprobe_parse_probe_arg(char *arg, ssize_t *size, - struct probe_arg *parg, unsigned int flags); - -extern int traceprobe_conflict_field_name(const char *name, - struct probe_arg *args, int narg); +extern int traceprobe_parse_probe_arg(struct trace_probe *tp, int i, + char *arg, unsigned int flags); extern int traceprobe_update_arg(struct probe_arg *arg); extern void traceprobe_free_probe_arg(struct probe_arg *arg); diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index b708e4ff7ea7..6eaaa2150685 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -517,51 +517,11 @@ static int create_trace_uprobe(int argc, char **argv) } /* parse arguments */ - ret = 0; for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { - struct probe_arg *parg = &tu->tp.args[i]; - - /* Increment count for freeing args in error case */ - tu->tp.nr_args++; - - /* Parse argument name */ - arg = strchr(argv[i], '='); - if (arg) { - *arg++ = '\0'; - parg->name = kstrdup(argv[i], GFP_KERNEL); - } else { - arg = argv[i]; - /* If argument name is omitted, set "argN" */ - snprintf(buf, MAX_EVENT_NAME_LEN, "arg%d", i + 1); - parg->name = kstrdup(buf, GFP_KERNEL); - } - - if (!parg->name) { - pr_info("Failed to allocate argument[%d] name.\n", i); - ret = -ENOMEM; - goto error; - } - - if (!is_good_name(parg->name)) { - pr_info("Invalid argument[%d] name: %s\n", i, parg->name); - ret = -EINVAL; - goto error; - } - - if (traceprobe_conflict_field_name(parg->name, tu->tp.args, i)) { - pr_info("Argument[%d] name '%s' conflicts with " - "another field.\n", i, argv[i]); - ret = -EINVAL; - goto error; - } - - /* Parse fetch argument */ - ret = traceprobe_parse_probe_arg(arg, &tu->tp.size, parg, + ret = traceprobe_parse_probe_arg(&tu->tp, i, argv[i], is_return ? TPARG_FL_RETURN : 0); - if (ret) { - pr_info("Parse error at argument[%d]. (%d)\n", i, ret); + if (ret) goto error; - } } ret = register_trace_uprobe(tu); -- cgit From 5448d44c38557fc15d1c53b608a9c9f0e1ca8f86 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 5 Nov 2018 18:02:08 +0900 Subject: tracing: Add unified dynamic event framework Add unified dynamic event framework for ftrace kprobes, uprobes and synthetic events. Those dynamic events can be co-exist on same file because those syntax doesn't overlap. This introduces a framework part which provides a unified tracefs interface and operations. Link: http://lkml.kernel.org/r/154140852824.17322.12250362185969352095.stgit@devbox Reviewed-by: Tom Zanussi Tested-by: Tom Zanussi Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/Kconfig | 3 + kernel/trace/Makefile | 1 + kernel/trace/trace.c | 4 + kernel/trace/trace_dynevent.c | 210 ++++++++++++++++++++++++++++++++++++++++++ kernel/trace/trace_dynevent.h | 119 ++++++++++++++++++++++++ 5 files changed, 337 insertions(+) create mode 100644 kernel/trace/trace_dynevent.c create mode 100644 kernel/trace/trace_dynevent.h (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 5e3de28c7677..bf2e8a5a91f1 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -518,6 +518,9 @@ config BPF_EVENTS help This allows the user to attach BPF programs to kprobe events. +config DYNAMIC_EVENTS + def_bool n + config PROBE_EVENTS def_bool n diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index c7ade7965464..c2b2148bb1d2 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -79,6 +79,7 @@ endif ifeq ($(CONFIG_TRACING),y) obj-$(CONFIG_KGDB_KDB) += trace_kdb.o endif +obj-$(CONFIG_DYNAMIC_EVENTS) += trace_dynevent.o obj-$(CONFIG_PROBE_EVENTS) += trace_probe.o obj-$(CONFIG_UPROBE_EVENTS) += trace_uprobe.o diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 194c01838e3f..7e0332f90ed4 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4604,6 +4604,10 @@ static const char readme_msg[] = "\t\t\t traces\n" #endif #endif /* CONFIG_STACK_TRACER */ +#ifdef CONFIG_DYNAMIC_EVENTS + " dynamic_events\t\t- Add/remove/show the generic dynamic events\n" + "\t\t\t Write into this file to define/undefine new trace events.\n" +#endif #ifdef CONFIG_KPROBE_EVENTS " kprobe_events\t\t- Add/remove/show the kernel dynamic events\n" "\t\t\t Write into this file to define/undefine new trace events.\n" diff --git a/kernel/trace/trace_dynevent.c b/kernel/trace/trace_dynevent.c new file mode 100644 index 000000000000..f17a887abb66 --- /dev/null +++ b/kernel/trace/trace_dynevent.c @@ -0,0 +1,210 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Generic dynamic event control interface + * + * Copyright (C) 2018 Masami Hiramatsu + */ + +#include +#include +#include +#include +#include +#include + +#include "trace.h" +#include "trace_dynevent.h" + +static DEFINE_MUTEX(dyn_event_ops_mutex); +static LIST_HEAD(dyn_event_ops_list); + +int dyn_event_register(struct dyn_event_operations *ops) +{ + if (!ops || !ops->create || !ops->show || !ops->is_busy || + !ops->free || !ops->match) + return -EINVAL; + + INIT_LIST_HEAD(&ops->list); + mutex_lock(&dyn_event_ops_mutex); + list_add_tail(&ops->list, &dyn_event_ops_list); + mutex_unlock(&dyn_event_ops_mutex); + return 0; +} + +int dyn_event_release(int argc, char **argv, struct dyn_event_operations *type) +{ + struct dyn_event *pos, *n; + char *system = NULL, *event, *p; + int ret = -ENOENT; + + if (argv[0][1] != ':') + return -EINVAL; + + event = &argv[0][2]; + p = strchr(event, '/'); + if (p) { + system = event; + event = p + 1; + *p = '\0'; + } + if (event[0] == '\0') + return -EINVAL; + + mutex_lock(&event_mutex); + for_each_dyn_event_safe(pos, n) { + if (type && type != pos->ops) + continue; + if (pos->ops->match(system, event, pos)) { + ret = pos->ops->free(pos); + break; + } + } + mutex_unlock(&event_mutex); + + return ret; +} + +static int create_dyn_event(int argc, char **argv) +{ + struct dyn_event_operations *ops; + int ret; + + if (argv[0][0] == '-') + return dyn_event_release(argc, argv, NULL); + + mutex_lock(&dyn_event_ops_mutex); + list_for_each_entry(ops, &dyn_event_ops_list, list) { + ret = ops->create(argc, (const char **)argv); + if (!ret || ret != -ECANCELED) + break; + } + mutex_unlock(&dyn_event_ops_mutex); + if (ret == -ECANCELED) + ret = -EINVAL; + + return ret; +} + +/* Protected by event_mutex */ +LIST_HEAD(dyn_event_list); + +void *dyn_event_seq_start(struct seq_file *m, loff_t *pos) +{ + mutex_lock(&event_mutex); + return seq_list_start(&dyn_event_list, *pos); +} + +void *dyn_event_seq_next(struct seq_file *m, void *v, loff_t *pos) +{ + return seq_list_next(v, &dyn_event_list, pos); +} + +void dyn_event_seq_stop(struct seq_file *m, void *v) +{ + mutex_unlock(&event_mutex); +} + +static int dyn_event_seq_show(struct seq_file *m, void *v) +{ + struct dyn_event *ev = v; + + if (ev && ev->ops) + return ev->ops->show(m, ev); + + return 0; +} + +static const struct seq_operations dyn_event_seq_op = { + .start = dyn_event_seq_start, + .next = dyn_event_seq_next, + .stop = dyn_event_seq_stop, + .show = dyn_event_seq_show +}; + +/* + * dyn_events_release_all - Release all specific events + * @type: the dyn_event_operations * which filters releasing events + * + * This releases all events which ->ops matches @type. If @type is NULL, + * all events are released. + * Return -EBUSY if any of them are in use, and return other errors when + * it failed to free the given event. Except for -EBUSY, event releasing + * process will be aborted at that point and there may be some other + * releasable events on the list. + */ +int dyn_events_release_all(struct dyn_event_operations *type) +{ + struct dyn_event *ev, *tmp; + int ret = 0; + + mutex_lock(&event_mutex); + for_each_dyn_event(ev) { + if (type && ev->ops != type) + continue; + if (ev->ops->is_busy(ev)) { + ret = -EBUSY; + goto out; + } + } + for_each_dyn_event_safe(ev, tmp) { + if (type && ev->ops != type) + continue; + ret = ev->ops->free(ev); + if (ret) + break; + } +out: + mutex_unlock(&event_mutex); + + return ret; +} + +static int dyn_event_open(struct inode *inode, struct file *file) +{ + int ret; + + if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) { + ret = dyn_events_release_all(NULL); + if (ret < 0) + return ret; + } + + return seq_open(file, &dyn_event_seq_op); +} + +static ssize_t dyn_event_write(struct file *file, const char __user *buffer, + size_t count, loff_t *ppos) +{ + return trace_parse_run_command(file, buffer, count, ppos, + create_dyn_event); +} + +static const struct file_operations dynamic_events_ops = { + .owner = THIS_MODULE, + .open = dyn_event_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, + .write = dyn_event_write, +}; + +/* Make a tracefs interface for controlling dynamic events */ +static __init int init_dynamic_event(void) +{ + struct dentry *d_tracer; + struct dentry *entry; + + d_tracer = tracing_init_dentry(); + if (IS_ERR(d_tracer)) + return 0; + + entry = tracefs_create_file("dynamic_events", 0644, d_tracer, + NULL, &dynamic_events_ops); + + /* Event list interface */ + if (!entry) + pr_warn("Could not create tracefs 'dynamic_events' entry\n"); + + return 0; +} +fs_initcall(init_dynamic_event); diff --git a/kernel/trace/trace_dynevent.h b/kernel/trace/trace_dynevent.h new file mode 100644 index 000000000000..8c334064e4d6 --- /dev/null +++ b/kernel/trace/trace_dynevent.h @@ -0,0 +1,119 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Common header file for generic dynamic events. + */ + +#ifndef _TRACE_DYNEVENT_H +#define _TRACE_DYNEVENT_H + +#include +#include +#include +#include + +#include "trace.h" + +struct dyn_event; + +/** + * struct dyn_event_operations - Methods for each type of dynamic events + * + * These methods must be set for each type, since there is no default method. + * Before using this for dyn_event_init(), it must be registered by + * dyn_event_register(). + * + * @create: Parse and create event method. This is invoked when user passes + * a event definition to dynamic_events interface. This must not destruct + * the arguments and return -ECANCELED if given arguments doesn't match its + * command prefix. + * @show: Showing method. This is invoked when user reads the event definitions + * via dynamic_events interface. + * @is_busy: Check whether given event is busy so that it can not be deleted. + * Return true if it is busy, otherwides false. + * @free: Delete the given event. Return 0 if success, otherwides error. + * @match: Check whether given event and system name match this event. + * Return true if it matches, otherwides false. + * + * Except for @create, these methods are called under holding event_mutex. + */ +struct dyn_event_operations { + struct list_head list; + int (*create)(int argc, const char *argv[]); + int (*show)(struct seq_file *m, struct dyn_event *ev); + bool (*is_busy)(struct dyn_event *ev); + int (*free)(struct dyn_event *ev); + bool (*match)(const char *system, const char *event, + struct dyn_event *ev); +}; + +/* Register new dyn_event type -- must be called at first */ +int dyn_event_register(struct dyn_event_operations *ops); + +/** + * struct dyn_event - Dynamic event list header + * + * The dyn_event structure encapsulates a list and a pointer to the operators + * for making a global list of dynamic events. + * User must includes this in each event structure, so that those events can + * be added/removed via dynamic_events interface. + */ +struct dyn_event { + struct list_head list; + struct dyn_event_operations *ops; +}; + +extern struct list_head dyn_event_list; + +static inline +int dyn_event_init(struct dyn_event *ev, struct dyn_event_operations *ops) +{ + if (!ev || !ops) + return -EINVAL; + + INIT_LIST_HEAD(&ev->list); + ev->ops = ops; + return 0; +} + +static inline int dyn_event_add(struct dyn_event *ev) +{ + lockdep_assert_held(&event_mutex); + + if (!ev || !ev->ops) + return -EINVAL; + + list_add_tail(&ev->list, &dyn_event_list); + return 0; +} + +static inline void dyn_event_remove(struct dyn_event *ev) +{ + lockdep_assert_held(&event_mutex); + list_del_init(&ev->list); +} + +void *dyn_event_seq_start(struct seq_file *m, loff_t *pos); +void *dyn_event_seq_next(struct seq_file *m, void *v, loff_t *pos); +void dyn_event_seq_stop(struct seq_file *m, void *v); +int dyn_events_release_all(struct dyn_event_operations *type); +int dyn_event_release(int argc, char **argv, struct dyn_event_operations *type); + +/* + * for_each_dyn_event - iterate over the dyn_event list + * @pos: the struct dyn_event * to use as a loop cursor + * + * This is just a basement of for_each macro. Wrap this for + * each actual event structure with ops filtering. + */ +#define for_each_dyn_event(pos) \ + list_for_each_entry(pos, &dyn_event_list, list) + +/* + * for_each_dyn_event - iterate over the dyn_event list safely + * @pos: the struct dyn_event * to use as a loop cursor + * @n: the struct dyn_event * to use as temporary storage + */ +#define for_each_dyn_event_safe(pos, n) \ + list_for_each_entry_safe(pos, n, &dyn_event_list, list) + +#endif -- cgit From 6212dd29683eec51d6d05374a66ac953e81250e9 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 5 Nov 2018 18:02:36 +0900 Subject: tracing/kprobes: Use dyn_event framework for kprobe events Use dyn_event framework for kprobe events. This shows kprobe events on "tracing/dynamic_events" file. User can also define new events via tracing/dynamic_events. Link: http://lkml.kernel.org/r/154140855646.17322.6619219995865980392.stgit@devbox Reviewed-by: Tom Zanussi Tested-by: Tom Zanussi Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- Documentation/trace/kprobetrace.rst | 3 + kernel/trace/Kconfig | 1 + kernel/trace/trace_kprobe.c | 319 ++++++++++++++++++++---------------- kernel/trace/trace_probe.c | 27 +++ kernel/trace/trace_probe.h | 2 + 5 files changed, 207 insertions(+), 145 deletions(-) (limited to 'kernel') diff --git a/Documentation/trace/kprobetrace.rst b/Documentation/trace/kprobetrace.rst index 47e765c2f2c3..235ce2ab131a 100644 --- a/Documentation/trace/kprobetrace.rst +++ b/Documentation/trace/kprobetrace.rst @@ -20,6 +20,9 @@ current_tracer. Instead of that, add probe points via /sys/kernel/debug/tracing/kprobe_events, and enable it via /sys/kernel/debug/tracing/events/kprobes//enable. +You can also use /sys/kernel/debug/tracing/dynamic_events instead of +kprobe_events. That interface will provide unified access to other +dynamic events too. Synopsis of kprobe_events ------------------------- diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index bf2e8a5a91f1..c0f6b0105609 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -461,6 +461,7 @@ config KPROBE_EVENTS bool "Enable kprobes-based dynamic events" select TRACING select PROBE_EVENTS + select DYNAMIC_EVENTS default y help This allows the user to add tracing events (similar to tracepoints) diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index d313bcc259dc..bdf8c2ad5152 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -12,6 +12,7 @@ #include #include +#include "trace_dynevent.h" #include "trace_kprobe_selftest.h" #include "trace_probe.h" #include "trace_probe_tmpl.h" @@ -19,17 +20,51 @@ #define KPROBE_EVENT_SYSTEM "kprobes" #define KRETPROBE_MAXACTIVE_MAX 4096 +static int trace_kprobe_create(int argc, const char **argv); +static int trace_kprobe_show(struct seq_file *m, struct dyn_event *ev); +static int trace_kprobe_release(struct dyn_event *ev); +static bool trace_kprobe_is_busy(struct dyn_event *ev); +static bool trace_kprobe_match(const char *system, const char *event, + struct dyn_event *ev); + +static struct dyn_event_operations trace_kprobe_ops = { + .create = trace_kprobe_create, + .show = trace_kprobe_show, + .is_busy = trace_kprobe_is_busy, + .free = trace_kprobe_release, + .match = trace_kprobe_match, +}; + /** * Kprobe event core functions */ struct trace_kprobe { - struct list_head list; + struct dyn_event devent; struct kretprobe rp; /* Use rp.kp for kprobe use */ unsigned long __percpu *nhit; const char *symbol; /* symbol name */ struct trace_probe tp; }; +static bool is_trace_kprobe(struct dyn_event *ev) +{ + return ev->ops == &trace_kprobe_ops; +} + +static struct trace_kprobe *to_trace_kprobe(struct dyn_event *ev) +{ + return container_of(ev, struct trace_kprobe, devent); +} + +/** + * for_each_trace_kprobe - iterate over the trace_kprobe list + * @pos: the struct trace_kprobe * for each entry + * @dpos: the struct dyn_event * to use as a loop cursor + */ +#define for_each_trace_kprobe(pos, dpos) \ + for_each_dyn_event(dpos) \ + if (is_trace_kprobe(dpos) && (pos = to_trace_kprobe(dpos))) + #define SIZEOF_TRACE_KPROBE(n) \ (offsetof(struct trace_kprobe, tp.args) + \ (sizeof(struct probe_arg) * (n))) @@ -81,6 +116,22 @@ static nokprobe_inline bool trace_kprobe_module_exist(struct trace_kprobe *tk) return ret; } +static bool trace_kprobe_is_busy(struct dyn_event *ev) +{ + struct trace_kprobe *tk = to_trace_kprobe(ev); + + return trace_probe_is_enabled(&tk->tp); +} + +static bool trace_kprobe_match(const char *system, const char *event, + struct dyn_event *ev) +{ + struct trace_kprobe *tk = to_trace_kprobe(ev); + + return strcmp(trace_event_name(&tk->tp.call), event) == 0 && + (!system || strcmp(tk->tp.call.class->system, system) == 0); +} + static nokprobe_inline unsigned long trace_kprobe_nhit(struct trace_kprobe *tk) { unsigned long nhit = 0; @@ -128,9 +179,6 @@ bool trace_kprobe_error_injectable(struct trace_event_call *call) static int register_kprobe_event(struct trace_kprobe *tk); static int unregister_kprobe_event(struct trace_kprobe *tk); -static DEFINE_MUTEX(probe_lock); -static LIST_HEAD(probe_list); - static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs); static int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs); @@ -192,7 +240,7 @@ static struct trace_kprobe *alloc_trace_kprobe(const char *group, if (!tk->tp.class.system) goto error; - INIT_LIST_HEAD(&tk->list); + dyn_event_init(&tk->devent, &trace_kprobe_ops); INIT_LIST_HEAD(&tk->tp.files); return tk; error: @@ -207,6 +255,9 @@ static void free_trace_kprobe(struct trace_kprobe *tk) { int i; + if (!tk) + return; + for (i = 0; i < tk->tp.nr_args; i++) traceprobe_free_probe_arg(&tk->tp.args[i]); @@ -220,9 +271,10 @@ static void free_trace_kprobe(struct trace_kprobe *tk) static struct trace_kprobe *find_trace_kprobe(const char *event, const char *group) { + struct dyn_event *pos; struct trace_kprobe *tk; - list_for_each_entry(tk, &probe_list, list) + for_each_trace_kprobe(tk, pos) if (strcmp(trace_event_name(&tk->tp.call), event) == 0 && strcmp(tk->tp.call.class->system, group) == 0) return tk; @@ -321,7 +373,7 @@ disable_trace_kprobe(struct trace_kprobe *tk, struct trace_event_file *file) * created with perf_event_open. We don't need to wait for these * trace_kprobes */ - if (list_empty(&tk->list)) + if (list_empty(&tk->devent.list)) wait = 0; out: if (wait) { @@ -419,7 +471,7 @@ static void __unregister_trace_kprobe(struct trace_kprobe *tk) } } -/* Unregister a trace_probe and probe_event: call with locking probe_lock */ +/* Unregister a trace_probe and probe_event */ static int unregister_trace_kprobe(struct trace_kprobe *tk) { /* Enabled event can not be unregistered */ @@ -431,7 +483,7 @@ static int unregister_trace_kprobe(struct trace_kprobe *tk) return -EBUSY; __unregister_trace_kprobe(tk); - list_del(&tk->list); + dyn_event_remove(&tk->devent); return 0; } @@ -442,7 +494,7 @@ static int register_trace_kprobe(struct trace_kprobe *tk) struct trace_kprobe *old_tk; int ret; - mutex_lock(&probe_lock); + mutex_lock(&event_mutex); /* Delete old (same name) event if exist */ old_tk = find_trace_kprobe(trace_event_name(&tk->tp.call), @@ -471,10 +523,10 @@ static int register_trace_kprobe(struct trace_kprobe *tk) if (ret < 0) unregister_kprobe_event(tk); else - list_add_tail(&tk->list, &probe_list); + dyn_event_add(&tk->devent); end: - mutex_unlock(&probe_lock); + mutex_unlock(&event_mutex); return ret; } @@ -483,6 +535,7 @@ static int trace_kprobe_module_callback(struct notifier_block *nb, unsigned long val, void *data) { struct module *mod = data; + struct dyn_event *pos; struct trace_kprobe *tk; int ret; @@ -490,8 +543,8 @@ static int trace_kprobe_module_callback(struct notifier_block *nb, return NOTIFY_DONE; /* Update probes on coming module */ - mutex_lock(&probe_lock); - list_for_each_entry(tk, &probe_list, list) { + mutex_lock(&event_mutex); + for_each_trace_kprobe(tk, pos) { if (trace_kprobe_within_module(tk, mod)) { /* Don't need to check busy - this should have gone. */ __unregister_trace_kprobe(tk); @@ -502,7 +555,7 @@ static int trace_kprobe_module_callback(struct notifier_block *nb, mod->name, ret); } } - mutex_unlock(&probe_lock); + mutex_unlock(&event_mutex); return NOTIFY_DONE; } @@ -520,7 +573,7 @@ static inline void sanitize_event_name(char *name) *name = '_'; } -static int create_trace_kprobe(int argc, char **argv) +static int trace_kprobe_create(int argc, const char *argv[]) { /* * Argument syntax: @@ -544,9 +597,10 @@ static int create_trace_kprobe(int argc, char **argv) * FETCHARG:TYPE : use TYPE instead of unsigned long. */ struct trace_kprobe *tk; - int i, ret = 0; - bool is_return = false, is_delete = false; - char *symbol = NULL, *event = NULL, *group = NULL; + int i, len, ret = 0; + bool is_return = false; + char *symbol = NULL, *tmp = NULL; + const char *event = NULL, *group = KPROBE_EVENT_SYSTEM; int maxactive = 0; long offset = 0; void *addr = NULL; @@ -554,26 +608,26 @@ static int create_trace_kprobe(int argc, char **argv) unsigned int flags = TPARG_FL_KERNEL; /* argc must be >= 1 */ - if (argv[0][0] == 'p') - is_return = false; - else if (argv[0][0] == 'r') { + if (argv[0][0] == 'r') { is_return = true; flags |= TPARG_FL_RETURN; - } else if (argv[0][0] == '-') - is_delete = true; - else { - pr_info("Probe definition must be started with 'p', 'r' or" - " '-'.\n"); - return -EINVAL; - } + } else if (argv[0][0] != 'p' || argc < 2) + return -ECANCELED; event = strchr(&argv[0][1], ':'); - if (event) { - event[0] = '\0'; + if (event) event++; - } + if (is_return && isdigit(argv[0][1])) { - ret = kstrtouint(&argv[0][1], 0, &maxactive); + if (event) + len = event - &argv[0][1] - 1; + else + len = strlen(&argv[0][1]); + if (len > MAX_EVENT_NAME_LEN - 1) + return -E2BIG; + memcpy(buf, &argv[0][1], len); + buf[len] = '\0'; + ret = kstrtouint(buf, 0, &maxactive); if (ret) { pr_info("Failed to parse maxactive.\n"); return ret; @@ -588,74 +642,37 @@ static int create_trace_kprobe(int argc, char **argv) } } - if (event) { - char *slash; - - slash = strchr(event, '/'); - if (slash) { - group = event; - event = slash + 1; - slash[0] = '\0'; - if (strlen(group) == 0) { - pr_info("Group name is not specified\n"); - return -EINVAL; - } - } - if (strlen(event) == 0) { - pr_info("Event name is not specified\n"); - return -EINVAL; - } - } - if (!group) - group = KPROBE_EVENT_SYSTEM; - - if (is_delete) { - if (!event) { - pr_info("Delete command needs an event name.\n"); - return -EINVAL; - } - mutex_lock(&probe_lock); - tk = find_trace_kprobe(event, group); - if (!tk) { - mutex_unlock(&probe_lock); - pr_info("Event %s/%s doesn't exist.\n", group, event); - return -ENOENT; - } - /* delete an event */ - ret = unregister_trace_kprobe(tk); - if (ret == 0) - free_trace_kprobe(tk); - mutex_unlock(&probe_lock); - return ret; - } - - if (argc < 2) { - pr_info("Probe point is not specified.\n"); - return -EINVAL; - } - /* try to parse an address. if that fails, try to read the * input as a symbol. */ if (kstrtoul(argv[1], 0, (unsigned long *)&addr)) { + /* Check whether uprobe event specified */ + if (strchr(argv[1], '/') && strchr(argv[1], ':')) + return -ECANCELED; /* a symbol specified */ - symbol = argv[1]; + symbol = kstrdup(argv[1], GFP_KERNEL); + if (!symbol) + return -ENOMEM; /* TODO: support .init module functions */ ret = traceprobe_split_symbol_offset(symbol, &offset); if (ret || offset < 0 || offset > UINT_MAX) { pr_info("Failed to parse either an address or a symbol.\n"); - return ret; + goto out; } if (kprobe_on_func_entry(NULL, symbol, offset)) flags |= TPARG_FL_FENTRY; if (offset && is_return && !(flags & TPARG_FL_FENTRY)) { pr_info("Given offset is not valid for return probe.\n"); - return -EINVAL; + ret = -EINVAL; + goto out; } } argc -= 2; argv += 2; - /* setup a probe */ - if (!event) { + if (event) { + ret = traceprobe_parse_event_name(&event, &group, buf); + if (ret) + goto out; + } else { /* Make a new event name */ if (symbol) snprintf(buf, MAX_EVENT_NAME_LEN, "%c_%s_%ld", @@ -666,17 +683,27 @@ static int create_trace_kprobe(int argc, char **argv) sanitize_event_name(buf); event = buf; } + + /* setup a probe */ tk = alloc_trace_kprobe(group, event, addr, symbol, offset, maxactive, argc, is_return); if (IS_ERR(tk)) { pr_info("Failed to allocate trace_probe.(%d)\n", (int)PTR_ERR(tk)); - return PTR_ERR(tk); + ret = PTR_ERR(tk); + goto out; } /* parse arguments */ for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { - ret = traceprobe_parse_probe_arg(&tk->tp, i, argv[i], flags); + tmp = kstrdup(argv[i], GFP_KERNEL); + if (!tmp) { + ret = -ENOMEM; + goto error; + } + + ret = traceprobe_parse_probe_arg(&tk->tp, i, tmp, flags); + kfree(tmp); if (ret) goto error; } @@ -684,60 +711,39 @@ static int create_trace_kprobe(int argc, char **argv) ret = register_trace_kprobe(tk); if (ret) goto error; - return 0; +out: + kfree(symbol); + return ret; error: free_trace_kprobe(tk); - return ret; + goto out; } -static int release_all_trace_kprobes(void) +static int create_or_delete_trace_kprobe(int argc, char **argv) { - struct trace_kprobe *tk; - int ret = 0; - - mutex_lock(&probe_lock); - /* Ensure no probe is in use. */ - list_for_each_entry(tk, &probe_list, list) - if (trace_probe_is_enabled(&tk->tp)) { - ret = -EBUSY; - goto end; - } - /* TODO: Use batch unregistration */ - while (!list_empty(&probe_list)) { - tk = list_entry(probe_list.next, struct trace_kprobe, list); - ret = unregister_trace_kprobe(tk); - if (ret) - goto end; - free_trace_kprobe(tk); - } - -end: - mutex_unlock(&probe_lock); + int ret; - return ret; -} + if (argv[0][0] == '-') + return dyn_event_release(argc, argv, &trace_kprobe_ops); -/* Probes listing interfaces */ -static void *probes_seq_start(struct seq_file *m, loff_t *pos) -{ - mutex_lock(&probe_lock); - return seq_list_start(&probe_list, *pos); + ret = trace_kprobe_create(argc, (const char **)argv); + return ret == -ECANCELED ? -EINVAL : ret; } -static void *probes_seq_next(struct seq_file *m, void *v, loff_t *pos) +static int trace_kprobe_release(struct dyn_event *ev) { - return seq_list_next(v, &probe_list, pos); -} + struct trace_kprobe *tk = to_trace_kprobe(ev); + int ret = unregister_trace_kprobe(tk); -static void probes_seq_stop(struct seq_file *m, void *v) -{ - mutex_unlock(&probe_lock); + if (!ret) + free_trace_kprobe(tk); + return ret; } -static int probes_seq_show(struct seq_file *m, void *v) +static int trace_kprobe_show(struct seq_file *m, struct dyn_event *ev) { - struct trace_kprobe *tk = v; + struct trace_kprobe *tk = to_trace_kprobe(ev); int i; seq_putc(m, trace_kprobe_is_return(tk) ? 'r' : 'p'); @@ -759,10 +765,20 @@ static int probes_seq_show(struct seq_file *m, void *v) return 0; } +static int probes_seq_show(struct seq_file *m, void *v) +{ + struct dyn_event *ev = v; + + if (!is_trace_kprobe(ev)) + return 0; + + return trace_kprobe_show(m, ev); +} + static const struct seq_operations probes_seq_op = { - .start = probes_seq_start, - .next = probes_seq_next, - .stop = probes_seq_stop, + .start = dyn_event_seq_start, + .next = dyn_event_seq_next, + .stop = dyn_event_seq_stop, .show = probes_seq_show }; @@ -771,7 +787,7 @@ static int probes_open(struct inode *inode, struct file *file) int ret; if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) { - ret = release_all_trace_kprobes(); + ret = dyn_events_release_all(&trace_kprobe_ops); if (ret < 0) return ret; } @@ -783,7 +799,7 @@ static ssize_t probes_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos) { return trace_parse_run_command(file, buffer, count, ppos, - create_trace_kprobe); + create_or_delete_trace_kprobe); } static const struct file_operations kprobe_events_ops = { @@ -798,8 +814,13 @@ static const struct file_operations kprobe_events_ops = { /* Probes profiling interfaces */ static int probes_profile_seq_show(struct seq_file *m, void *v) { - struct trace_kprobe *tk = v; + struct dyn_event *ev = v; + struct trace_kprobe *tk; + if (!is_trace_kprobe(ev)) + return 0; + + tk = to_trace_kprobe(ev); seq_printf(m, " %-44s %15lu %15lu\n", trace_event_name(&tk->tp.call), trace_kprobe_nhit(tk), @@ -809,9 +830,9 @@ static int probes_profile_seq_show(struct seq_file *m, void *v) } static const struct seq_operations profile_seq_op = { - .start = probes_seq_start, - .next = probes_seq_next, - .stop = probes_seq_stop, + .start = dyn_event_seq_start, + .next = dyn_event_seq_next, + .stop = dyn_event_seq_stop, .show = probes_profile_seq_show }; @@ -1332,7 +1353,7 @@ static int register_kprobe_event(struct trace_kprobe *tk) kfree(call->print_fmt); return -ENODEV; } - ret = trace_add_event_call(call); + ret = trace_add_event_call_nolock(call); if (ret) { pr_info("Failed to register kprobe event: %s\n", trace_event_name(call)); @@ -1347,7 +1368,7 @@ static int unregister_kprobe_event(struct trace_kprobe *tk) int ret; /* tp->event is unregistered in trace_remove_event_call() */ - ret = trace_remove_event_call(&tk->tp.call); + ret = trace_remove_event_call_nolock(&tk->tp.call); if (!ret) kfree(tk->tp.call.print_fmt); return ret; @@ -1364,7 +1385,7 @@ create_local_trace_kprobe(char *func, void *addr, unsigned long offs, char *event; /* - * local trace_kprobes are not added to probe_list, so they are never + * local trace_kprobes are not added to dyn_event, so they are never * searched in find_trace_kprobe(). Therefore, there is no concern of * duplicated name here. */ @@ -1422,6 +1443,11 @@ static __init int init_kprobe_trace(void) { struct dentry *d_tracer; struct dentry *entry; + int ret; + + ret = dyn_event_register(&trace_kprobe_ops); + if (ret) + return ret; if (register_module_notifier(&trace_kprobe_module_nb)) return -EINVAL; @@ -1479,9 +1505,8 @@ static __init int kprobe_trace_self_tests_init(void) pr_info("Testing kprobe tracing: "); - ret = trace_run_command("p:testprobe kprobe_trace_selftest_target " - "$stack $stack0 +0($stack)", - create_trace_kprobe); + ret = trace_run_command("p:testprobe kprobe_trace_selftest_target $stack $stack0 +0($stack)", + create_or_delete_trace_kprobe); if (WARN_ON_ONCE(ret)) { pr_warn("error on probing function entry.\n"); warn++; @@ -1501,8 +1526,8 @@ static __init int kprobe_trace_self_tests_init(void) } } - ret = trace_run_command("r:testprobe2 kprobe_trace_selftest_target " - "$retval", create_trace_kprobe); + ret = trace_run_command("r:testprobe2 kprobe_trace_selftest_target $retval", + create_or_delete_trace_kprobe); if (WARN_ON_ONCE(ret)) { pr_warn("error on probing function return.\n"); warn++; @@ -1572,20 +1597,24 @@ static __init int kprobe_trace_self_tests_init(void) disable_trace_kprobe(tk, file); } - ret = trace_run_command("-:testprobe", create_trace_kprobe); + ret = trace_run_command("-:testprobe", create_or_delete_trace_kprobe); if (WARN_ON_ONCE(ret)) { pr_warn("error on deleting a probe.\n"); warn++; } - ret = trace_run_command("-:testprobe2", create_trace_kprobe); + ret = trace_run_command("-:testprobe2", create_or_delete_trace_kprobe); if (WARN_ON_ONCE(ret)) { pr_warn("error on deleting a probe.\n"); warn++; } end: - release_all_trace_kprobes(); + ret = dyn_events_release_all(&trace_kprobe_ops); + if (WARN_ON_ONCE(ret)) { + pr_warn("error on cleaning up probes.\n"); + warn++; + } /* * Wait for the optimizer work to finish. Otherwise it might fiddle * with probes in already freed __init text. diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 449150c6a87f..ff86417c0149 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -154,6 +154,33 @@ int traceprobe_split_symbol_offset(char *symbol, long *offset) return 0; } +/* @buf must has MAX_EVENT_NAME_LEN size */ +int traceprobe_parse_event_name(const char **pevent, const char **pgroup, + char *buf) +{ + const char *slash, *event = *pevent; + + slash = strchr(event, '/'); + if (slash) { + if (slash == event) { + pr_info("Group name is not specified\n"); + return -EINVAL; + } + if (slash - event + 1 > MAX_EVENT_NAME_LEN) { + pr_info("Group name is too long\n"); + return -E2BIG; + } + strlcpy(buf, event, slash - event + 1); + *pgroup = buf; + *pevent = slash + 1; + } + if (strlen(event) == 0) { + pr_info("Event name is not specified\n"); + return -EINVAL; + } + return 0; +} + #define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long)) static int parse_probe_vars(char *arg, const struct fetch_type *t, diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index feeec261b356..8a63f8bc01bc 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -279,6 +279,8 @@ extern int traceprobe_update_arg(struct probe_arg *arg); extern void traceprobe_free_probe_arg(struct probe_arg *arg); extern int traceprobe_split_symbol_offset(char *symbol, long *offset); +extern int traceprobe_parse_event_name(const char **pevent, + const char **pgroup, char *buf); extern int traceprobe_set_print_fmt(struct trace_probe *tp, bool is_return); -- cgit From 0597c49c69d53f00df99421d832453f4e92a5006 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 5 Nov 2018 18:03:04 +0900 Subject: tracing/uprobes: Use dyn_event framework for uprobe events Use dyn_event framework for uprobe events. This shows uprobe events on "dynamic_events" file. User can also define new uprobe events via dynamic_events. Link: http://lkml.kernel.org/r/154140858481.17322.9091293846515154065.stgit@devbox Reviewed-by: Tom Zanussi Tested-by: Tom Zanussi Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- Documentation/trace/uprobetracer.rst | 4 + kernel/trace/Kconfig | 1 + kernel/trace/trace_uprobe.c | 278 +++++++++++++++++++---------------- 3 files changed, 153 insertions(+), 130 deletions(-) (limited to 'kernel') diff --git a/Documentation/trace/uprobetracer.rst b/Documentation/trace/uprobetracer.rst index d0822811527a..4c3bfde2ba47 100644 --- a/Documentation/trace/uprobetracer.rst +++ b/Documentation/trace/uprobetracer.rst @@ -18,6 +18,10 @@ current_tracer. Instead of that, add probe points via However unlike kprobe-event tracer, the uprobe event interface expects the user to calculate the offset of the probepoint in the object. +You can also use /sys/kernel/debug/tracing/dynamic_events instead of +uprobe_events. That interface will provide unified access to other +dynamic events too. + Synopsis of uprobe_tracer ------------------------- :: diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index c0f6b0105609..2cab3c5dfe2c 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -501,6 +501,7 @@ config UPROBE_EVENTS depends on PERF_EVENTS select UPROBES select PROBE_EVENTS + select DYNAMIC_EVENTS select TRACING default y help diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 6eaaa2150685..4a7b21c891f3 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -7,6 +7,7 @@ */ #define pr_fmt(fmt) "trace_kprobe: " fmt +#include #include #include #include @@ -14,6 +15,7 @@ #include #include +#include "trace_dynevent.h" #include "trace_probe.h" #include "trace_probe_tmpl.h" @@ -37,11 +39,26 @@ struct trace_uprobe_filter { struct list_head perf_events; }; +static int trace_uprobe_create(int argc, const char **argv); +static int trace_uprobe_show(struct seq_file *m, struct dyn_event *ev); +static int trace_uprobe_release(struct dyn_event *ev); +static bool trace_uprobe_is_busy(struct dyn_event *ev); +static bool trace_uprobe_match(const char *system, const char *event, + struct dyn_event *ev); + +static struct dyn_event_operations trace_uprobe_ops = { + .create = trace_uprobe_create, + .show = trace_uprobe_show, + .is_busy = trace_uprobe_is_busy, + .free = trace_uprobe_release, + .match = trace_uprobe_match, +}; + /* * uprobe event core functions */ struct trace_uprobe { - struct list_head list; + struct dyn_event devent; struct trace_uprobe_filter filter; struct uprobe_consumer consumer; struct path path; @@ -53,6 +70,25 @@ struct trace_uprobe { struct trace_probe tp; }; +static bool is_trace_uprobe(struct dyn_event *ev) +{ + return ev->ops == &trace_uprobe_ops; +} + +static struct trace_uprobe *to_trace_uprobe(struct dyn_event *ev) +{ + return container_of(ev, struct trace_uprobe, devent); +} + +/** + * for_each_trace_uprobe - iterate over the trace_uprobe list + * @pos: the struct trace_uprobe * for each entry + * @dpos: the struct dyn_event * to use as a loop cursor + */ +#define for_each_trace_uprobe(pos, dpos) \ + for_each_dyn_event(dpos) \ + if (is_trace_uprobe(dpos) && (pos = to_trace_uprobe(dpos))) + #define SIZEOF_TRACE_UPROBE(n) \ (offsetof(struct trace_uprobe, tp.args) + \ (sizeof(struct probe_arg) * (n))) @@ -60,9 +96,6 @@ struct trace_uprobe { static int register_uprobe_event(struct trace_uprobe *tu); static int unregister_uprobe_event(struct trace_uprobe *tu); -static DEFINE_MUTEX(uprobe_lock); -static LIST_HEAD(uprobe_list); - struct uprobe_dispatch_data { struct trace_uprobe *tu; unsigned long bp_addr; @@ -209,6 +242,22 @@ static inline bool is_ret_probe(struct trace_uprobe *tu) return tu->consumer.ret_handler != NULL; } +static bool trace_uprobe_is_busy(struct dyn_event *ev) +{ + struct trace_uprobe *tu = to_trace_uprobe(ev); + + return trace_probe_is_enabled(&tu->tp); +} + +static bool trace_uprobe_match(const char *system, const char *event, + struct dyn_event *ev) +{ + struct trace_uprobe *tu = to_trace_uprobe(ev); + + return strcmp(trace_event_name(&tu->tp.call), event) == 0 && + (!system || strcmp(tu->tp.call.class->system, system) == 0); +} + /* * Allocate new trace_uprobe and initialize it (including uprobes). */ @@ -236,7 +285,7 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs, bool is_ret) if (!tu->tp.class.system) goto error; - INIT_LIST_HEAD(&tu->list); + dyn_event_init(&tu->devent, &trace_uprobe_ops); INIT_LIST_HEAD(&tu->tp.files); tu->consumer.handler = uprobe_dispatcher; if (is_ret) @@ -255,6 +304,9 @@ static void free_trace_uprobe(struct trace_uprobe *tu) { int i; + if (!tu) + return; + for (i = 0; i < tu->tp.nr_args; i++) traceprobe_free_probe_arg(&tu->tp.args[i]); @@ -267,9 +319,10 @@ static void free_trace_uprobe(struct trace_uprobe *tu) static struct trace_uprobe *find_probe_event(const char *event, const char *group) { + struct dyn_event *pos; struct trace_uprobe *tu; - list_for_each_entry(tu, &uprobe_list, list) + for_each_trace_uprobe(tu, pos) if (strcmp(trace_event_name(&tu->tp.call), event) == 0 && strcmp(tu->tp.call.class->system, group) == 0) return tu; @@ -277,7 +330,7 @@ static struct trace_uprobe *find_probe_event(const char *event, const char *grou return NULL; } -/* Unregister a trace_uprobe and probe_event: call with locking uprobe_lock */ +/* Unregister a trace_uprobe and probe_event */ static int unregister_trace_uprobe(struct trace_uprobe *tu) { int ret; @@ -286,7 +339,7 @@ static int unregister_trace_uprobe(struct trace_uprobe *tu) if (ret) return ret; - list_del(&tu->list); + dyn_event_remove(&tu->devent); free_trace_uprobe(tu); return 0; } @@ -302,13 +355,14 @@ static int unregister_trace_uprobe(struct trace_uprobe *tu) */ static struct trace_uprobe *find_old_trace_uprobe(struct trace_uprobe *new) { + struct dyn_event *pos; struct trace_uprobe *tmp, *old = NULL; struct inode *new_inode = d_real_inode(new->path.dentry); old = find_probe_event(trace_event_name(&new->tp.call), new->tp.call.class->system); - list_for_each_entry(tmp, &uprobe_list, list) { + for_each_trace_uprobe(tmp, pos) { if ((old ? old != tmp : true) && new_inode == d_real_inode(tmp->path.dentry) && new->offset == tmp->offset && @@ -326,7 +380,7 @@ static int register_trace_uprobe(struct trace_uprobe *tu) struct trace_uprobe *old_tu; int ret; - mutex_lock(&uprobe_lock); + mutex_lock(&event_mutex); /* register as an event */ old_tu = find_old_trace_uprobe(tu); @@ -348,10 +402,10 @@ static int register_trace_uprobe(struct trace_uprobe *tu) goto end; } - list_add_tail(&tu->list, &uprobe_list); + dyn_event_add(&tu->devent); end: - mutex_unlock(&uprobe_lock); + mutex_unlock(&event_mutex); return ret; } @@ -362,91 +416,49 @@ end: * * - Remove uprobe: -:[GRP/]EVENT */ -static int create_trace_uprobe(int argc, char **argv) +static int trace_uprobe_create(int argc, const char **argv) { struct trace_uprobe *tu; - char *arg, *event, *group, *filename, *rctr, *rctr_end; + const char *event = NULL, *group = UPROBE_EVENT_SYSTEM; + char *arg, *filename, *rctr, *rctr_end, *tmp; char buf[MAX_EVENT_NAME_LEN]; struct path path; unsigned long offset, ref_ctr_offset; - bool is_delete, is_return; + bool is_return = false; int i, ret; ret = 0; - is_delete = false; - is_return = false; - event = NULL; - group = NULL; ref_ctr_offset = 0; /* argc must be >= 1 */ - if (argv[0][0] == '-') - is_delete = true; - else if (argv[0][0] == 'r') + if (argv[0][0] == 'r') is_return = true; - else if (argv[0][0] != 'p') { - pr_info("Probe definition must be started with 'p', 'r' or '-'.\n"); - return -EINVAL; - } + else if (argv[0][0] != 'p' || argc < 2) + return -ECANCELED; - if (argv[0][1] == ':') { + if (argv[0][1] == ':') event = &argv[0][2]; - arg = strchr(event, '/'); - if (arg) { - group = event; - event = arg + 1; - event[-1] = '\0'; + if (!strchr(argv[1], '/')) + return -ECANCELED; - if (strlen(group) == 0) { - pr_info("Group name is not specified\n"); - return -EINVAL; - } - } - if (strlen(event) == 0) { - pr_info("Event name is not specified\n"); - return -EINVAL; - } - } - if (!group) - group = UPROBE_EVENT_SYSTEM; - - if (is_delete) { - int ret; - - if (!event) { - pr_info("Delete command needs an event name.\n"); - return -EINVAL; - } - mutex_lock(&uprobe_lock); - tu = find_probe_event(event, group); - - if (!tu) { - mutex_unlock(&uprobe_lock); - pr_info("Event %s/%s doesn't exist.\n", group, event); - return -ENOENT; - } - /* delete an event */ - ret = unregister_trace_uprobe(tu); - mutex_unlock(&uprobe_lock); - return ret; - } + filename = kstrdup(argv[1], GFP_KERNEL); + if (!filename) + return -ENOMEM; - if (argc < 2) { - pr_info("Probe point is not specified.\n"); - return -EINVAL; - } /* Find the last occurrence, in case the path contains ':' too. */ - arg = strrchr(argv[1], ':'); - if (!arg) - return -EINVAL; + arg = strrchr(filename, ':'); + if (!arg || !isdigit(arg[1])) { + kfree(filename); + return -ECANCELED; + } *arg++ = '\0'; - filename = argv[1]; ret = kern_path(filename, LOOKUP_FOLLOW, &path); - if (ret) + if (ret) { + kfree(filename); return ret; - + } if (!d_is_reg(path.dentry)) { ret = -EINVAL; goto fail_address_parse; @@ -480,7 +492,11 @@ static int create_trace_uprobe(int argc, char **argv) argv += 2; /* setup a probe */ - if (!event) { + if (event) { + ret = traceprobe_parse_event_name(&event, &group, buf); + if (ret) + goto fail_address_parse; + } else { char *tail; char *ptr; @@ -508,18 +524,19 @@ static int create_trace_uprobe(int argc, char **argv) tu->offset = offset; tu->ref_ctr_offset = ref_ctr_offset; tu->path = path; - tu->filename = kstrdup(filename, GFP_KERNEL); - - if (!tu->filename) { - pr_info("Failed to allocate filename.\n"); - ret = -ENOMEM; - goto error; - } + tu->filename = filename; /* parse arguments */ for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { - ret = traceprobe_parse_probe_arg(&tu->tp, i, argv[i], + tmp = kstrdup(argv[i], GFP_KERNEL); + if (!tmp) { + ret = -ENOMEM; + goto error; + } + + ret = traceprobe_parse_probe_arg(&tu->tp, i, tmp, is_return ? TPARG_FL_RETURN : 0); + kfree(tmp); if (ret) goto error; } @@ -535,55 +552,35 @@ error: fail_address_parse: path_put(&path); + kfree(filename); pr_info("Failed to parse address or file.\n"); return ret; } -static int cleanup_all_probes(void) +static int create_or_delete_trace_uprobe(int argc, char **argv) { - struct trace_uprobe *tu; - int ret = 0; + int ret; - mutex_lock(&uprobe_lock); - /* Ensure no probe is in use. */ - list_for_each_entry(tu, &uprobe_list, list) - if (trace_probe_is_enabled(&tu->tp)) { - ret = -EBUSY; - goto end; - } - while (!list_empty(&uprobe_list)) { - tu = list_entry(uprobe_list.next, struct trace_uprobe, list); - ret = unregister_trace_uprobe(tu); - if (ret) - break; - } -end: - mutex_unlock(&uprobe_lock); - return ret; -} + if (argv[0][0] == '-') + return dyn_event_release(argc, argv, &trace_uprobe_ops); -/* Probes listing interfaces */ -static void *probes_seq_start(struct seq_file *m, loff_t *pos) -{ - mutex_lock(&uprobe_lock); - return seq_list_start(&uprobe_list, *pos); + ret = trace_uprobe_create(argc, (const char **)argv); + return ret == -ECANCELED ? -EINVAL : ret; } -static void *probes_seq_next(struct seq_file *m, void *v, loff_t *pos) +static int trace_uprobe_release(struct dyn_event *ev) { - return seq_list_next(v, &uprobe_list, pos); -} + struct trace_uprobe *tu = to_trace_uprobe(ev); -static void probes_seq_stop(struct seq_file *m, void *v) -{ - mutex_unlock(&uprobe_lock); + return unregister_trace_uprobe(tu); } -static int probes_seq_show(struct seq_file *m, void *v) +/* Probes listing interfaces */ +static int trace_uprobe_show(struct seq_file *m, struct dyn_event *ev) { - struct trace_uprobe *tu = v; + struct trace_uprobe *tu = to_trace_uprobe(ev); char c = is_ret_probe(tu) ? 'r' : 'p'; int i; @@ -601,11 +598,21 @@ static int probes_seq_show(struct seq_file *m, void *v) return 0; } +static int probes_seq_show(struct seq_file *m, void *v) +{ + struct dyn_event *ev = v; + + if (!is_trace_uprobe(ev)) + return 0; + + return trace_uprobe_show(m, ev); +} + static const struct seq_operations probes_seq_op = { - .start = probes_seq_start, - .next = probes_seq_next, - .stop = probes_seq_stop, - .show = probes_seq_show + .start = dyn_event_seq_start, + .next = dyn_event_seq_next, + .stop = dyn_event_seq_stop, + .show = probes_seq_show }; static int probes_open(struct inode *inode, struct file *file) @@ -613,7 +620,7 @@ static int probes_open(struct inode *inode, struct file *file) int ret; if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) { - ret = cleanup_all_probes(); + ret = dyn_events_release_all(&trace_uprobe_ops); if (ret) return ret; } @@ -624,7 +631,8 @@ static int probes_open(struct inode *inode, struct file *file) static ssize_t probes_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos) { - return trace_parse_run_command(file, buffer, count, ppos, create_trace_uprobe); + return trace_parse_run_command(file, buffer, count, ppos, + create_or_delete_trace_uprobe); } static const struct file_operations uprobe_events_ops = { @@ -639,17 +647,22 @@ static const struct file_operations uprobe_events_ops = { /* Probes profiling interfaces */ static int probes_profile_seq_show(struct seq_file *m, void *v) { - struct trace_uprobe *tu = v; + struct dyn_event *ev = v; + struct trace_uprobe *tu; + + if (!is_trace_uprobe(ev)) + return 0; + tu = to_trace_uprobe(ev); seq_printf(m, " %s %-44s %15lu\n", tu->filename, trace_event_name(&tu->tp.call), tu->nhit); return 0; } static const struct seq_operations profile_seq_op = { - .start = probes_seq_start, - .next = probes_seq_next, - .stop = probes_seq_stop, + .start = dyn_event_seq_start, + .next = dyn_event_seq_next, + .stop = dyn_event_seq_stop, .show = probes_profile_seq_show }; @@ -1307,7 +1320,7 @@ static int register_uprobe_event(struct trace_uprobe *tu) return -ENODEV; } - ret = trace_add_event_call(call); + ret = trace_add_event_call_nolock(call); if (ret) { pr_info("Failed to register uprobe event: %s\n", @@ -1324,7 +1337,7 @@ static int unregister_uprobe_event(struct trace_uprobe *tu) int ret; /* tu->event is unregistered in trace_remove_event_call() */ - ret = trace_remove_event_call(&tu->tp.call); + ret = trace_remove_event_call_nolock(&tu->tp.call); if (ret) return ret; kfree(tu->tp.call.print_fmt); @@ -1351,7 +1364,7 @@ create_local_trace_uprobe(char *name, unsigned long offs, } /* - * local trace_kprobes are not added to probe_list, so they are never + * local trace_kprobes are not added to dyn_event, so they are never * searched in find_trace_kprobe(). Therefore, there is no concern of * duplicated name "DUMMY_EVENT" here. */ @@ -1399,6 +1412,11 @@ void destroy_local_trace_uprobe(struct trace_event_call *event_call) static __init int init_uprobe_trace(void) { struct dentry *d_tracer; + int ret; + + ret = dyn_event_register(&trace_uprobe_ops); + if (ret) + return ret; d_tracer = tracing_init_dentry(); if (IS_ERR(d_tracer)) -- cgit From 7bbab38d07f3185fddf6fce126e2239010efdfce Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 5 Nov 2018 18:03:33 +0900 Subject: tracing: Use dyn_event framework for synthetic events Use dyn_event framework for synthetic events. This shows synthetic events on "tracing/dynamic_events" file in addition to tracing/synthetic_events interface. User can also define new events via tracing/dynamic_events with "s:" prefix. So, the new syntax is below; s:[synthetic/]EVENT_NAME TYPE ARG; [TYPE ARG;]... To remove events via tracing/dynamic_events, you can use "-:" prefix as same as other events. Link: http://lkml.kernel.org/r/154140861301.17322.15454611233735614508.stgit@devbox Reviewed-by: Tom Zanussi Tested-by: Tom Zanussi Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/Kconfig | 1 + kernel/trace/trace.c | 8 ++ kernel/trace/trace_events_hist.c | 265 ++++++++++++++++++++++++--------------- 3 files changed, 176 insertions(+), 98 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 2cab3c5dfe2c..fa8b1fe824f3 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -635,6 +635,7 @@ config HIST_TRIGGERS depends on ARCH_HAVE_NMI_SAFE_CMPXCHG select TRACING_MAP select TRACING + select DYNAMIC_EVENTS default n help Hist triggers allow one or more arbitrary trace event fields diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 7e0332f90ed4..911470ad9e94 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4620,6 +4620,9 @@ static const char readme_msg[] = "\t accepts: event-definitions (one definition per line)\n" "\t Format: p[:[/]] []\n" "\t r[maxactive][:[/]] []\n" +#ifdef CONFIG_HIST_TRIGGERS + "\t s:[synthetic/] []\n" +#endif "\t -:[/]\n" #ifdef CONFIG_KPROBE_EVENTS "\t place: [:][+]|\n" @@ -4638,6 +4641,11 @@ static const char readme_msg[] = "\t type: s8/16/32/64, u8/16/32/64, x8/16/32/64, string, symbol,\n" "\t b@/,\n" "\t \\[\\]\n" +#ifdef CONFIG_HIST_TRIGGERS + "\t field: ;\n" + "\t stype: u8/u16/u32/u64, s8/s16/s32/s64, pid_t,\n" + "\t [unsigned] char/int/long\n" +#endif #endif " events/\t\t- Directory containing all trace event subsystems:\n" " enable\t\t- Write 0/1 to enable/disable tracing of all events\n" diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 0feb7f460123..414aabd67d1f 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -15,6 +15,7 @@ #include "tracing_map.h" #include "trace.h" +#include "trace_dynevent.h" #define SYNTH_SYSTEM "synthetic" #define SYNTH_FIELDS_MAX 16 @@ -292,6 +293,21 @@ struct hist_trigger_data { unsigned int n_max_var_str; }; +static int synth_event_create(int argc, const char **argv); +static int synth_event_show(struct seq_file *m, struct dyn_event *ev); +static int synth_event_release(struct dyn_event *ev); +static bool synth_event_is_busy(struct dyn_event *ev); +static bool synth_event_match(const char *system, const char *event, + struct dyn_event *ev); + +static struct dyn_event_operations synth_event_ops = { + .create = synth_event_create, + .show = synth_event_show, + .is_busy = synth_event_is_busy, + .free = synth_event_release, + .match = synth_event_match, +}; + struct synth_field { char *type; char *name; @@ -301,7 +317,7 @@ struct synth_field { }; struct synth_event { - struct list_head list; + struct dyn_event devent; int ref; char *name; struct synth_field **fields; @@ -312,6 +328,32 @@ struct synth_event { struct tracepoint *tp; }; +static bool is_synth_event(struct dyn_event *ev) +{ + return ev->ops == &synth_event_ops; +} + +static struct synth_event *to_synth_event(struct dyn_event *ev) +{ + return container_of(ev, struct synth_event, devent); +} + +static bool synth_event_is_busy(struct dyn_event *ev) +{ + struct synth_event *event = to_synth_event(ev); + + return event->ref != 0; +} + +static bool synth_event_match(const char *system, const char *event, + struct dyn_event *ev) +{ + struct synth_event *sev = to_synth_event(ev); + + return strcmp(sev->name, event) == 0 && + (!system || strcmp(system, SYNTH_SYSTEM) == 0); +} + struct action_data; typedef void (*action_fn_t) (struct hist_trigger_data *hist_data, @@ -402,7 +444,6 @@ static bool have_hist_err(void) return false; } -static LIST_HEAD(synth_event_list); static DEFINE_MUTEX(synth_event_mutex); struct synth_trace_event { @@ -738,14 +779,12 @@ static void free_synth_field(struct synth_field *field) kfree(field); } -static struct synth_field *parse_synth_field(int argc, char **argv, +static struct synth_field *parse_synth_field(int argc, const char **argv, int *consumed) { struct synth_field *field; - const char *prefix = NULL; - char *field_type = argv[0], *field_name; + const char *prefix = NULL, *field_type = argv[0], *field_name, *array; int len, ret = 0; - char *array; if (field_type[0] == ';') field_type++; @@ -762,20 +801,31 @@ static struct synth_field *parse_synth_field(int argc, char **argv, *consumed = 2; } - len = strlen(field_name); - if (field_name[len - 1] == ';') - field_name[len - 1] = '\0'; - field = kzalloc(sizeof(*field), GFP_KERNEL); if (!field) return ERR_PTR(-ENOMEM); - len = strlen(field_type) + 1; + len = strlen(field_name); array = strchr(field_name, '['); + if (array) + len -= strlen(array); + else if (field_name[len - 1] == ';') + len--; + + field->name = kmemdup_nul(field_name, len, GFP_KERNEL); + if (!field->name) { + ret = -ENOMEM; + goto free; + } + + if (field_type[0] == ';') + field_type++; + len = strlen(field_type) + 1; if (array) len += strlen(array); if (prefix) len += strlen(prefix); + field->type = kzalloc(len, GFP_KERNEL); if (!field->type) { ret = -ENOMEM; @@ -786,7 +836,8 @@ static struct synth_field *parse_synth_field(int argc, char **argv, strcat(field->type, field_type); if (array) { strcat(field->type, array); - *array = '\0'; + if (field->type[len - 1] == ';') + field->type[len - 1] = '\0'; } field->size = synth_field_size(field->type); @@ -800,11 +851,6 @@ static struct synth_field *parse_synth_field(int argc, char **argv, field->is_signed = synth_field_signed(field->type); - field->name = kstrdup(field_name, GFP_KERNEL); - if (!field->name) { - ret = -ENOMEM; - goto free; - } out: return field; free: @@ -868,9 +914,13 @@ static inline void trace_synth(struct synth_event *event, u64 *var_ref_vals, static struct synth_event *find_synth_event(const char *name) { + struct dyn_event *pos; struct synth_event *event; - list_for_each_entry(event, &synth_event_list, list) { + for_each_dyn_event(pos) { + if (!is_synth_event(pos)) + continue; + event = to_synth_event(pos); if (strcmp(event->name, name) == 0) return event; } @@ -921,7 +971,7 @@ static int register_synth_event(struct synth_event *event) ret = set_synth_event_print_fmt(call); if (ret < 0) { - trace_remove_event_call(call); + trace_remove_event_call_nolock(call); goto err; } out: @@ -959,7 +1009,7 @@ static void free_synth_event(struct synth_event *event) kfree(event); } -static struct synth_event *alloc_synth_event(char *event_name, int n_fields, +static struct synth_event *alloc_synth_event(const char *name, int n_fields, struct synth_field **fields) { struct synth_event *event; @@ -971,7 +1021,7 @@ static struct synth_event *alloc_synth_event(char *event_name, int n_fields, goto out; } - event->name = kstrdup(event_name, GFP_KERNEL); + event->name = kstrdup(name, GFP_KERNEL); if (!event->name) { kfree(event); event = ERR_PTR(-ENOMEM); @@ -985,6 +1035,8 @@ static struct synth_event *alloc_synth_event(char *event_name, int n_fields, goto out; } + dyn_event_init(&event->devent, &synth_event_ops); + for (i = 0; i < n_fields; i++) event->fields[i] = fields[i]; @@ -1008,16 +1060,11 @@ struct hist_var_data { struct hist_trigger_data *hist_data; }; -static int create_synth_event(int argc, char **argv) +static int __create_synth_event(int argc, const char *name, const char **argv) { struct synth_field *field, *fields[SYNTH_FIELDS_MAX]; struct synth_event *event = NULL; - bool delete_event = false; int i, consumed = 0, n_fields = 0, ret = 0; - char *name; - - mutex_lock(&event_mutex); - mutex_lock(&synth_event_mutex); /* * Argument syntax: @@ -1025,43 +1072,20 @@ static int create_synth_event(int argc, char **argv) * - Remove synthetic event: ! field[;field] ... * where 'field' = type field_name */ - if (argc < 1) { - ret = -EINVAL; - goto out; - } - name = argv[0]; - if (name[0] == '!') { - delete_event = true; - name++; - } + if (name[0] == '\0' || argc < 1) + return -EINVAL; + + mutex_lock(&event_mutex); + mutex_lock(&synth_event_mutex); event = find_synth_event(name); if (event) { - if (delete_event) { - if (event->ref) { - ret = -EBUSY; - goto out; - } - ret = unregister_synth_event(event); - if (!ret) { - list_del(&event->list); - free_synth_event(event); - } - } else - ret = -EEXIST; - goto out; - } else if (delete_event) { - ret = -ENOENT; + ret = -EEXIST; goto out; } - if (argc < 2) { - ret = -EINVAL; - goto out; - } - - for (i = 1; i < argc - 1; i++) { + for (i = 0; i < argc - 1; i++) { if (strcmp(argv[i], ";") == 0) continue; if (n_fields == SYNTH_FIELDS_MAX) { @@ -1091,7 +1115,7 @@ static int create_synth_event(int argc, char **argv) } ret = register_synth_event(event); if (!ret) - list_add(&event->list, &synth_event_list); + dyn_event_add(&event->devent); else free_synth_event(event); out: @@ -1106,57 +1130,77 @@ static int create_synth_event(int argc, char **argv) goto out; } -static int release_all_synth_events(void) +static int create_or_delete_synth_event(int argc, char **argv) { - struct synth_event *event, *e; - int ret = 0; - - mutex_lock(&event_mutex); - mutex_lock(&synth_event_mutex); - - list_for_each_entry(event, &synth_event_list, list) { - if (event->ref) { - mutex_unlock(&synth_event_mutex); - return -EBUSY; - } - } + const char *name = argv[0]; + struct synth_event *event = NULL; + int ret; - list_for_each_entry_safe(event, e, &synth_event_list, list) { - ret = unregister_synth_event(event); - if (!ret) { - list_del(&event->list); - free_synth_event(event); + /* trace_run_command() ensures argc != 0 */ + if (name[0] == '!') { + mutex_lock(&event_mutex); + mutex_lock(&synth_event_mutex); + event = find_synth_event(name + 1); + if (event) { + if (event->ref) + ret = -EBUSY; + else { + ret = unregister_synth_event(event); + if (!ret) { + dyn_event_remove(&event->devent); + free_synth_event(event); + } + } } else - break; + ret = -ENOENT; + mutex_unlock(&synth_event_mutex); + mutex_unlock(&event_mutex); + return ret; } - mutex_unlock(&synth_event_mutex); - mutex_unlock(&event_mutex); - return ret; + ret = __create_synth_event(argc - 1, name, (const char **)argv + 1); + return ret == -ECANCELED ? -EINVAL : ret; } - -static void *synth_events_seq_start(struct seq_file *m, loff_t *pos) +static int synth_event_create(int argc, const char **argv) { - mutex_lock(&synth_event_mutex); + const char *name = argv[0]; + int len; - return seq_list_start(&synth_event_list, *pos); -} + if (name[0] != 's' || name[1] != ':') + return -ECANCELED; + name += 2; -static void *synth_events_seq_next(struct seq_file *m, void *v, loff_t *pos) -{ - return seq_list_next(v, &synth_event_list, pos); + /* This interface accepts group name prefix */ + if (strchr(name, '/')) { + len = sizeof(SYNTH_SYSTEM "/") - 1; + if (strncmp(name, SYNTH_SYSTEM "/", len)) + return -EINVAL; + name += len; + } + return __create_synth_event(argc - 1, name, argv + 1); } -static void synth_events_seq_stop(struct seq_file *m, void *v) +static int synth_event_release(struct dyn_event *ev) { - mutex_unlock(&synth_event_mutex); + struct synth_event *event = to_synth_event(ev); + int ret; + + if (event->ref) + return -EBUSY; + + ret = unregister_synth_event(event); + if (ret) + return ret; + + dyn_event_remove(ev); + free_synth_event(event); + return 0; } -static int synth_events_seq_show(struct seq_file *m, void *v) +static int __synth_event_show(struct seq_file *m, struct synth_event *event) { struct synth_field *field; - struct synth_event *event = v; unsigned int i; seq_printf(m, "%s\t", event->name); @@ -1174,11 +1218,30 @@ static int synth_events_seq_show(struct seq_file *m, void *v) return 0; } +static int synth_event_show(struct seq_file *m, struct dyn_event *ev) +{ + struct synth_event *event = to_synth_event(ev); + + seq_printf(m, "s:%s/", event->class.system); + + return __synth_event_show(m, event); +} + +static int synth_events_seq_show(struct seq_file *m, void *v) +{ + struct dyn_event *ev = v; + + if (!is_synth_event(ev)) + return 0; + + return __synth_event_show(m, to_synth_event(ev)); +} + static const struct seq_operations synth_events_seq_op = { - .start = synth_events_seq_start, - .next = synth_events_seq_next, - .stop = synth_events_seq_stop, - .show = synth_events_seq_show + .start = dyn_event_seq_start, + .next = dyn_event_seq_next, + .stop = dyn_event_seq_stop, + .show = synth_events_seq_show, }; static int synth_events_open(struct inode *inode, struct file *file) @@ -1186,7 +1249,7 @@ static int synth_events_open(struct inode *inode, struct file *file) int ret; if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) { - ret = release_all_synth_events(); + ret = dyn_events_release_all(&synth_event_ops); if (ret < 0) return ret; } @@ -1199,7 +1262,7 @@ static ssize_t synth_events_write(struct file *file, size_t count, loff_t *ppos) { return trace_parse_run_command(file, buffer, count, ppos, - create_synth_event); + create_or_delete_synth_event); } static const struct file_operations synth_events_fops = { @@ -5791,6 +5854,12 @@ static __init int trace_events_hist_init(void) struct dentry *d_tracer; int err = 0; + err = dyn_event_register(&synth_event_ops); + if (err) { + pr_warn("Could not register synth_event_ops\n"); + return err; + } + d_tracer = tracing_init_dentry(); if (IS_ERR(d_tracer)) { err = PTR_ERR(d_tracer); -- cgit From 0e2b81f7b52a1c1a8c46986f9ca01eb7b3c421f8 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 5 Nov 2018 18:04:01 +0900 Subject: tracing: Remove unneeded synth_event_mutex Rmove unneeded synth_event_mutex. This mutex protects the reference count in synth_event, however, those operational points are already protected by event_mutex. 1. In __create_synth_event() and create_or_delete_synth_event(), those synth_event_mutex clearly obtained right after event_mutex. 2. event_hist_trigger_func() is trigger_hist_cmd.func() which is called by trigger_process_regex(), which is a part of event_trigger_regex_write() and this function takes event_mutex. 3. hist_unreg_all() is trigger_hist_cmd.unreg_all() which is called by event_trigger_regex_open() and it takes event_mutex. 4. onmatch_destroy() and onmatch_create() have long call tree, but both are finally invoked from event_trigger_regex_write() and event_trace_del_tracer(), former takes event_mutex, and latter ensures called under event_mutex locked. Finally, I ensured there is no resource conflict. For safety, I added lockdep_assert_held(&event_mutex) for each function. Link: http://lkml.kernel.org/r/154140864134.17322.4796059721306031894.stgit@devbox Reviewed-by: Tom Zanussi Tested-by: Tom Zanussi Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 30 +++++++----------------------- 1 file changed, 7 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 414aabd67d1f..21e4954375a1 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -444,8 +444,6 @@ static bool have_hist_err(void) return false; } -static DEFINE_MUTEX(synth_event_mutex); - struct synth_trace_event { struct trace_entry ent; u64 fields[]; @@ -1077,7 +1075,6 @@ static int __create_synth_event(int argc, const char *name, const char **argv) return -EINVAL; mutex_lock(&event_mutex); - mutex_lock(&synth_event_mutex); event = find_synth_event(name); if (event) { @@ -1119,7 +1116,6 @@ static int __create_synth_event(int argc, const char *name, const char **argv) else free_synth_event(event); out: - mutex_unlock(&synth_event_mutex); mutex_unlock(&event_mutex); return ret; @@ -1139,7 +1135,6 @@ static int create_or_delete_synth_event(int argc, char **argv) /* trace_run_command() ensures argc != 0 */ if (name[0] == '!') { mutex_lock(&event_mutex); - mutex_lock(&synth_event_mutex); event = find_synth_event(name + 1); if (event) { if (event->ref) @@ -1153,7 +1148,6 @@ static int create_or_delete_synth_event(int argc, char **argv) } } else ret = -ENOENT; - mutex_unlock(&synth_event_mutex); mutex_unlock(&event_mutex); return ret; } @@ -3535,7 +3529,7 @@ static void onmatch_destroy(struct action_data *data) { unsigned int i; - mutex_lock(&synth_event_mutex); + lockdep_assert_held(&event_mutex); kfree(data->onmatch.match_event); kfree(data->onmatch.match_event_system); @@ -3548,8 +3542,6 @@ static void onmatch_destroy(struct action_data *data) data->onmatch.synth_event->ref--; kfree(data); - - mutex_unlock(&synth_event_mutex); } static void destroy_field_var(struct field_var *field_var) @@ -3700,15 +3692,14 @@ static int onmatch_create(struct hist_trigger_data *hist_data, struct synth_event *event; int ret = 0; - mutex_lock(&synth_event_mutex); + lockdep_assert_held(&event_mutex); + event = find_synth_event(data->onmatch.synth_event_name); if (!event) { hist_err("onmatch: Couldn't find synthetic event: ", data->onmatch.synth_event_name); - mutex_unlock(&synth_event_mutex); return -EINVAL; } event->ref++; - mutex_unlock(&synth_event_mutex); var_ref_idx = hist_data->n_var_refs; @@ -3782,9 +3773,7 @@ static int onmatch_create(struct hist_trigger_data *hist_data, out: return ret; err: - mutex_lock(&synth_event_mutex); event->ref--; - mutex_unlock(&synth_event_mutex); goto out; } @@ -5492,6 +5481,8 @@ static void hist_unreg_all(struct trace_event_file *file) struct synth_event *se; const char *se_name; + lockdep_assert_held(&event_mutex); + if (hist_file_check_refs(file)) return; @@ -5501,12 +5492,10 @@ static void hist_unreg_all(struct trace_event_file *file) list_del_rcu(&test->list); trace_event_trigger_enable_disable(file, 0); - mutex_lock(&synth_event_mutex); se_name = trace_event_name(file->event_call); se = find_synth_event(se_name); if (se) se->ref--; - mutex_unlock(&synth_event_mutex); update_cond_flag(file); if (hist_data->enable_timestamps) @@ -5532,6 +5521,8 @@ static int event_hist_trigger_func(struct event_command *cmd_ops, char *trigger, *p; int ret = 0; + lockdep_assert_held(&event_mutex); + if (glob && strlen(glob)) { last_cmd_set(param); hist_err_clear(); @@ -5622,14 +5613,10 @@ static int event_hist_trigger_func(struct event_command *cmd_ops, } cmd_ops->unreg(glob+1, trigger_ops, trigger_data, file); - - mutex_lock(&synth_event_mutex); se_name = trace_event_name(file->event_call); se = find_synth_event(se_name); if (se) se->ref--; - mutex_unlock(&synth_event_mutex); - ret = 0; goto out_free; } @@ -5665,13 +5652,10 @@ enable: if (ret) goto out_unreg; - mutex_lock(&synth_event_mutex); se_name = trace_event_name(file->event_call); se = find_synth_event(se_name); if (se) se->ref++; - mutex_unlock(&synth_event_mutex); - /* Just return zero, not the number of registered triggers */ ret = 0; out: -- cgit From c454a46b5efd8eff8880e88ece2976e60a26bf35 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Fri, 7 Dec 2018 16:42:25 -0800 Subject: bpf: Add bpf_line_info support This patch adds bpf_line_info support. It accepts an array of bpf_line_info objects during BPF_PROG_LOAD. The "line_info", "line_info_cnt" and "line_info_rec_size" are added to the "union bpf_attr". The "line_info_rec_size" makes bpf_line_info extensible in the future. The new "check_btf_line()" ensures the userspace line_info is valid for the kernel to use. When the verifier is translating/patching the bpf_prog (through "bpf_patch_insn_single()"), the line_infos' insn_off is also adjusted by the newly added "bpf_adj_linfo()". If the bpf_prog is jited, this patch also provides the jited addrs (in aux->jited_linfo) for the corresponding line_info.insn_off. "bpf_prog_fill_jited_linfo()" is added to fill the aux->jited_linfo. It is currently called by the x86 jit. Other jits can also use "bpf_prog_fill_jited_linfo()" and it will be done in the followup patches. In the future, if it deemed necessary, a particular jit could also provide its own "bpf_prog_fill_jited_linfo()" implementation. A few "*line_info*" fields are added to the bpf_prog_info such that the user can get the xlated line_info back (i.e. the line_info with its insn_off reflecting the translated prog). The jited_line_info is available if the prog is jited. It is an array of __u64. If the prog is not jited, jited_line_info_cnt is 0. The verifier's verbose log with line_info will be done in a follow up patch. Signed-off-by: Martin KaFai Lau Acked-by: Yonghong Song Signed-off-by: Alexei Starovoitov --- arch/x86/net/bpf_jit_comp.c | 2 + include/linux/bpf.h | 21 +++++ include/linux/bpf_verifier.h | 1 + include/linux/btf.h | 1 + include/linux/filter.h | 7 ++ include/uapi/linux/bpf.h | 19 +++++ kernel/bpf/btf.c | 2 +- kernel/bpf/core.c | 118 +++++++++++++++++++++++++- kernel/bpf/syscall.c | 83 ++++++++++++++++-- kernel/bpf/verifier.c | 198 +++++++++++++++++++++++++++++++++++++------ 10 files changed, 419 insertions(+), 33 deletions(-) (limited to 'kernel') diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 2580cd2e98b1..5542303c43d9 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -1181,6 +1181,8 @@ out_image: } if (!image || !prog->is_func || extra_pass) { + if (image) + bpf_prog_fill_jited_linfo(prog, addrs); out_addrs: kfree(addrs); kfree(jit_data); diff --git a/include/linux/bpf.h b/include/linux/bpf.h index e82b7039fc66..0c992b86eb2c 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -319,7 +319,28 @@ struct bpf_prog_aux { struct bpf_prog_offload *offload; struct btf *btf; struct bpf_func_info *func_info; + /* bpf_line_info loaded from userspace. linfo->insn_off + * has the xlated insn offset. + * Both the main and sub prog share the same linfo. + * The subprog can access its first linfo by + * using the linfo_idx. + */ + struct bpf_line_info *linfo; + /* jited_linfo is the jited addr of the linfo. It has a + * one to one mapping to linfo: + * jited_linfo[i] is the jited addr for the linfo[i]->insn_off. + * Both the main and sub prog share the same jited_linfo. + * The subprog can access its first jited_linfo by + * using the linfo_idx. + */ + void **jited_linfo; u32 func_info_cnt; + u32 nr_linfo; + /* subprog can use linfo_idx to access its first linfo and + * jited_linfo. + * main prog always has linfo_idx == 0 + */ + u32 linfo_idx; union { struct work_struct work; struct rcu_head rcu; diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 11f5df1092d9..c736945be7c5 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -203,6 +203,7 @@ static inline bool bpf_verifier_log_needed(const struct bpf_verifier_log *log) struct bpf_subprog_info { u32 start; /* insn idx of function entry point */ + u32 linfo_idx; /* The idx to the main_prog->aux->linfo */ u16 stack_depth; /* max. stack depth used by this function */ }; diff --git a/include/linux/btf.h b/include/linux/btf.h index 8c2199b5d250..b98405a56383 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -46,6 +46,7 @@ void btf_type_seq_show(const struct btf *btf, u32 type_id, void *obj, struct seq_file *m); int btf_get_fd_by_id(u32 id); u32 btf_id(const struct btf *btf); +bool btf_name_offset_valid(const struct btf *btf, u32 offset); #ifdef CONFIG_BPF_SYSCALL const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id); diff --git a/include/linux/filter.h b/include/linux/filter.h index d16deead65c6..29f21f9d7f68 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -718,6 +718,13 @@ void bpf_prog_free(struct bpf_prog *fp); bool bpf_opcode_in_insntable(u8 code); +void bpf_prog_free_linfo(struct bpf_prog *prog); +void bpf_prog_fill_jited_linfo(struct bpf_prog *prog, + const u32 *insn_to_jit_off); +int bpf_prog_alloc_jited_linfo(struct bpf_prog *prog); +void bpf_prog_free_jited_linfo(struct bpf_prog *prog); +void bpf_prog_free_unused_jited_linfo(struct bpf_prog *prog); + struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags); struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, gfp_t gfp_extra_flags); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index a84fd232d934..7a66db8d15d5 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -356,6 +356,9 @@ union bpf_attr { __u32 func_info_rec_size; /* userspace bpf_func_info size */ __aligned_u64 func_info; /* func info */ __u32 func_info_cnt; /* number of bpf_func_info records */ + __u32 line_info_rec_size; /* userspace bpf_line_info size */ + __aligned_u64 line_info; /* line info */ + __u32 line_info_cnt; /* number of bpf_line_info records */ }; struct { /* anonymous struct used by BPF_OBJ_* commands */ @@ -2679,6 +2682,12 @@ struct bpf_prog_info { __u32 func_info_rec_size; __aligned_u64 func_info; __u32 func_info_cnt; + __u32 line_info_cnt; + __aligned_u64 line_info; + __aligned_u64 jited_line_info; + __u32 jited_line_info_cnt; + __u32 line_info_rec_size; + __u32 jited_line_info_rec_size; } __attribute__((aligned(8))); struct bpf_map_info { @@ -2995,4 +3004,14 @@ struct bpf_func_info { __u32 type_id; }; +#define BPF_LINE_INFO_LINE_NUM(line_col) ((line_col) >> 10) +#define BPF_LINE_INFO_LINE_COL(line_col) ((line_col) & 0x3ff) + +struct bpf_line_info { + __u32 insn_off; + __u32 file_name_off; + __u32 line_off; + __u32 line_col; +}; + #endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index a09b2f94ab25..e0a827f95e19 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -444,7 +444,7 @@ static const struct btf_kind_operations *btf_type_ops(const struct btf_type *t) return kind_ops[BTF_INFO_KIND(t->info)]; } -static bool btf_name_offset_valid(const struct btf *btf, u32 offset) +bool btf_name_offset_valid(const struct btf *btf, u32 offset) { return BTF_STR_OFFSET_VALID(offset) && offset < btf->hdr.str_len; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index a5b223ef7131..5cdd8da0e7f2 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -105,6 +105,91 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags) } EXPORT_SYMBOL_GPL(bpf_prog_alloc); +int bpf_prog_alloc_jited_linfo(struct bpf_prog *prog) +{ + if (!prog->aux->nr_linfo || !prog->jit_requested) + return 0; + + prog->aux->jited_linfo = kcalloc(prog->aux->nr_linfo, + sizeof(*prog->aux->jited_linfo), + GFP_KERNEL | __GFP_NOWARN); + if (!prog->aux->jited_linfo) + return -ENOMEM; + + return 0; +} + +void bpf_prog_free_jited_linfo(struct bpf_prog *prog) +{ + kfree(prog->aux->jited_linfo); + prog->aux->jited_linfo = NULL; +} + +void bpf_prog_free_unused_jited_linfo(struct bpf_prog *prog) +{ + if (prog->aux->jited_linfo && !prog->aux->jited_linfo[0]) + bpf_prog_free_jited_linfo(prog); +} + +/* The jit engine is responsible to provide an array + * for insn_off to the jited_off mapping (insn_to_jit_off). + * + * The idx to this array is the insn_off. Hence, the insn_off + * here is relative to the prog itself instead of the main prog. + * This array has one entry for each xlated bpf insn. + * + * jited_off is the byte off to the last byte of the jited insn. + * + * Hence, with + * insn_start: + * The first bpf insn off of the prog. The insn off + * here is relative to the main prog. + * e.g. if prog is a subprog, insn_start > 0 + * linfo_idx: + * The prog's idx to prog->aux->linfo and jited_linfo + * + * jited_linfo[linfo_idx] = prog->bpf_func + * + * For i > linfo_idx, + * + * jited_linfo[i] = prog->bpf_func + + * insn_to_jit_off[linfo[i].insn_off - insn_start - 1] + */ +void bpf_prog_fill_jited_linfo(struct bpf_prog *prog, + const u32 *insn_to_jit_off) +{ + u32 linfo_idx, insn_start, insn_end, nr_linfo, i; + const struct bpf_line_info *linfo; + void **jited_linfo; + + if (!prog->aux->jited_linfo) + /* Userspace did not provide linfo */ + return; + + linfo_idx = prog->aux->linfo_idx; + linfo = &prog->aux->linfo[linfo_idx]; + insn_start = linfo[0].insn_off; + insn_end = insn_start + prog->len; + + jited_linfo = &prog->aux->jited_linfo[linfo_idx]; + jited_linfo[0] = prog->bpf_func; + + nr_linfo = prog->aux->nr_linfo - linfo_idx; + + for (i = 1; i < nr_linfo && linfo[i].insn_off < insn_end; i++) + /* The verifier ensures that linfo[i].insn_off is + * strictly increasing + */ + jited_linfo[i] = prog->bpf_func + + insn_to_jit_off[linfo[i].insn_off - insn_start - 1]; +} + +void bpf_prog_free_linfo(struct bpf_prog *prog) +{ + bpf_prog_free_jited_linfo(prog); + kvfree(prog->aux->linfo); +} + struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, gfp_t gfp_extra_flags) { @@ -294,6 +379,26 @@ static int bpf_adj_branches(struct bpf_prog *prog, u32 pos, u32 delta, return ret; } +static void bpf_adj_linfo(struct bpf_prog *prog, u32 off, u32 delta) +{ + struct bpf_line_info *linfo; + u32 i, nr_linfo; + + nr_linfo = prog->aux->nr_linfo; + if (!nr_linfo || !delta) + return; + + linfo = prog->aux->linfo; + + for (i = 0; i < nr_linfo; i++) + if (off < linfo[i].insn_off) + break; + + /* Push all off < linfo[i].insn_off by delta */ + for (; i < nr_linfo; i++) + linfo[i].insn_off += delta; +} + struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, const struct bpf_insn *patch, u32 len) { @@ -349,6 +454,8 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, */ BUG_ON(bpf_adj_branches(prog_adj, off, insn_delta, false)); + bpf_adj_linfo(prog_adj, off, insn_delta); + return prog_adj; } @@ -1591,13 +1698,20 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) * be JITed, but falls back to the interpreter. */ if (!bpf_prog_is_dev_bound(fp->aux)) { + *err = bpf_prog_alloc_jited_linfo(fp); + if (*err) + return fp; + fp = bpf_int_jit_compile(fp); -#ifdef CONFIG_BPF_JIT_ALWAYS_ON if (!fp->jited) { + bpf_prog_free_jited_linfo(fp); +#ifdef CONFIG_BPF_JIT_ALWAYS_ON *err = -ENOTSUPP; return fp; - } #endif + } else { + bpf_prog_free_unused_jited_linfo(fp); + } } else { *err = bpf_prog_offload_compile(fp); if (*err) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index aa05aa38f4a8..19c88cff7880 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1215,6 +1215,7 @@ static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock) bpf_prog_kallsyms_del_all(prog); btf_put(prog->aux->btf); kvfree(prog->aux->func_info); + bpf_prog_free_linfo(prog); call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu); } @@ -1439,7 +1440,7 @@ bpf_prog_load_check_attach_type(enum bpf_prog_type prog_type, } /* last field in 'union bpf_attr' used by this command */ -#define BPF_PROG_LOAD_LAST_FIELD func_info_cnt +#define BPF_PROG_LOAD_LAST_FIELD line_info_cnt static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) { @@ -1560,6 +1561,7 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) return err; free_used_maps: + bpf_prog_free_linfo(prog); kvfree(prog->aux->func_info); btf_put(prog->aux->btf); bpf_prog_kallsyms_del_subprogs(prog); @@ -2041,6 +2043,37 @@ static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog) return insns; } +static int set_info_rec_size(struct bpf_prog_info *info) +{ + /* + * Ensure info.*_rec_size is the same as kernel expected size + * + * or + * + * Only allow zero *_rec_size if both _rec_size and _cnt are + * zero. In this case, the kernel will set the expected + * _rec_size back to the info. + */ + + if ((info->func_info_cnt || info->func_info_rec_size) && + info->func_info_rec_size != sizeof(struct bpf_func_info)) + return -EINVAL; + + if ((info->line_info_cnt || info->line_info_rec_size) && + info->line_info_rec_size != sizeof(struct bpf_line_info)) + return -EINVAL; + + if ((info->jited_line_info_cnt || info->jited_line_info_rec_size) && + info->jited_line_info_rec_size != sizeof(__u64)) + return -EINVAL; + + info->func_info_rec_size = sizeof(struct bpf_func_info); + info->line_info_rec_size = sizeof(struct bpf_line_info); + info->jited_line_info_rec_size = sizeof(__u64); + + return 0; +} + static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, const union bpf_attr *attr, union bpf_attr __user *uattr) @@ -2083,11 +2116,9 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, return -EFAULT; } - if ((info.func_info_cnt || info.func_info_rec_size) && - info.func_info_rec_size != sizeof(struct bpf_func_info)) - return -EINVAL; - - info.func_info_rec_size = sizeof(struct bpf_func_info); + err = set_info_rec_size(&info); + if (err) + return err; if (!capable(CAP_SYS_ADMIN)) { info.jited_prog_len = 0; @@ -2095,6 +2126,8 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, info.nr_jited_ksyms = 0; info.nr_jited_func_lens = 0; info.func_info_cnt = 0; + info.line_info_cnt = 0; + info.jited_line_info_cnt = 0; goto done; } @@ -2251,6 +2284,44 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, } } + ulen = info.line_info_cnt; + info.line_info_cnt = prog->aux->nr_linfo; + if (info.line_info_cnt && ulen) { + if (bpf_dump_raw_ok()) { + __u8 __user *user_linfo; + + user_linfo = u64_to_user_ptr(info.line_info); + ulen = min_t(u32, info.line_info_cnt, ulen); + if (copy_to_user(user_linfo, prog->aux->linfo, + info.line_info_rec_size * ulen)) + return -EFAULT; + } else { + info.line_info = 0; + } + } + + ulen = info.jited_line_info_cnt; + if (prog->aux->jited_linfo) + info.jited_line_info_cnt = prog->aux->nr_linfo; + else + info.jited_line_info_cnt = 0; + if (info.jited_line_info_cnt && ulen) { + if (bpf_dump_raw_ok()) { + __u64 __user *user_linfo; + u32 i; + + user_linfo = u64_to_user_ptr(info.jited_line_info); + ulen = min_t(u32, info.jited_line_info_cnt, ulen); + for (i = 0; i < ulen; i++) { + if (put_user((__u64)(long)prog->aux->jited_linfo[i], + &user_linfo[i])) + return -EFAULT; + } + } else { + info.jited_line_info = 0; + } + } + done: if (copy_to_user(uinfo, &info, info_len) || put_user(info_len, &uattr->info.info_len)) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 2752d35ad073..9d25506bd55a 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4640,15 +4640,17 @@ err_free: #define MIN_BPF_FUNCINFO_SIZE 8 #define MAX_FUNCINFO_REC_SIZE 252 -static int check_btf_func(struct bpf_prog *prog, struct bpf_verifier_env *env, - union bpf_attr *attr, union bpf_attr __user *uattr) +static int check_btf_func(struct bpf_verifier_env *env, + const union bpf_attr *attr, + union bpf_attr __user *uattr) { u32 i, nfuncs, urec_size, min_size, prev_offset; u32 krec_size = sizeof(struct bpf_func_info); - struct bpf_func_info *krecord = NULL; + struct bpf_func_info *krecord; const struct btf_type *type; + struct bpf_prog *prog; + const struct btf *btf; void __user *urecord; - struct btf *btf; int ret = 0; nfuncs = attr->func_info_cnt; @@ -4668,20 +4670,15 @@ static int check_btf_func(struct bpf_prog *prog, struct bpf_verifier_env *env, return -EINVAL; } - btf = btf_get_by_fd(attr->prog_btf_fd); - if (IS_ERR(btf)) { - verbose(env, "unable to get btf from fd\n"); - return PTR_ERR(btf); - } + prog = env->prog; + btf = prog->aux->btf; urecord = u64_to_user_ptr(attr->func_info); min_size = min_t(u32, krec_size, urec_size); krecord = kvcalloc(nfuncs, krec_size, GFP_KERNEL | __GFP_NOWARN); - if (!krecord) { - ret = -ENOMEM; - goto free_btf; - } + if (!krecord) + return -ENOMEM; for (i = 0; i < nfuncs; i++) { ret = bpf_check_uarg_tail_zero(urecord, krec_size, urec_size); @@ -4694,12 +4691,12 @@ static int check_btf_func(struct bpf_prog *prog, struct bpf_verifier_env *env, if (put_user(min_size, &uattr->func_info_rec_size)) ret = -EFAULT; } - goto free_btf; + goto err_free; } if (copy_from_user(&krecord[i], urecord, min_size)) { ret = -EFAULT; - goto free_btf; + goto err_free; } /* check insn_off */ @@ -4709,20 +4706,20 @@ static int check_btf_func(struct bpf_prog *prog, struct bpf_verifier_env *env, "nonzero insn_off %u for the first func info record", krecord[i].insn_off); ret = -EINVAL; - goto free_btf; + goto err_free; } } else if (krecord[i].insn_off <= prev_offset) { verbose(env, "same or smaller insn offset (%u) than previous func info record (%u)", krecord[i].insn_off, prev_offset); ret = -EINVAL; - goto free_btf; + goto err_free; } if (env->subprog_info[i].start != krecord[i].insn_off) { verbose(env, "func_info BTF section doesn't match subprog layout in BPF program\n"); ret = -EINVAL; - goto free_btf; + goto err_free; } /* check type_id */ @@ -4731,20 +4728,18 @@ static int check_btf_func(struct bpf_prog *prog, struct bpf_verifier_env *env, verbose(env, "invalid type id %d in func info", krecord[i].type_id); ret = -EINVAL; - goto free_btf; + goto err_free; } prev_offset = krecord[i].insn_off; urecord += urec_size; } - prog->aux->btf = btf; prog->aux->func_info = krecord; prog->aux->func_info_cnt = nfuncs; return 0; -free_btf: - btf_put(btf); +err_free: kvfree(krecord); return ret; } @@ -4760,6 +4755,150 @@ static void adjust_btf_func(struct bpf_verifier_env *env) env->prog->aux->func_info[i].insn_off = env->subprog_info[i].start; } +#define MIN_BPF_LINEINFO_SIZE (offsetof(struct bpf_line_info, line_col) + \ + sizeof(((struct bpf_line_info *)(0))->line_col)) +#define MAX_LINEINFO_REC_SIZE MAX_FUNCINFO_REC_SIZE + +static int check_btf_line(struct bpf_verifier_env *env, + const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + u32 i, s, nr_linfo, ncopy, expected_size, rec_size, prev_offset = 0; + struct bpf_subprog_info *sub; + struct bpf_line_info *linfo; + struct bpf_prog *prog; + const struct btf *btf; + void __user *ulinfo; + int err; + + nr_linfo = attr->line_info_cnt; + if (!nr_linfo) + return 0; + + rec_size = attr->line_info_rec_size; + if (rec_size < MIN_BPF_LINEINFO_SIZE || + rec_size > MAX_LINEINFO_REC_SIZE || + rec_size & (sizeof(u32) - 1)) + return -EINVAL; + + /* Need to zero it in case the userspace may + * pass in a smaller bpf_line_info object. + */ + linfo = kvcalloc(nr_linfo, sizeof(struct bpf_line_info), + GFP_KERNEL | __GFP_NOWARN); + if (!linfo) + return -ENOMEM; + + prog = env->prog; + btf = prog->aux->btf; + + s = 0; + sub = env->subprog_info; + ulinfo = u64_to_user_ptr(attr->line_info); + expected_size = sizeof(struct bpf_line_info); + ncopy = min_t(u32, expected_size, rec_size); + for (i = 0; i < nr_linfo; i++) { + err = bpf_check_uarg_tail_zero(ulinfo, expected_size, rec_size); + if (err) { + if (err == -E2BIG) { + verbose(env, "nonzero tailing record in line_info"); + if (put_user(expected_size, + &uattr->line_info_rec_size)) + err = -EFAULT; + } + goto err_free; + } + + if (copy_from_user(&linfo[i], ulinfo, ncopy)) { + err = -EFAULT; + goto err_free; + } + + /* + * Check insn_off to ensure + * 1) strictly increasing AND + * 2) bounded by prog->len + * + * The linfo[0].insn_off == 0 check logically falls into + * the later "missing bpf_line_info for func..." case + * because the first linfo[0].insn_off must be the + * first sub also and the first sub must have + * subprog_info[0].start == 0. + */ + if ((i && linfo[i].insn_off <= prev_offset) || + linfo[i].insn_off >= prog->len) { + verbose(env, "Invalid line_info[%u].insn_off:%u (prev_offset:%u prog->len:%u)\n", + i, linfo[i].insn_off, prev_offset, + prog->len); + err = -EINVAL; + goto err_free; + } + + if (!btf_name_offset_valid(btf, linfo[i].line_off) || + !btf_name_offset_valid(btf, linfo[i].file_name_off)) { + verbose(env, "Invalid line_info[%u].line_off or .file_name_off\n", i); + err = -EINVAL; + goto err_free; + } + + if (s != env->subprog_cnt) { + if (linfo[i].insn_off == sub[s].start) { + sub[s].linfo_idx = i; + s++; + } else if (sub[s].start < linfo[i].insn_off) { + verbose(env, "missing bpf_line_info for func#%u\n", s); + err = -EINVAL; + goto err_free; + } + } + + prev_offset = linfo[i].insn_off; + ulinfo += rec_size; + } + + if (s != env->subprog_cnt) { + verbose(env, "missing bpf_line_info for %u funcs starting from func#%u\n", + env->subprog_cnt - s, s); + err = -EINVAL; + goto err_free; + } + + prog->aux->linfo = linfo; + prog->aux->nr_linfo = nr_linfo; + + return 0; + +err_free: + kvfree(linfo); + return err; +} + +static int check_btf_info(struct bpf_verifier_env *env, + const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + struct btf *btf; + int err; + + if (!attr->func_info_cnt && !attr->line_info_cnt) + return 0; + + btf = btf_get_by_fd(attr->prog_btf_fd); + if (IS_ERR(btf)) + return PTR_ERR(btf); + env->prog->aux->btf = btf; + + err = check_btf_func(env, attr, uattr); + if (err) + return err; + + err = check_btf_line(env, attr, uattr); + if (err) + return err; + + return 0; +} + /* check %cur's range satisfies %old's */ static bool range_within(struct bpf_reg_state *old, struct bpf_reg_state *cur) @@ -6004,7 +6143,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) int i, j, subprog_start, subprog_end = 0, len, subprog; struct bpf_insn *insn; void *old_bpf_func; - int err = -ENOMEM; + int err; if (env->subprog_cnt <= 1) return 0; @@ -6035,6 +6174,11 @@ static int jit_subprogs(struct bpf_verifier_env *env) insn->imm = 1; } + err = bpf_prog_alloc_jited_linfo(prog); + if (err) + goto out_undo_insn; + + err = -ENOMEM; func = kcalloc(env->subprog_cnt, sizeof(prog), GFP_KERNEL); if (!func) goto out_undo_insn; @@ -6065,6 +6209,10 @@ static int jit_subprogs(struct bpf_verifier_env *env) func[i]->aux->name[0] = 'F'; func[i]->aux->stack_depth = env->subprog_info[i].stack_depth; func[i]->jit_requested = 1; + func[i]->aux->linfo = prog->aux->linfo; + func[i]->aux->nr_linfo = prog->aux->nr_linfo; + func[i]->aux->jited_linfo = prog->aux->jited_linfo; + func[i]->aux->linfo_idx = env->subprog_info[i].linfo_idx; func[i] = bpf_int_jit_compile(func[i]); if (!func[i]->jited) { err = -ENOTSUPP; @@ -6138,6 +6286,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) prog->bpf_func = func[0]->bpf_func; prog->aux->func = func; prog->aux->func_cnt = env->subprog_cnt; + bpf_prog_free_unused_jited_linfo(prog); return 0; out_free: for (i = 0; i < env->subprog_cnt; i++) @@ -6154,6 +6303,7 @@ out_undo_insn: insn->off = 0; insn->imm = env->insn_aux_data[i].call_imm; } + bpf_prog_free_jited_linfo(prog); return err; } @@ -6526,7 +6676,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, if (ret < 0) goto skip_full_check; - ret = check_btf_func(env->prog, env, attr, uattr); + ret = check_btf_info(env, attr, uattr); if (ret < 0) goto skip_full_check; -- cgit From e80c1a9d5f514ce5134c6c4263a11607341466c9 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Tue, 4 Dec 2018 19:00:01 +0900 Subject: printk: fix printk_time race. Since printk_time can be toggled via /sys/module/printk/parameters/time , it is not safe to assume that output length does not change across multiple msg_print_text() calls. If we hit this race, we can observe failures such as SYSLOG_ACTION_READ_ALL writes more bytes than userspace has supplied, SYSLOG_ACTION_SIZE_UNREAD returns -EFAULT when succeeded, SYSLOG_ACTION_READ reads garbage memory or even triggers an kernel oops at _copy_to_user() due to integer overflow. To close this race, get a snapshot value of printk_time and pass it to SYSLOG_ACTION_READ, SYSLOG_ACTION_READ_ALL, SYSLOG_ACTION_SIZE_UNREAD and kmsg_dump_get_buffer(). Link: http://lkml.kernel.org/r/555af37c-b9e0-f940-cb73-a78eba2d4944@i-love.sakura.ne.jp To: Sergey Senozhatsky Cc: linux-kernel@vger.kernel.org Signed-off-by: Tetsuo Handa Reviewed-by: Sergey Senozhatsky Signed-off-by: Petr Mladek --- kernel/printk/printk.c | 70 ++++++++++++++++++++++++++++---------------------- 1 file changed, 39 insertions(+), 31 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index a1d88212a5d2..c7d217764db3 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -404,6 +404,7 @@ DECLARE_WAIT_QUEUE_HEAD(log_wait); static u64 syslog_seq; static u32 syslog_idx; static size_t syslog_partial; +static bool syslog_time; /* index and sequence number of the first record stored in the buffer */ static u64 log_first_seq; @@ -1229,12 +1230,7 @@ module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR); static size_t print_time(u64 ts, char *buf) { - unsigned long rem_nsec; - - if (!printk_time) - return 0; - - rem_nsec = do_div(ts, 1000000000); + unsigned long rem_nsec = do_div(ts, 1000000000); if (!buf) return snprintf(NULL, 0, "[%5lu.000000] ", (unsigned long)ts); @@ -1243,7 +1239,8 @@ static size_t print_time(u64 ts, char *buf) (unsigned long)ts, rem_nsec / 1000); } -static size_t print_prefix(const struct printk_log *msg, bool syslog, char *buf) +static size_t print_prefix(const struct printk_log *msg, bool syslog, + bool time, char *buf) { size_t len = 0; unsigned int prefix = (msg->facility << 3) | msg->level; @@ -1262,11 +1259,13 @@ static size_t print_prefix(const struct printk_log *msg, bool syslog, char *buf) } } - len += print_time(msg->ts_nsec, buf ? buf + len : NULL); + if (time) + len += print_time(msg->ts_nsec, buf ? buf + len : NULL); return len; } -static size_t msg_print_text(const struct printk_log *msg, bool syslog, char *buf, size_t size) +static size_t msg_print_text(const struct printk_log *msg, bool syslog, + bool time, char *buf, size_t size) { const char *text = log_text(msg); size_t text_size = msg->text_len; @@ -1285,17 +1284,17 @@ static size_t msg_print_text(const struct printk_log *msg, bool syslog, char *bu } if (buf) { - if (print_prefix(msg, syslog, NULL) + + if (print_prefix(msg, syslog, time, NULL) + text_len + 1 >= size - len) break; - len += print_prefix(msg, syslog, buf + len); + len += print_prefix(msg, syslog, time, buf + len); memcpy(buf + len, text, text_len); len += text_len; buf[len++] = '\n'; } else { /* SYSLOG_ACTION_* buffer size only calculation */ - len += print_prefix(msg, syslog, NULL); + len += print_prefix(msg, syslog, time, NULL); len += text_len; len++; } @@ -1332,9 +1331,17 @@ static int syslog_print(char __user *buf, int size) break; } + /* + * To keep reading/counting partial line consistent, + * use printk_time value as of the beginning of a line. + */ + if (!syslog_partial) + syslog_time = printk_time; + skip = syslog_partial; msg = log_from_idx(syslog_idx); - n = msg_print_text(msg, true, text, LOG_LINE_MAX + PREFIX_MAX); + n = msg_print_text(msg, true, syslog_time, text, + LOG_LINE_MAX + PREFIX_MAX); if (n - syslog_partial <= size) { /* message fits into buffer, move forward */ syslog_idx = log_next(syslog_idx); @@ -1374,11 +1381,13 @@ static int syslog_print_all(char __user *buf, int size, bool clear) u64 next_seq; u64 seq; u32 idx; + bool time; text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL); if (!text) return -ENOMEM; + time = printk_time; logbuf_lock_irq(); /* * Find first record that fits, including all following records, @@ -1389,7 +1398,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear) while (seq < log_next_seq) { struct printk_log *msg = log_from_idx(idx); - len += msg_print_text(msg, true, NULL, 0); + len += msg_print_text(msg, true, time, NULL, 0); idx = log_next(idx); seq++; } @@ -1400,7 +1409,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear) while (len > size && seq < log_next_seq) { struct printk_log *msg = log_from_idx(idx); - len -= msg_print_text(msg, true, NULL, 0); + len -= msg_print_text(msg, true, time, NULL, 0); idx = log_next(idx); seq++; } @@ -1411,14 +1420,9 @@ static int syslog_print_all(char __user *buf, int size, bool clear) len = 0; while (len >= 0 && seq < next_seq) { struct printk_log *msg = log_from_idx(idx); - int textlen; + int textlen = msg_print_text(msg, true, time, text, + LOG_LINE_MAX + PREFIX_MAX); - textlen = msg_print_text(msg, true, text, - LOG_LINE_MAX + PREFIX_MAX); - if (textlen < 0) { - len = textlen; - break; - } idx = log_next(idx); seq++; @@ -1542,11 +1546,14 @@ int do_syslog(int type, char __user *buf, int len, int source) } else { u64 seq = syslog_seq; u32 idx = syslog_idx; + bool time = syslog_partial ? syslog_time : printk_time; while (seq < log_next_seq) { struct printk_log *msg = log_from_idx(idx); - error += msg_print_text(msg, true, NULL, 0); + error += msg_print_text(msg, true, time, NULL, + 0); + time = printk_time; idx = log_next(idx); seq++; } @@ -2004,6 +2011,7 @@ EXPORT_SYMBOL(printk); #define LOG_LINE_MAX 0 #define PREFIX_MAX 0 +#define printk_time false static u64 syslog_seq; static u32 syslog_idx; @@ -2027,8 +2035,8 @@ static void console_lock_spinning_enable(void) { } static int console_lock_spinning_disable_and_check(void) { return 0; } static void call_console_drivers(const char *ext_text, size_t ext_len, const char *text, size_t len) {} -static size_t msg_print_text(const struct printk_log *msg, - bool syslog, char *buf, size_t size) { return 0; } +static size_t msg_print_text(const struct printk_log *msg, bool syslog, + bool time, char *buf, size_t size) { return 0; } static bool suppress_message_printing(int level) { return false; } #endif /* CONFIG_PRINTK */ @@ -2386,8 +2394,7 @@ skip: len += msg_print_text(msg, console_msg_format & MSG_FORMAT_SYSLOG, - text + len, - sizeof(text) - len); + printk_time, text + len, sizeof(text) - len); if (nr_ext_console_drivers) { ext_len = msg_print_ext_header(ext_text, sizeof(ext_text), @@ -3111,7 +3118,7 @@ bool kmsg_dump_get_line_nolock(struct kmsg_dumper *dumper, bool syslog, goto out; msg = log_from_idx(dumper->cur_idx); - l = msg_print_text(msg, syslog, line, size); + l = msg_print_text(msg, syslog, printk_time, line, size); dumper->cur_idx = log_next(dumper->cur_idx); dumper->cur_seq++; @@ -3182,6 +3189,7 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, u32 next_idx; size_t l = 0; bool ret = false; + bool time = printk_time; if (!dumper->active) goto out; @@ -3205,7 +3213,7 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, while (seq < dumper->next_seq) { struct printk_log *msg = log_from_idx(idx); - l += msg_print_text(msg, true, NULL, 0); + l += msg_print_text(msg, true, time, NULL, 0); idx = log_next(idx); seq++; } @@ -3216,7 +3224,7 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, while (l > size && seq < dumper->next_seq) { struct printk_log *msg = log_from_idx(idx); - l -= msg_print_text(msg, true, NULL, 0); + l -= msg_print_text(msg, true, time, NULL, 0); idx = log_next(idx); seq++; } @@ -3229,7 +3237,7 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, while (seq < dumper->next_seq) { struct printk_log *msg = log_from_idx(idx); - l += msg_print_text(msg, syslog, buf + l, size - l); + l += msg_print_text(msg, syslog, time, buf + l, size - l); idx = log_next(idx); seq++; } -- cgit From 7e1413edd6194a9807aa5f3ac0378b9b4b9da879 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 4 Dec 2018 13:35:45 -0500 Subject: tracing: Consolidate trace_add/remove_event_call back to the nolock functions The trace_add/remove_event_call_nolock() functions were added to allow the tace_add/remove_event_call() code be called when the event_mutex lock was already taken. Now that all callers are done within the event_mutex, there's no reason to have two different interfaces. Remove the current wrapper trace_add/remove_event_call()s and rename the _nolock versions back to the original names. Link: http://lkml.kernel.org/r/154140866955.17322.2081425494660638846.stgit@devbox Acked-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- include/linux/trace_events.h | 2 -- kernel/trace/trace_events.c | 30 ++++-------------------------- kernel/trace/trace_events_hist.c | 6 +++--- kernel/trace/trace_kprobe.c | 4 ++-- kernel/trace/trace_uprobe.c | 4 ++-- 5 files changed, 11 insertions(+), 35 deletions(-) (limited to 'kernel') diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 3aa05593a53f..4130a5497d40 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -529,8 +529,6 @@ extern int trace_event_raw_init(struct trace_event_call *call); extern int trace_define_field(struct trace_event_call *call, const char *type, const char *name, int offset, int size, int is_signed, int filter_type); -extern int trace_add_event_call_nolock(struct trace_event_call *call); -extern int trace_remove_event_call_nolock(struct trace_event_call *call); extern int trace_add_event_call(struct trace_event_call *call); extern int trace_remove_event_call(struct trace_event_call *call); extern int trace_event_get_offsets(struct trace_event_call *call); diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index a3b157f689ee..bd0162c0467c 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2305,7 +2305,8 @@ __trace_early_add_new_event(struct trace_event_call *call, struct ftrace_module_file_ops; static void __add_event_to_tracers(struct trace_event_call *call); -int trace_add_event_call_nolock(struct trace_event_call *call) +/* Add an additional event_call dynamically */ +int trace_add_event_call(struct trace_event_call *call) { int ret; lockdep_assert_held(&event_mutex); @@ -2320,17 +2321,6 @@ int trace_add_event_call_nolock(struct trace_event_call *call) return ret; } -/* Add an additional event_call dynamically */ -int trace_add_event_call(struct trace_event_call *call) -{ - int ret; - - mutex_lock(&event_mutex); - ret = trace_add_event_call_nolock(call); - mutex_unlock(&event_mutex); - return ret; -} - /* * Must be called under locking of trace_types_lock, event_mutex and * trace_event_sem. @@ -2376,8 +2366,8 @@ static int probe_remove_event_call(struct trace_event_call *call) return 0; } -/* no event_mutex version */ -int trace_remove_event_call_nolock(struct trace_event_call *call) +/* Remove an event_call */ +int trace_remove_event_call(struct trace_event_call *call) { int ret; @@ -2392,18 +2382,6 @@ int trace_remove_event_call_nolock(struct trace_event_call *call) return ret; } -/* Remove an event_call */ -int trace_remove_event_call(struct trace_event_call *call) -{ - int ret; - - mutex_lock(&event_mutex); - ret = trace_remove_event_call_nolock(call); - mutex_unlock(&event_mutex); - - return ret; -} - #define for_each_event(event, start, end) \ for (event = start; \ (unsigned long)event < (unsigned long)end; \ diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 21e4954375a1..82e72c48a5a9 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -960,7 +960,7 @@ static int register_synth_event(struct synth_event *event) call->data = event; call->tp = event->tp; - ret = trace_add_event_call_nolock(call); + ret = trace_add_event_call(call); if (ret) { pr_warn("Failed to register synthetic event: %s\n", trace_event_name(call)); @@ -969,7 +969,7 @@ static int register_synth_event(struct synth_event *event) ret = set_synth_event_print_fmt(call); if (ret < 0) { - trace_remove_event_call_nolock(call); + trace_remove_event_call(call); goto err; } out: @@ -984,7 +984,7 @@ static int unregister_synth_event(struct synth_event *event) struct trace_event_call *call = &event->call; int ret; - ret = trace_remove_event_call_nolock(call); + ret = trace_remove_event_call(call); return ret; } diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index bdf8c2ad5152..0e0f7b8024fb 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1353,7 +1353,7 @@ static int register_kprobe_event(struct trace_kprobe *tk) kfree(call->print_fmt); return -ENODEV; } - ret = trace_add_event_call_nolock(call); + ret = trace_add_event_call(call); if (ret) { pr_info("Failed to register kprobe event: %s\n", trace_event_name(call)); @@ -1368,7 +1368,7 @@ static int unregister_kprobe_event(struct trace_kprobe *tk) int ret; /* tp->event is unregistered in trace_remove_event_call() */ - ret = trace_remove_event_call_nolock(&tk->tp.call); + ret = trace_remove_event_call(&tk->tp.call); if (!ret) kfree(tk->tp.call.print_fmt); return ret; diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 4a7b21c891f3..e335576b9411 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -1320,7 +1320,7 @@ static int register_uprobe_event(struct trace_uprobe *tu) return -ENODEV; } - ret = trace_add_event_call_nolock(call); + ret = trace_add_event_call(call); if (ret) { pr_info("Failed to register uprobe event: %s\n", @@ -1337,7 +1337,7 @@ static int unregister_uprobe_event(struct trace_uprobe *tu) int ret; /* tu->event is unregistered in trace_remove_event_call() */ - ret = trace_remove_event_call_nolock(&tu->tp.call); + ret = trace_remove_event_call(&tu->tp.call); if (ret) return ret; kfree(tu->tp.call.print_fmt); -- cgit From 1ce25e9f6fff7633955e8ce776e5e9d7a780d34b Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 5 Nov 2018 18:04:57 +0900 Subject: tracing: Add generic event-name based remove event method Add a generic method to remove event from dynamic event list. This is same as other system under ftrace. You just need to pass the event name with '!', e.g. # echo p:new_grp/new_event _do_fork > dynamic_events This creates an event, and # echo '!p:new_grp/new_event _do_fork' > dynamic_events Or, # echo '!p:new_grp/new_event' > dynamic_events will remove new_grp/new_event event. Note that this doesn't check the event prefix (e.g. "p:") strictly, because the "group/event" name must be unique. Link: http://lkml.kernel.org/r/154140869774.17322.8887303560398645347.stgit@devbox Reviewed-by: Tom Zanussi Tested-by: Tom Zanussi Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_dynevent.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_dynevent.c b/kernel/trace/trace_dynevent.c index f17a887abb66..dd1f43588d70 100644 --- a/kernel/trace/trace_dynevent.c +++ b/kernel/trace/trace_dynevent.c @@ -37,10 +37,17 @@ int dyn_event_release(int argc, char **argv, struct dyn_event_operations *type) char *system = NULL, *event, *p; int ret = -ENOENT; - if (argv[0][1] != ':') - return -EINVAL; + if (argv[0][0] == '-') { + if (argv[0][1] != ':') + return -EINVAL; + event = &argv[0][2]; + } else { + event = strchr(argv[0], ':'); + if (!event) + return -EINVAL; + event++; + } - event = &argv[0][2]; p = strchr(event, '/'); if (p) { system = event; @@ -69,7 +76,7 @@ static int create_dyn_event(int argc, char **argv) struct dyn_event_operations *ops; int ret; - if (argv[0][0] == '-') + if (argv[0][0] == '-' || argv[0][0] == '!') return dyn_event_release(argc, argv, NULL); mutex_lock(&dyn_event_ops_mutex); -- cgit From a0572f687fb3c46e15554f4789797a077cc393b4 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 5 Dec 2018 12:48:53 -0500 Subject: ftrace: Allow ftrace_replace_code() to be schedulable The function ftrace_replace_code() is the ftrace engine that does the work to modify all the nops into the calls to the function callback in all the functions being traced. The generic version which is normally called from stop machine, but an architecture can implement a non stop machine version and still use the generic ftrace_replace_code(). When an architecture does this, ftrace_replace_code() may be called from a schedulable context, where it can allow the code to be preemptible, and schedule out. In order to allow an architecture to make ftrace_replace_code() schedulable, a new command flag is added called: FTRACE_MAY_SLEEP Which can be or'd to the command that is passed to ftrace_modify_all_code() that calls ftrace_replace_code() and will have it call cond_resched() in the loop that modifies the nops into the calls to the ftrace trampolines. Link: http://lkml.kernel.org/r/20181204192903.8193-1-anders.roxell@linaro.org Link: http://lkml.kernel.org/r/20181205183303.828422192@goodmis.org Reported-by: Anders Roxell Tested-by: Anders Roxell Signed-off-by: Steven Rostedt (VMware) --- include/linux/ftrace.h | 1 + kernel/trace/ftrace.c | 19 ++++++++++++++++--- 2 files changed, 17 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 98e141c71ad0..13485a19e964 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -389,6 +389,7 @@ enum { FTRACE_UPDATE_TRACE_FUNC = (1 << 2), FTRACE_START_FUNC_RET = (1 << 3), FTRACE_STOP_FUNC_RET = (1 << 4), + FTRACE_MAY_SLEEP = (1 << 5), }; /* diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 8ef9fc226037..ab3e8b995e12 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -77,6 +77,11 @@ #define ASSIGN_OPS_HASH(opsname, val) #endif +enum { + FTRACE_MODIFY_ENABLE_FL = (1 << 0), + FTRACE_MODIFY_MAY_SLEEP_FL = (1 << 1), +}; + struct ftrace_ops ftrace_list_end __read_mostly = { .func = ftrace_stub, .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_STUB, @@ -2389,10 +2394,12 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable) return -1; /* unknow ftrace bug */ } -void __weak ftrace_replace_code(int enable) +void __weak ftrace_replace_code(int mod_flags) { struct dyn_ftrace *rec; struct ftrace_page *pg; + int enable = mod_flags & FTRACE_MODIFY_ENABLE_FL; + int schedulable = mod_flags & FTRACE_MODIFY_MAY_SLEEP_FL; int failed; if (unlikely(ftrace_disabled)) @@ -2409,6 +2416,8 @@ void __weak ftrace_replace_code(int enable) /* Stop processing */ return; } + if (schedulable) + cond_resched(); } while_for_each_ftrace_rec(); } @@ -2522,8 +2531,12 @@ int __weak ftrace_arch_code_modify_post_process(void) void ftrace_modify_all_code(int command) { int update = command & FTRACE_UPDATE_TRACE_FUNC; + int mod_flags = 0; int err = 0; + if (command & FTRACE_MAY_SLEEP) + mod_flags = FTRACE_MODIFY_MAY_SLEEP_FL; + /* * If the ftrace_caller calls a ftrace_ops func directly, * we need to make sure that it only traces functions it @@ -2541,9 +2554,9 @@ void ftrace_modify_all_code(int command) } if (command & FTRACE_UPDATE_CALLS) - ftrace_replace_code(1); + ftrace_replace_code(mod_flags | FTRACE_MODIFY_ENABLE_FL); else if (command & FTRACE_DISABLE_CALLS) - ftrace_replace_code(0); + ftrace_replace_code(mod_flags); if (update && ftrace_trace_function != ftrace_ops_list_func) { function_trace_op = set_function_trace_op; -- cgit From 45fe439bc3699851802062c05acf4cbac4d672bc Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 7 Dec 2018 12:25:52 -0500 Subject: fgraph: Add comment to describe ftrace_graph_get_ret_stack The ret_stack should not be accessed directly via the curr_ret_stack variable on the task_struct. This is because the ret_stack is going to be converted into a series of longs and not an array of ret_stack structures. The way that a ret_stack should be retrieved is via the ftrace_graph_get_ret_stack structure, but it needs to be documented on how to use it. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/fgraph.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index a3704ec8b599..d4f04f0ca646 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -232,6 +232,17 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer) return ret; } +/** + * ftrace_graph_get_ret_stack - return the entry of the shadow stack + * @task: The task to read the shadow stack from + * @idx: Index down the shadow stack + * + * Return the ret_struct on the shadow stack of the @task at the + * call graph at @idx starting with zero. If @idx is zero, it + * will return the last saved ret_stack entry. If it is greater than + * zero, it will return the corresponding ret_stack for the depth + * of saved return addresses. + */ struct ftrace_ret_stack * ftrace_graph_get_ret_stack(struct task_struct *task, int idx) { -- cgit From e434b8cdf788568ba65a0a0fd9f3cb41f3ca1803 Mon Sep 17 00:00:00 2001 From: Jiong Wang Date: Fri, 7 Dec 2018 12:16:18 -0500 Subject: bpf: relax verifier restriction on BPF_MOV | BPF_ALU Currently, the destination register is marked as unknown for 32-bit sub-register move (BPF_MOV | BPF_ALU) whenever the source register type is SCALAR_VALUE. This is too conservative that some valid cases will be rejected. Especially, this may turn a constant scalar value into unknown value that could break some assumptions of verifier. For example, test_l4lb_noinline.c has the following C code: struct real_definition *dst 1: if (!get_packet_dst(&dst, &pckt, vip_info, is_ipv6)) 2: return TC_ACT_SHOT; 3: 4: if (dst->flags & F_IPV6) { get_packet_dst is responsible for initializing "dst" into valid pointer and return true (1), otherwise return false (0). The compiled instruction sequence using alu32 will be: 412: (54) (u32) r7 &= (u32) 1 413: (bc) (u32) r0 = (u32) r7 414: (95) exit insn 413, a BPF_MOV | BPF_ALU, however will turn r0 into unknown value even r7 contains SCALAR_VALUE 1. This causes trouble when verifier is walking the code path that hasn't initialized "dst" inside get_packet_dst, for which case 0 is returned and we would then expect verifier concluding line 1 in the above C code pass the "if" check, therefore would skip fall through path starting at line 4. Now, because r0 returned from callee has became unknown value, so verifier won't skip analyzing path starting at line 4 and "dst->flags" requires dereferencing the pointer "dst" which actually hasn't be initialized for this path. This patch relaxed the code marking sub-register move destination. For a SCALAR_VALUE, it is safe to just copy the value from source then truncate it into 32-bit. A unit test also included to demonstrate this issue. This test will fail before this patch. This relaxation could let verifier skipping more paths for conditional comparison against immediate. It also let verifier recording a more accurate/strict value for one register at one state, if this state end up with going through exit without rejection and it is used for state comparison later, then it is possible an inaccurate/permissive value is better. So the real impact on verifier processed insn number is complex. But in all, without this fix, valid program could be rejected. >From real benchmarking on kernel selftests and Cilium bpf tests, there is no impact on processed instruction number when tests ares compiled with default compilation options. There is slightly improvements when they are compiled with -mattr=+alu32 after this patch. Also, test_xdp_noinline/-mattr=+alu32 now passed verification. It is rejected before this fix. Insn processed before/after this patch: default -mattr=+alu32 Kernel selftest === test_xdp.o 371/371 369/369 test_l4lb.o 6345/6345 5623/5623 test_xdp_noinline.o 2971/2971 rejected/2727 test_tcp_estates.o 429/429 430/430 Cilium bpf === bpf_lb-DLB_L3.o: 2085/2085 1685/1687 bpf_lb-DLB_L4.o: 2287/2287 1986/1982 bpf_lb-DUNKNOWN.o: 690/690 622/622 bpf_lxc.o: 95033/95033 N/A bpf_netdev.o: 7245/7245 N/A bpf_overlay.o: 2898/2898 3085/2947 NOTE: - bpf_lxc.o and bpf_netdev.o compiled by -mattr=+alu32 are rejected by verifier due to another issue inside verifier on supporting alu32 binary. - Each cilium bpf program could generate several processed insn number, above number is sum of them. v1->v2: - Restrict the change on SCALAR_VALUE. - Update benchmark numbers on Cilium bpf tests. Signed-off-by: Jiong Wang Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 16 ++++++++++++---- tools/testing/selftests/bpf/test_verifier.c | 13 +++++++++++++ 2 files changed, 25 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 9d25506bd55a..2e70b813a1a7 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3583,12 +3583,15 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) return err; if (BPF_SRC(insn->code) == BPF_X) { + struct bpf_reg_state *src_reg = regs + insn->src_reg; + struct bpf_reg_state *dst_reg = regs + insn->dst_reg; + if (BPF_CLASS(insn->code) == BPF_ALU64) { /* case: R1 = R2 * copy register state to dest reg */ - regs[insn->dst_reg] = regs[insn->src_reg]; - regs[insn->dst_reg].live |= REG_LIVE_WRITTEN; + *dst_reg = *src_reg; + dst_reg->live |= REG_LIVE_WRITTEN; } else { /* R1 = (u32) R2 */ if (is_pointer_value(env, insn->src_reg)) { @@ -3596,9 +3599,14 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) "R%d partial copy of pointer\n", insn->src_reg); return -EACCES; + } else if (src_reg->type == SCALAR_VALUE) { + *dst_reg = *src_reg; + dst_reg->live |= REG_LIVE_WRITTEN; + } else { + mark_reg_unknown(env, regs, + insn->dst_reg); } - mark_reg_unknown(env, regs, insn->dst_reg); - coerce_reg_to_size(®s[insn->dst_reg], 4); + coerce_reg_to_size(dst_reg, 4); } } else { /* case: R = imm diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index 36ce58b4933e..957e4711c46c 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -2959,6 +2959,19 @@ static struct bpf_test tests[] = { .result_unpriv = REJECT, .result = ACCEPT, }, + { + "alu32: mov u32 const", + .insns = { + BPF_MOV32_IMM(BPF_REG_7, 0), + BPF_ALU32_IMM(BPF_AND, BPF_REG_7, 1), + BPF_MOV32_REG(BPF_REG_0, BPF_REG_7), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1), + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_7, 0), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .retval = 0, + }, { "unpriv: partial copy of pointer", .insns = { -- cgit From 7a5725ddc6e18aab61f647c0996d3b7eb03ff5cb Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 10 Dec 2018 11:17:50 -0800 Subject: bpf: clean up bpf_prog_get_info_by_fd() info.nr_jited_ksyms and info.nr_jited_func_lens cannot be 0 in these two statements, so we don't need to check them. Signed-off-by: Song Liu Acked-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 19c88cff7880..a99a23bf5910 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2209,7 +2209,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, ulen = info.nr_jited_ksyms; info.nr_jited_ksyms = prog->aux->func_cnt ? : 1; - if (info.nr_jited_ksyms && ulen) { + if (ulen) { if (bpf_dump_raw_ok()) { unsigned long ksym_addr; u64 __user *user_ksyms; @@ -2240,7 +2240,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, ulen = info.nr_jited_func_lens; info.nr_jited_func_lens = prog->aux->func_cnt ? : 1; - if (info.nr_jited_func_lens && ulen) { + if (ulen) { if (bpf_dump_raw_ok()) { u32 __user *user_lens; u32 func_len, i; -- cgit From 11d8b82d2222cade12caad2c125f23023777dcbc Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 10 Dec 2018 14:14:08 -0800 Subject: bpf: rename *_info_cnt to nr_*_info in bpf_prog_info In uapi bpf.h, currently we have the following fields in the struct bpf_prog_info: __u32 func_info_cnt; __u32 line_info_cnt; __u32 jited_line_info_cnt; The above field names "func_info_cnt" and "line_info_cnt" also appear in union bpf_attr for program loading. The original intention is to keep the names the same between bpf_prog_info and bpf_attr so it will imply what we returned to user space will be the same as what the user space passed to the kernel. Such a naming convention in bpf_prog_info is not consistent with other fields like: __u32 nr_jited_ksyms; __u32 nr_jited_func_lens; This patch made this adjustment so in bpf_prog_info newly introduced *_info_cnt becomes nr_*_info. Acked-by: Song Liu Acked-by: Martin KaFai Lau Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 6 +++--- kernel/bpf/syscall.c | 38 +++++++++++++++++++------------------- 2 files changed, 22 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 1bee1135866a..f943ed803309 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2696,11 +2696,11 @@ struct bpf_prog_info { __u32 btf_id; __u32 func_info_rec_size; __aligned_u64 func_info; - __u32 func_info_cnt; - __u32 line_info_cnt; + __u32 nr_func_info; + __u32 nr_line_info; __aligned_u64 line_info; __aligned_u64 jited_line_info; - __u32 jited_line_info_cnt; + __u32 nr_jited_line_info; __u32 line_info_rec_size; __u32 jited_line_info_rec_size; } __attribute__((aligned(8))); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index a99a23bf5910..5745c7837621 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2055,15 +2055,15 @@ static int set_info_rec_size(struct bpf_prog_info *info) * _rec_size back to the info. */ - if ((info->func_info_cnt || info->func_info_rec_size) && + if ((info->nr_func_info || info->func_info_rec_size) && info->func_info_rec_size != sizeof(struct bpf_func_info)) return -EINVAL; - if ((info->line_info_cnt || info->line_info_rec_size) && + if ((info->nr_line_info || info->line_info_rec_size) && info->line_info_rec_size != sizeof(struct bpf_line_info)) return -EINVAL; - if ((info->jited_line_info_cnt || info->jited_line_info_rec_size) && + if ((info->nr_jited_line_info || info->jited_line_info_rec_size) && info->jited_line_info_rec_size != sizeof(__u64)) return -EINVAL; @@ -2125,9 +2125,9 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, info.xlated_prog_len = 0; info.nr_jited_ksyms = 0; info.nr_jited_func_lens = 0; - info.func_info_cnt = 0; - info.line_info_cnt = 0; - info.jited_line_info_cnt = 0; + info.nr_func_info = 0; + info.nr_line_info = 0; + info.nr_jited_line_info = 0; goto done; } @@ -2268,14 +2268,14 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, if (prog->aux->btf) info.btf_id = btf_id(prog->aux->btf); - ulen = info.func_info_cnt; - info.func_info_cnt = prog->aux->func_info_cnt; - if (info.func_info_cnt && ulen) { + ulen = info.nr_func_info; + info.nr_func_info = prog->aux->func_info_cnt; + if (info.nr_func_info && ulen) { if (bpf_dump_raw_ok()) { char __user *user_finfo; user_finfo = u64_to_user_ptr(info.func_info); - ulen = min_t(u32, info.func_info_cnt, ulen); + ulen = min_t(u32, info.nr_func_info, ulen); if (copy_to_user(user_finfo, prog->aux->func_info, info.func_info_rec_size * ulen)) return -EFAULT; @@ -2284,14 +2284,14 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, } } - ulen = info.line_info_cnt; - info.line_info_cnt = prog->aux->nr_linfo; - if (info.line_info_cnt && ulen) { + ulen = info.nr_line_info; + info.nr_line_info = prog->aux->nr_linfo; + if (info.nr_line_info && ulen) { if (bpf_dump_raw_ok()) { __u8 __user *user_linfo; user_linfo = u64_to_user_ptr(info.line_info); - ulen = min_t(u32, info.line_info_cnt, ulen); + ulen = min_t(u32, info.nr_line_info, ulen); if (copy_to_user(user_linfo, prog->aux->linfo, info.line_info_rec_size * ulen)) return -EFAULT; @@ -2300,18 +2300,18 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, } } - ulen = info.jited_line_info_cnt; + ulen = info.nr_jited_line_info; if (prog->aux->jited_linfo) - info.jited_line_info_cnt = prog->aux->nr_linfo; + info.nr_jited_line_info = prog->aux->nr_linfo; else - info.jited_line_info_cnt = 0; - if (info.jited_line_info_cnt && ulen) { + info.nr_jited_line_info = 0; + if (info.nr_jited_line_info && ulen) { if (bpf_dump_raw_ok()) { __u64 __user *user_linfo; u32 i; user_linfo = u64_to_user_ptr(info.jited_line_info); - ulen = min_t(u32, info.jited_line_info_cnt, ulen); + ulen = min_t(u32, info.nr_jited_line_info, ulen); for (i = 0; i < ulen; i++) { if (put_user((__u64)(long)prog->aux->jited_linfo[i], &user_linfo[i])) -- cgit From 108c35a908d484df094f46a1e9d961d732737013 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Mon, 3 Dec 2018 11:29:29 +0100 Subject: sched/cpufreq: Add the SPDX tags The SPDX tags are not present in cpufreq.c and cpufreq_schedutil.c. Add them and remove the license descriptions Signed-off-by: Daniel Lezcano Acked-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- kernel/sched/cpufreq.c | 5 +---- kernel/sched/cpufreq_schedutil.c | 5 +---- 2 files changed, 2 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpufreq.c b/kernel/sched/cpufreq.c index 5e54cbcae673..22bd8980f32f 100644 --- a/kernel/sched/cpufreq.c +++ b/kernel/sched/cpufreq.c @@ -1,12 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Scheduler code and data structures related to cpufreq. * * Copyright (C) 2016, Intel Corporation * Author: Rafael J. Wysocki - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #include "sched.h" diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 3fffad3bc8a8..626ddd4ffa43 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -1,12 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0 /* * CPUFreq governor based on scheduler-provided CPU utilization data. * * Copyright (C) 2016, Intel Corporation * Author: Rafael J. Wysocki - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -- cgit From 9f191555ba4ba8fc82e589670e46a7f79b72a157 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Mon, 10 Dec 2018 14:00:28 +0000 Subject: dma-debug: Expose nr_total_entries in debugfs Expose nr_total_entries in debugfs, so that {num,min}_free_entries become even more meaningful to users interested in current/maximum utilisation. This becomes even more relevant once nr_total_entries may change at runtime beyond just the existing AMD GART debug code. Suggested-by: John Garry Signed-off-by: Robin Murphy Tested-by: Qian Cai Signed-off-by: Christoph Hellwig --- Documentation/DMA-API.txt | 3 +++ kernel/dma/debug.c | 7 +++++++ 2 files changed, 10 insertions(+) (limited to 'kernel') diff --git a/Documentation/DMA-API.txt b/Documentation/DMA-API.txt index ac66ae2509a9..6bdb095393b0 100644 --- a/Documentation/DMA-API.txt +++ b/Documentation/DMA-API.txt @@ -723,6 +723,9 @@ dma-api/min_free_entries This read-only file can be read to get the dma-api/num_free_entries The current number of free dma_debug_entries in the allocator. +dma-api/nr_total_entries The total number of dma_debug_entries in the + allocator, both free and used. + dma-api/driver-filter You can write a name of a driver into this file to limit the debug output to requests from that particular driver. Write an empty string to diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index 231ca4628062..f6a141eb9438 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -142,6 +142,7 @@ static struct dentry *show_all_errors_dent __read_mostly; static struct dentry *show_num_errors_dent __read_mostly; static struct dentry *num_free_entries_dent __read_mostly; static struct dentry *min_free_entries_dent __read_mostly; +static struct dentry *nr_total_entries_dent __read_mostly; static struct dentry *filter_dent __read_mostly; /* per-driver filter related state */ @@ -926,6 +927,12 @@ static int dma_debug_fs_init(void) if (!min_free_entries_dent) goto out_err; + nr_total_entries_dent = debugfs_create_u32("nr_total_entries", 0444, + dma_debug_dent, + &nr_total_entries); + if (!nr_total_entries_dent) + goto out_err; + filter_dent = debugfs_create_file("driver_filter", 0644, dma_debug_dent, NULL, &filter_fops); if (!filter_dent) -- cgit From f737b095c60c635db260e02fdb9f0efb9f3360c4 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Mon, 10 Dec 2018 14:00:27 +0000 Subject: dma-debug: Use pr_fmt() Use pr_fmt() to generate the "DMA-API: " prefix consistently. This results in it being added to a couple of pr_*() messages which were missing it before, and for the err_printk() calls moves it to the actual start of the message instead of somewhere in the middle. Signed-off-by: Robin Murphy Tested-by: Qian Cai Signed-off-by: Christoph Hellwig --- kernel/dma/debug.c | 74 ++++++++++++++++++++++++++++-------------------------- 1 file changed, 38 insertions(+), 36 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index f6a141eb9438..29486eb9d1dc 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -17,6 +17,8 @@ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ +#define pr_fmt(fmt) "DMA-API: " fmt + #include #include #include @@ -235,7 +237,7 @@ static bool driver_filter(struct device *dev) error_count += 1; \ if (driver_filter(dev) && \ (show_all_errors || show_num_errors > 0)) { \ - WARN(1, "%s %s: " format, \ + WARN(1, pr_fmt("%s %s: ") format, \ dev ? dev_driver_string(dev) : "NULL", \ dev ? dev_name(dev) : "NULL", ## arg); \ dump_entry_trace(entry); \ @@ -520,7 +522,7 @@ static void active_cacheline_inc_overlap(phys_addr_t cln) * prematurely. */ WARN_ONCE(overlap > ACTIVE_CACHELINE_MAX_OVERLAP, - "DMA-API: exceeded %d overlapping mappings of cacheline %pa\n", + pr_fmt("exceeded %d overlapping mappings of cacheline %pa\n"), ACTIVE_CACHELINE_MAX_OVERLAP, &cln); } @@ -615,7 +617,7 @@ void debug_dma_assert_idle(struct page *page) cln = to_cacheline_number(entry); err_printk(entry->dev, entry, - "DMA-API: cpu touching an active dma mapped cacheline [cln=%pa]\n", + "cpu touching an active dma mapped cacheline [cln=%pa]\n", &cln); } @@ -635,7 +637,7 @@ static void add_dma_entry(struct dma_debug_entry *entry) rc = active_cacheline_insert(entry); if (rc == -ENOMEM) { - pr_err("DMA-API: cacheline tracking ENOMEM, dma-debug disabled\n"); + pr_err("cacheline tracking ENOMEM, dma-debug disabled\n"); global_disable = true; } @@ -674,7 +676,7 @@ static struct dma_debug_entry *dma_entry_alloc(void) if (list_empty(&free_entries)) { global_disable = true; spin_unlock_irqrestore(&free_entries_lock, flags); - pr_err("DMA-API: debugging out of memory - disabling\n"); + pr_err("debugging out of memory - disabling\n"); return NULL; } @@ -778,7 +780,7 @@ static int prealloc_memory(u32 num_entries) num_free_entries = num_entries; min_free_entries = num_entries; - pr_info("DMA-API: preallocated %d debug entries\n", num_entries); + pr_info("preallocated %d debug entries\n", num_entries); return 0; @@ -851,7 +853,7 @@ static ssize_t filter_write(struct file *file, const char __user *userbuf, * switched off. */ if (current_driver_name[0]) - pr_info("DMA-API: switching off dma-debug driver filter\n"); + pr_info("switching off dma-debug driver filter\n"); current_driver_name[0] = 0; current_driver = NULL; goto out_unlock; @@ -869,7 +871,7 @@ static ssize_t filter_write(struct file *file, const char __user *userbuf, current_driver_name[i] = 0; current_driver = NULL; - pr_info("DMA-API: enable driver filter for driver [%s]\n", + pr_info("enable driver filter for driver [%s]\n", current_driver_name); out_unlock: @@ -888,7 +890,7 @@ static int dma_debug_fs_init(void) { dma_debug_dent = debugfs_create_dir("dma-api", NULL); if (!dma_debug_dent) { - pr_err("DMA-API: can not create debugfs directory\n"); + pr_err("can not create debugfs directory\n"); return -ENOMEM; } @@ -980,7 +982,7 @@ static int dma_debug_device_change(struct notifier_block *nb, unsigned long acti count = device_dma_allocations(dev, &entry); if (count == 0) break; - err_printk(dev, entry, "DMA-API: device driver has pending " + err_printk(dev, entry, "device driver has pending " "DMA allocations while released from device " "[count=%d]\n" "One of leaked entries details: " @@ -1030,14 +1032,14 @@ static int dma_debug_init(void) } if (dma_debug_fs_init() != 0) { - pr_err("DMA-API: error creating debugfs entries - disabling\n"); + pr_err("error creating debugfs entries - disabling\n"); global_disable = true; return 0; } if (prealloc_memory(nr_prealloc_entries) != 0) { - pr_err("DMA-API: debugging out of memory error - disabled\n"); + pr_err("debugging out of memory error - disabled\n"); global_disable = true; return 0; @@ -1047,7 +1049,7 @@ static int dma_debug_init(void) dma_debug_initialized = true; - pr_info("DMA-API: debugging enabled by kernel config\n"); + pr_info("debugging enabled by kernel config\n"); return 0; } core_initcall(dma_debug_init); @@ -1058,7 +1060,7 @@ static __init int dma_debug_cmdline(char *str) return -EINVAL; if (strncmp(str, "off", 3) == 0) { - pr_info("DMA-API: debugging disabled on kernel command line\n"); + pr_info("debugging disabled on kernel command line\n"); global_disable = true; } @@ -1092,11 +1094,11 @@ static void check_unmap(struct dma_debug_entry *ref) if (dma_mapping_error(ref->dev, ref->dev_addr)) { err_printk(ref->dev, NULL, - "DMA-API: device driver tries to free an " + "device driver tries to free an " "invalid DMA memory address\n"); } else { err_printk(ref->dev, NULL, - "DMA-API: device driver tries to free DMA " + "device driver tries to free DMA " "memory it has not allocated [device " "address=0x%016llx] [size=%llu bytes]\n", ref->dev_addr, ref->size); @@ -1105,7 +1107,7 @@ static void check_unmap(struct dma_debug_entry *ref) } if (ref->size != entry->size) { - err_printk(ref->dev, entry, "DMA-API: device driver frees " + err_printk(ref->dev, entry, "device driver frees " "DMA memory with different size " "[device address=0x%016llx] [map size=%llu bytes] " "[unmap size=%llu bytes]\n", @@ -1113,7 +1115,7 @@ static void check_unmap(struct dma_debug_entry *ref) } if (ref->type != entry->type) { - err_printk(ref->dev, entry, "DMA-API: device driver frees " + err_printk(ref->dev, entry, "device driver frees " "DMA memory with wrong function " "[device address=0x%016llx] [size=%llu bytes] " "[mapped as %s] [unmapped as %s]\n", @@ -1121,7 +1123,7 @@ static void check_unmap(struct dma_debug_entry *ref) type2name[entry->type], type2name[ref->type]); } else if ((entry->type == dma_debug_coherent) && (phys_addr(ref) != phys_addr(entry))) { - err_printk(ref->dev, entry, "DMA-API: device driver frees " + err_printk(ref->dev, entry, "device driver frees " "DMA memory with different CPU address " "[device address=0x%016llx] [size=%llu bytes] " "[cpu alloc address=0x%016llx] " @@ -1133,7 +1135,7 @@ static void check_unmap(struct dma_debug_entry *ref) if (ref->sg_call_ents && ref->type == dma_debug_sg && ref->sg_call_ents != entry->sg_call_ents) { - err_printk(ref->dev, entry, "DMA-API: device driver frees " + err_printk(ref->dev, entry, "device driver frees " "DMA sg list with different entry count " "[map count=%d] [unmap count=%d]\n", entry->sg_call_ents, ref->sg_call_ents); @@ -1144,7 +1146,7 @@ static void check_unmap(struct dma_debug_entry *ref) * DMA API don't handle this properly, so check for it here */ if (ref->direction != entry->direction) { - err_printk(ref->dev, entry, "DMA-API: device driver frees " + err_printk(ref->dev, entry, "device driver frees " "DMA memory with different direction " "[device address=0x%016llx] [size=%llu bytes] " "[mapped with %s] [unmapped with %s]\n", @@ -1160,7 +1162,7 @@ static void check_unmap(struct dma_debug_entry *ref) */ if (entry->map_err_type == MAP_ERR_NOT_CHECKED) { err_printk(ref->dev, entry, - "DMA-API: device driver failed to check map error" + "device driver failed to check map error" "[device address=0x%016llx] [size=%llu bytes] " "[mapped as %s]", ref->dev_addr, ref->size, @@ -1185,7 +1187,7 @@ static void check_for_stack(struct device *dev, return; addr = page_address(page) + offset; if (object_is_on_stack(addr)) - err_printk(dev, NULL, "DMA-API: device driver maps memory from stack [addr=%p]\n", addr); + err_printk(dev, NULL, "device driver maps memory from stack [addr=%p]\n", addr); } else { /* Stack is vmalloced. */ int i; @@ -1195,7 +1197,7 @@ static void check_for_stack(struct device *dev, continue; addr = (u8 *)current->stack + i * PAGE_SIZE + offset; - err_printk(dev, NULL, "DMA-API: device driver maps memory from stack [probable addr=%p]\n", addr); + err_printk(dev, NULL, "device driver maps memory from stack [probable addr=%p]\n", addr); break; } } @@ -1215,7 +1217,7 @@ static void check_for_illegal_area(struct device *dev, void *addr, unsigned long { if (overlap(addr, len, _stext, _etext) || overlap(addr, len, __start_rodata, __end_rodata)) - err_printk(dev, NULL, "DMA-API: device driver maps memory from kernel text or rodata [addr=%p] [len=%lu]\n", addr, len); + err_printk(dev, NULL, "device driver maps memory from kernel text or rodata [addr=%p] [len=%lu]\n", addr, len); } static void check_sync(struct device *dev, @@ -1231,7 +1233,7 @@ static void check_sync(struct device *dev, entry = bucket_find_contain(&bucket, ref, &flags); if (!entry) { - err_printk(dev, NULL, "DMA-API: device driver tries " + err_printk(dev, NULL, "device driver tries " "to sync DMA memory it has not allocated " "[device address=0x%016llx] [size=%llu bytes]\n", (unsigned long long)ref->dev_addr, ref->size); @@ -1239,7 +1241,7 @@ static void check_sync(struct device *dev, } if (ref->size > entry->size) { - err_printk(dev, entry, "DMA-API: device driver syncs" + err_printk(dev, entry, "device driver syncs" " DMA memory outside allocated range " "[device address=0x%016llx] " "[allocation size=%llu bytes] " @@ -1252,7 +1254,7 @@ static void check_sync(struct device *dev, goto out; if (ref->direction != entry->direction) { - err_printk(dev, entry, "DMA-API: device driver syncs " + err_printk(dev, entry, "device driver syncs " "DMA memory with different direction " "[device address=0x%016llx] [size=%llu bytes] " "[mapped with %s] [synced with %s]\n", @@ -1263,7 +1265,7 @@ static void check_sync(struct device *dev, if (to_cpu && !(entry->direction == DMA_FROM_DEVICE) && !(ref->direction == DMA_TO_DEVICE)) - err_printk(dev, entry, "DMA-API: device driver syncs " + err_printk(dev, entry, "device driver syncs " "device read-only DMA memory for cpu " "[device address=0x%016llx] [size=%llu bytes] " "[mapped with %s] [synced with %s]\n", @@ -1273,7 +1275,7 @@ static void check_sync(struct device *dev, if (!to_cpu && !(entry->direction == DMA_TO_DEVICE) && !(ref->direction == DMA_FROM_DEVICE)) - err_printk(dev, entry, "DMA-API: device driver syncs " + err_printk(dev, entry, "device driver syncs " "device write-only DMA memory to device " "[device address=0x%016llx] [size=%llu bytes] " "[mapped with %s] [synced with %s]\n", @@ -1283,7 +1285,7 @@ static void check_sync(struct device *dev, if (ref->sg_call_ents && ref->type == dma_debug_sg && ref->sg_call_ents != entry->sg_call_ents) { - err_printk(ref->dev, entry, "DMA-API: device driver syncs " + err_printk(ref->dev, entry, "device driver syncs " "DMA sg list with different entry count " "[map count=%d] [sync count=%d]\n", entry->sg_call_ents, ref->sg_call_ents); @@ -1304,7 +1306,7 @@ static void check_sg_segment(struct device *dev, struct scatterlist *sg) * whoever generated the list forgot to check them. */ if (sg->length > max_seg) - err_printk(dev, NULL, "DMA-API: mapping sg segment longer than device claims to support [len=%u] [max=%u]\n", + err_printk(dev, NULL, "mapping sg segment longer than device claims to support [len=%u] [max=%u]\n", sg->length, max_seg); /* * In some cases this could potentially be the DMA API @@ -1314,7 +1316,7 @@ static void check_sg_segment(struct device *dev, struct scatterlist *sg) start = sg_dma_address(sg); end = start + sg_dma_len(sg) - 1; if ((start ^ end) & ~boundary) - err_printk(dev, NULL, "DMA-API: mapping sg segment across boundary [start=0x%016llx] [end=0x%016llx] [boundary=0x%016llx]\n", + err_printk(dev, NULL, "mapping sg segment across boundary [start=0x%016llx] [end=0x%016llx] [boundary=0x%016llx]\n", start, end, boundary); #endif } @@ -1326,11 +1328,11 @@ void debug_dma_map_single(struct device *dev, const void *addr, return; if (!virt_addr_valid(addr)) - err_printk(dev, NULL, "DMA-API: device driver maps memory from invalid area [addr=%p] [len=%lu]\n", + err_printk(dev, NULL, "device driver maps memory from invalid area [addr=%p] [len=%lu]\n", addr, len); if (is_vmalloc_addr(addr)) - err_printk(dev, NULL, "DMA-API: device driver maps memory from vmalloc area [addr=%p] [len=%lu]\n", + err_printk(dev, NULL, "device driver maps memory from vmalloc area [addr=%p] [len=%lu]\n", addr, len); } EXPORT_SYMBOL(debug_dma_map_single); @@ -1787,7 +1789,7 @@ static int __init dma_debug_driver_setup(char *str) } if (current_driver_name[0]) - pr_info("DMA-API: enable driver filter for driver [%s]\n", + pr_info("enable driver filter for driver [%s]\n", current_driver_name); -- cgit From 2b9d9ac02b9d8d32c515c82bb17401c429f160ab Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Mon, 10 Dec 2018 14:00:29 +0000 Subject: dma-debug: Dynamically expand the dma_debug_entry pool Certain drivers such as large multi-queue network adapters can use pools of mapped DMA buffers larger than the default dma_debug_entry pool of 65536 entries, with the result that merely probing such a device can cause DMA debug to disable itself during boot unless explicitly given an appropriate "dma_debug_entries=..." option. Developers trying to debug some other driver on such a system may not be immediately aware of this, and at worst it can hide bugs if they fail to realise that dma-debug has already disabled itself unexpectedly by the time their code of interest gets to run. Even once they do realise, it can be a bit of a pain to emprirically determine a suitable number of preallocated entries to configure, short of massively over-allocating. There's really no need for such a static limit, though, since we can quite easily expand the pool at runtime in those rare cases that the preallocated entries are insufficient, which is arguably the least surprising and most useful behaviour. To that end, refactor the prealloc_memory() logic a little bit to generalise it for runtime reallocations as well. Signed-off-by: Robin Murphy Tested-by: Qian Cai Signed-off-by: Christoph Hellwig --- Documentation/DMA-API.txt | 11 +++---- kernel/dma/debug.c | 79 ++++++++++++++++++++++++----------------------- 2 files changed, 46 insertions(+), 44 deletions(-) (limited to 'kernel') diff --git a/Documentation/DMA-API.txt b/Documentation/DMA-API.txt index 6bdb095393b0..0fcb7561af1e 100644 --- a/Documentation/DMA-API.txt +++ b/Documentation/DMA-API.txt @@ -717,8 +717,8 @@ dma-api/num_errors The number in this file shows how many dma-api/min_free_entries This read-only file can be read to get the minimum number of free dma_debug_entries the allocator has ever seen. If this value goes - down to zero the code will disable itself - because it is not longer reliable. + down to zero the code will attempt to increase + nr_total_entries to compensate. dma-api/num_free_entries The current number of free dma_debug_entries in the allocator. @@ -745,10 +745,9 @@ driver filter at boot time. The debug code will only print errors for that driver afterwards. This filter can be disabled or changed later using debugfs. When the code disables itself at runtime this is most likely because it ran -out of dma_debug_entries. These entries are preallocated at boot. The number -of preallocated entries is defined per architecture. If it is too low for you -boot with 'dma_debug_entries=' to overwrite the -architectural default. +out of dma_debug_entries and was unable to allocate more on-demand. 65536 +entries are preallocated at boot - if this is too low for you boot with +'dma_debug_entries=' to overwrite the default. :: diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index 29486eb9d1dc..ef7c90b7a346 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -47,6 +47,8 @@ #ifndef PREALLOC_DMA_DEBUG_ENTRIES #define PREALLOC_DMA_DEBUG_ENTRIES (1 << 16) #endif +/* If the pool runs out, add this many new entries at once */ +#define DMA_DEBUG_DYNAMIC_ENTRIES 256 enum { dma_debug_single, @@ -646,6 +648,34 @@ static void add_dma_entry(struct dma_debug_entry *entry) */ } +static int dma_debug_create_entries(u32 num_entries, gfp_t gfp) +{ + struct dma_debug_entry *entry, *next_entry; + int i; + + for (i = 0; i < num_entries; ++i) { + entry = kzalloc(sizeof(*entry), gfp); + if (!entry) + goto out_err; + + list_add_tail(&entry->list, &free_entries); + } + + num_free_entries += num_entries; + nr_total_entries += num_entries; + + return 0; + +out_err: + + list_for_each_entry_safe(entry, next_entry, &free_entries, list) { + list_del(&entry->list); + kfree(entry); + } + + return -ENOMEM; +} + static struct dma_debug_entry *__dma_entry_alloc(void) { struct dma_debug_entry *entry; @@ -672,12 +702,14 @@ static struct dma_debug_entry *dma_entry_alloc(void) unsigned long flags; spin_lock_irqsave(&free_entries_lock, flags); - - if (list_empty(&free_entries)) { - global_disable = true; - spin_unlock_irqrestore(&free_entries_lock, flags); - pr_err("debugging out of memory - disabling\n"); - return NULL; + if (num_free_entries == 0) { + if (dma_debug_create_entries(DMA_DEBUG_DYNAMIC_ENTRIES, + GFP_ATOMIC)) { + global_disable = true; + spin_unlock_irqrestore(&free_entries_lock, flags); + pr_err("debugging out of memory - disabling\n"); + return NULL; + } } entry = __dma_entry_alloc(); @@ -764,36 +796,6 @@ int dma_debug_resize_entries(u32 num_entries) * 2. Preallocate a given number of dma_debug_entry structs */ -static int prealloc_memory(u32 num_entries) -{ - struct dma_debug_entry *entry, *next_entry; - int i; - - for (i = 0; i < num_entries; ++i) { - entry = kzalloc(sizeof(*entry), GFP_KERNEL); - if (!entry) - goto out_err; - - list_add_tail(&entry->list, &free_entries); - } - - num_free_entries = num_entries; - min_free_entries = num_entries; - - pr_info("preallocated %d debug entries\n", num_entries); - - return 0; - -out_err: - - list_for_each_entry_safe(entry, next_entry, &free_entries, list) { - list_del(&entry->list); - kfree(entry); - } - - return -ENOMEM; -} - static ssize_t filter_read(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { @@ -1038,14 +1040,15 @@ static int dma_debug_init(void) return 0; } - if (prealloc_memory(nr_prealloc_entries) != 0) { + if (dma_debug_create_entries(nr_prealloc_entries, GFP_KERNEL) != 0) { pr_err("debugging out of memory error - disabled\n"); global_disable = true; return 0; } - nr_total_entries = num_free_entries; + min_free_entries = num_free_entries; + pr_info("preallocated %d debug entries\n", nr_total_entries); dma_debug_initialized = true; -- cgit From ceb51173b2b5bd7af0d99079f6bbacdcfc58fe08 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Mon, 10 Dec 2018 14:00:30 +0000 Subject: dma-debug: Make leak-like behaviour apparent Now that we can dynamically allocate DMA debug entries to cope with drivers maintaining excessively large numbers of live mappings, a driver which *does* actually have a bug leaking mappings (and is not unloaded) will no longer trigger the "DMA-API: debugging out of memory - disabling" message until it gets to actual kernel OOM conditions, which means it could go unnoticed for a while. To that end, let's inform the user each time the pool has grown to a multiple of its initial size, which should make it apparent that they either have a leak or might want to increase the preallocation size. Signed-off-by: Robin Murphy Tested-by: Qian Cai Signed-off-by: Christoph Hellwig --- Documentation/DMA-API.txt | 6 +++++- kernel/dma/debug.c | 13 +++++++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/Documentation/DMA-API.txt b/Documentation/DMA-API.txt index 0fcb7561af1e..7a7d8a415ce8 100644 --- a/Documentation/DMA-API.txt +++ b/Documentation/DMA-API.txt @@ -747,7 +747,11 @@ driver afterwards. This filter can be disabled or changed later using debugfs. When the code disables itself at runtime this is most likely because it ran out of dma_debug_entries and was unable to allocate more on-demand. 65536 entries are preallocated at boot - if this is too low for you boot with -'dma_debug_entries=' to overwrite the default. +'dma_debug_entries=' to overwrite the default. The +code will print to the kernel log each time it has dynamically allocated +as many entries as were initially preallocated. This is to indicate that a +larger preallocation size may be appropriate, or if it happens continually +that a driver may be leaking mappings. :: diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index ef7c90b7a346..912c23f4c177 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -691,6 +691,18 @@ static struct dma_debug_entry *__dma_entry_alloc(void) return entry; } +void __dma_entry_alloc_check_leak(void) +{ + u32 tmp = nr_total_entries % nr_prealloc_entries; + + /* Shout each time we tick over some multiple of the initial pool */ + if (tmp < DMA_DEBUG_DYNAMIC_ENTRIES) { + pr_info("dma_debug_entry pool grown to %u (%u00%%)\n", + nr_total_entries, + (nr_total_entries / nr_prealloc_entries)); + } +} + /* struct dma_entry allocator * * The next two functions implement the allocator for @@ -710,6 +722,7 @@ static struct dma_debug_entry *dma_entry_alloc(void) pr_err("debugging out of memory - disabling\n"); return NULL; } + __dma_entry_alloc_check_leak(); } entry = __dma_entry_alloc(); -- cgit From 0cb0e25e421436a83ee39857923e4213b983e463 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Mon, 10 Dec 2018 14:00:32 +0000 Subject: dma/debug: Remove dma_debug_resize_entries() With the only caller now gone, we can clean up this part of dma-debug's exposed internals and make way to tweak the allocation behaviour. Signed-off-by: Robin Murphy Tested-by: Qian Cai Signed-off-by: Christoph Hellwig --- include/linux/dma-debug.h | 7 ------- kernel/dma/debug.c | 46 ---------------------------------------------- 2 files changed, 53 deletions(-) (limited to 'kernel') diff --git a/include/linux/dma-debug.h b/include/linux/dma-debug.h index 30213adbb6b9..46e6131a72b6 100644 --- a/include/linux/dma-debug.h +++ b/include/linux/dma-debug.h @@ -30,8 +30,6 @@ struct bus_type; extern void dma_debug_add_bus(struct bus_type *bus); -extern int dma_debug_resize_entries(u32 num_entries); - extern void debug_dma_map_single(struct device *dev, const void *addr, unsigned long len); @@ -101,11 +99,6 @@ static inline void dma_debug_add_bus(struct bus_type *bus) { } -static inline int dma_debug_resize_entries(u32 num_entries) -{ - return 0; -} - static inline void debug_dma_map_single(struct device *dev, const void *addr, unsigned long len) { diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index 912c23f4c177..36a42874b05f 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -755,52 +755,6 @@ static void dma_entry_free(struct dma_debug_entry *entry) spin_unlock_irqrestore(&free_entries_lock, flags); } -int dma_debug_resize_entries(u32 num_entries) -{ - int i, delta, ret = 0; - unsigned long flags; - struct dma_debug_entry *entry; - LIST_HEAD(tmp); - - spin_lock_irqsave(&free_entries_lock, flags); - - if (nr_total_entries < num_entries) { - delta = num_entries - nr_total_entries; - - spin_unlock_irqrestore(&free_entries_lock, flags); - - for (i = 0; i < delta; i++) { - entry = kzalloc(sizeof(*entry), GFP_KERNEL); - if (!entry) - break; - - list_add_tail(&entry->list, &tmp); - } - - spin_lock_irqsave(&free_entries_lock, flags); - - list_splice(&tmp, &free_entries); - nr_total_entries += i; - num_free_entries += i; - } else { - delta = nr_total_entries - num_entries; - - for (i = 0; i < delta && !list_empty(&free_entries); i++) { - entry = __dma_entry_alloc(); - kfree(entry); - } - - nr_total_entries -= i; - } - - if (nr_total_entries != num_entries) - ret = 1; - - spin_unlock_irqrestore(&free_entries_lock, flags); - - return ret; -} - /* * DMA-API debugging init code * -- cgit From ad78dee0b630527bdfed809d1f5ed95c601886ae Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Mon, 10 Dec 2018 14:00:33 +0000 Subject: dma-debug: Batch dma_debug_entry allocation DMA debug entries are one of those things which aren't that useful individually - we will always want some larger quantity of them - and which we don't really need to manage the exact number of - we only care about having 'enough'. In that regard, the current behaviour of creating them one-by-one leads to a lot of unwarranted function call overhead and memory wasted on alignment padding. Now that we don't have to worry about freeing anything via dma_debug_resize_entries(), we can optimise the allocation behaviour by grabbing whole pages at once, which will save considerably on the aforementioned overheads, and probably offer a little more cache/TLB locality benefit for traversing the lists under normal operation. This should also give even less reason for an architecture-level override of the preallocation size, so make the definition unconditional - if there is still any desire to change the compile-time value for some platforms it would be better off as a Kconfig option anyway. Since freeing a whole page of entries at once becomes enough of a challenge that it's not really worth complicating dma_debug_init(), we may as well tweak the preallocation behaviour such that as long as we manage to allocate *some* pages, we can leave debugging enabled on a best-effort basis rather than otherwise wasting them. Signed-off-by: Robin Murphy Tested-by: Qian Cai Signed-off-by: Christoph Hellwig --- Documentation/DMA-API.txt | 4 +++- kernel/dma/debug.c | 50 ++++++++++++++++++++--------------------------- 2 files changed, 24 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/Documentation/DMA-API.txt b/Documentation/DMA-API.txt index 7a7d8a415ce8..016eb6909b8a 100644 --- a/Documentation/DMA-API.txt +++ b/Documentation/DMA-API.txt @@ -747,7 +747,9 @@ driver afterwards. This filter can be disabled or changed later using debugfs. When the code disables itself at runtime this is most likely because it ran out of dma_debug_entries and was unable to allocate more on-demand. 65536 entries are preallocated at boot - if this is too low for you boot with -'dma_debug_entries=' to overwrite the default. The +'dma_debug_entries=' to overwrite the default. Note +that the code allocates entries in batches, so the exact number of +preallocated entries may be greater than the actual number requested. The code will print to the kernel log each time it has dynamically allocated as many entries as were initially preallocated. This is to indicate that a larger preallocation size may be appropriate, or if it happens continually diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index 36a42874b05f..20ab0f6c1b70 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -43,12 +43,9 @@ #define HASH_FN_SHIFT 13 #define HASH_FN_MASK (HASH_SIZE - 1) -/* allow architectures to override this if absolutely required */ -#ifndef PREALLOC_DMA_DEBUG_ENTRIES #define PREALLOC_DMA_DEBUG_ENTRIES (1 << 16) -#endif /* If the pool runs out, add this many new entries at once */ -#define DMA_DEBUG_DYNAMIC_ENTRIES 256 +#define DMA_DEBUG_DYNAMIC_ENTRIES (PAGE_SIZE / sizeof(struct dma_debug_entry)) enum { dma_debug_single, @@ -648,32 +645,22 @@ static void add_dma_entry(struct dma_debug_entry *entry) */ } -static int dma_debug_create_entries(u32 num_entries, gfp_t gfp) +static int dma_debug_create_entries(gfp_t gfp) { - struct dma_debug_entry *entry, *next_entry; + struct dma_debug_entry *entry; int i; - for (i = 0; i < num_entries; ++i) { - entry = kzalloc(sizeof(*entry), gfp); - if (!entry) - goto out_err; + entry = (void *)get_zeroed_page(gfp); + if (!entry) + return -ENOMEM; - list_add_tail(&entry->list, &free_entries); - } + for (i = 0; i < DMA_DEBUG_DYNAMIC_ENTRIES; i++) + list_add_tail(&entry[i].list, &free_entries); - num_free_entries += num_entries; - nr_total_entries += num_entries; + num_free_entries += DMA_DEBUG_DYNAMIC_ENTRIES; + nr_total_entries += DMA_DEBUG_DYNAMIC_ENTRIES; return 0; - -out_err: - - list_for_each_entry_safe(entry, next_entry, &free_entries, list) { - list_del(&entry->list); - kfree(entry); - } - - return -ENOMEM; } static struct dma_debug_entry *__dma_entry_alloc(void) @@ -715,8 +702,7 @@ static struct dma_debug_entry *dma_entry_alloc(void) spin_lock_irqsave(&free_entries_lock, flags); if (num_free_entries == 0) { - if (dma_debug_create_entries(DMA_DEBUG_DYNAMIC_ENTRIES, - GFP_ATOMIC)) { + if (dma_debug_create_entries(GFP_ATOMIC)) { global_disable = true; spin_unlock_irqrestore(&free_entries_lock, flags); pr_err("debugging out of memory - disabling\n"); @@ -987,7 +973,7 @@ void dma_debug_add_bus(struct bus_type *bus) static int dma_debug_init(void) { - int i; + int i, nr_pages; /* Do not use dma_debug_initialized here, since we really want to be * called to set dma_debug_initialized @@ -1007,15 +993,21 @@ static int dma_debug_init(void) return 0; } - if (dma_debug_create_entries(nr_prealloc_entries, GFP_KERNEL) != 0) { + nr_pages = DIV_ROUND_UP(nr_prealloc_entries, DMA_DEBUG_DYNAMIC_ENTRIES); + for (i = 0; i < nr_pages; ++i) + dma_debug_create_entries(GFP_KERNEL); + if (num_free_entries >= nr_prealloc_entries) { + pr_info("preallocated %d debug entries\n", nr_total_entries); + } else if (num_free_entries > 0) { + pr_warn("%d debug entries requested but only %d allocated\n", + nr_prealloc_entries, nr_total_entries); + } else { pr_err("debugging out of memory error - disabled\n"); global_disable = true; return 0; } - min_free_entries = num_free_entries; - pr_info("preallocated %d debug entries\n", nr_total_entries); dma_debug_initialized = true; -- cgit From 1431a5d2cfa18d7006d9b0e7ab4548d9bb19ce55 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 6 Dec 2018 17:11:32 -0800 Subject: locking/lockdep: Declare local symbols static This patch avoids that sparse complains about a missing declaration for the lock_classes array when building with CONFIG_DEBUG_LOCKDEP=n. Signed-off-by: Bart Van Assche Signed-off-by: Peter Zijlstra (Intel) Cc: Johannes Berg Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Sasha Levin Cc: Thomas Gleixner Cc: Waiman Long Cc: johannes.berg@intel.com Cc: tj@kernel.org Link: https://lkml.kernel.org/r/20181207011148.251812-9-bvanassche@acm.org Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 1efada2dd9dd..7434a00b2b2f 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -138,6 +138,9 @@ static struct lock_list list_entries[MAX_LOCKDEP_ENTRIES]; * get freed - this significantly simplifies the debugging code. */ unsigned long nr_lock_classes; +#ifndef CONFIG_DEBUG_LOCKDEP +static +#endif struct lock_class lock_classes[MAX_LOCKDEP_KEYS]; static inline struct lock_class *hlock_class(struct held_lock *hlock) -- cgit From d35568bdb6ce4be3f885f8f189bbde5adc7e0160 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 6 Dec 2018 17:11:33 -0800 Subject: locking/lockdep: Inline __lockdep_init_map() Since the function __lockdep_init_map() only has one caller, inline it into its caller. This patch does not change any functionality. Signed-off-by: Bart Van Assche Signed-off-by: Peter Zijlstra (Intel) Cc: Johannes Berg Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Sasha Levin Cc: Thomas Gleixner Cc: Waiman Long Cc: johannes.berg@intel.com Cc: tj@kernel.org Link: https://lkml.kernel.org/r/20181207011148.251812-10-bvanassche@acm.org Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 7434a00b2b2f..b5c8fcb6c070 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -3091,7 +3091,7 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this, /* * Initialize a lock instance's lock-class mapping info: */ -static void __lockdep_init_map(struct lockdep_map *lock, const char *name, +void lockdep_init_map(struct lockdep_map *lock, const char *name, struct lock_class_key *key, int subclass) { int i; @@ -3147,12 +3147,6 @@ static void __lockdep_init_map(struct lockdep_map *lock, const char *name, raw_local_irq_restore(flags); } } - -void lockdep_init_map(struct lockdep_map *lock, const char *name, - struct lock_class_key *key, int subclass) -{ - __lockdep_init_map(lock, name, key, subclass); -} EXPORT_SYMBOL_GPL(lockdep_init_map); struct lock_class_key __lockdep_no_validate__; -- cgit From 2904d9fa45d3ce7153f1e10d78c570ecf7f19c35 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 6 Dec 2018 17:11:34 -0800 Subject: locking/lockdep: Introduce lock_class_cache_is_registered() This patch does not change any functionality but makes the lockdep_reset_lock() function easier to read. Signed-off-by: Bart Van Assche Signed-off-by: Peter Zijlstra (Intel) Cc: Johannes Berg Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Sasha Levin Cc: Thomas Gleixner Cc: Waiman Long Cc: johannes.berg@intel.com Cc: tj@kernel.org Link: https://lkml.kernel.org/r/20181207011148.251812-11-bvanassche@acm.org Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 50 +++++++++++++++++++++++++++++------------------- 1 file changed, 30 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index b5c8fcb6c070..81388d028ac7 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -4201,13 +4201,33 @@ void lockdep_free_key_range(void *start, unsigned long size) */ } -void lockdep_reset_lock(struct lockdep_map *lock) +/* + * Check whether any element of the @lock->class_cache[] array refers to a + * registered lock class. The caller must hold either the graph lock or the + * RCU read lock. + */ +static bool lock_class_cache_is_registered(struct lockdep_map *lock) { struct lock_class *class; struct hlist_head *head; - unsigned long flags; int i, j; - int locked; + + for (i = 0; i < CLASSHASH_SIZE; i++) { + head = classhash_table + i; + hlist_for_each_entry_rcu(class, head, hash_entry) { + for (j = 0; j < NR_LOCKDEP_CACHING_CLASSES; j++) + if (lock->class_cache[j] == class) + return true; + } + } + return false; +} + +void lockdep_reset_lock(struct lockdep_map *lock) +{ + struct lock_class *class; + unsigned long flags; + int j, locked; raw_local_irq_save(flags); @@ -4227,24 +4247,14 @@ void lockdep_reset_lock(struct lockdep_map *lock) * be gone. */ locked = graph_lock(); - for (i = 0; i < CLASSHASH_SIZE; i++) { - head = classhash_table + i; - hlist_for_each_entry_rcu(class, head, hash_entry) { - int match = 0; - - for (j = 0; j < NR_LOCKDEP_CACHING_CLASSES; j++) - match |= class == lock->class_cache[j]; - - if (unlikely(match)) { - if (debug_locks_off_graph_unlock()) { - /* - * We all just reset everything, how did it match? - */ - WARN_ON(1); - } - goto out_restore; - } + if (unlikely(lock_class_cache_is_registered(lock))) { + if (debug_locks_off_graph_unlock()) { + /* + * We all just reset everything, how did it match? + */ + WARN_ON(1); } + goto out_restore; } if (locked) graph_unlock(); -- cgit From a66b6922dc6a5ece60ea9326153da3b062977a4d Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 6 Dec 2018 17:11:35 -0800 Subject: locking/lockdep: Remove a superfluous INIT_LIST_HEAD() statement Initializing a list entry just before it is passed to list_add_tail_rcu() is not necessary because list_add_tail_rcu() overwrites the next and prev pointers anyway. Hence remove the INIT_LIST_HEAD() statement. Signed-off-by: Bart Van Assche Signed-off-by: Peter Zijlstra (Intel) Cc: Johannes Berg Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Sasha Levin Cc: Thomas Gleixner Cc: Waiman Long Cc: johannes.berg@intel.com Cc: tj@kernel.org Link: https://lkml.kernel.org/r/20181207011148.251812-12-bvanassche@acm.org Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 81388d028ac7..346b5a1fd062 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -792,7 +792,6 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) class->key = key; class->name = lock->name; class->subclass = subclass; - INIT_LIST_HEAD(&class->lock_entry); INIT_LIST_HEAD(&class->locks_before); INIT_LIST_HEAD(&class->locks_after); class->name_version = count_matching_names(class); -- cgit From 786fa29e9cb6810e21ab0d9c41a81d81d54d1d1b Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 6 Dec 2018 17:11:36 -0800 Subject: locking/lockdep: Make concurrent lockdep_reset_lock() calls safe Since zap_class() removes items from the all_lock_classes list and the classhash_table, protect all zap_class() calls against concurrent data structure modifications with the graph lock. Signed-off-by: Bart Van Assche Signed-off-by: Peter Zijlstra (Intel) Cc: Johannes Berg Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Sasha Levin Cc: Thomas Gleixner Cc: Waiman Long Cc: johannes.berg@intel.com Cc: tj@kernel.org Link: https://lkml.kernel.org/r/20181207011148.251812-13-bvanassche@acm.org Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 346b5a1fd062..737d2dd3ea56 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -4122,6 +4122,9 @@ void lockdep_reset(void) raw_local_irq_restore(flags); } +/* + * Remove all references to a lock class. The caller must hold the graph lock. + */ static void zap_class(struct lock_class *class) { int i; @@ -4229,6 +4232,7 @@ void lockdep_reset_lock(struct lockdep_map *lock) int j, locked; raw_local_irq_save(flags); + locked = graph_lock(); /* * Remove all classes this lock might have: @@ -4245,7 +4249,6 @@ void lockdep_reset_lock(struct lockdep_map *lock) * Debug check: in the end all mapped classes should * be gone. */ - locked = graph_lock(); if (unlikely(lock_class_cache_is_registered(lock))) { if (debug_locks_off_graph_unlock()) { /* -- cgit From fe27b0de8dfcdf8482558ce5d25e697fe74d851e Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 6 Dec 2018 17:11:37 -0800 Subject: locking/lockdep: Stop using RCU primitives to access 'all_lock_classes' Due to the previous patch all code that accesses the 'all_lock_classes' list holds the graph lock. Hence use regular list primitives instead of their RCU variants to access this list. Signed-off-by: Bart Van Assche Signed-off-by: Peter Zijlstra (Intel) Cc: Johannes Berg Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Sasha Levin Cc: Thomas Gleixner Cc: Waiman Long Cc: johannes.berg@intel.com Cc: tj@kernel.org Link: https://lkml.kernel.org/r/20181207011148.251812-14-bvanassche@acm.org Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 737d2dd3ea56..5c837a537273 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -629,7 +629,8 @@ static int static_obj(void *obj) /* * To make lock name printouts unique, we calculate a unique - * class->name_version generation counter: + * class->name_version generation counter. The caller must hold the graph + * lock. */ static int count_matching_names(struct lock_class *new_class) { @@ -639,7 +640,7 @@ static int count_matching_names(struct lock_class *new_class) if (!new_class->name) return 0; - list_for_each_entry_rcu(class, &all_lock_classes, lock_entry) { + list_for_each_entry(class, &all_lock_classes, lock_entry) { if (new_class->key - new_class->subclass == class->key) return class->name_version; if (class->name && !strcmp(class->name, new_class->name)) @@ -803,7 +804,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) /* * Add it to the global list of classes: */ - list_add_tail_rcu(&class->lock_entry, &all_lock_classes); + list_add_tail(&class->lock_entry, &all_lock_classes); if (verbose(class)) { graph_unlock(); @@ -4141,7 +4142,7 @@ static void zap_class(struct lock_class *class) * Unhash the class and remove it from the all_lock_classes list: */ hlist_del_rcu(&class->hash_entry); - list_del_rcu(&class->lock_entry); + list_del(&class->lock_entry); RCU_INIT_POINTER(class->key, NULL); RCU_INIT_POINTER(class->name, NULL); -- cgit From 80eb865768703c0f85a0603762742ae1dedf21f0 Mon Sep 17 00:00:00 2001 From: Andrea Parri Date: Tue, 27 Nov 2018 12:01:10 +0100 Subject: sched/fair: Clean up comment in nohz_idle_balance() Concerning the comment associated to the atomic_fetch_andnot() in nohz_idle_balance(), Vincent explains [1]: "[...] the comment is useless and can be removed [...] it was referring to a line code above the comment that was present in a previous iteration of the patchset. This line disappeared in final version but the comment has stayed." So remove the comment. Vincent also points out that the full ordering associated to the atomic_fetch_andnot() primitive could be relaxed, but this patch insists on the current more conservative/fully ordered solution: "Performance" isn't a concern, stay away from "correctness"/subtle relaxed (re)ordering if possible..., just make sure not to confuse the next reader with misleading/out-of-date comments. [1] http://lkml.kernel.org/r/CAKfTPtBjA-oCBRkO6__npQwL3+HLjzk7riCcPU1R7YdO-EpuZg@mail.gmail.com Suggested-by: Vincent Guittot Signed-off-by: Andrea Parri Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20181127110110.5533-1-andrea.parri@amarulasolutions.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ac855b2f4774..db514993565b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -9533,9 +9533,7 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) return false; } - /* - * barrier, pairs with nohz_balance_enter_idle(), ensures ... - */ + /* could be _relaxed() */ flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu)); if (!(flags & NOHZ_KICK_MASK)) return false; -- cgit From 765d0af19f5f388a34bf4533378f8398b72ded46 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Wed, 29 Aug 2018 15:19:11 +0200 Subject: sched/topology: Remove the ::smt_gain field from 'struct sched_domain' ::smt_gain is used to compute the capacity of CPUs of a SMT core with the constraint 1 < ::smt_gain < 2 in order to be able to compute number of CPUs per core. The field has_free_capacity of struct numa_stat, which was the last user of this computation of number of CPUs per core, has been removed by: 2d4056fafa19 ("sched/numa: Remove numa_has_capacity()") We can now remove this constraint on core capacity and use the defautl value SCHED_CAPACITY_SCALE for SMT CPUs. With this remove, SCHED_CAPACITY_SCALE becomes the maximum compute capacity of CPUs on every systems. This should help to simplify some code and remove fields like rd->max_cpu_capacity Furthermore, arch_scale_cpu_capacity() is used with a NULL sd in several other places in the code when it wants the capacity of a CPUs to scale some metrics like in pelt, deadline or schedutil. In case on SMT, the value returned is not the capacity of SMT CPUs but default SCHED_CAPACITY_SCALE. So remove it. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1535548752-4434-4-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- include/linux/sched/topology.h | 1 - kernel/sched/sched.h | 3 --- kernel/sched/topology.c | 2 -- 3 files changed, 6 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 6b9976180c1e..7fa0bc17cd8c 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -89,7 +89,6 @@ struct sched_domain { unsigned int newidle_idx; unsigned int wake_idx; unsigned int forkexec_idx; - unsigned int smt_gain; int nohz_idle; /* NOHZ IDLE status */ int flags; /* See SD_* */ diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 9bde60a11805..ceb896404869 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1864,9 +1864,6 @@ unsigned long arch_scale_freq_capacity(int cpu) static __always_inline unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu) { - if (sd && (sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1)) - return sd->smt_gain / sd->span_weight; - return SCHED_CAPACITY_SCALE; } #endif diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 8d7f15ba5916..7364e0b427b7 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1133,7 +1133,6 @@ sd_init(struct sched_domain_topology_level *tl, .last_balance = jiffies, .balance_interval = sd_weight, - .smt_gain = 0, .max_newidle_lb_cost = 0, .next_decay_max_lb_cost = jiffies, .child = child, @@ -1164,7 +1163,6 @@ sd_init(struct sched_domain_topology_level *tl, if (sd->flags & SD_SHARE_CPUCAPACITY) { sd->imbalance_pct = 110; - sd->smt_gain = 1178; /* ~15% */ } else if (sd->flags & SD_SHARE_PKG_RESOURCES) { sd->imbalance_pct = 117; -- cgit From 9ebc6053814d37b9de8cc291fba28f30a729c929 Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Sat, 3 Nov 2018 13:26:02 -0400 Subject: sched/core: Remove unnecessary unlikely() in push_*_task() WARN_ON() already contains an unlikely(), so it's not necessary to use WARN_ON(1). Signed-off-by: Yangtao Li Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20181103172602.1917-1-tiny.windzz@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/deadline.c | 4 +--- kernel/sched/rt.c | 4 +--- 2 files changed, 2 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index b32bc1f7cd14..fb8b7b5d745d 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -2042,10 +2042,8 @@ static int push_dl_task(struct rq *rq) return 0; retry: - if (unlikely(next_task == rq->curr)) { - WARN_ON(1); + if (WARN_ON(next_task == rq->curr)) return 0; - } /* * If next_task preempts rq->curr, and rq->curr diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 9aa3287ce301..e4f398ad9e73 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1813,10 +1813,8 @@ static int push_rt_task(struct rq *rq) return 0; retry: - if (unlikely(next_task == rq->curr)) { - WARN_ON(1); + if (WARN_ON(next_task == rq->curr)) return 0; - } /* * It's possible that the next_task slipped in of -- cgit From 5bd0988be12733a42a1a3d50e3e2ddfd79e57518 Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Mon, 3 Dec 2018 09:56:14 +0000 Subject: sched/topology: Relocate arch_scale_cpu_capacity() to the internal header By default, arch_scale_cpu_capacity() is only visible from within the kernel/sched folder. Relocate it to include/linux/sched/topology.h to make it visible to other clients needing to know about the capacity of CPUs, such as the Energy Model framework. This also shrinks the public header. Signed-off-by: Quentin Perret Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: adharmap@codeaurora.org Cc: chris.redpath@arm.com Cc: currojerez@riseup.net Cc: dietmar.eggemann@arm.com Cc: edubezval@gmail.com Cc: gregkh@linuxfoundation.org Cc: javi.merino@kernel.org Cc: joel@joelfernandes.org Cc: juri.lelli@redhat.com Cc: morten.rasmussen@arm.com Cc: patrick.bellasi@arm.com Cc: pkondeti@codeaurora.org Cc: rjw@rjwysocki.net Cc: skannan@codeaurora.org Cc: smuckle@google.com Cc: srinivas.pandruvada@linux.intel.com Cc: thara.gopinath@linaro.org Cc: tkjos@google.com Cc: valentin.schneider@arm.com Cc: vincent.guittot@linaro.org Cc: viresh.kumar@linaro.org Link: https://lkml.kernel.org/r/20181203095628.11858-2-quentin.perret@arm.com Signed-off-by: Ingo Molnar --- include/linux/sched/topology.h | 16 ++++++++++++++++ kernel/sched/sched.h | 18 ------------------ 2 files changed, 16 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 7fa0bc17cd8c..c31d3a47a47c 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -201,6 +201,14 @@ extern void set_sched_topology(struct sched_domain_topology_level *tl); # define SD_INIT_NAME(type) #endif +#ifndef arch_scale_cpu_capacity +static __always_inline +unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu) +{ + return SCHED_CAPACITY_SCALE; +} +#endif + #else /* CONFIG_SMP */ struct sched_domain_attr; @@ -216,6 +224,14 @@ static inline bool cpus_share_cache(int this_cpu, int that_cpu) return true; } +#ifndef arch_scale_cpu_capacity +static __always_inline +unsigned long arch_scale_cpu_capacity(void __always_unused *sd, int cpu) +{ + return SCHED_CAPACITY_SCALE; +} +#endif + #endif /* !CONFIG_SMP */ static inline int task_node(const struct task_struct *p) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index ceb896404869..66067152a831 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1859,24 +1859,6 @@ unsigned long arch_scale_freq_capacity(int cpu) } #endif -#ifdef CONFIG_SMP -#ifndef arch_scale_cpu_capacity -static __always_inline -unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu) -{ - return SCHED_CAPACITY_SCALE; -} -#endif -#else -#ifndef arch_scale_cpu_capacity -static __always_inline -unsigned long arch_scale_cpu_capacity(void __always_unused *sd, int cpu) -{ - return SCHED_CAPACITY_SCALE; -} -#endif -#endif - #ifdef CONFIG_SMP #ifdef CONFIG_PREEMPT -- cgit From 938e5e4b0d1502a93e787985cb95b136b40717b7 Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Mon, 3 Dec 2018 09:56:15 +0000 Subject: sched/cpufreq: Prepare schedutil for Energy Aware Scheduling Schedutil requests frequency by aggregating utilization signals from the scheduler (CFS, RT, DL, IRQ) and applying a 25% margin on top of them. Since Energy Aware Scheduling (EAS) needs to be able to predict the frequency requests, it needs to forecast the decisions made by the governor. In order to prepare the introduction of EAS, introduce schedutil_freq_util() to centralize the aforementioned signal aggregation and make it available to both schedutil and EAS. Since frequency selection and energy estimation still need to deal with RT and DL signals slightly differently, schedutil_freq_util() is called with a different 'type' parameter in those two contexts, and returns an aggregated utilization signal accordingly. While at it, introduce the map_util_freq() function which is designed to make schedutil's 25% margin usable easily for both sugov and EAS. As EAS will be able to predict schedutil's frequency requests more accurately than any other governor by design, it'd be sensible to make sure EAS cannot be used without schedutil. This will be done later, once EAS has actually been introduced. Suggested-by: Peter Zijlstra Signed-off-by: Quentin Perret Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Thomas Gleixner Cc: adharmap@codeaurora.org Cc: chris.redpath@arm.com Cc: currojerez@riseup.net Cc: dietmar.eggemann@arm.com Cc: edubezval@gmail.com Cc: gregkh@linuxfoundation.org Cc: javi.merino@kernel.org Cc: joel@joelfernandes.org Cc: juri.lelli@redhat.com Cc: morten.rasmussen@arm.com Cc: patrick.bellasi@arm.com Cc: pkondeti@codeaurora.org Cc: rjw@rjwysocki.net Cc: skannan@codeaurora.org Cc: smuckle@google.com Cc: srinivas.pandruvada@linux.intel.com Cc: thara.gopinath@linaro.org Cc: tkjos@google.com Cc: valentin.schneider@arm.com Cc: vincent.guittot@linaro.org Cc: viresh.kumar@linaro.org Link: https://lkml.kernel.org/r/20181203095628.11858-3-quentin.perret@arm.com Signed-off-by: Ingo Molnar --- include/linux/sched/cpufreq.h | 6 +++++ kernel/sched/cpufreq_schedutil.c | 53 ++++++++++++++++++++++++++++------------ kernel/sched/sched.h | 30 +++++++++++++++++++++++ 3 files changed, 74 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched/cpufreq.h b/include/linux/sched/cpufreq.h index 59667444669f..afa940cd50dc 100644 --- a/include/linux/sched/cpufreq.h +++ b/include/linux/sched/cpufreq.h @@ -20,6 +20,12 @@ void cpufreq_add_update_util_hook(int cpu, struct update_util_data *data, void (*func)(struct update_util_data *data, u64 time, unsigned int flags)); void cpufreq_remove_update_util_hook(int cpu); + +static inline unsigned long map_util_freq(unsigned long util, + unsigned long freq, unsigned long cap) +{ + return (freq + (freq >> 2)) * util / cap; +} #endif /* CONFIG_CPU_FREQ */ #endif /* _LINUX_SCHED_CPUFREQ_H */ diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 3fffad3bc8a8..90128be27712 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -13,6 +13,7 @@ #include "sched.h" +#include #include struct sugov_tunables { @@ -167,7 +168,7 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy, unsigned int freq = arch_scale_freq_invariant() ? policy->cpuinfo.max_freq : policy->cur; - freq = (freq + (freq >> 2)) * util / max; + freq = map_util_freq(util, freq, max); if (freq == sg_policy->cached_raw_freq && !sg_policy->need_freq_update) return sg_policy->next_freq; @@ -197,15 +198,13 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy, * based on the task model parameters and gives the minimal utilization * required to meet deadlines. */ -static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu) +unsigned long schedutil_freq_util(int cpu, unsigned long util_cfs, + unsigned long max, enum schedutil_type type) { - struct rq *rq = cpu_rq(sg_cpu->cpu); - unsigned long util, irq, max; + unsigned long dl_util, util, irq; + struct rq *rq = cpu_rq(cpu); - sg_cpu->max = max = arch_scale_cpu_capacity(NULL, sg_cpu->cpu); - sg_cpu->bw_dl = cpu_bw_dl(rq); - - if (rt_rq_is_runnable(&rq->rt)) + if (type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt)) return max; /* @@ -223,21 +222,30 @@ static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu) * utilization (PELT windows are synchronized) we can directly add them * to obtain the CPU's actual utilization. */ - util = cpu_util_cfs(rq); + util = util_cfs; util += cpu_util_rt(rq); + dl_util = cpu_util_dl(rq); + /* - * We do not make cpu_util_dl() a permanent part of this sum because we - * want to use cpu_bw_dl() later on, but we need to check if the - * CFS+RT+DL sum is saturated (ie. no idle time) such that we select - * f_max when there is no idle time. + * For frequency selection we do not make cpu_util_dl() a permanent part + * of this sum because we want to use cpu_bw_dl() later on, but we need + * to check if the CFS+RT+DL sum is saturated (ie. no idle time) such + * that we select f_max when there is no idle time. * * NOTE: numerical errors or stop class might cause us to not quite hit * saturation when we should -- something for later. */ - if ((util + cpu_util_dl(rq)) >= max) + if (util + dl_util >= max) return max; + /* + * OTOH, for energy computation we need the estimated running time, so + * include util_dl and ignore dl_bw. + */ + if (type == ENERGY_UTIL) + util += dl_util; + /* * There is still idle time; further improve the number by using the * irq metric. Because IRQ/steal time is hidden from the task clock we @@ -260,7 +268,22 @@ static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu) * bw_dl as requested freq. However, cpufreq is not yet ready for such * an interface. So, we only do the latter for now. */ - return min(max, util + sg_cpu->bw_dl); + if (type == FREQUENCY_UTIL) + util += cpu_bw_dl(rq); + + return min(max, util); +} + +static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu) +{ + struct rq *rq = cpu_rq(sg_cpu->cpu); + unsigned long util = cpu_util_cfs(rq); + unsigned long max = arch_scale_cpu_capacity(NULL, sg_cpu->cpu); + + sg_cpu->max = max; + sg_cpu->bw_dl = cpu_bw_dl(rq); + + return schedutil_freq_util(sg_cpu->cpu, util, max, FREQUENCY_UTIL); } /** diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 66067152a831..2eafa228aebf 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2191,6 +2191,31 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} #endif #ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL +/** + * enum schedutil_type - CPU utilization type + * @FREQUENCY_UTIL: Utilization used to select frequency + * @ENERGY_UTIL: Utilization used during energy calculation + * + * The utilization signals of all scheduling classes (CFS/RT/DL) and IRQ time + * need to be aggregated differently depending on the usage made of them. This + * enum is used within schedutil_freq_util() to differentiate the types of + * utilization expected by the callers, and adjust the aggregation accordingly. + */ +enum schedutil_type { + FREQUENCY_UTIL, + ENERGY_UTIL, +}; + +unsigned long schedutil_freq_util(int cpu, unsigned long util_cfs, + unsigned long max, enum schedutil_type type); + +static inline unsigned long schedutil_energy_util(int cpu, unsigned long cfs) +{ + unsigned long max = arch_scale_cpu_capacity(NULL, cpu); + + return schedutil_freq_util(cpu, cfs, max, ENERGY_UTIL); +} + static inline unsigned long cpu_bw_dl(struct rq *rq) { return (rq->dl.running_bw * SCHED_CAPACITY_SCALE) >> BW_SHIFT; @@ -2217,6 +2242,11 @@ static inline unsigned long cpu_util_rt(struct rq *rq) { return READ_ONCE(rq->avg_rt.util_avg); } +#else /* CONFIG_CPU_FREQ_GOV_SCHEDUTIL */ +static inline unsigned long schedutil_energy_util(int cpu, unsigned long cfs) +{ + return cfs; +} #endif #ifdef CONFIG_HAVE_SCHED_AVG_IRQ -- cgit From 27871f7a8a341ef5c636a337856369acf8013e4e Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Mon, 3 Dec 2018 09:56:16 +0000 Subject: PM: Introduce an Energy Model management framework Several subsystems in the kernel (task scheduler and/or thermal at the time of writing) can benefit from knowing about the energy consumed by CPUs. Yet, this information can come from different sources (DT or firmware for example), in different formats, hence making it hard to exploit without a standard API. As an attempt to address this, introduce a centralized Energy Model (EM) management framework which aggregates the power values provided by drivers into a table for each performance domain in the system. The power cost tables are made available to interested clients (e.g. task scheduler or thermal) via platform-agnostic APIs. The overall design is represented by the diagram below (focused on Arm-related drivers as an example, but applicable to any architecture): +---------------+ +-----------------+ +-------------+ | Thermal (IPA) | | Scheduler (EAS) | | Other | +---------------+ +-----------------+ +-------------+ | | em_pd_energy() | | | em_cpu_get() | +-----------+ | +--------+ | | | v v v +---------------------+ | | | Energy Model | | | | Framework | | | +---------------------+ ^ ^ ^ | | | em_register_perf_domain() +----------+ | +---------+ | | | +---------------+ +---------------+ +--------------+ | cpufreq-dt | | arm_scmi | | Other | +---------------+ +---------------+ +--------------+ ^ ^ ^ | | | +--------------+ +---------------+ +--------------+ | Device Tree | | Firmware | | ? | +--------------+ +---------------+ +--------------+ Drivers (typically, but not limited to, CPUFreq drivers) can register data in the EM framework using the em_register_perf_domain() API. The calling driver must provide a callback function with a standardized signature that will be used by the EM framework to build the power cost tables of the performance domain. This design should offer a lot of flexibility to calling drivers which are free of reading information from any location and to use any technique to compute power costs. Moreover, the capacity states registered by drivers in the EM framework are not required to match real performance states of the target. This is particularly important on targets where the performance states are not known by the OS. The power cost coefficients managed by the EM framework are specified in milli-watts. Although the two potential users of those coefficients (IPA and EAS) only need relative correctness, IPA specifically needs to compare the power of CPUs with the power of other components (GPUs, for example), which are still expressed in absolute terms in their respective subsystems. Hence, specifying the power of CPUs in milli-watts should help transitioning IPA to using the EM framework without introducing new problems by keeping units comparable across sub-systems. On the longer term, the EM of other devices than CPUs could also be managed by the EM framework, which would enable to remove the absolute unit. However, this is not absolutely required as a first step, so this extension of the EM framework is left for later. On the client side, the EM framework offers APIs to access the power cost tables of a CPU (em_cpu_get()), and to estimate the energy consumed by the CPUs of a performance domain (em_pd_energy()). Clients such as the task scheduler can then use these APIs to access the shared data structures holding the Energy Model of CPUs. Signed-off-by: Quentin Perret Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Rafael J. Wysocki Cc: Thomas Gleixner Cc: adharmap@codeaurora.org Cc: chris.redpath@arm.com Cc: currojerez@riseup.net Cc: dietmar.eggemann@arm.com Cc: edubezval@gmail.com Cc: gregkh@linuxfoundation.org Cc: javi.merino@kernel.org Cc: joel@joelfernandes.org Cc: juri.lelli@redhat.com Cc: morten.rasmussen@arm.com Cc: patrick.bellasi@arm.com Cc: pkondeti@codeaurora.org Cc: skannan@codeaurora.org Cc: smuckle@google.com Cc: srinivas.pandruvada@linux.intel.com Cc: thara.gopinath@linaro.org Cc: tkjos@google.com Cc: valentin.schneider@arm.com Cc: vincent.guittot@linaro.org Cc: viresh.kumar@linaro.org Link: https://lkml.kernel.org/r/20181203095628.11858-4-quentin.perret@arm.com Signed-off-by: Ingo Molnar --- include/linux/energy_model.h | 187 ++++++++++++++++++++++++++++++++++++++++ kernel/power/Kconfig | 15 ++++ kernel/power/Makefile | 2 + kernel/power/energy_model.c | 201 +++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 405 insertions(+) create mode 100644 include/linux/energy_model.h create mode 100644 kernel/power/energy_model.c (limited to 'kernel') diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h new file mode 100644 index 000000000000..aa027f7bcb3e --- /dev/null +++ b/include/linux/energy_model.h @@ -0,0 +1,187 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_ENERGY_MODEL_H +#define _LINUX_ENERGY_MODEL_H +#include +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_ENERGY_MODEL +/** + * em_cap_state - Capacity state of a performance domain + * @frequency: The CPU frequency in KHz, for consistency with CPUFreq + * @power: The power consumed by 1 CPU at this level, in milli-watts + * @cost: The cost coefficient associated with this level, used during + * energy calculation. Equal to: power * max_frequency / frequency + */ +struct em_cap_state { + unsigned long frequency; + unsigned long power; + unsigned long cost; +}; + +/** + * em_perf_domain - Performance domain + * @table: List of capacity states, in ascending order + * @nr_cap_states: Number of capacity states + * @cpus: Cpumask covering the CPUs of the domain + * + * A "performance domain" represents a group of CPUs whose performance is + * scaled together. All CPUs of a performance domain must have the same + * micro-architecture. Performance domains often have a 1-to-1 mapping with + * CPUFreq policies. + */ +struct em_perf_domain { + struct em_cap_state *table; + int nr_cap_states; + unsigned long cpus[0]; +}; + +#define EM_CPU_MAX_POWER 0xFFFF + +struct em_data_callback { + /** + * active_power() - Provide power at the next capacity state of a CPU + * @power : Active power at the capacity state in mW (modified) + * @freq : Frequency at the capacity state in kHz (modified) + * @cpu : CPU for which we do this operation + * + * active_power() must find the lowest capacity state of 'cpu' above + * 'freq' and update 'power' and 'freq' to the matching active power + * and frequency. + * + * The power is the one of a single CPU in the domain, expressed in + * milli-watts. It is expected to fit in the [0, EM_CPU_MAX_POWER] + * range. + * + * Return 0 on success. + */ + int (*active_power)(unsigned long *power, unsigned long *freq, int cpu); +}; +#define EM_DATA_CB(_active_power_cb) { .active_power = &_active_power_cb } + +struct em_perf_domain *em_cpu_get(int cpu); +int em_register_perf_domain(cpumask_t *span, unsigned int nr_states, + struct em_data_callback *cb); + +/** + * em_pd_energy() - Estimates the energy consumed by the CPUs of a perf. domain + * @pd : performance domain for which energy has to be estimated + * @max_util : highest utilization among CPUs of the domain + * @sum_util : sum of the utilization of all CPUs in the domain + * + * Return: the sum of the energy consumed by the CPUs of the domain assuming + * a capacity state satisfying the max utilization of the domain. + */ +static inline unsigned long em_pd_energy(struct em_perf_domain *pd, + unsigned long max_util, unsigned long sum_util) +{ + unsigned long freq, scale_cpu; + struct em_cap_state *cs; + int i, cpu; + + /* + * In order to predict the capacity state, map the utilization of the + * most utilized CPU of the performance domain to a requested frequency, + * like schedutil. + */ + cpu = cpumask_first(to_cpumask(pd->cpus)); + scale_cpu = arch_scale_cpu_capacity(NULL, cpu); + cs = &pd->table[pd->nr_cap_states - 1]; + freq = map_util_freq(max_util, cs->frequency, scale_cpu); + + /* + * Find the lowest capacity state of the Energy Model above the + * requested frequency. + */ + for (i = 0; i < pd->nr_cap_states; i++) { + cs = &pd->table[i]; + if (cs->frequency >= freq) + break; + } + + /* + * The capacity of a CPU in the domain at that capacity state (cs) + * can be computed as: + * + * cs->freq * scale_cpu + * cs->cap = -------------------- (1) + * cpu_max_freq + * + * So, ignoring the costs of idle states (which are not available in + * the EM), the energy consumed by this CPU at that capacity state is + * estimated as: + * + * cs->power * cpu_util + * cpu_nrg = -------------------- (2) + * cs->cap + * + * since 'cpu_util / cs->cap' represents its percentage of busy time. + * + * NOTE: Although the result of this computation actually is in + * units of power, it can be manipulated as an energy value + * over a scheduling period, since it is assumed to be + * constant during that interval. + * + * By injecting (1) in (2), 'cpu_nrg' can be re-expressed as a product + * of two terms: + * + * cs->power * cpu_max_freq cpu_util + * cpu_nrg = ------------------------ * --------- (3) + * cs->freq scale_cpu + * + * The first term is static, and is stored in the em_cap_state struct + * as 'cs->cost'. + * + * Since all CPUs of the domain have the same micro-architecture, they + * share the same 'cs->cost', and the same CPU capacity. Hence, the + * total energy of the domain (which is the simple sum of the energy of + * all of its CPUs) can be factorized as: + * + * cs->cost * \Sum cpu_util + * pd_nrg = ------------------------ (4) + * scale_cpu + */ + return cs->cost * sum_util / scale_cpu; +} + +/** + * em_pd_nr_cap_states() - Get the number of capacity states of a perf. domain + * @pd : performance domain for which this must be done + * + * Return: the number of capacity states in the performance domain table + */ +static inline int em_pd_nr_cap_states(struct em_perf_domain *pd) +{ + return pd->nr_cap_states; +} + +#else +struct em_perf_domain {}; +struct em_data_callback {}; +#define EM_DATA_CB(_active_power_cb) { } + +static inline int em_register_perf_domain(cpumask_t *span, + unsigned int nr_states, struct em_data_callback *cb) +{ + return -EINVAL; +} +static inline struct em_perf_domain *em_cpu_get(int cpu) +{ + return NULL; +} +static inline unsigned long em_pd_energy(struct em_perf_domain *pd, + unsigned long max_util, unsigned long sum_util) +{ + return 0; +} +static inline int em_pd_nr_cap_states(struct em_perf_domain *pd) +{ + return 0; +} +#endif + +#endif diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 3a6c2f87699e..f8fe57d1022e 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -298,3 +298,18 @@ config PM_GENERIC_DOMAINS_OF config CPU_PM bool + +config ENERGY_MODEL + bool "Energy Model for CPUs" + depends on SMP + depends on CPU_FREQ + default n + help + Several subsystems (thermal and/or the task scheduler for example) + can leverage information about the energy consumed by CPUs to make + smarter decisions. This config option enables the framework from + which subsystems can access the energy models. + + The exact usage of the energy model is subsystem-dependent. + + If in doubt, say N. diff --git a/kernel/power/Makefile b/kernel/power/Makefile index a3f79f0eef36..e7e47d9be1e5 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile @@ -15,3 +15,5 @@ obj-$(CONFIG_PM_AUTOSLEEP) += autosleep.o obj-$(CONFIG_PM_WAKELOCKS) += wakelock.o obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o + +obj-$(CONFIG_ENERGY_MODEL) += energy_model.o diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c new file mode 100644 index 000000000000..d9dc2c38764a --- /dev/null +++ b/kernel/power/energy_model.c @@ -0,0 +1,201 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Energy Model of CPUs + * + * Copyright (c) 2018, Arm ltd. + * Written by: Quentin Perret, Arm ltd. + */ + +#define pr_fmt(fmt) "energy_model: " fmt + +#include +#include +#include +#include +#include + +/* Mapping of each CPU to the performance domain to which it belongs. */ +static DEFINE_PER_CPU(struct em_perf_domain *, em_data); + +/* + * Mutex serializing the registrations of performance domains and letting + * callbacks defined by drivers sleep. + */ +static DEFINE_MUTEX(em_pd_mutex); + +static struct em_perf_domain *em_create_pd(cpumask_t *span, int nr_states, + struct em_data_callback *cb) +{ + unsigned long opp_eff, prev_opp_eff = ULONG_MAX; + unsigned long power, freq, prev_freq = 0; + int i, ret, cpu = cpumask_first(span); + struct em_cap_state *table; + struct em_perf_domain *pd; + u64 fmax; + + if (!cb->active_power) + return NULL; + + pd = kzalloc(sizeof(*pd) + cpumask_size(), GFP_KERNEL); + if (!pd) + return NULL; + + table = kcalloc(nr_states, sizeof(*table), GFP_KERNEL); + if (!table) + goto free_pd; + + /* Build the list of capacity states for this performance domain */ + for (i = 0, freq = 0; i < nr_states; i++, freq++) { + /* + * active_power() is a driver callback which ceils 'freq' to + * lowest capacity state of 'cpu' above 'freq' and updates + * 'power' and 'freq' accordingly. + */ + ret = cb->active_power(&power, &freq, cpu); + if (ret) { + pr_err("pd%d: invalid cap. state: %d\n", cpu, ret); + goto free_cs_table; + } + + /* + * We expect the driver callback to increase the frequency for + * higher capacity states. + */ + if (freq <= prev_freq) { + pr_err("pd%d: non-increasing freq: %lu\n", cpu, freq); + goto free_cs_table; + } + + /* + * The power returned by active_state() is expected to be + * positive, in milli-watts and to fit into 16 bits. + */ + if (!power || power > EM_CPU_MAX_POWER) { + pr_err("pd%d: invalid power: %lu\n", cpu, power); + goto free_cs_table; + } + + table[i].power = power; + table[i].frequency = prev_freq = freq; + + /* + * The hertz/watts efficiency ratio should decrease as the + * frequency grows on sane platforms. But this isn't always + * true in practice so warn the user if a higher OPP is more + * power efficient than a lower one. + */ + opp_eff = freq / power; + if (opp_eff >= prev_opp_eff) + pr_warn("pd%d: hertz/watts ratio non-monotonically decreasing: em_cap_state %d >= em_cap_state%d\n", + cpu, i, i - 1); + prev_opp_eff = opp_eff; + } + + /* Compute the cost of each capacity_state. */ + fmax = (u64) table[nr_states - 1].frequency; + for (i = 0; i < nr_states; i++) { + table[i].cost = div64_u64(fmax * table[i].power, + table[i].frequency); + } + + pd->table = table; + pd->nr_cap_states = nr_states; + cpumask_copy(to_cpumask(pd->cpus), span); + + return pd; + +free_cs_table: + kfree(table); +free_pd: + kfree(pd); + + return NULL; +} + +/** + * em_cpu_get() - Return the performance domain for a CPU + * @cpu : CPU to find the performance domain for + * + * Return: the performance domain to which 'cpu' belongs, or NULL if it doesn't + * exist. + */ +struct em_perf_domain *em_cpu_get(int cpu) +{ + return READ_ONCE(per_cpu(em_data, cpu)); +} +EXPORT_SYMBOL_GPL(em_cpu_get); + +/** + * em_register_perf_domain() - Register the Energy Model of a performance domain + * @span : Mask of CPUs in the performance domain + * @nr_states : Number of capacity states to register + * @cb : Callback functions providing the data of the Energy Model + * + * Create Energy Model tables for a performance domain using the callbacks + * defined in cb. + * + * If multiple clients register the same performance domain, all but the first + * registration will be ignored. + * + * Return 0 on success + */ +int em_register_perf_domain(cpumask_t *span, unsigned int nr_states, + struct em_data_callback *cb) +{ + unsigned long cap, prev_cap = 0; + struct em_perf_domain *pd; + int cpu, ret = 0; + + if (!span || !nr_states || !cb) + return -EINVAL; + + /* + * Use a mutex to serialize the registration of performance domains and + * let the driver-defined callback functions sleep. + */ + mutex_lock(&em_pd_mutex); + + for_each_cpu(cpu, span) { + /* Make sure we don't register again an existing domain. */ + if (READ_ONCE(per_cpu(em_data, cpu))) { + ret = -EEXIST; + goto unlock; + } + + /* + * All CPUs of a domain must have the same micro-architecture + * since they all share the same table. + */ + cap = arch_scale_cpu_capacity(NULL, cpu); + if (prev_cap && prev_cap != cap) { + pr_err("CPUs of %*pbl must have the same capacity\n", + cpumask_pr_args(span)); + ret = -EINVAL; + goto unlock; + } + prev_cap = cap; + } + + /* Create the performance domain and add it to the Energy Model. */ + pd = em_create_pd(span, nr_states, cb); + if (!pd) { + ret = -EINVAL; + goto unlock; + } + + for_each_cpu(cpu, span) { + /* + * The per-cpu array can be read concurrently from em_cpu_get(). + * The barrier enforces the ordering needed to make sure readers + * can only access well formed em_perf_domain structs. + */ + smp_store_release(per_cpu_ptr(&em_data, cpu), pd); + } + + pr_debug("Created perf domain %*pbl\n", cpumask_pr_args(span)); +unlock: + mutex_unlock(&em_pd_mutex); + + return ret; +} +EXPORT_SYMBOL_GPL(em_register_perf_domain); -- cgit From 6aa140fa4508933a6ac6717d65a403eb904d6c02 Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Mon, 3 Dec 2018 09:56:18 +0000 Subject: sched/topology: Reference the Energy Model of CPUs when available The existing scheduling domain hierarchy is defined to map to the cache topology of the system. However, Energy Aware Scheduling (EAS) requires more knowledge about the platform, and specifically needs to know about the span of Performance Domains (PD), which do not always align with caches. To address this issue, use the Energy Model (EM) of the system to extend the scheduler topology code with a representation of the PDs, alongside the scheduling domains. More specifically, a linked list of PDs is attached to each root domain. When multiple root domains are in use, each list contains only the PDs covering the CPUs of its root domain. If a PD spans over CPUs of multiple different root domains, it will be duplicated in all lists. The lists are fully maintained by the scheduler from partition_sched_domains() in order to cope with hotplug and cpuset changes. As for scheduling domains, the list are protected by RCU to ensure safe concurrent updates. Signed-off-by: Quentin Perret Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: adharmap@codeaurora.org Cc: chris.redpath@arm.com Cc: currojerez@riseup.net Cc: dietmar.eggemann@arm.com Cc: edubezval@gmail.com Cc: gregkh@linuxfoundation.org Cc: javi.merino@kernel.org Cc: joel@joelfernandes.org Cc: juri.lelli@redhat.com Cc: morten.rasmussen@arm.com Cc: patrick.bellasi@arm.com Cc: pkondeti@codeaurora.org Cc: rjw@rjwysocki.net Cc: skannan@codeaurora.org Cc: smuckle@google.com Cc: srinivas.pandruvada@linux.intel.com Cc: thara.gopinath@linaro.org Cc: tkjos@google.com Cc: valentin.schneider@arm.com Cc: vincent.guittot@linaro.org Cc: viresh.kumar@linaro.org Link: https://lkml.kernel.org/r/20181203095628.11858-6-quentin.perret@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/sched.h | 21 ++++++++ kernel/sched/topology.c | 134 ++++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 151 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 2eafa228aebf..808a565187b1 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -45,6 +45,7 @@ #include #include #include +#include #include #include #include @@ -709,6 +710,12 @@ static inline bool sched_asym_prefer(int a, int b) return arch_asym_cpu_priority(a) > arch_asym_cpu_priority(b); } +struct perf_domain { + struct em_perf_domain *em_pd; + struct perf_domain *next; + struct rcu_head rcu; +}; + /* * We add the notion of a root-domain which will be used to define per-domain * variables. Each exclusive cpuset essentially defines an island domain by @@ -761,6 +768,12 @@ struct root_domain { struct cpupri cpupri; unsigned long max_cpu_capacity; + + /* + * NULL-terminated list of performance domains intersecting with the + * CPUs of the rd. Protected by RCU. + */ + struct perf_domain *pd; }; extern struct root_domain def_root_domain; @@ -2276,3 +2289,11 @@ unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned return util; } #endif + +#ifdef CONFIG_SMP +#ifdef CONFIG_ENERGY_MODEL +#define perf_domain_span(pd) (to_cpumask(((pd)->em_pd->cpus))) +#else +#define perf_domain_span(pd) NULL +#endif +#endif diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 7364e0b427b7..169d25cafab5 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -201,6 +201,116 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) return 1; } +#ifdef CONFIG_ENERGY_MODEL +static void free_pd(struct perf_domain *pd) +{ + struct perf_domain *tmp; + + while (pd) { + tmp = pd->next; + kfree(pd); + pd = tmp; + } +} + +static struct perf_domain *find_pd(struct perf_domain *pd, int cpu) +{ + while (pd) { + if (cpumask_test_cpu(cpu, perf_domain_span(pd))) + return pd; + pd = pd->next; + } + + return NULL; +} + +static struct perf_domain *pd_init(int cpu) +{ + struct em_perf_domain *obj = em_cpu_get(cpu); + struct perf_domain *pd; + + if (!obj) { + if (sched_debug()) + pr_info("%s: no EM found for CPU%d\n", __func__, cpu); + return NULL; + } + + pd = kzalloc(sizeof(*pd), GFP_KERNEL); + if (!pd) + return NULL; + pd->em_pd = obj; + + return pd; +} + +static void perf_domain_debug(const struct cpumask *cpu_map, + struct perf_domain *pd) +{ + if (!sched_debug() || !pd) + return; + + printk(KERN_DEBUG "root_domain %*pbl:", cpumask_pr_args(cpu_map)); + + while (pd) { + printk(KERN_CONT " pd%d:{ cpus=%*pbl nr_cstate=%d }", + cpumask_first(perf_domain_span(pd)), + cpumask_pr_args(perf_domain_span(pd)), + em_pd_nr_cap_states(pd->em_pd)); + pd = pd->next; + } + + printk(KERN_CONT "\n"); +} + +static void destroy_perf_domain_rcu(struct rcu_head *rp) +{ + struct perf_domain *pd; + + pd = container_of(rp, struct perf_domain, rcu); + free_pd(pd); +} + +static void build_perf_domains(const struct cpumask *cpu_map) +{ + struct perf_domain *pd = NULL, *tmp; + int cpu = cpumask_first(cpu_map); + struct root_domain *rd = cpu_rq(cpu)->rd; + int i; + + for_each_cpu(i, cpu_map) { + /* Skip already covered CPUs. */ + if (find_pd(pd, i)) + continue; + + /* Create the new pd and add it to the local list. */ + tmp = pd_init(i); + if (!tmp) + goto free; + tmp->next = pd; + pd = tmp; + } + + perf_domain_debug(cpu_map, pd); + + /* Attach the new list of performance domains to the root domain. */ + tmp = rd->pd; + rcu_assign_pointer(rd->pd, pd); + if (tmp) + call_rcu(&tmp->rcu, destroy_perf_domain_rcu); + + return; + +free: + free_pd(pd); + tmp = rd->pd; + rcu_assign_pointer(rd->pd, NULL); + if (tmp) + call_rcu(&tmp->rcu, destroy_perf_domain_rcu); +} +#else +static void free_pd(struct perf_domain *pd) { } +#endif /* CONFIG_ENERGY_MODEL */ + static void free_rootdomain(struct rcu_head *rcu) { struct root_domain *rd = container_of(rcu, struct root_domain, rcu); @@ -211,6 +321,7 @@ static void free_rootdomain(struct rcu_head *rcu) free_cpumask_var(rd->rto_mask); free_cpumask_var(rd->online); free_cpumask_var(rd->span); + free_pd(rd->pd); kfree(rd); } @@ -1959,8 +2070,8 @@ void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], /* Destroy deleted domains: */ for (i = 0; i < ndoms_cur; i++) { for (j = 0; j < n && !new_topology; j++) { - if (cpumask_equal(doms_cur[i], doms_new[j]) - && dattrs_equal(dattr_cur, i, dattr_new, j)) + if (cpumask_equal(doms_cur[i], doms_new[j]) && + dattrs_equal(dattr_cur, i, dattr_new, j)) goto match1; } /* No match - a current sched domain not in new doms_new[] */ @@ -1980,8 +2091,8 @@ match1: /* Build new domains: */ for (i = 0; i < ndoms_new; i++) { for (j = 0; j < n && !new_topology; j++) { - if (cpumask_equal(doms_new[i], doms_cur[j]) - && dattrs_equal(dattr_new, i, dattr_cur, j)) + if (cpumask_equal(doms_new[i], doms_cur[j]) && + dattrs_equal(dattr_new, i, dattr_cur, j)) goto match2; } /* No match - add a new doms_new */ @@ -1990,6 +2101,21 @@ match2: ; } +#ifdef CONFIG_ENERGY_MODEL + /* Build perf. domains: */ + for (i = 0; i < ndoms_new; i++) { + for (j = 0; j < n; j++) { + if (cpumask_equal(doms_new[i], doms_cur[j]) && + cpu_rq(cpumask_first(doms_cur[j]))->rd->pd) + goto match3; + } + /* No match - add perf. domains for a new rd */ + build_perf_domains(doms_new[i]); +match3: + ; + } +#endif + /* Remember the new sched domains: */ if (doms_cur != &fallback_doms) free_sched_domains(doms_cur, ndoms_cur); -- cgit From 011b27bb5d3139e8b5fe9ceff1fc7f6dc3145071 Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Mon, 3 Dec 2018 09:56:19 +0000 Subject: sched/topology: Add lowest CPU asymmetry sched_domain level pointer Add another member to the family of per-cpu sched_domain shortcut pointers. This one, sd_asym_cpucapacity, points to the lowest level at which the SD_ASYM_CPUCAPACITY flag is set. While at it, rename the sd_asym shortcut to sd_asym_packing to avoid confusions. Generally speaking, the largest opportunity to save energy via scheduling comes from a smarter exploitation of heterogeneous platforms (i.e. big.LITTLE). Consequently, the sd_asym_cpucapacity shortcut will be used at first as the lowest domain where Energy-Aware Scheduling (EAS) should be applied. For example, it is possible to apply EAS within a socket on a multi-socket system, as long as each socket has an asymmetric topology. Energy-aware cross-sockets wake-up balancing will only happen when the system is over-utilized, or this_cpu and prev_cpu are in different sockets. Suggested-by: Morten Rasmussen Signed-off-by: Quentin Perret Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: adharmap@codeaurora.org Cc: chris.redpath@arm.com Cc: currojerez@riseup.net Cc: dietmar.eggemann@arm.com Cc: edubezval@gmail.com Cc: gregkh@linuxfoundation.org Cc: javi.merino@kernel.org Cc: joel@joelfernandes.org Cc: juri.lelli@redhat.com Cc: patrick.bellasi@arm.com Cc: pkondeti@codeaurora.org Cc: rjw@rjwysocki.net Cc: skannan@codeaurora.org Cc: smuckle@google.com Cc: srinivas.pandruvada@linux.intel.com Cc: thara.gopinath@linaro.org Cc: tkjos@google.com Cc: valentin.schneider@arm.com Cc: vincent.guittot@linaro.org Cc: viresh.kumar@linaro.org Link: https://lkml.kernel.org/r/20181203095628.11858-7-quentin.perret@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 2 +- kernel/sched/sched.h | 3 ++- kernel/sched/topology.c | 8 ++++++-- 3 files changed, 9 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index fdc8356ea742..a31a6d325901 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -9299,7 +9299,7 @@ static void nohz_balancer_kick(struct rq *rq) } } - sd = rcu_dereference(per_cpu(sd_asym, cpu)); + sd = rcu_dereference(per_cpu(sd_asym_packing, cpu)); if (sd) { for_each_cpu(i, sched_domain_span(sd)) { if (i == cpu || diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 808a565187b1..75c403674706 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1303,7 +1303,8 @@ DECLARE_PER_CPU(int, sd_llc_size); DECLARE_PER_CPU(int, sd_llc_id); DECLARE_PER_CPU(struct sched_domain_shared *, sd_llc_shared); DECLARE_PER_CPU(struct sched_domain *, sd_numa); -DECLARE_PER_CPU(struct sched_domain *, sd_asym); +DECLARE_PER_CPU(struct sched_domain *, sd_asym_packing); +DECLARE_PER_CPU(struct sched_domain *, sd_asym_cpucapacity); extern struct static_key_false sched_asym_cpucapacity; struct sched_group_capacity { diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 169d25cafab5..137ccfed9a43 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -508,7 +508,8 @@ DEFINE_PER_CPU(int, sd_llc_size); DEFINE_PER_CPU(int, sd_llc_id); DEFINE_PER_CPU(struct sched_domain_shared *, sd_llc_shared); DEFINE_PER_CPU(struct sched_domain *, sd_numa); -DEFINE_PER_CPU(struct sched_domain *, sd_asym); +DEFINE_PER_CPU(struct sched_domain *, sd_asym_packing); +DEFINE_PER_CPU(struct sched_domain *, sd_asym_cpucapacity); DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity); static void update_top_cache_domain(int cpu) @@ -534,7 +535,10 @@ static void update_top_cache_domain(int cpu) rcu_assign_pointer(per_cpu(sd_numa, cpu), sd); sd = highest_flag_domain(cpu, SD_ASYM_PACKING); - rcu_assign_pointer(per_cpu(sd_asym, cpu), sd); + rcu_assign_pointer(per_cpu(sd_asym_packing, cpu), sd); + + sd = lowest_flag_domain(cpu, SD_ASYM_CPUCAPACITY); + rcu_assign_pointer(per_cpu(sd_asym_cpucapacity, cpu), sd); } /* -- cgit From b68a4c0dba3b1e1dda1ede49f3c2fc72d3b54567 Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Mon, 3 Dec 2018 09:56:20 +0000 Subject: sched/topology: Disable EAS on inappropriate platforms Energy Aware Scheduling (EAS) in its current form is most relevant on platforms with asymmetric CPU topologies (e.g. Arm big.LITTLE) since this is where there is a lot of potential for saving energy through scheduling. This is particularly true since the Energy Model only includes the active power costs of CPUs, hence not providing enough data to compare packing-vs-spreading strategies. As such, disable EAS on root domains where the SD_ASYM_CPUCAPACITY flag is not set. While at it, disable EAS on systems where the complexity of the Energy Model is too high since that could lead to unacceptable scheduling overhead. All in all, EAS can be used on a root domain if and only if: 1. an Energy Model is available; 2. the root domain has an asymmetric CPU capacity topology; 3. the complexity of the root domain's EM is low enough to keep scheduling overheads low. Signed-off-by: Quentin Perret Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: adharmap@codeaurora.org Cc: chris.redpath@arm.com Cc: currojerez@riseup.net Cc: dietmar.eggemann@arm.com Cc: edubezval@gmail.com Cc: gregkh@linuxfoundation.org Cc: javi.merino@kernel.org Cc: joel@joelfernandes.org Cc: juri.lelli@redhat.com Cc: morten.rasmussen@arm.com Cc: patrick.bellasi@arm.com Cc: pkondeti@codeaurora.org Cc: rjw@rjwysocki.net Cc: skannan@codeaurora.org Cc: smuckle@google.com Cc: srinivas.pandruvada@linux.intel.com Cc: thara.gopinath@linaro.org Cc: tkjos@google.com Cc: valentin.schneider@arm.com Cc: vincent.guittot@linaro.org Cc: viresh.kumar@linaro.org Link: https://lkml.kernel.org/r/20181203095628.11858-8-quentin.perret@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/topology.c | 49 ++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 48 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 137ccfed9a43..6ddb804b2dec 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -270,12 +270,45 @@ static void destroy_perf_domain_rcu(struct rcu_head *rp) free_pd(pd); } +/* + * EAS can be used on a root domain if it meets all the following conditions: + * 1. an Energy Model (EM) is available; + * 2. the SD_ASYM_CPUCAPACITY flag is set in the sched_domain hierarchy. + * 3. the EM complexity is low enough to keep scheduling overheads low; + * + * The complexity of the Energy Model is defined as: + * + * C = nr_pd * (nr_cpus + nr_cs) + * + * with parameters defined as: + * - nr_pd: the number of performance domains + * - nr_cpus: the number of CPUs + * - nr_cs: the sum of the number of capacity states of all performance + * domains (for example, on a system with 2 performance domains, + * with 10 capacity states each, nr_cs = 2 * 10 = 20). + * + * It is generally not a good idea to use such a model in the wake-up path on + * very complex platforms because of the associated scheduling overheads. The + * arbitrary constraint below prevents that. It makes EAS usable up to 16 CPUs + * with per-CPU DVFS and less than 8 capacity states each, for example. + */ +#define EM_MAX_COMPLEXITY 2048 + static void build_perf_domains(const struct cpumask *cpu_map) { + int i, nr_pd = 0, nr_cs = 0, nr_cpus = cpumask_weight(cpu_map); struct perf_domain *pd = NULL, *tmp; int cpu = cpumask_first(cpu_map); struct root_domain *rd = cpu_rq(cpu)->rd; - int i; + + /* EAS is enabled for asymmetric CPU capacity topologies. */ + if (!per_cpu(sd_asym_cpucapacity, cpu)) { + if (sched_debug()) { + pr_info("rd %*pbl: CPUs do not have asymmetric capacities\n", + cpumask_pr_args(cpu_map)); + } + goto free; + } for_each_cpu(i, cpu_map) { /* Skip already covered CPUs. */ @@ -288,6 +321,20 @@ static void build_perf_domains(const struct cpumask *cpu_map) goto free; tmp->next = pd; pd = tmp; + + /* + * Count performance domains and capacity states for the + * complexity check. + */ + nr_pd++; + nr_cs += em_pd_nr_cap_states(pd->em_pd); + } + + /* Bail out if the Energy Model complexity is too high. */ + if (nr_pd * (nr_cs + nr_cpus) > EM_MAX_COMPLEXITY) { + WARN(1, "rd %*pbl: Failed to start EAS, EM complexity is too high\n", + cpumask_pr_args(cpu_map)); + goto free; } perf_domain_debug(cpu_map, pd); -- cgit From 531b5c9f5cd05ead53324f419b32685a22eebe8b Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Mon, 3 Dec 2018 09:56:21 +0000 Subject: sched/topology: Make Energy Aware Scheduling depend on schedutil Energy Aware Scheduling (EAS) is designed with the assumption that frequencies of CPUs follow their utilization value. When using a CPUFreq governor other than schedutil, the chances of this assumption being true are small, if any. When schedutil is being used, EAS' predictions are at least consistent with the frequency requests. Although those requests have no guarantees to be honored by the hardware, they should at least guide DVFS in the right direction and provide some hope in regards to the EAS model being accurate. To make sure EAS is only used in a sane configuration, create a strong dependency on schedutil being used. Since having sugov compiled-in does not provide that guarantee, make CPUFreq call a scheduler function on governor changes hence letting it rebuild the scheduling domains, check the governors of the online CPUs, and enable/disable EAS accordingly. Signed-off-by: Quentin Perret Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Rafael J. Wysocki Cc: Thomas Gleixner Cc: adharmap@codeaurora.org Cc: chris.redpath@arm.com Cc: currojerez@riseup.net Cc: dietmar.eggemann@arm.com Cc: edubezval@gmail.com Cc: gregkh@linuxfoundation.org Cc: javi.merino@kernel.org Cc: joel@joelfernandes.org Cc: juri.lelli@redhat.com Cc: morten.rasmussen@arm.com Cc: patrick.bellasi@arm.com Cc: pkondeti@codeaurora.org Cc: skannan@codeaurora.org Cc: smuckle@google.com Cc: srinivas.pandruvada@linux.intel.com Cc: thara.gopinath@linaro.org Cc: tkjos@google.com Cc: valentin.schneider@arm.com Cc: vincent.guittot@linaro.org Cc: viresh.kumar@linaro.org Link: https://lkml.kernel.org/r/20181203095628.11858-9-quentin.perret@arm.com Signed-off-by: Ingo Molnar --- drivers/cpufreq/cpufreq.c | 1 + include/linux/cpufreq.h | 8 ++++++++ kernel/sched/cpufreq_schedutil.c | 37 +++++++++++++++++++++++++++++++++++-- kernel/sched/sched.h | 4 +--- kernel/sched/topology.c | 28 ++++++++++++++++++++++++---- 5 files changed, 69 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 7aa3dcad2175..6f23ebb395f1 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -2277,6 +2277,7 @@ static int cpufreq_set_policy(struct cpufreq_policy *policy, ret = cpufreq_start_governor(policy); if (!ret) { pr_debug("cpufreq: governor change\n"); + sched_cpufreq_governor_change(policy, old_gov); return 0; } cpufreq_exit_governor(policy); diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 882a9b9e34bc..c86d6d8bdfed 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -950,6 +950,14 @@ static inline bool policy_has_boost_freq(struct cpufreq_policy *policy) } #endif +#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) +void sched_cpufreq_governor_change(struct cpufreq_policy *policy, + struct cpufreq_governor *old_gov); +#else +static inline void sched_cpufreq_governor_change(struct cpufreq_policy *policy, + struct cpufreq_governor *old_gov) { } +#endif + extern void arch_freq_prepare_all(void); extern unsigned int arch_freq_get_on_cpu(int cpu); diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 90128be27712..c2e53d1a3143 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -624,7 +624,7 @@ static struct kobj_type sugov_tunables_ktype = { /********************** cpufreq governor interface *********************/ -static struct cpufreq_governor schedutil_gov; +struct cpufreq_governor schedutil_gov; static struct sugov_policy *sugov_policy_alloc(struct cpufreq_policy *policy) { @@ -883,7 +883,7 @@ static void sugov_limits(struct cpufreq_policy *policy) sg_policy->need_freq_update = true; } -static struct cpufreq_governor schedutil_gov = { +struct cpufreq_governor schedutil_gov = { .name = "schedutil", .owner = THIS_MODULE, .dynamic_switching = true, @@ -906,3 +906,36 @@ static int __init sugov_register(void) return cpufreq_register_governor(&schedutil_gov); } fs_initcall(sugov_register); + +#ifdef CONFIG_ENERGY_MODEL +extern bool sched_energy_update; +extern struct mutex sched_energy_mutex; + +static void rebuild_sd_workfn(struct work_struct *work) +{ + mutex_lock(&sched_energy_mutex); + sched_energy_update = true; + rebuild_sched_domains(); + sched_energy_update = false; + mutex_unlock(&sched_energy_mutex); +} +static DECLARE_WORK(rebuild_sd_work, rebuild_sd_workfn); + +/* + * EAS shouldn't be attempted without sugov, so rebuild the sched_domains + * on governor changes to make sure the scheduler knows about it. + */ +void sched_cpufreq_governor_change(struct cpufreq_policy *policy, + struct cpufreq_governor *old_gov) +{ + if (old_gov == &schedutil_gov || policy->governor == &schedutil_gov) { + /* + * When called from the cpufreq_register_driver() path, the + * cpu_hotplug_lock is already held, so use a work item to + * avoid nested locking in rebuild_sched_domains(). + */ + schedule_work(&rebuild_sd_work); + } + +} +#endif diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 75c403674706..fd84900b0b21 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2291,10 +2291,8 @@ unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned } #endif -#ifdef CONFIG_SMP -#ifdef CONFIG_ENERGY_MODEL +#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) #define perf_domain_span(pd) (to_cpumask(((pd)->em_pd->cpus))) #else #define perf_domain_span(pd) NULL #endif -#endif diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 6ddb804b2dec..0a5a1d3a4eae 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -201,7 +201,10 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) return 1; } -#ifdef CONFIG_ENERGY_MODEL +#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) +DEFINE_MUTEX(sched_energy_mutex); +bool sched_energy_update; + static void free_pd(struct perf_domain *pd) { struct perf_domain *tmp; @@ -275,6 +278,7 @@ static void destroy_perf_domain_rcu(struct rcu_head *rp) * 1. an Energy Model (EM) is available; * 2. the SD_ASYM_CPUCAPACITY flag is set in the sched_domain hierarchy. * 3. the EM complexity is low enough to keep scheduling overheads low; + * 4. schedutil is driving the frequency of all CPUs of the rd; * * The complexity of the Energy Model is defined as: * @@ -294,12 +298,15 @@ static void destroy_perf_domain_rcu(struct rcu_head *rp) */ #define EM_MAX_COMPLEXITY 2048 +extern struct cpufreq_governor schedutil_gov; static void build_perf_domains(const struct cpumask *cpu_map) { int i, nr_pd = 0, nr_cs = 0, nr_cpus = cpumask_weight(cpu_map); struct perf_domain *pd = NULL, *tmp; int cpu = cpumask_first(cpu_map); struct root_domain *rd = cpu_rq(cpu)->rd; + struct cpufreq_policy *policy; + struct cpufreq_governor *gov; /* EAS is enabled for asymmetric CPU capacity topologies. */ if (!per_cpu(sd_asym_cpucapacity, cpu)) { @@ -315,6 +322,19 @@ static void build_perf_domains(const struct cpumask *cpu_map) if (find_pd(pd, i)) continue; + /* Do not attempt EAS if schedutil is not being used. */ + policy = cpufreq_cpu_get(i); + if (!policy) + goto free; + gov = policy->governor; + cpufreq_cpu_put(policy); + if (gov != &schedutil_gov) { + if (rd->pd) + pr_warn("rd %*pbl: Disabling EAS, schedutil is mandatory\n", + cpumask_pr_args(cpu_map)); + goto free; + } + /* Create the new pd and add it to the local list. */ tmp = pd_init(i); if (!tmp) @@ -356,7 +376,7 @@ free: } #else static void free_pd(struct perf_domain *pd) { } -#endif /* CONFIG_ENERGY_MODEL */ +#endif /* CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL*/ static void free_rootdomain(struct rcu_head *rcu) { @@ -2152,10 +2172,10 @@ match2: ; } -#ifdef CONFIG_ENERGY_MODEL +#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) /* Build perf. domains: */ for (i = 0; i < ndoms_new; i++) { - for (j = 0; j < n; j++) { + for (j = 0; j < n && !sched_energy_update; j++) { if (cpumask_equal(doms_new[i], doms_cur[j]) && cpu_rq(cpumask_first(doms_cur[j]))->rd->pd) goto match3; -- cgit From 1f74de8798c93ce14801cc4e772603e51c841c33 Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Mon, 3 Dec 2018 09:56:22 +0000 Subject: sched/toplogy: Introduce the 'sched_energy_present' static key In order to make sure Energy Aware Scheduling (EAS) will not impact systems where no Energy Model is available, introduce a static key guarding the access to EAS code. Since EAS is enabled on a per-root-domain basis, the static key is enabled when at least one root domain meets all conditions for EAS. Signed-off-by: Quentin Perret Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: adharmap@codeaurora.org Cc: chris.redpath@arm.com Cc: currojerez@riseup.net Cc: dietmar.eggemann@arm.com Cc: edubezval@gmail.com Cc: gregkh@linuxfoundation.org Cc: javi.merino@kernel.org Cc: joel@joelfernandes.org Cc: juri.lelli@redhat.com Cc: morten.rasmussen@arm.com Cc: patrick.bellasi@arm.com Cc: pkondeti@codeaurora.org Cc: rjw@rjwysocki.net Cc: skannan@codeaurora.org Cc: smuckle@google.com Cc: srinivas.pandruvada@linux.intel.com Cc: thara.gopinath@linaro.org Cc: tkjos@google.com Cc: valentin.schneider@arm.com Cc: vincent.guittot@linaro.org Cc: viresh.kumar@linaro.org Link: https://lkml.kernel.org/r/20181203095628.11858-10-quentin.perret@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/sched.h | 4 ++++ kernel/sched/topology.c | 28 ++++++++++++++++++++++++---- 2 files changed, 28 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index fd84900b0b21..2b3cf356e958 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2296,3 +2296,7 @@ unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned #else #define perf_domain_span(pd) NULL #endif + +#ifdef CONFIG_SMP +extern struct static_key_false sched_energy_present; +#endif diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 0a5a1d3a4eae..3f35ba1d8fde 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -201,6 +201,7 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) return 1; } +DEFINE_STATIC_KEY_FALSE(sched_energy_present); #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) DEFINE_MUTEX(sched_energy_mutex); bool sched_energy_update; @@ -273,6 +274,19 @@ static void destroy_perf_domain_rcu(struct rcu_head *rp) free_pd(pd); } +static void sched_energy_set(bool has_eas) +{ + if (!has_eas && static_branch_unlikely(&sched_energy_present)) { + if (sched_debug()) + pr_info("%s: stopping EAS\n", __func__); + static_branch_disable_cpuslocked(&sched_energy_present); + } else if (has_eas && !static_branch_unlikely(&sched_energy_present)) { + if (sched_debug()) + pr_info("%s: starting EAS\n", __func__); + static_branch_enable_cpuslocked(&sched_energy_present); + } +} + /* * EAS can be used on a root domain if it meets all the following conditions: * 1. an Energy Model (EM) is available; @@ -299,7 +313,7 @@ static void destroy_perf_domain_rcu(struct rcu_head *rp) #define EM_MAX_COMPLEXITY 2048 extern struct cpufreq_governor schedutil_gov; -static void build_perf_domains(const struct cpumask *cpu_map) +static bool build_perf_domains(const struct cpumask *cpu_map) { int i, nr_pd = 0, nr_cs = 0, nr_cpus = cpumask_weight(cpu_map); struct perf_domain *pd = NULL, *tmp; @@ -365,7 +379,7 @@ static void build_perf_domains(const struct cpumask *cpu_map) if (tmp) call_rcu(&tmp->rcu, destroy_perf_domain_rcu); - return; + return !!pd; free: free_pd(pd); @@ -373,6 +387,8 @@ free: rcu_assign_pointer(rd->pd, NULL); if (tmp) call_rcu(&tmp->rcu, destroy_perf_domain_rcu); + + return false; } #else static void free_pd(struct perf_domain *pd) { } @@ -2114,6 +2130,7 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], struct sched_domain_attr *dattr_new) { + bool __maybe_unused has_eas = false; int i, j, n; int new_topology; @@ -2177,14 +2194,17 @@ match2: for (i = 0; i < ndoms_new; i++) { for (j = 0; j < n && !sched_energy_update; j++) { if (cpumask_equal(doms_new[i], doms_cur[j]) && - cpu_rq(cpumask_first(doms_cur[j]))->rd->pd) + cpu_rq(cpumask_first(doms_cur[j]))->rd->pd) { + has_eas = true; goto match3; + } } /* No match - add perf. domains for a new rd */ - build_perf_domains(doms_new[i]); + has_eas |= build_perf_domains(doms_new[i]); match3: ; } + sched_energy_set(has_eas); #endif /* Remember the new sched domains: */ -- cgit From 630246a06ae2a7a12d1fce85f1e5681032982791 Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Mon, 3 Dec 2018 09:56:24 +0000 Subject: sched/fair: Clean-up update_sg_lb_stats parameters In preparation for the introduction of a new root domain flag which can be set during load balance (the 'overutilized' flag), clean-up the set of parameters passed to update_sg_lb_stats(). More specifically, the 'local_group' and 'local_idx' parameters can be removed since they can easily be reconstructed from within the function. While at it, transform the 'overload' parameter into a flag stored in the 'sg_status' parameter hence facilitating the definition of new flags when needed. Suggested-by: Peter Zijlstra Suggested-by: Valentin Schneider Signed-off-by: Quentin Perret Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Thomas Gleixner Cc: adharmap@codeaurora.org Cc: chris.redpath@arm.com Cc: currojerez@riseup.net Cc: dietmar.eggemann@arm.com Cc: edubezval@gmail.com Cc: gregkh@linuxfoundation.org Cc: javi.merino@kernel.org Cc: joel@joelfernandes.org Cc: juri.lelli@redhat.com Cc: morten.rasmussen@arm.com Cc: patrick.bellasi@arm.com Cc: pkondeti@codeaurora.org Cc: rjw@rjwysocki.net Cc: skannan@codeaurora.org Cc: smuckle@google.com Cc: srinivas.pandruvada@linux.intel.com Cc: thara.gopinath@linaro.org Cc: tkjos@google.com Cc: vincent.guittot@linaro.org Cc: viresh.kumar@linaro.org Link: https://lkml.kernel.org/r/20181203095628.11858-12-quentin.perret@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 27 +++++++++++---------------- kernel/sched/sched.h | 3 +++ 2 files changed, 14 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a31a6d325901..e04f29098ec7 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7905,16 +7905,16 @@ static bool update_nohz_stats(struct rq *rq, bool force) * update_sg_lb_stats - Update sched_group's statistics for load balancing. * @env: The load balancing environment. * @group: sched_group whose statistics are to be updated. - * @load_idx: Load index of sched_domain of this_cpu for load calc. - * @local_group: Does group contain this_cpu. * @sgs: variable to hold the statistics for this group. - * @overload: Indicate pullable load (e.g. >1 runnable task). + * @sg_status: Holds flag indicating the status of the sched_group */ static inline void update_sg_lb_stats(struct lb_env *env, - struct sched_group *group, int load_idx, - int local_group, struct sg_lb_stats *sgs, - bool *overload) + struct sched_group *group, + struct sg_lb_stats *sgs, + int *sg_status) { + int local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(group)); + int load_idx = get_sd_load_idx(env->sd, env->idle); unsigned long load; int i, nr_running; @@ -7938,7 +7938,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, nr_running = rq->nr_running; if (nr_running > 1) - *overload = true; + *sg_status |= SG_OVERLOAD; #ifdef CONFIG_NUMA_BALANCING sgs->nr_numa_running += rq->nr_numa_running; @@ -7954,7 +7954,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, if (env->sd->flags & SD_ASYM_CPUCAPACITY && sgs->group_misfit_task_load < rq->misfit_task_load) { sgs->group_misfit_task_load = rq->misfit_task_load; - *overload = 1; + *sg_status |= SG_OVERLOAD; } } @@ -8099,17 +8099,14 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd struct sched_group *sg = env->sd->groups; struct sg_lb_stats *local = &sds->local_stat; struct sg_lb_stats tmp_sgs; - int load_idx; - bool overload = false; bool prefer_sibling = child && child->flags & SD_PREFER_SIBLING; + int sg_status = 0; #ifdef CONFIG_NO_HZ_COMMON if (env->idle == CPU_NEWLY_IDLE && READ_ONCE(nohz.has_blocked)) env->flags |= LBF_NOHZ_STATS; #endif - load_idx = get_sd_load_idx(env->sd, env->idle); - do { struct sg_lb_stats *sgs = &tmp_sgs; int local_group; @@ -8124,8 +8121,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd update_group_capacity(env->sd, env->dst_cpu); } - update_sg_lb_stats(env, sg, load_idx, local_group, sgs, - &overload); + update_sg_lb_stats(env, sg, sgs, &sg_status); if (local_group) goto next_group; @@ -8175,8 +8171,7 @@ next_group: if (!env->sd->parent) { /* update overload indicator if we are at root domain */ - if (READ_ONCE(env->dst_rq->rd->overload) != overload) - WRITE_ONCE(env->dst_rq->rd->overload, overload); + WRITE_ONCE(env->dst_rq->rd->overload, sg_status & SG_OVERLOAD); } } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 2b3cf356e958..d4d984846924 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -716,6 +716,9 @@ struct perf_domain { struct rcu_head rcu; }; +/* Scheduling group status flags */ +#define SG_OVERLOAD 0x1 /* More than one runnable task on a CPU. */ + /* * We add the notion of a root-domain which will be used to define per-domain * variables. Each exclusive cpuset essentially defines an island domain by -- cgit From 2802bf3cd936fe2c8033a696d375a4d9d3974de4 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Mon, 3 Dec 2018 09:56:25 +0000 Subject: sched/fair: Add over-utilization/tipping point indicator Energy-aware scheduling is only meant to be active while the system is _not_ over-utilized. That is, there are spare cycles available to shift tasks around based on their actual utilization to get a more energy-efficient task distribution without depriving any tasks. When above the tipping point task placement is done the traditional way based on load_avg, spreading the tasks across as many cpus as possible based on priority scaled load to preserve smp_nice. Below the tipping point we want to use util_avg instead. We need to define a criteria for when we make the switch. The util_avg for each cpu converges towards 100% regardless of how many additional tasks we may put on it. If we define over-utilized as: sum_{cpus}(rq.cfs.avg.util_avg) + margin > sum_{cpus}(rq.capacity) some individual cpus may be over-utilized running multiple tasks even when the above condition is false. That should be okay as long as we try to spread the tasks out to avoid per-cpu over-utilization as much as possible and if all tasks have the _same_ priority. If the latter isn't true, we have to consider priority to preserve smp_nice. For example, we could have n_cpus nice=-10 util_avg=55% tasks and n_cpus/2 nice=0 util_avg=60% tasks. Balancing based on util_avg we are likely to end up with nice=-10 tasks sharing cpus and nice=0 tasks getting their own as we 1.5*n_cpus tasks in total and 55%+55% is less over-utilized than 55%+60% for those cpus that have to be shared. The system utilization is only 85% of the system capacity, but we are breaking smp_nice. To be sure not to break smp_nice, we have defined over-utilization conservatively as when any cpu in the system is fully utilized at its highest frequency instead: cpu_rq(any).cfs.avg.util_avg + margin > cpu_rq(any).capacity IOW, as soon as one cpu is (nearly) 100% utilized, we switch to load_avg to factor in priority to preserve smp_nice. With this definition, we can skip periodic load-balance as no cpu has an always-running task when the system is not over-utilized. All tasks will be periodic and we can balance them at wake-up. This conservative condition does however mean that some scenarios that could benefit from energy-aware decisions even if one cpu is fully utilized would not get those benefits. For systems where some cpus might have reduced capacity on some cpus (RT-pressure and/or big.LITTLE), we want periodic load-balance checks as soon a just a single cpu is fully utilized as it might one of those with reduced capacity and in that case we want to migrate it. [ peterz: Added a comment explaining why new tasks are not accounted during overutilization detection. ] Signed-off-by: Morten Rasmussen Signed-off-by: Quentin Perret Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: adharmap@codeaurora.org Cc: chris.redpath@arm.com Cc: currojerez@riseup.net Cc: dietmar.eggemann@arm.com Cc: edubezval@gmail.com Cc: gregkh@linuxfoundation.org Cc: javi.merino@kernel.org Cc: joel@joelfernandes.org Cc: juri.lelli@redhat.com Cc: patrick.bellasi@arm.com Cc: pkondeti@codeaurora.org Cc: rjw@rjwysocki.net Cc: skannan@codeaurora.org Cc: smuckle@google.com Cc: srinivas.pandruvada@linux.intel.com Cc: thara.gopinath@linaro.org Cc: tkjos@google.com Cc: valentin.schneider@arm.com Cc: vincent.guittot@linaro.org Cc: viresh.kumar@linaro.org Link: https://lkml.kernel.org/r/20181203095628.11858-13-quentin.perret@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 59 ++++++++++++++++++++++++++++++++++++++++++++++++++-- kernel/sched/sched.h | 4 ++++ 2 files changed, 61 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e04f29098ec7..767e7675774b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5082,6 +5082,24 @@ static inline void hrtick_update(struct rq *rq) } #endif +#ifdef CONFIG_SMP +static inline unsigned long cpu_util(int cpu); +static unsigned long capacity_of(int cpu); + +static inline bool cpu_overutilized(int cpu) +{ + return (capacity_of(cpu) * 1024) < (cpu_util(cpu) * capacity_margin); +} + +static inline void update_overutilized_status(struct rq *rq) +{ + if (!READ_ONCE(rq->rd->overutilized) && cpu_overutilized(rq->cpu)) + WRITE_ONCE(rq->rd->overutilized, SG_OVERUTILIZED); +} +#else +static inline void update_overutilized_status(struct rq *rq) { } +#endif + /* * The enqueue_task method is called before nr_running is * increased. Here we update the fair scheduling stats and @@ -5139,8 +5157,26 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) update_cfs_group(se); } - if (!se) + if (!se) { add_nr_running(rq, 1); + /* + * Since new tasks are assigned an initial util_avg equal to + * half of the spare capacity of their CPU, tiny tasks have the + * ability to cross the overutilized threshold, which will + * result in the load balancer ruining all the task placement + * done by EAS. As a way to mitigate that effect, do not account + * for the first enqueue operation of new tasks during the + * overutilized flag detection. + * + * A better way of solving this problem would be to wait for + * the PELT signals of tasks to converge before taking them + * into account, but that is not straightforward to implement, + * and the following generally works well enough in practice. + */ + if (flags & ENQUEUE_WAKEUP) + update_overutilized_status(rq); + + } hrtick_update(rq); } @@ -7940,6 +7976,9 @@ static inline void update_sg_lb_stats(struct lb_env *env, if (nr_running > 1) *sg_status |= SG_OVERLOAD; + if (cpu_overutilized(i)) + *sg_status |= SG_OVERUTILIZED; + #ifdef CONFIG_NUMA_BALANCING sgs->nr_numa_running += rq->nr_numa_running; sgs->nr_preferred_running += rq->nr_preferred_running; @@ -8170,8 +8209,15 @@ next_group: env->fbq_type = fbq_classify_group(&sds->busiest_stat); if (!env->sd->parent) { + struct root_domain *rd = env->dst_rq->rd; + /* update overload indicator if we are at root domain */ - WRITE_ONCE(env->dst_rq->rd->overload, sg_status & SG_OVERLOAD); + WRITE_ONCE(rd->overload, sg_status & SG_OVERLOAD); + + /* Update over-utilization (tipping point, U >= 0) indicator */ + WRITE_ONCE(rd->overutilized, sg_status & SG_OVERUTILIZED); + } else if (sg_status & SG_OVERUTILIZED) { + WRITE_ONCE(env->dst_rq->rd->overutilized, SG_OVERUTILIZED); } } @@ -8398,6 +8444,14 @@ static struct sched_group *find_busiest_group(struct lb_env *env) * this level. */ update_sd_lb_stats(env, &sds); + + if (static_branch_unlikely(&sched_energy_present)) { + struct root_domain *rd = env->dst_rq->rd; + + if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized)) + goto out_balanced; + } + local = &sds.local_stat; busiest = &sds.busiest_stat; @@ -9798,6 +9852,7 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) task_tick_numa(rq, curr); update_misfit_status(curr, rq); + update_overutilized_status(task_rq(curr)); } /* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index d4d984846924..0ba08924e017 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -718,6 +718,7 @@ struct perf_domain { /* Scheduling group status flags */ #define SG_OVERLOAD 0x1 /* More than one runnable task on a CPU. */ +#define SG_OVERUTILIZED 0x2 /* One or more CPUs are over-utilized. */ /* * We add the notion of a root-domain which will be used to define per-domain @@ -741,6 +742,9 @@ struct root_domain { */ int overload; + /* Indicate one or more cpus over-utilized (tipping point) */ + int overutilized; + /* * The bit corresponding to a CPU gets set here if such CPU has more * than one runnable -deadline task (as it is below for RT tasks). -- cgit From 390031e4c309c94ecc07a558187eb5185200df83 Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Mon, 3 Dec 2018 09:56:26 +0000 Subject: sched/fair: Introduce an energy estimation helper function In preparation for the definition of an energy-aware wakeup path, introduce a helper function to estimate the consequence on system energy when a specific task wakes-up on a specific CPU. compute_energy() estimates the capacity state to be reached by all performance domains and estimates the consumption of each online CPU according to its Energy Model and its percentage of busy time. Signed-off-by: Quentin Perret Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: adharmap@codeaurora.org Cc: chris.redpath@arm.com Cc: currojerez@riseup.net Cc: dietmar.eggemann@arm.com Cc: edubezval@gmail.com Cc: gregkh@linuxfoundation.org Cc: javi.merino@kernel.org Cc: joel@joelfernandes.org Cc: juri.lelli@redhat.com Cc: morten.rasmussen@arm.com Cc: patrick.bellasi@arm.com Cc: pkondeti@codeaurora.org Cc: rjw@rjwysocki.net Cc: skannan@codeaurora.org Cc: smuckle@google.com Cc: srinivas.pandruvada@linux.intel.com Cc: thara.gopinath@linaro.org Cc: tkjos@google.com Cc: valentin.schneider@arm.com Cc: vincent.guittot@linaro.org Cc: viresh.kumar@linaro.org Link: https://lkml.kernel.org/r/20181203095628.11858-14-quentin.perret@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 76 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 76 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 767e7675774b..b3c94584d947 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6377,6 +6377,82 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu) return !task_fits_capacity(p, min_cap); } +/* + * Predicts what cpu_util(@cpu) would return if @p was migrated (and enqueued) + * to @dst_cpu. + */ +static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu) +{ + struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs; + unsigned long util_est, util = READ_ONCE(cfs_rq->avg.util_avg); + + /* + * If @p migrates from @cpu to another, remove its contribution. Or, + * if @p migrates from another CPU to @cpu, add its contribution. In + * the other cases, @cpu is not impacted by the migration, so the + * util_avg should already be correct. + */ + if (task_cpu(p) == cpu && dst_cpu != cpu) + sub_positive(&util, task_util(p)); + else if (task_cpu(p) != cpu && dst_cpu == cpu) + util += task_util(p); + + if (sched_feat(UTIL_EST)) { + util_est = READ_ONCE(cfs_rq->avg.util_est.enqueued); + + /* + * During wake-up, the task isn't enqueued yet and doesn't + * appear in the cfs_rq->avg.util_est.enqueued of any rq, + * so just add it (if needed) to "simulate" what will be + * cpu_util() after the task has been enqueued. + */ + if (dst_cpu == cpu) + util_est += _task_util_est(p); + + util = max(util, util_est); + } + + return min(util, capacity_orig_of(cpu)); +} + +/* + * compute_energy(): Estimates the energy that would be consumed if @p was + * migrated to @dst_cpu. compute_energy() predicts what will be the utilization + * landscape of the * CPUs after the task migration, and uses the Energy Model + * to compute what would be the energy if we decided to actually migrate that + * task. + */ +static long +compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd) +{ + long util, max_util, sum_util, energy = 0; + int cpu; + + for (; pd; pd = pd->next) { + max_util = sum_util = 0; + /* + * The capacity state of CPUs of the current rd can be driven by + * CPUs of another rd if they belong to the same performance + * domain. So, account for the utilization of these CPUs too + * by masking pd with cpu_online_mask instead of the rd span. + * + * If an entire performance domain is outside of the current rd, + * it will not appear in its pd list and will not be accounted + * by compute_energy(). + */ + for_each_cpu_and(cpu, perf_domain_span(pd), cpu_online_mask) { + util = cpu_util_next(cpu, p, dst_cpu); + util = schedutil_energy_util(cpu, util); + max_util = max(util, max_util); + sum_util += util; + } + + energy += em_pd_energy(pd->em_pd, max_util, sum_util); + } + + return energy; +} + /* * select_task_rq_fair: Select target runqueue for the waking task in domains * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE, -- cgit From 732cd75b8c920d3727e69957b14faa7c2d7c3b75 Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Mon, 3 Dec 2018 09:56:27 +0000 Subject: sched/fair: Select an energy-efficient CPU on task wake-up If an Energy Model (EM) is available and if the system isn't overutilized, re-route waking tasks into an energy-aware placement algorithm. The selection of an energy-efficient CPU for a task is achieved by estimating the impact on system-level active energy resulting from the placement of the task on the CPU with the highest spare capacity in each performance domain. This strategy spreads tasks in a performance domain and avoids overly aggressive task packing. The best CPU energy-wise is then selected if it saves a large enough amount of energy with respect to prev_cpu. Although it has already shown significant benefits on some existing targets, this approach cannot scale to platforms with numerous CPUs. This is an attempt to do something useful as writing a fast heuristic that performs reasonably well on a broad spectrum of architectures isn't an easy task. As such, the scope of usability of the energy-aware wake-up path is restricted to systems with the SD_ASYM_CPUCAPACITY flag set, and where the EM isn't too complex. Signed-off-by: Quentin Perret Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: adharmap@codeaurora.org Cc: chris.redpath@arm.com Cc: currojerez@riseup.net Cc: dietmar.eggemann@arm.com Cc: edubezval@gmail.com Cc: gregkh@linuxfoundation.org Cc: javi.merino@kernel.org Cc: joel@joelfernandes.org Cc: juri.lelli@redhat.com Cc: morten.rasmussen@arm.com Cc: patrick.bellasi@arm.com Cc: pkondeti@codeaurora.org Cc: rjw@rjwysocki.net Cc: skannan@codeaurora.org Cc: smuckle@google.com Cc: srinivas.pandruvada@linux.intel.com Cc: thara.gopinath@linaro.org Cc: tkjos@google.com Cc: valentin.schneider@arm.com Cc: vincent.guittot@linaro.org Cc: viresh.kumar@linaro.org Link: https://lkml.kernel.org/r/20181203095628.11858-15-quentin.perret@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 143 +++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 141 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b3c94584d947..ca469646ebe1 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6453,6 +6453,137 @@ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd) return energy; } +/* + * find_energy_efficient_cpu(): Find most energy-efficient target CPU for the + * waking task. find_energy_efficient_cpu() looks for the CPU with maximum + * spare capacity in each performance domain and uses it as a potential + * candidate to execute the task. Then, it uses the Energy Model to figure + * out which of the CPU candidates is the most energy-efficient. + * + * The rationale for this heuristic is as follows. In a performance domain, + * all the most energy efficient CPU candidates (according to the Energy + * Model) are those for which we'll request a low frequency. When there are + * several CPUs for which the frequency request will be the same, we don't + * have enough data to break the tie between them, because the Energy Model + * only includes active power costs. With this model, if we assume that + * frequency requests follow utilization (e.g. using schedutil), the CPU with + * the maximum spare capacity in a performance domain is guaranteed to be among + * the best candidates of the performance domain. + * + * In practice, it could be preferable from an energy standpoint to pack + * small tasks on a CPU in order to let other CPUs go in deeper idle states, + * but that could also hurt our chances to go cluster idle, and we have no + * ways to tell with the current Energy Model if this is actually a good + * idea or not. So, find_energy_efficient_cpu() basically favors + * cluster-packing, and spreading inside a cluster. That should at least be + * a good thing for latency, and this is consistent with the idea that most + * of the energy savings of EAS come from the asymmetry of the system, and + * not so much from breaking the tie between identical CPUs. That's also the + * reason why EAS is enabled in the topology code only for systems where + * SD_ASYM_CPUCAPACITY is set. + * + * NOTE: Forkees are not accepted in the energy-aware wake-up path because + * they don't have any useful utilization data yet and it's not possible to + * forecast their impact on energy consumption. Consequently, they will be + * placed by find_idlest_cpu() on the least loaded CPU, which might turn out + * to be energy-inefficient in some use-cases. The alternative would be to + * bias new tasks towards specific types of CPUs first, or to try to infer + * their util_avg from the parent task, but those heuristics could hurt + * other use-cases too. So, until someone finds a better way to solve this, + * let's keep things simple by re-using the existing slow path. + */ + +static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) +{ + unsigned long prev_energy = ULONG_MAX, best_energy = ULONG_MAX; + struct root_domain *rd = cpu_rq(smp_processor_id())->rd; + int cpu, best_energy_cpu = prev_cpu; + struct perf_domain *head, *pd; + unsigned long cpu_cap, util; + struct sched_domain *sd; + + rcu_read_lock(); + pd = rcu_dereference(rd->pd); + if (!pd || READ_ONCE(rd->overutilized)) + goto fail; + head = pd; + + /* + * Energy-aware wake-up happens on the lowest sched_domain starting + * from sd_asym_cpucapacity spanning over this_cpu and prev_cpu. + */ + sd = rcu_dereference(*this_cpu_ptr(&sd_asym_cpucapacity)); + while (sd && !cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) + sd = sd->parent; + if (!sd) + goto fail; + + sync_entity_load_avg(&p->se); + if (!task_util_est(p)) + goto unlock; + + for (; pd; pd = pd->next) { + unsigned long cur_energy, spare_cap, max_spare_cap = 0; + int max_spare_cap_cpu = -1; + + for_each_cpu_and(cpu, perf_domain_span(pd), sched_domain_span(sd)) { + if (!cpumask_test_cpu(cpu, &p->cpus_allowed)) + continue; + + /* Skip CPUs that will be overutilized. */ + util = cpu_util_next(cpu, p, cpu); + cpu_cap = capacity_of(cpu); + if (cpu_cap * 1024 < util * capacity_margin) + continue; + + /* Always use prev_cpu as a candidate. */ + if (cpu == prev_cpu) { + prev_energy = compute_energy(p, prev_cpu, head); + best_energy = min(best_energy, prev_energy); + continue; + } + + /* + * Find the CPU with the maximum spare capacity in + * the performance domain + */ + spare_cap = cpu_cap - util; + if (spare_cap > max_spare_cap) { + max_spare_cap = spare_cap; + max_spare_cap_cpu = cpu; + } + } + + /* Evaluate the energy impact of using this CPU. */ + if (max_spare_cap_cpu >= 0) { + cur_energy = compute_energy(p, max_spare_cap_cpu, head); + if (cur_energy < best_energy) { + best_energy = cur_energy; + best_energy_cpu = max_spare_cap_cpu; + } + } + } +unlock: + rcu_read_unlock(); + + /* + * Pick the best CPU if prev_cpu cannot be used, or if it saves at + * least 6% of the energy used by prev_cpu. + */ + if (prev_energy == ULONG_MAX) + return best_energy_cpu; + + if ((prev_energy - best_energy) > (prev_energy >> 4)) + return best_energy_cpu; + + return prev_cpu; + +fail: + rcu_read_unlock(); + + return -1; +} + /* * select_task_rq_fair: Select target runqueue for the waking task in domains * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE, @@ -6476,8 +6607,16 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f if (sd_flag & SD_BALANCE_WAKE) { record_wakee(p); - want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu) - && cpumask_test_cpu(cpu, &p->cpus_allowed); + + if (static_branch_unlikely(&sched_energy_present)) { + new_cpu = find_energy_efficient_cpu(p, prev_cpu); + if (new_cpu >= 0) + return new_cpu; + new_cpu = prev_cpu; + } + + want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu) && + cpumask_test_cpu(cpu, &p->cpus_allowed); } rcu_read_lock(); -- cgit From db5113911abaa7eb20cf115d4339959c1aecea95 Mon Sep 17 00:00:00 2001 From: Tycho Andersen Date: Sun, 9 Dec 2018 11:24:11 -0700 Subject: seccomp: hoist struct seccomp_data recalculation higher In the next patch, we're going to use the sd pointer passed to __seccomp_filter() as the data to pass to userspace. Except that in some cases (__seccomp_filter(SECCOMP_RET_TRACE), emulate_vsyscall(), every time seccomp is inovked on power, etc.) the sd pointer will be NULL in order to force seccomp to recompute the register data. Previously this recomputation happened one level lower, in seccomp_run_filters(); this patch just moves it up a level higher to __seccomp_filter(). Thanks Oleg for spotting this. Signed-off-by: Tycho Andersen CC: Kees Cook CC: Andy Lutomirski CC: Oleg Nesterov CC: Eric W. Biederman CC: "Serge E. Hallyn" Acked-by: Serge Hallyn CC: Christian Brauner CC: Tyler Hicks CC: Akihiro Suda Signed-off-by: Kees Cook --- kernel/seccomp.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/seccomp.c b/kernel/seccomp.c index f2ae2324c232..96afc32e041d 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -188,7 +188,6 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen) static u32 seccomp_run_filters(const struct seccomp_data *sd, struct seccomp_filter **match) { - struct seccomp_data sd_local; u32 ret = SECCOMP_RET_ALLOW; /* Make sure cross-thread synced filter points somewhere sane. */ struct seccomp_filter *f = @@ -198,11 +197,6 @@ static u32 seccomp_run_filters(const struct seccomp_data *sd, if (WARN_ON(f == NULL)) return SECCOMP_RET_KILL_PROCESS; - if (!sd) { - populate_seccomp_data(&sd_local); - sd = &sd_local; - } - /* * All filters in the list are evaluated and the lowest BPF return * value always takes priority (ignoring the DATA). @@ -658,6 +652,7 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, u32 filter_ret, action; struct seccomp_filter *match = NULL; int data; + struct seccomp_data sd_local; /* * Make sure that any changes to mode from another thread have @@ -665,6 +660,11 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, */ rmb(); + if (!sd) { + populate_seccomp_data(&sd_local); + sd = &sd_local; + } + filter_ret = seccomp_run_filters(sd, &match); data = filter_ret & SECCOMP_RET_DATA; action = filter_ret & SECCOMP_RET_ACTION_FULL; -- cgit From a5662e4d81c4d4b08140c625d0f3c50b15786252 Mon Sep 17 00:00:00 2001 From: Tycho Andersen Date: Sun, 9 Dec 2018 11:24:12 -0700 Subject: seccomp: switch system call argument type to void * The const qualifier causes problems for any code that wants to write to the third argument of the seccomp syscall, as we will do in a future patch in this series. The third argument to the seccomp syscall is documented as void *, so rather than just dropping the const, let's switch everything to use void * as well. I believe this is safe because of 1. the documentation above, 2. there's no real type information exported about syscalls anywhere besides the man pages. Signed-off-by: Tycho Andersen CC: Kees Cook CC: Andy Lutomirski CC: Oleg Nesterov CC: Eric W. Biederman CC: "Serge E. Hallyn" Acked-by: Serge Hallyn CC: Christian Brauner CC: Tyler Hicks CC: Akihiro Suda Signed-off-by: Kees Cook --- include/linux/seccomp.h | 2 +- include/linux/syscalls.h | 2 +- kernel/seccomp.c | 8 ++++---- 3 files changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h index e5320f6c8654..b5103c019cf4 100644 --- a/include/linux/seccomp.h +++ b/include/linux/seccomp.h @@ -43,7 +43,7 @@ extern void secure_computing_strict(int this_syscall); #endif extern long prctl_get_seccomp(void); -extern long prctl_set_seccomp(unsigned long, char __user *); +extern long prctl_set_seccomp(unsigned long, void __user *); static inline int seccomp_mode(struct seccomp *s) { diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 2ac3d13a915b..a60694fb0f58 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -879,7 +879,7 @@ asmlinkage long sys_renameat2(int olddfd, const char __user *oldname, int newdfd, const char __user *newname, unsigned int flags); asmlinkage long sys_seccomp(unsigned int op, unsigned int flags, - const char __user *uargs); + void __user *uargs); asmlinkage long sys_getrandom(char __user *buf, size_t count, unsigned int flags); asmlinkage long sys_memfd_create(const char __user *uname_ptr, unsigned int flags); diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 96afc32e041d..393e029f778a 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -924,7 +924,7 @@ static long seccomp_get_action_avail(const char __user *uaction) /* Common entry point for both prctl and syscall. */ static long do_seccomp(unsigned int op, unsigned int flags, - const char __user *uargs) + void __user *uargs) { switch (op) { case SECCOMP_SET_MODE_STRICT: @@ -944,7 +944,7 @@ static long do_seccomp(unsigned int op, unsigned int flags, } SYSCALL_DEFINE3(seccomp, unsigned int, op, unsigned int, flags, - const char __user *, uargs) + void __user *, uargs) { return do_seccomp(op, flags, uargs); } @@ -956,10 +956,10 @@ SYSCALL_DEFINE3(seccomp, unsigned int, op, unsigned int, flags, * * Returns 0 on success or -EINVAL on failure. */ -long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter) +long prctl_set_seccomp(unsigned long seccomp_mode, void __user *filter) { unsigned int op; - char __user *uargs; + void __user *uargs; switch (seccomp_mode) { case SECCOMP_MODE_STRICT: -- cgit From 6a21cc50f0c7f87dae5259f6cfefe024412313f6 Mon Sep 17 00:00:00 2001 From: Tycho Andersen Date: Sun, 9 Dec 2018 11:24:13 -0700 Subject: seccomp: add a return code to trap to userspace This patch introduces a means for syscalls matched in seccomp to notify some other task that a particular filter has been triggered. The motivation for this is primarily for use with containers. For example, if a container does an init_module(), we obviously don't want to load this untrusted code, which may be compiled for the wrong version of the kernel anyway. Instead, we could parse the module image, figure out which module the container is trying to load and load it on the host. As another example, containers cannot mount() in general since various filesystems assume a trusted image. However, if an orchestrator knows that e.g. a particular block device has not been exposed to a container for writing, it want to allow the container to mount that block device (that is, handle the mount for it). This patch adds functionality that is already possible via at least two other means that I know about, both of which involve ptrace(): first, one could ptrace attach, and then iterate through syscalls via PTRACE_SYSCALL. Unfortunately this is slow, so a faster version would be to install a filter that does SECCOMP_RET_TRACE, which triggers a PTRACE_EVENT_SECCOMP. Since ptrace allows only one tracer, if the container runtime is that tracer, users inside the container (or outside) trying to debug it will not be able to use ptrace, which is annoying. It also means that older distributions based on Upstart cannot boot inside containers using ptrace, since upstart itself uses ptrace to monitor services while starting. The actual implementation of this is fairly small, although getting the synchronization right was/is slightly complex. Finally, it's worth noting that the classic seccomp TOCTOU of reading memory data from the task still applies here, but can be avoided with careful design of the userspace handler: if the userspace handler reads all of the task memory that is necessary before applying its security policy, the tracee's subsequent memory edits will not be read by the tracer. Signed-off-by: Tycho Andersen CC: Kees Cook CC: Andy Lutomirski CC: Oleg Nesterov CC: Eric W. Biederman CC: "Serge E. Hallyn" Acked-by: Serge Hallyn CC: Christian Brauner CC: Tyler Hicks CC: Akihiro Suda Signed-off-by: Kees Cook --- Documentation/ioctl/ioctl-number.txt | 1 + Documentation/userspace-api/seccomp_filter.rst | 84 +++++ include/linux/seccomp.h | 7 +- include/uapi/linux/seccomp.h | 40 ++- kernel/seccomp.c | 448 ++++++++++++++++++++++++- tools/testing/selftests/seccomp/seccomp_bpf.c | 447 +++++++++++++++++++++++- 6 files changed, 1017 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/Documentation/ioctl/ioctl-number.txt b/Documentation/ioctl/ioctl-number.txt index af6f6ba1fe80..c9558146ac58 100644 --- a/Documentation/ioctl/ioctl-number.txt +++ b/Documentation/ioctl/ioctl-number.txt @@ -79,6 +79,7 @@ Code Seq#(hex) Include File Comments 0x1b all InfiniBand Subsystem 0x20 all drivers/cdrom/cm206.h 0x22 all scsi/sg.h +'!' 00-1F uapi/linux/seccomp.h '#' 00-3F IEEE 1394 Subsystem Block for the entire subsystem '$' 00-0F linux/perf_counter.h, linux/perf_event.h '%' 00-0F include/uapi/linux/stm.h diff --git a/Documentation/userspace-api/seccomp_filter.rst b/Documentation/userspace-api/seccomp_filter.rst index 82a468bc7560..b1b846d8a094 100644 --- a/Documentation/userspace-api/seccomp_filter.rst +++ b/Documentation/userspace-api/seccomp_filter.rst @@ -122,6 +122,11 @@ In precedence order, they are: Results in the lower 16-bits of the return value being passed to userland as the errno without executing the system call. +``SECCOMP_RET_USER_NOTIF``: + Results in a ``struct seccomp_notif`` message sent on the userspace + notification fd, if it is attached, or ``-ENOSYS`` if it is not. See below + on discussion of how to handle user notifications. + ``SECCOMP_RET_TRACE``: When returned, this value will cause the kernel to attempt to notify a ``ptrace()``-based tracer prior to executing the system @@ -183,6 +188,85 @@ The ``samples/seccomp/`` directory contains both an x86-specific example and a more generic example of a higher level macro interface for BPF program generation. +Userspace Notification +====================== + +The ``SECCOMP_RET_USER_NOTIF`` return code lets seccomp filters pass a +particular syscall to userspace to be handled. This may be useful for +applications like container managers, which wish to intercept particular +syscalls (``mount()``, ``finit_module()``, etc.) and change their behavior. + +To acquire a notification FD, use the ``SECCOMP_FILTER_FLAG_NEW_LISTENER`` +argument to the ``seccomp()`` syscall: + +.. code-block:: c + + fd = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_NEW_LISTENER, &prog); + +which (on success) will return a listener fd for the filter, which can then be +passed around via ``SCM_RIGHTS`` or similar. Note that filter fds correspond to +a particular filter, and not a particular task. So if this task then forks, +notifications from both tasks will appear on the same filter fd. Reads and +writes to/from a filter fd are also synchronized, so a filter fd can safely +have many readers. + +The interface for a seccomp notification fd consists of two structures: + +.. code-block:: c + + struct seccomp_notif_sizes { + __u16 seccomp_notif; + __u16 seccomp_notif_resp; + __u16 seccomp_data; + }; + + struct seccomp_notif { + __u64 id; + __u32 pid; + __u32 flags; + struct seccomp_data data; + }; + + struct seccomp_notif_resp { + __u64 id; + __s64 val; + __s32 error; + __u32 flags; + }; + +The ``struct seccomp_notif_sizes`` structure can be used to determine the size +of the various structures used in seccomp notifications. The size of ``struct +seccomp_data`` may change in the future, so code should use: + +.. code-block:: c + + struct seccomp_notif_sizes sizes; + seccomp(SECCOMP_GET_NOTIF_SIZES, 0, &sizes); + +to determine the size of the various structures to allocate. See +samples/seccomp/user-trap.c for an example. + +Users can read via ``ioctl(SECCOMP_IOCTL_NOTIF_RECV)`` (or ``poll()``) on a +seccomp notification fd to receive a ``struct seccomp_notif``, which contains +five members: the input length of the structure, a unique-per-filter ``id``, +the ``pid`` of the task which triggered this request (which may be 0 if the +task is in a pid ns not visible from the listener's pid namespace), a ``flags`` +member which for now only has ``SECCOMP_NOTIF_FLAG_SIGNALED``, representing +whether or not the notification is a result of a non-fatal signal, and the +``data`` passed to seccomp. Userspace can then make a decision based on this +information about what to do, and ``ioctl(SECCOMP_IOCTL_NOTIF_SEND)`` a +response, indicating what should be returned to userspace. The ``id`` member of +``struct seccomp_notif_resp`` should be the same ``id`` as in ``struct +seccomp_notif``. + +It is worth noting that ``struct seccomp_data`` contains the values of register +arguments to the syscall, but does not contain pointers to memory. The task's +memory is accessible to suitably privileged traces via ``ptrace()`` or +``/proc/pid/mem``. However, care should be taken to avoid the TOCTOU mentioned +above in this document: all arguments being read from the tracee's memory +should be read into the tracer's memory before any policy decisions are made. +This allows for an atomic decision on syscall arguments. + Sysctls ======= diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h index b5103c019cf4..84868d37b35d 100644 --- a/include/linux/seccomp.h +++ b/include/linux/seccomp.h @@ -4,9 +4,10 @@ #include -#define SECCOMP_FILTER_FLAG_MASK (SECCOMP_FILTER_FLAG_TSYNC | \ - SECCOMP_FILTER_FLAG_LOG | \ - SECCOMP_FILTER_FLAG_SPEC_ALLOW) +#define SECCOMP_FILTER_FLAG_MASK (SECCOMP_FILTER_FLAG_TSYNC | \ + SECCOMP_FILTER_FLAG_LOG | \ + SECCOMP_FILTER_FLAG_SPEC_ALLOW | \ + SECCOMP_FILTER_FLAG_NEW_LISTENER) #ifdef CONFIG_SECCOMP diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h index 9efc0e73d50b..90734aa5aa36 100644 --- a/include/uapi/linux/seccomp.h +++ b/include/uapi/linux/seccomp.h @@ -15,11 +15,13 @@ #define SECCOMP_SET_MODE_STRICT 0 #define SECCOMP_SET_MODE_FILTER 1 #define SECCOMP_GET_ACTION_AVAIL 2 +#define SECCOMP_GET_NOTIF_SIZES 3 /* Valid flags for SECCOMP_SET_MODE_FILTER */ -#define SECCOMP_FILTER_FLAG_TSYNC (1UL << 0) -#define SECCOMP_FILTER_FLAG_LOG (1UL << 1) -#define SECCOMP_FILTER_FLAG_SPEC_ALLOW (1UL << 2) +#define SECCOMP_FILTER_FLAG_TSYNC (1UL << 0) +#define SECCOMP_FILTER_FLAG_LOG (1UL << 1) +#define SECCOMP_FILTER_FLAG_SPEC_ALLOW (1UL << 2) +#define SECCOMP_FILTER_FLAG_NEW_LISTENER (1UL << 3) /* * All BPF programs must return a 32-bit value. @@ -35,6 +37,7 @@ #define SECCOMP_RET_KILL SECCOMP_RET_KILL_THREAD #define SECCOMP_RET_TRAP 0x00030000U /* disallow and force a SIGSYS */ #define SECCOMP_RET_ERRNO 0x00050000U /* returns an errno */ +#define SECCOMP_RET_USER_NOTIF 0x7fc00000U /* notifies userspace */ #define SECCOMP_RET_TRACE 0x7ff00000U /* pass to a tracer or disallow */ #define SECCOMP_RET_LOG 0x7ffc0000U /* allow after logging */ #define SECCOMP_RET_ALLOW 0x7fff0000U /* allow */ @@ -60,4 +63,35 @@ struct seccomp_data { __u64 args[6]; }; +struct seccomp_notif_sizes { + __u16 seccomp_notif; + __u16 seccomp_notif_resp; + __u16 seccomp_data; +}; + +struct seccomp_notif { + __u64 id; + __u32 pid; + __u32 flags; + struct seccomp_data data; +}; + +struct seccomp_notif_resp { + __u64 id; + __s64 val; + __s32 error; + __u32 flags; +}; + +#define SECCOMP_IOC_MAGIC '!' +#define SECCOMP_IO(nr) _IO(SECCOMP_IOC_MAGIC, nr) +#define SECCOMP_IOR(nr, type) _IOR(SECCOMP_IOC_MAGIC, nr, type) +#define SECCOMP_IOW(nr, type) _IOW(SECCOMP_IOC_MAGIC, nr, type) +#define SECCOMP_IOWR(nr, type) _IOWR(SECCOMP_IOC_MAGIC, nr, type) + +/* Flags for seccomp notification fd ioctl. */ +#define SECCOMP_IOCTL_NOTIF_RECV SECCOMP_IOWR(0, struct seccomp_notif) +#define SECCOMP_IOCTL_NOTIF_SEND SECCOMP_IOWR(1, \ + struct seccomp_notif_resp) +#define SECCOMP_IOCTL_NOTIF_ID_VALID SECCOMP_IOR(2, __u64) #endif /* _UAPI_LINUX_SECCOMP_H */ diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 393e029f778a..15b6be97fc09 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -33,12 +33,74 @@ #endif #ifdef CONFIG_SECCOMP_FILTER +#include #include #include #include #include #include #include +#include + +enum notify_state { + SECCOMP_NOTIFY_INIT, + SECCOMP_NOTIFY_SENT, + SECCOMP_NOTIFY_REPLIED, +}; + +struct seccomp_knotif { + /* The struct pid of the task whose filter triggered the notification */ + struct task_struct *task; + + /* The "cookie" for this request; this is unique for this filter. */ + u64 id; + + /* + * The seccomp data. This pointer is valid the entire time this + * notification is active, since it comes from __seccomp_filter which + * eclipses the entire lifecycle here. + */ + const struct seccomp_data *data; + + /* + * Notification states. When SECCOMP_RET_USER_NOTIF is returned, a + * struct seccomp_knotif is created and starts out in INIT. Once the + * handler reads the notification off of an FD, it transitions to SENT. + * If a signal is received the state transitions back to INIT and + * another message is sent. When the userspace handler replies, state + * transitions to REPLIED. + */ + enum notify_state state; + + /* The return values, only valid when in SECCOMP_NOTIFY_REPLIED */ + int error; + long val; + + /* Signals when this has entered SECCOMP_NOTIFY_REPLIED */ + struct completion ready; + + struct list_head list; +}; + +/** + * struct notification - container for seccomp userspace notifications. Since + * most seccomp filters will not have notification listeners attached and this + * structure is fairly large, we store the notification-specific stuff in a + * separate structure. + * + * @request: A semaphore that users of this notification can wait on for + * changes. Actual reads and writes are still controlled with + * filter->notify_lock. + * @next_id: The id of the next request. + * @notifications: A list of struct seccomp_knotif elements. + * @wqh: A wait queue for poll. + */ +struct notification { + struct semaphore request; + u64 next_id; + struct list_head notifications; + wait_queue_head_t wqh; +}; /** * struct seccomp_filter - container for seccomp BPF programs @@ -50,6 +112,8 @@ * @log: true if all actions except for SECCOMP_RET_ALLOW should be logged * @prev: points to a previously installed, or inherited, filter * @prog: the BPF program to evaluate + * @notif: the struct that holds all notification related information + * @notify_lock: A lock for all notification-related accesses. * * seccomp_filter objects are organized in a tree linked via the @prev * pointer. For any task, it appears to be a singly-linked list starting @@ -66,6 +130,8 @@ struct seccomp_filter { bool log; struct seccomp_filter *prev; struct bpf_prog *prog; + struct notification *notif; + struct mutex notify_lock; }; /* Limit any path through the tree to 256KB worth of instructions. */ @@ -386,6 +452,7 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog) if (!sfilter) return ERR_PTR(-ENOMEM); + mutex_init(&sfilter->notify_lock); ret = bpf_prog_create_from_user(&sfilter->prog, fprog, seccomp_check_filter, save_orig); if (ret < 0) { @@ -479,7 +546,6 @@ static long seccomp_attach_filter(unsigned int flags, static void __get_seccomp_filter(struct seccomp_filter *filter) { - /* Reference count is bounded by the number of total processes. */ refcount_inc(&filter->usage); } @@ -550,11 +616,13 @@ static void seccomp_send_sigsys(int syscall, int reason) #define SECCOMP_LOG_TRACE (1 << 4) #define SECCOMP_LOG_LOG (1 << 5) #define SECCOMP_LOG_ALLOW (1 << 6) +#define SECCOMP_LOG_USER_NOTIF (1 << 7) static u32 seccomp_actions_logged = SECCOMP_LOG_KILL_PROCESS | SECCOMP_LOG_KILL_THREAD | SECCOMP_LOG_TRAP | SECCOMP_LOG_ERRNO | + SECCOMP_LOG_USER_NOTIF | SECCOMP_LOG_TRACE | SECCOMP_LOG_LOG; @@ -575,6 +643,9 @@ static inline void seccomp_log(unsigned long syscall, long signr, u32 action, case SECCOMP_RET_TRACE: log = requested && seccomp_actions_logged & SECCOMP_LOG_TRACE; break; + case SECCOMP_RET_USER_NOTIF: + log = requested && seccomp_actions_logged & SECCOMP_LOG_USER_NOTIF; + break; case SECCOMP_RET_LOG: log = seccomp_actions_logged & SECCOMP_LOG_LOG; break; @@ -646,6 +717,68 @@ void secure_computing_strict(int this_syscall) #else #ifdef CONFIG_SECCOMP_FILTER +static u64 seccomp_next_notify_id(struct seccomp_filter *filter) +{ + /* + * Note: overflow is ok here, the id just needs to be unique per + * filter. + */ + lockdep_assert_held(&filter->notify_lock); + return filter->notif->next_id++; +} + +static void seccomp_do_user_notification(int this_syscall, + struct seccomp_filter *match, + const struct seccomp_data *sd) +{ + int err; + long ret = 0; + struct seccomp_knotif n = {}; + + mutex_lock(&match->notify_lock); + err = -ENOSYS; + if (!match->notif) + goto out; + + n.task = current; + n.state = SECCOMP_NOTIFY_INIT; + n.data = sd; + n.id = seccomp_next_notify_id(match); + init_completion(&n.ready); + list_add(&n.list, &match->notif->notifications); + + up(&match->notif->request); + wake_up_poll(&match->notif->wqh, EPOLLIN | EPOLLRDNORM); + mutex_unlock(&match->notify_lock); + + /* + * This is where we wait for a reply from userspace. + */ + err = wait_for_completion_interruptible(&n.ready); + mutex_lock(&match->notify_lock); + if (err == 0) { + ret = n.val; + err = n.error; + } + + /* + * Note that it's possible the listener died in between the time when + * we were notified of a respons (or a signal) and when we were able to + * re-acquire the lock, so only delete from the list if the + * notification actually exists. + * + * Also note that this test is only valid because there's no way to + * *reattach* to a notifier right now. If one is added, we'll need to + * keep track of the notif itself and make sure they match here. + */ + if (match->notif) + list_del(&n.list); +out: + mutex_unlock(&match->notify_lock); + syscall_set_return_value(current, task_pt_regs(current), + err, ret); +} + static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, const bool recheck_after_trace) { @@ -728,6 +861,10 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, return 0; + case SECCOMP_RET_USER_NOTIF: + seccomp_do_user_notification(this_syscall, match, sd); + goto skip; + case SECCOMP_RET_LOG: seccomp_log(this_syscall, 0, action, true); return 0; @@ -834,6 +971,263 @@ out: } #ifdef CONFIG_SECCOMP_FILTER +static int seccomp_notify_release(struct inode *inode, struct file *file) +{ + struct seccomp_filter *filter = file->private_data; + struct seccomp_knotif *knotif; + + mutex_lock(&filter->notify_lock); + + /* + * If this file is being closed because e.g. the task who owned it + * died, let's wake everyone up who was waiting on us. + */ + list_for_each_entry(knotif, &filter->notif->notifications, list) { + if (knotif->state == SECCOMP_NOTIFY_REPLIED) + continue; + + knotif->state = SECCOMP_NOTIFY_REPLIED; + knotif->error = -ENOSYS; + knotif->val = 0; + + complete(&knotif->ready); + } + + kfree(filter->notif); + filter->notif = NULL; + mutex_unlock(&filter->notify_lock); + __put_seccomp_filter(filter); + return 0; +} + +static long seccomp_notify_recv(struct seccomp_filter *filter, + void __user *buf) +{ + struct seccomp_knotif *knotif = NULL, *cur; + struct seccomp_notif unotif; + ssize_t ret; + + memset(&unotif, 0, sizeof(unotif)); + + ret = down_interruptible(&filter->notif->request); + if (ret < 0) + return ret; + + mutex_lock(&filter->notify_lock); + list_for_each_entry(cur, &filter->notif->notifications, list) { + if (cur->state == SECCOMP_NOTIFY_INIT) { + knotif = cur; + break; + } + } + + /* + * If we didn't find a notification, it could be that the task was + * interrupted by a fatal signal between the time we were woken and + * when we were able to acquire the rw lock. + */ + if (!knotif) { + ret = -ENOENT; + goto out; + } + + unotif.id = knotif->id; + unotif.pid = task_pid_vnr(knotif->task); + unotif.data = *(knotif->data); + + knotif->state = SECCOMP_NOTIFY_SENT; + wake_up_poll(&filter->notif->wqh, EPOLLOUT | EPOLLWRNORM); + ret = 0; +out: + mutex_unlock(&filter->notify_lock); + + if (ret == 0 && copy_to_user(buf, &unotif, sizeof(unotif))) { + ret = -EFAULT; + + /* + * Userspace screwed up. To make sure that we keep this + * notification alive, let's reset it back to INIT. It + * may have died when we released the lock, so we need to make + * sure it's still around. + */ + knotif = NULL; + mutex_lock(&filter->notify_lock); + list_for_each_entry(cur, &filter->notif->notifications, list) { + if (cur->id == unotif.id) { + knotif = cur; + break; + } + } + + if (knotif) { + knotif->state = SECCOMP_NOTIFY_INIT; + up(&filter->notif->request); + } + mutex_unlock(&filter->notify_lock); + } + + return ret; +} + +static long seccomp_notify_send(struct seccomp_filter *filter, + void __user *buf) +{ + struct seccomp_notif_resp resp = {}; + struct seccomp_knotif *knotif = NULL, *cur; + long ret; + + if (copy_from_user(&resp, buf, sizeof(resp))) + return -EFAULT; + + if (resp.flags) + return -EINVAL; + + ret = mutex_lock_interruptible(&filter->notify_lock); + if (ret < 0) + return ret; + + list_for_each_entry(cur, &filter->notif->notifications, list) { + if (cur->id == resp.id) { + knotif = cur; + break; + } + } + + if (!knotif) { + ret = -ENOENT; + goto out; + } + + /* Allow exactly one reply. */ + if (knotif->state != SECCOMP_NOTIFY_SENT) { + ret = -EINPROGRESS; + goto out; + } + + ret = 0; + knotif->state = SECCOMP_NOTIFY_REPLIED; + knotif->error = resp.error; + knotif->val = resp.val; + complete(&knotif->ready); +out: + mutex_unlock(&filter->notify_lock); + return ret; +} + +static long seccomp_notify_id_valid(struct seccomp_filter *filter, + void __user *buf) +{ + struct seccomp_knotif *knotif = NULL; + u64 id; + long ret; + + if (copy_from_user(&id, buf, sizeof(id))) + return -EFAULT; + + ret = mutex_lock_interruptible(&filter->notify_lock); + if (ret < 0) + return ret; + + ret = -ENOENT; + list_for_each_entry(knotif, &filter->notif->notifications, list) { + if (knotif->id == id) { + if (knotif->state == SECCOMP_NOTIFY_SENT) + ret = 0; + goto out; + } + } + +out: + mutex_unlock(&filter->notify_lock); + return ret; +} + +static long seccomp_notify_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + struct seccomp_filter *filter = file->private_data; + void __user *buf = (void __user *)arg; + + switch (cmd) { + case SECCOMP_IOCTL_NOTIF_RECV: + return seccomp_notify_recv(filter, buf); + case SECCOMP_IOCTL_NOTIF_SEND: + return seccomp_notify_send(filter, buf); + case SECCOMP_IOCTL_NOTIF_ID_VALID: + return seccomp_notify_id_valid(filter, buf); + default: + return -EINVAL; + } +} + +static __poll_t seccomp_notify_poll(struct file *file, + struct poll_table_struct *poll_tab) +{ + struct seccomp_filter *filter = file->private_data; + __poll_t ret = 0; + struct seccomp_knotif *cur; + + poll_wait(file, &filter->notif->wqh, poll_tab); + + ret = mutex_lock_interruptible(&filter->notify_lock); + if (ret < 0) + return EPOLLERR; + + list_for_each_entry(cur, &filter->notif->notifications, list) { + if (cur->state == SECCOMP_NOTIFY_INIT) + ret |= EPOLLIN | EPOLLRDNORM; + if (cur->state == SECCOMP_NOTIFY_SENT) + ret |= EPOLLOUT | EPOLLWRNORM; + if ((ret & EPOLLIN) && (ret & EPOLLOUT)) + break; + } + + mutex_unlock(&filter->notify_lock); + + return ret; +} + +static const struct file_operations seccomp_notify_ops = { + .poll = seccomp_notify_poll, + .release = seccomp_notify_release, + .unlocked_ioctl = seccomp_notify_ioctl, +}; + +static struct file *init_listener(struct seccomp_filter *filter) +{ + struct file *ret = ERR_PTR(-EBUSY); + struct seccomp_filter *cur; + + for (cur = current->seccomp.filter; cur; cur = cur->prev) { + if (cur->notif) + goto out; + } + + ret = ERR_PTR(-ENOMEM); + filter->notif = kzalloc(sizeof(*(filter->notif)), GFP_KERNEL); + if (!filter->notif) + goto out; + + sema_init(&filter->notif->request, 0); + filter->notif->next_id = get_random_u64(); + INIT_LIST_HEAD(&filter->notif->notifications); + init_waitqueue_head(&filter->notif->wqh); + + ret = anon_inode_getfile("seccomp notify", &seccomp_notify_ops, + filter, O_RDWR); + if (IS_ERR(ret)) + goto out_notif; + + /* The file has a reference to it now */ + __get_seccomp_filter(filter); + +out_notif: + if (IS_ERR(ret)) + kfree(filter->notif); +out: + return ret; +} + /** * seccomp_set_mode_filter: internal function for setting seccomp filter * @flags: flags to change filter behavior @@ -853,6 +1247,8 @@ static long seccomp_set_mode_filter(unsigned int flags, const unsigned long seccomp_mode = SECCOMP_MODE_FILTER; struct seccomp_filter *prepared = NULL; long ret = -EINVAL; + int listener = -1; + struct file *listener_f = NULL; /* Validate flags. */ if (flags & ~SECCOMP_FILTER_FLAG_MASK) @@ -863,13 +1259,28 @@ static long seccomp_set_mode_filter(unsigned int flags, if (IS_ERR(prepared)) return PTR_ERR(prepared); + if (flags & SECCOMP_FILTER_FLAG_NEW_LISTENER) { + listener = get_unused_fd_flags(O_CLOEXEC); + if (listener < 0) { + ret = listener; + goto out_free; + } + + listener_f = init_listener(prepared); + if (IS_ERR(listener_f)) { + put_unused_fd(listener); + ret = PTR_ERR(listener_f); + goto out_free; + } + } + /* * Make sure we cannot change seccomp or nnp state via TSYNC * while another thread is in the middle of calling exec. */ if (flags & SECCOMP_FILTER_FLAG_TSYNC && mutex_lock_killable(¤t->signal->cred_guard_mutex)) - goto out_free; + goto out_put_fd; spin_lock_irq(¤t->sighand->siglock); @@ -887,6 +1298,16 @@ out: spin_unlock_irq(¤t->sighand->siglock); if (flags & SECCOMP_FILTER_FLAG_TSYNC) mutex_unlock(¤t->signal->cred_guard_mutex); +out_put_fd: + if (flags & SECCOMP_FILTER_FLAG_NEW_LISTENER) { + if (ret < 0) { + fput(listener_f); + put_unused_fd(listener); + } else { + fd_install(listener, listener_f); + ret = listener; + } + } out_free: seccomp_filter_free(prepared); return ret; @@ -911,6 +1332,7 @@ static long seccomp_get_action_avail(const char __user *uaction) case SECCOMP_RET_KILL_THREAD: case SECCOMP_RET_TRAP: case SECCOMP_RET_ERRNO: + case SECCOMP_RET_USER_NOTIF: case SECCOMP_RET_TRACE: case SECCOMP_RET_LOG: case SECCOMP_RET_ALLOW: @@ -922,6 +1344,20 @@ static long seccomp_get_action_avail(const char __user *uaction) return 0; } +static long seccomp_get_notif_sizes(void __user *usizes) +{ + struct seccomp_notif_sizes sizes = { + .seccomp_notif = sizeof(struct seccomp_notif), + .seccomp_notif_resp = sizeof(struct seccomp_notif_resp), + .seccomp_data = sizeof(struct seccomp_data), + }; + + if (copy_to_user(usizes, &sizes, sizeof(sizes))) + return -EFAULT; + + return 0; +} + /* Common entry point for both prctl and syscall. */ static long do_seccomp(unsigned int op, unsigned int flags, void __user *uargs) @@ -938,6 +1374,11 @@ static long do_seccomp(unsigned int op, unsigned int flags, return -EINVAL; return seccomp_get_action_avail(uargs); + case SECCOMP_GET_NOTIF_SIZES: + if (flags != 0) + return -EINVAL; + + return seccomp_get_notif_sizes(uargs); default: return -EINVAL; } @@ -1111,6 +1552,7 @@ long seccomp_get_metadata(struct task_struct *task, #define SECCOMP_RET_KILL_THREAD_NAME "kill_thread" #define SECCOMP_RET_TRAP_NAME "trap" #define SECCOMP_RET_ERRNO_NAME "errno" +#define SECCOMP_RET_USER_NOTIF_NAME "user_notif" #define SECCOMP_RET_TRACE_NAME "trace" #define SECCOMP_RET_LOG_NAME "log" #define SECCOMP_RET_ALLOW_NAME "allow" @@ -1120,6 +1562,7 @@ static const char seccomp_actions_avail[] = SECCOMP_RET_KILL_THREAD_NAME " " SECCOMP_RET_TRAP_NAME " " SECCOMP_RET_ERRNO_NAME " " + SECCOMP_RET_USER_NOTIF_NAME " " SECCOMP_RET_TRACE_NAME " " SECCOMP_RET_LOG_NAME " " SECCOMP_RET_ALLOW_NAME; @@ -1134,6 +1577,7 @@ static const struct seccomp_log_name seccomp_log_names[] = { { SECCOMP_LOG_KILL_THREAD, SECCOMP_RET_KILL_THREAD_NAME }, { SECCOMP_LOG_TRAP, SECCOMP_RET_TRAP_NAME }, { SECCOMP_LOG_ERRNO, SECCOMP_RET_ERRNO_NAME }, + { SECCOMP_LOG_USER_NOTIF, SECCOMP_RET_USER_NOTIF_NAME }, { SECCOMP_LOG_TRACE, SECCOMP_RET_TRACE_NAME }, { SECCOMP_LOG_LOG, SECCOMP_RET_LOG_NAME }, { SECCOMP_LOG_ALLOW, SECCOMP_RET_ALLOW_NAME }, diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c index e1473234968d..5c9768a1b8cd 100644 --- a/tools/testing/selftests/seccomp/seccomp_bpf.c +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c @@ -5,6 +5,7 @@ * Test code for seccomp bpf. */ +#define _GNU_SOURCE #include /* @@ -40,10 +41,12 @@ #include #include #include +#include +#include -#define _GNU_SOURCE #include #include +#include #include "../kselftest_harness.h" @@ -133,6 +136,10 @@ struct seccomp_data { #define SECCOMP_GET_ACTION_AVAIL 2 #endif +#ifndef SECCOMP_GET_NOTIF_SIZES +#define SECCOMP_GET_NOTIF_SIZES 3 +#endif + #ifndef SECCOMP_FILTER_FLAG_TSYNC #define SECCOMP_FILTER_FLAG_TSYNC (1UL << 0) #endif @@ -154,6 +161,44 @@ struct seccomp_metadata { }; #endif +#ifndef SECCOMP_FILTER_FLAG_NEW_LISTENER +#define SECCOMP_FILTER_FLAG_NEW_LISTENER (1UL << 3) + +#define SECCOMP_RET_USER_NOTIF 0x7fc00000U + +#define SECCOMP_IOC_MAGIC '!' +#define SECCOMP_IO(nr) _IO(SECCOMP_IOC_MAGIC, nr) +#define SECCOMP_IOR(nr, type) _IOR(SECCOMP_IOC_MAGIC, nr, type) +#define SECCOMP_IOW(nr, type) _IOW(SECCOMP_IOC_MAGIC, nr, type) +#define SECCOMP_IOWR(nr, type) _IOWR(SECCOMP_IOC_MAGIC, nr, type) + +/* Flags for seccomp notification fd ioctl. */ +#define SECCOMP_IOCTL_NOTIF_RECV SECCOMP_IOWR(0, struct seccomp_notif) +#define SECCOMP_IOCTL_NOTIF_SEND SECCOMP_IOWR(1, \ + struct seccomp_notif_resp) +#define SECCOMP_IOCTL_NOTIF_ID_VALID SECCOMP_IOR(2, __u64) + +struct seccomp_notif { + __u64 id; + __u32 pid; + __u32 flags; + struct seccomp_data data; +}; + +struct seccomp_notif_resp { + __u64 id; + __s64 val; + __s32 error; + __u32 flags; +}; + +struct seccomp_notif_sizes { + __u16 seccomp_notif; + __u16 seccomp_notif_resp; + __u16 seccomp_data; +}; +#endif + #ifndef seccomp int seccomp(unsigned int op, unsigned int flags, void *args) { @@ -2077,7 +2122,8 @@ TEST(detect_seccomp_filter_flags) { unsigned int flags[] = { SECCOMP_FILTER_FLAG_TSYNC, SECCOMP_FILTER_FLAG_LOG, - SECCOMP_FILTER_FLAG_SPEC_ALLOW }; + SECCOMP_FILTER_FLAG_SPEC_ALLOW, + SECCOMP_FILTER_FLAG_NEW_LISTENER }; unsigned int flag, all_flags; int i; long ret; @@ -2933,6 +2979,403 @@ skip: ASSERT_EQ(0, kill(pid, SIGKILL)); } +static int user_trap_syscall(int nr, unsigned int flags) +{ + struct sock_filter filter[] = { + BPF_STMT(BPF_LD+BPF_W+BPF_ABS, + offsetof(struct seccomp_data, nr)), + BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, nr, 0, 1), + BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_USER_NOTIF), + BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW), + }; + + struct sock_fprog prog = { + .len = (unsigned short)ARRAY_SIZE(filter), + .filter = filter, + }; + + return seccomp(SECCOMP_SET_MODE_FILTER, flags, &prog); +} + +#define USER_NOTIF_MAGIC 116983961184613L +TEST(user_notification_basic) +{ + pid_t pid; + long ret; + int status, listener; + struct seccomp_notif req = {}; + struct seccomp_notif_resp resp = {}; + struct pollfd pollfd; + + struct sock_filter filter[] = { + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW), + }; + struct sock_fprog prog = { + .len = (unsigned short)ARRAY_SIZE(filter), + .filter = filter, + }; + + pid = fork(); + ASSERT_GE(pid, 0); + + /* Check that we get -ENOSYS with no listener attached */ + if (pid == 0) { + if (user_trap_syscall(__NR_getpid, 0) < 0) + exit(1); + ret = syscall(__NR_getpid); + exit(ret >= 0 || errno != ENOSYS); + } + + EXPECT_EQ(waitpid(pid, &status, 0), pid); + EXPECT_EQ(true, WIFEXITED(status)); + EXPECT_EQ(0, WEXITSTATUS(status)); + + /* Add some no-op filters so for grins. */ + EXPECT_EQ(seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog), 0); + EXPECT_EQ(seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog), 0); + EXPECT_EQ(seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog), 0); + EXPECT_EQ(seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog), 0); + + /* Check that the basic notification machinery works */ + listener = user_trap_syscall(__NR_getpid, + SECCOMP_FILTER_FLAG_NEW_LISTENER); + EXPECT_GE(listener, 0); + + /* Installing a second listener in the chain should EBUSY */ + EXPECT_EQ(user_trap_syscall(__NR_getpid, + SECCOMP_FILTER_FLAG_NEW_LISTENER), + -1); + EXPECT_EQ(errno, EBUSY); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + ret = syscall(__NR_getpid); + exit(ret != USER_NOTIF_MAGIC); + } + + pollfd.fd = listener; + pollfd.events = POLLIN | POLLOUT; + + EXPECT_GT(poll(&pollfd, 1, -1), 0); + EXPECT_EQ(pollfd.revents, POLLIN); + + EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0); + + pollfd.fd = listener; + pollfd.events = POLLIN | POLLOUT; + + EXPECT_GT(poll(&pollfd, 1, -1), 0); + EXPECT_EQ(pollfd.revents, POLLOUT); + + EXPECT_EQ(req.data.nr, __NR_getpid); + + resp.id = req.id; + resp.error = 0; + resp.val = USER_NOTIF_MAGIC; + + /* check that we make sure flags == 0 */ + resp.flags = 1; + EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), -1); + EXPECT_EQ(errno, EINVAL); + + resp.flags = 0; + EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0); + + EXPECT_EQ(waitpid(pid, &status, 0), pid); + EXPECT_EQ(true, WIFEXITED(status)); + EXPECT_EQ(0, WEXITSTATUS(status)); +} + +TEST(user_notification_kill_in_middle) +{ + pid_t pid; + long ret; + int listener; + struct seccomp_notif req = {}; + struct seccomp_notif_resp resp = {}; + + listener = user_trap_syscall(__NR_getpid, + SECCOMP_FILTER_FLAG_NEW_LISTENER); + EXPECT_GE(listener, 0); + + /* + * Check that nothing bad happens when we kill the task in the middle + * of a syscall. + */ + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + ret = syscall(__NR_getpid); + exit(ret != USER_NOTIF_MAGIC); + } + + EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0); + EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ID_VALID, &req.id), 0); + + EXPECT_EQ(kill(pid, SIGKILL), 0); + EXPECT_EQ(waitpid(pid, NULL, 0), pid); + + EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ID_VALID, &req.id), -1); + + resp.id = req.id; + ret = ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp); + EXPECT_EQ(ret, -1); + EXPECT_EQ(errno, ENOENT); +} + +static int handled = -1; + +static void signal_handler(int signal) +{ + if (write(handled, "c", 1) != 1) + perror("write from signal"); +} + +TEST(user_notification_signal) +{ + pid_t pid; + long ret; + int status, listener, sk_pair[2]; + struct seccomp_notif req = {}; + struct seccomp_notif_resp resp = {}; + char c; + + ASSERT_EQ(socketpair(PF_LOCAL, SOCK_SEQPACKET, 0, sk_pair), 0); + + listener = user_trap_syscall(__NR_gettid, + SECCOMP_FILTER_FLAG_NEW_LISTENER); + EXPECT_GE(listener, 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + close(sk_pair[0]); + handled = sk_pair[1]; + if (signal(SIGUSR1, signal_handler) == SIG_ERR) { + perror("signal"); + exit(1); + } + /* + * ERESTARTSYS behavior is a bit hard to test, because we need + * to rely on a signal that has not yet been handled. Let's at + * least check that the error code gets propagated through, and + * hope that it doesn't break when there is actually a signal :) + */ + ret = syscall(__NR_gettid); + exit(!(ret == -1 && errno == 512)); + } + + close(sk_pair[1]); + + EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0); + + EXPECT_EQ(kill(pid, SIGUSR1), 0); + + /* + * Make sure the signal really is delivered, which means we're not + * stuck in the user notification code any more and the notification + * should be dead. + */ + EXPECT_EQ(read(sk_pair[0], &c, 1), 1); + + resp.id = req.id; + resp.error = -EPERM; + resp.val = 0; + + EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), -1); + EXPECT_EQ(errno, ENOENT); + + EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0); + + resp.id = req.id; + resp.error = -512; /* -ERESTARTSYS */ + resp.val = 0; + + EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0); + + EXPECT_EQ(waitpid(pid, &status, 0), pid); + EXPECT_EQ(true, WIFEXITED(status)); + EXPECT_EQ(0, WEXITSTATUS(status)); +} + +TEST(user_notification_closed_listener) +{ + pid_t pid; + long ret; + int status, listener; + + listener = user_trap_syscall(__NR_getpid, + SECCOMP_FILTER_FLAG_NEW_LISTENER); + EXPECT_GE(listener, 0); + + /* + * Check that we get an ENOSYS when the listener is closed. + */ + pid = fork(); + ASSERT_GE(pid, 0); + if (pid == 0) { + close(listener); + ret = syscall(__NR_getpid); + exit(ret != -1 && errno != ENOSYS); + } + + close(listener); + + EXPECT_EQ(waitpid(pid, &status, 0), pid); + EXPECT_EQ(true, WIFEXITED(status)); + EXPECT_EQ(0, WEXITSTATUS(status)); +} + +/* + * Check that a pid in a child namespace still shows up as valid in ours. + */ +TEST(user_notification_child_pid_ns) +{ + pid_t pid; + int status, listener; + struct seccomp_notif req = {}; + struct seccomp_notif_resp resp = {}; + + ASSERT_EQ(unshare(CLONE_NEWPID), 0); + + listener = user_trap_syscall(__NR_getpid, SECCOMP_FILTER_FLAG_NEW_LISTENER); + ASSERT_GE(listener, 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) + exit(syscall(__NR_getpid) != USER_NOTIF_MAGIC); + + EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0); + EXPECT_EQ(req.pid, pid); + + resp.id = req.id; + resp.error = 0; + resp.val = USER_NOTIF_MAGIC; + + EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0); + + EXPECT_EQ(waitpid(pid, &status, 0), pid); + EXPECT_EQ(true, WIFEXITED(status)); + EXPECT_EQ(0, WEXITSTATUS(status)); + close(listener); +} + +/* + * Check that a pid in a sibling (i.e. unrelated) namespace shows up as 0, i.e. + * invalid. + */ +TEST(user_notification_sibling_pid_ns) +{ + pid_t pid, pid2; + int status, listener; + struct seccomp_notif req = {}; + struct seccomp_notif_resp resp = {}; + + listener = user_trap_syscall(__NR_getpid, SECCOMP_FILTER_FLAG_NEW_LISTENER); + ASSERT_GE(listener, 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + ASSERT_EQ(unshare(CLONE_NEWPID), 0); + + pid2 = fork(); + ASSERT_GE(pid2, 0); + + if (pid2 == 0) + exit(syscall(__NR_getpid) != USER_NOTIF_MAGIC); + + EXPECT_EQ(waitpid(pid2, &status, 0), pid2); + EXPECT_EQ(true, WIFEXITED(status)); + EXPECT_EQ(0, WEXITSTATUS(status)); + exit(WEXITSTATUS(status)); + } + + /* Create the sibling ns, and sibling in it. */ + EXPECT_EQ(unshare(CLONE_NEWPID), 0); + EXPECT_EQ(errno, 0); + + pid2 = fork(); + EXPECT_GE(pid2, 0); + + if (pid2 == 0) { + ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0); + /* + * The pid should be 0, i.e. the task is in some namespace that + * we can't "see". + */ + ASSERT_EQ(req.pid, 0); + + resp.id = req.id; + resp.error = 0; + resp.val = USER_NOTIF_MAGIC; + + ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0); + exit(0); + } + + close(listener); + + EXPECT_EQ(waitpid(pid, &status, 0), pid); + EXPECT_EQ(true, WIFEXITED(status)); + EXPECT_EQ(0, WEXITSTATUS(status)); + + EXPECT_EQ(waitpid(pid2, &status, 0), pid2); + EXPECT_EQ(true, WIFEXITED(status)); + EXPECT_EQ(0, WEXITSTATUS(status)); +} + +TEST(user_notification_fault_recv) +{ + pid_t pid; + int status, listener; + struct seccomp_notif req = {}; + struct seccomp_notif_resp resp = {}; + + listener = user_trap_syscall(__NR_getpid, SECCOMP_FILTER_FLAG_NEW_LISTENER); + ASSERT_GE(listener, 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) + exit(syscall(__NR_getpid) != USER_NOTIF_MAGIC); + + /* Do a bad recv() */ + EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, NULL), -1); + EXPECT_EQ(errno, EFAULT); + + /* We should still be able to receive this notification, though. */ + EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0); + EXPECT_EQ(req.pid, pid); + + resp.id = req.id; + resp.error = 0; + resp.val = USER_NOTIF_MAGIC; + + EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0); + + EXPECT_EQ(waitpid(pid, &status, 0), pid); + EXPECT_EQ(true, WIFEXITED(status)); + EXPECT_EQ(0, WEXITSTATUS(status)); +} + +TEST(seccomp_get_notif_sizes) +{ + struct seccomp_notif_sizes sizes; + + EXPECT_EQ(seccomp(SECCOMP_GET_NOTIF_SIZES, 0, &sizes), 0); + EXPECT_EQ(sizes.seccomp_notif, sizeof(struct seccomp_notif)); + EXPECT_EQ(sizes.seccomp_notif_resp, sizeof(struct seccomp_notif_resp)); +} + /* * TODO: * - add microbenchmarks -- cgit From 5b20c6fd6a60e182243da31c47f2ebff5b0e3d57 Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Tue, 11 Dec 2018 11:37:44 -0500 Subject: timekeeping: Convert to DEFINE_SHOW_ATTRIBUTE Use DEFINE_SHOW_ATTRIBUTE macro to simplify the code. Signed-off-by: Yangtao Li Signed-off-by: Thomas Gleixner Cc: john.stultz@linaro.org Cc: sboyd@kernel.org Link: https://lkml.kernel.org/r/20181211163744.22133-1-tiny.windzz@gmail.com --- kernel/time/timekeeping_debug.c | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping_debug.c b/kernel/time/timekeeping_debug.c index f811882cfd13..86489950d690 100644 --- a/kernel/time/timekeeping_debug.c +++ b/kernel/time/timekeeping_debug.c @@ -19,7 +19,7 @@ static unsigned int sleep_time_bin[NUM_BINS] = {0}; -static int tk_debug_show_sleep_time(struct seq_file *s, void *data) +static int tk_debug_sleep_time_show(struct seq_file *s, void *data) { unsigned int bin; seq_puts(s, " time (secs) count\n"); @@ -33,18 +33,7 @@ static int tk_debug_show_sleep_time(struct seq_file *s, void *data) } return 0; } - -static int tk_debug_sleep_time_open(struct inode *inode, struct file *file) -{ - return single_open(file, tk_debug_show_sleep_time, NULL); -} - -static const struct file_operations tk_debug_sleep_time_fops = { - .open = tk_debug_sleep_time_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(tk_debug_sleep_time); static int __init tk_debug_sleep_time_init(void) { -- cgit From fdadd04931c2d7cd294dc5b2b342863f94be53a3 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 11 Dec 2018 12:14:12 +0100 Subject: bpf: fix bpf_jit_limit knob for PAGE_SIZE >= 64K Michael and Sandipan report: Commit ede95a63b5 introduced a bpf_jit_limit tuneable to limit BPF JIT allocations. At compile time it defaults to PAGE_SIZE * 40000, and is adjusted again at init time if MODULES_VADDR is defined. For ppc64 kernels, MODULES_VADDR isn't defined, so we're stuck with the compile-time default at boot-time, which is 0x9c400000 when using 64K page size. This overflows the signed 32-bit bpf_jit_limit value: root@ubuntu:/tmp# cat /proc/sys/net/core/bpf_jit_limit -1673527296 and can cause various unexpected failures throughout the network stack. In one case `strace dhclient eth0` reported: setsockopt(5, SOL_SOCKET, SO_ATTACH_FILTER, {len=11, filter=0x105dd27f8}, 16) = -1 ENOTSUPP (Unknown error 524) and similar failures can be seen with tools like tcpdump. This doesn't always reproduce however, and I'm not sure why. The more consistent failure I've seen is an Ubuntu 18.04 KVM guest booted on a POWER9 host would time out on systemd/netplan configuring a virtio-net NIC with no noticeable errors in the logs. Given this and also given that in near future some architectures like arm64 will have a custom area for BPF JIT image allocations we should get rid of the BPF_JIT_LIMIT_DEFAULT fallback / default entirely. For 4.21, we have an overridable bpf_jit_alloc_exec(), bpf_jit_free_exec() so therefore add another overridable bpf_jit_alloc_exec_limit() helper function which returns the possible size of the memory area for deriving the default heuristic in bpf_jit_charge_init(). Like bpf_jit_alloc_exec() and bpf_jit_free_exec(), the new bpf_jit_alloc_exec_limit() assumes that module_alloc() is the default JIT memory provider, and therefore in case archs implement their custom module_alloc() we use MODULES_{END,_VADDR} for limits and otherwise for vmalloc_exec() cases like on ppc64 we use VMALLOC_{END,_START}. Additionally, for archs supporting large page sizes, we should change the sysctl to be handled as long to not run into sysctl restrictions in future. Fixes: ede95a63b5e8 ("bpf: add bpf_jit_limit knob to restrict unpriv allocations") Reported-by: Sandipan Das Reported-by: Michael Roth Signed-off-by: Daniel Borkmann Tested-by: Michael Roth Signed-off-by: Alexei Starovoitov --- include/linux/filter.h | 2 +- kernel/bpf/core.c | 21 +++++++++++++++------ net/core/sysctl_net_core.c | 20 +++++++++++++++++--- 3 files changed, 33 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/include/linux/filter.h b/include/linux/filter.h index 795ff0b869bb..a8b9d90a8042 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -861,7 +861,7 @@ bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk, extern int bpf_jit_enable; extern int bpf_jit_harden; extern int bpf_jit_kallsyms; -extern int bpf_jit_limit; +extern long bpf_jit_limit; typedef void (*bpf_jit_fill_hole_t)(void *area, unsigned int size); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index b1a3545d0ec8..b2890c268cb3 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -365,13 +365,11 @@ void bpf_prog_kallsyms_del_all(struct bpf_prog *fp) } #ifdef CONFIG_BPF_JIT -# define BPF_JIT_LIMIT_DEFAULT (PAGE_SIZE * 40000) - /* All BPF JIT sysctl knobs here. */ int bpf_jit_enable __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_ALWAYS_ON); int bpf_jit_harden __read_mostly; int bpf_jit_kallsyms __read_mostly; -int bpf_jit_limit __read_mostly = BPF_JIT_LIMIT_DEFAULT; +long bpf_jit_limit __read_mostly; static __always_inline void bpf_get_prog_addr_region(const struct bpf_prog *prog, @@ -580,16 +578,27 @@ int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type, static atomic_long_t bpf_jit_current; +/* Can be overridden by an arch's JIT compiler if it has a custom, + * dedicated BPF backend memory area, or if neither of the two + * below apply. + */ +u64 __weak bpf_jit_alloc_exec_limit(void) +{ #if defined(MODULES_VADDR) + return MODULES_END - MODULES_VADDR; +#else + return VMALLOC_END - VMALLOC_START; +#endif +} + static int __init bpf_jit_charge_init(void) { /* Only used as heuristic here to derive limit. */ - bpf_jit_limit = min_t(u64, round_up((MODULES_END - MODULES_VADDR) >> 2, - PAGE_SIZE), INT_MAX); + bpf_jit_limit = min_t(u64, round_up(bpf_jit_alloc_exec_limit() >> 2, + PAGE_SIZE), LONG_MAX); return 0; } pure_initcall(bpf_jit_charge_init); -#endif static int bpf_jit_charge_modmem(u32 pages) { diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 37b4667128a3..d67ec17f2cc8 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -28,6 +28,8 @@ static int two __maybe_unused = 2; static int min_sndbuf = SOCK_MIN_SNDBUF; static int min_rcvbuf = SOCK_MIN_RCVBUF; static int max_skb_frags = MAX_SKB_FRAGS; +static long long_one __maybe_unused = 1; +static long long_max __maybe_unused = LONG_MAX; static int net_msg_warn; /* Unused, but still a sysctl */ @@ -289,6 +291,17 @@ proc_dointvec_minmax_bpf_restricted(struct ctl_table *table, int write, return proc_dointvec_minmax(table, write, buffer, lenp, ppos); } + +static int +proc_dolongvec_minmax_bpf_restricted(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); +} #endif static struct ctl_table net_core_table[] = { @@ -398,10 +411,11 @@ static struct ctl_table net_core_table[] = { { .procname = "bpf_jit_limit", .data = &bpf_jit_limit, - .maxlen = sizeof(int), + .maxlen = sizeof(long), .mode = 0600, - .proc_handler = proc_dointvec_minmax_bpf_restricted, - .extra1 = &one, + .proc_handler = proc_dolongvec_minmax_bpf_restricted, + .extra1 = &long_one, + .extra2 = &long_max, }, #endif { -- cgit From 07c17732bd687567802aaa5fa5c101c2776565d1 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Tue, 11 Dec 2018 18:49:05 +0900 Subject: printk: Remove print_prefix() calls with NULL buffer. We can save lines/size by removing print_prefix() with buf == NULL. This patch makes no functional change. Link: http://lkml.kernel.org/r/1544521745-11925-1-git-send-email-penguin-kernel@I-love.SAKURA.ne.jp To: Steven Rostedt Cc: linux-kernel@vger.kernel.org Signed-off-by: Tetsuo Handa Reviewed-by: Sergey Senozhatsky Signed-off-by: Petr Mladek --- kernel/printk/printk.c | 39 ++++++++++++++------------------------- 1 file changed, 14 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index c7d217764db3..91db332ccf4d 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1228,13 +1228,15 @@ static inline void boot_delay_msec(int level) static bool printk_time = IS_ENABLED(CONFIG_PRINTK_TIME); module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR); +static size_t print_syslog(unsigned int level, char *buf) +{ + return sprintf(buf, "<%u>", level); +} + static size_t print_time(u64 ts, char *buf) { unsigned long rem_nsec = do_div(ts, 1000000000); - if (!buf) - return snprintf(NULL, 0, "[%5lu.000000] ", (unsigned long)ts); - return sprintf(buf, "[%5lu.%06lu] ", (unsigned long)ts, rem_nsec / 1000); } @@ -1243,24 +1245,11 @@ static size_t print_prefix(const struct printk_log *msg, bool syslog, bool time, char *buf) { size_t len = 0; - unsigned int prefix = (msg->facility << 3) | msg->level; - - if (syslog) { - if (buf) { - len += sprintf(buf, "<%u>", prefix); - } else { - len += 3; - if (prefix > 999) - len += 3; - else if (prefix > 99) - len += 2; - else if (prefix > 9) - len++; - } - } + if (syslog) + len = print_syslog((msg->facility << 3) | msg->level, buf); if (time) - len += print_time(msg->ts_nsec, buf ? buf + len : NULL); + len += print_time(msg->ts_nsec, buf + len); return len; } @@ -1270,6 +1259,8 @@ static size_t msg_print_text(const struct printk_log *msg, bool syslog, const char *text = log_text(msg); size_t text_size = msg->text_len; size_t len = 0; + char prefix[PREFIX_MAX]; + const size_t prefix_len = print_prefix(msg, syslog, time, prefix); do { const char *next = memchr(text, '\n', text_size); @@ -1284,19 +1275,17 @@ static size_t msg_print_text(const struct printk_log *msg, bool syslog, } if (buf) { - if (print_prefix(msg, syslog, time, NULL) + - text_len + 1 >= size - len) + if (prefix_len + text_len + 1 >= size - len) break; - len += print_prefix(msg, syslog, time, buf + len); + memcpy(buf + len, prefix, prefix_len); + len += prefix_len; memcpy(buf + len, text, text_len); len += text_len; buf[len++] = '\n'; } else { /* SYSLOG_ACTION_* buffer size only calculation */ - len += print_prefix(msg, syslog, time, NULL); - len += text_len; - len++; + len += prefix_len + text_len + 1; } text = next; -- cgit From 943a10f85265d4c14753d445003b6f5d8e3d959a Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Tue, 11 Dec 2018 11:20:48 -0500 Subject: PM / sleep: convert to DEFINE_SHOW_ATTRIBUTE Use DEFINE_SHOW_ATTRIBUTE macro to simplify the code. Signed-off-by: Yangtao Li Acked-by: Pavel Machek Signed-off-by: Rafael J. Wysocki --- kernel/power/main.c | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/power/main.c b/kernel/power/main.c index 35b50823d83b..98e76cad128b 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -318,23 +318,12 @@ static int suspend_stats_show(struct seq_file *s, void *unused) return 0; } - -static int suspend_stats_open(struct inode *inode, struct file *file) -{ - return single_open(file, suspend_stats_show, NULL); -} - -static const struct file_operations suspend_stats_operations = { - .open = suspend_stats_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(suspend_stats); static int __init pm_debugfs_init(void) { debugfs_create_file("suspend_stats", S_IFREG | S_IRUGO, - NULL, NULL, &suspend_stats_operations); + NULL, NULL, &suspend_stats_fops); return 0; } -- cgit From 1b2b234b1318afb3775d4c6624fd5a96558f19df Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Mon, 10 Dec 2018 15:43:00 -0800 Subject: bpf: pass struct btf pointer to the map_check_btf() callback If key_type or value_type are of non-trivial data types (e.g. structure or typedef), it's not possible to check them without the additional information, which can't be obtained without a pointer to the btf structure. So, let's pass btf pointer to the map_check_btf() callbacks. Signed-off-by: Roman Gushchin Cc: Alexei Starovoitov Cc: Daniel Borkmann Acked-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 3 +++ kernel/bpf/arraymap.c | 1 + kernel/bpf/lpm_trie.c | 1 + kernel/bpf/syscall.c | 3 ++- 4 files changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 0c992b86eb2c..e734f163bd0b 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -23,6 +23,7 @@ struct bpf_prog; struct bpf_map; struct sock; struct seq_file; +struct btf; struct btf_type; /* map is generic key/value storage optionally accesible by eBPF programs */ @@ -52,6 +53,7 @@ struct bpf_map_ops { void (*map_seq_show_elem)(struct bpf_map *map, void *key, struct seq_file *m); int (*map_check_btf)(const struct bpf_map *map, + const struct btf *btf, const struct btf_type *key_type, const struct btf_type *value_type); }; @@ -126,6 +128,7 @@ static inline bool bpf_map_support_seq_show(const struct bpf_map *map) } int map_check_no_btf(const struct bpf_map *map, + const struct btf *btf, const struct btf_type *key_type, const struct btf_type *value_type); diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 24583da9ffd1..25632a75d630 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -382,6 +382,7 @@ static void percpu_array_map_seq_show_elem(struct bpf_map *map, void *key, } static int array_map_check_btf(const struct bpf_map *map, + const struct btf *btf, const struct btf_type *key_type, const struct btf_type *value_type) { diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index bfd4882e1106..abf1002080df 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -728,6 +728,7 @@ free_stack: } static int trie_check_btf(const struct bpf_map *map, + const struct btf *btf, const struct btf_type *key_type, const struct btf_type *value_type) { diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 5745c7837621..70fb11106fc2 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -456,6 +456,7 @@ static int bpf_obj_name_cpy(char *dst, const char *src) } int map_check_no_btf(const struct bpf_map *map, + const struct btf *btf, const struct btf_type *key_type, const struct btf_type *value_type) { @@ -478,7 +479,7 @@ static int map_check_btf(const struct bpf_map *map, const struct btf *btf, return -EINVAL; if (map->ops->map_check_btf) - ret = map->ops->map_check_btf(map, key_type, value_type); + ret = map->ops->map_check_btf(map, btf, key_type, value_type); return ret; } -- cgit From 9a1126b63190e2541dd5d643f4bfeb5a7f493729 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Mon, 10 Dec 2018 15:43:01 -0800 Subject: bpf: add bpffs pretty print for cgroup local storage maps Implement bpffs pretty printing for cgroup local storage maps (both shared and per-cpu). Output example (captured for tools/testing/selftests/bpf/netcnt_prog.c): Shared: $ cat /sys/fs/bpf/map_2 # WARNING!! The output is for debug purpose only # WARNING!! The output format will change {4294968594,1}: {9999,1039896} Per-cpu: $ cat /sys/fs/bpf/map_1 # WARNING!! The output is for debug purpose only # WARNING!! The output format will change {4294968594,1}: { cpu0: {0,0,0,0,0} cpu1: {0,0,0,0,0} cpu2: {1,104,0,0,0} cpu3: {0,0,0,0,0} } Signed-off-by: Roman Gushchin Cc: Alexei Starovoitov Cc: Daniel Borkmann Acked-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov --- include/linux/btf.h | 1 + kernel/bpf/btf.c | 22 +++++++++++ kernel/bpf/local_storage.c | 93 +++++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 115 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/btf.h b/include/linux/btf.h index b98405a56383..a4cf075b89eb 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -47,6 +47,7 @@ void btf_type_seq_show(const struct btf *btf, u32 type_id, void *obj, int btf_get_fd_by_id(u32 id); u32 btf_id(const struct btf *btf); bool btf_name_offset_valid(const struct btf *btf, u32 offset); +bool btf_type_is_reg_int(const struct btf_type *t, u32 expected_size); #ifdef CONFIG_BPF_SYSCALL const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id); diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index bf34933cc413..1545ddfb6fa5 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -514,6 +514,28 @@ static bool btf_type_int_is_regular(const struct btf_type *t) return true; } +/* + * Check that given type is a regular int and has the expected size. + */ +bool btf_type_is_reg_int(const struct btf_type *t, u32 expected_size) +{ + u8 nr_bits, nr_bytes; + u32 int_data; + + if (!btf_type_is_int(t)) + return false; + + int_data = btf_type_int(t); + nr_bits = BTF_INT_BITS(int_data); + nr_bytes = BITS_ROUNDUP_BYTES(nr_bits); + if (BITS_PER_BYTE_MASKED(nr_bits) || + BTF_INT_OFFSET(int_data) || + nr_bytes != expected_size) + return false; + + return true; +} + __printf(2, 3) static void __btf_verifier_log(struct bpf_verifier_log *log, const char *fmt, ...) { diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index b65017dead44..5eca03da0989 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -1,11 +1,13 @@ //SPDX-License-Identifier: GPL-2.0 #include #include +#include #include #include #include #include #include +#include DEFINE_PER_CPU(struct bpf_cgroup_storage*, bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]); @@ -308,6 +310,94 @@ static int cgroup_storage_delete_elem(struct bpf_map *map, void *key) return -EINVAL; } +static int cgroup_storage_check_btf(const struct bpf_map *map, + const struct btf *btf, + const struct btf_type *key_type, + const struct btf_type *value_type) +{ + const struct btf_type *t; + struct btf_member *m; + u32 id, size; + + /* Key is expected to be of struct bpf_cgroup_storage_key type, + * which is: + * struct bpf_cgroup_storage_key { + * __u64 cgroup_inode_id; + * __u32 attach_type; + * }; + */ + + /* + * Key_type must be a structure with two fields. + */ + if (BTF_INFO_KIND(key_type->info) != BTF_KIND_STRUCT || + BTF_INFO_VLEN(key_type->info) != 2) + return -EINVAL; + + /* + * The first field must be a 64 bit integer at 0 offset. + */ + m = (struct btf_member *)(key_type + 1); + if (m->offset) + return -EINVAL; + id = m->type; + t = btf_type_id_size(btf, &id, NULL); + size = FIELD_SIZEOF(struct bpf_cgroup_storage_key, cgroup_inode_id); + if (!t || !btf_type_is_reg_int(t, size)) + return -EINVAL; + + /* + * The second field must be a 32 bit integer at 64 bit offset. + */ + m++; + if (m->offset != offsetof(struct bpf_cgroup_storage_key, attach_type) * + BITS_PER_BYTE) + return -EINVAL; + id = m->type; + t = btf_type_id_size(btf, &id, NULL); + size = FIELD_SIZEOF(struct bpf_cgroup_storage_key, attach_type); + if (!t || !btf_type_is_reg_int(t, size)) + return -EINVAL; + + return 0; +} + +static void cgroup_storage_seq_show_elem(struct bpf_map *map, void *_key, + struct seq_file *m) +{ + enum bpf_cgroup_storage_type stype = cgroup_storage_type(map); + struct bpf_cgroup_storage_key *key = _key; + struct bpf_cgroup_storage *storage; + int cpu; + + rcu_read_lock(); + storage = cgroup_storage_lookup(map_to_storage(map), key, false); + if (!storage) { + rcu_read_unlock(); + return; + } + + btf_type_seq_show(map->btf, map->btf_key_type_id, key, m); + stype = cgroup_storage_type(map); + if (stype == BPF_CGROUP_STORAGE_SHARED) { + seq_puts(m, ": "); + btf_type_seq_show(map->btf, map->btf_value_type_id, + &READ_ONCE(storage->buf)->data[0], m); + seq_puts(m, "\n"); + } else { + seq_puts(m, ": {\n"); + for_each_possible_cpu(cpu) { + seq_printf(m, "\tcpu%d: ", cpu); + btf_type_seq_show(map->btf, map->btf_value_type_id, + per_cpu_ptr(storage->percpu_buf, cpu), + m); + seq_puts(m, "\n"); + } + seq_puts(m, "}\n"); + } + rcu_read_unlock(); +} + const struct bpf_map_ops cgroup_storage_map_ops = { .map_alloc = cgroup_storage_map_alloc, .map_free = cgroup_storage_map_free, @@ -315,7 +405,8 @@ const struct bpf_map_ops cgroup_storage_map_ops = { .map_lookup_elem = cgroup_storage_lookup_elem, .map_update_elem = cgroup_storage_update_elem, .map_delete_elem = cgroup_storage_delete_elem, - .map_check_btf = map_check_no_btf, + .map_check_btf = cgroup_storage_check_btf, + .map_seq_show_elem = cgroup_storage_seq_show_elem, }; int bpf_cgroup_storage_assign(struct bpf_prog *prog, struct bpf_map *_map) -- cgit From 06459901d55ee2f690b8e1fe084fb03061d617cf Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Fri, 9 Nov 2018 18:21:32 +0100 Subject: irq/irq_sim: Store multiple interrupt offsets in a bitmap Two threads can try to fire the irq_sim with different offsets and will end up fighting for the irq_work asignment. Thomas Gleixner suggested a solution based on a bitfield where we set a bit for every offset associated with an interrupt that should be fired and then iterate over all set bits in the interrupt handler. This is a slightly modified solution using a bitmap so that we don't impose a limit on the number of interrupts one can allocate with irq_sim. Suggested-by: Thomas Gleixner Signed-off-by: Bartosz Golaszewski Signed-off-by: Marc Zyngier --- include/linux/irq_sim.h | 2 +- kernel/irq/irq_sim.c | 23 +++++++++++++++++++++-- 2 files changed, 22 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/linux/irq_sim.h b/include/linux/irq_sim.h index 630a57e55db6..4500d453a63e 100644 --- a/include/linux/irq_sim.h +++ b/include/linux/irq_sim.h @@ -16,7 +16,7 @@ struct irq_sim_work_ctx { struct irq_work work; - int irq; + unsigned long *pending; }; struct irq_sim_irq_ctx { diff --git a/kernel/irq/irq_sim.c b/kernel/irq/irq_sim.c index dd20d0d528d4..98a20e1594ce 100644 --- a/kernel/irq/irq_sim.c +++ b/kernel/irq/irq_sim.c @@ -34,9 +34,20 @@ static struct irq_chip irq_sim_irqchip = { static void irq_sim_handle_irq(struct irq_work *work) { struct irq_sim_work_ctx *work_ctx; + unsigned int offset = 0; + struct irq_sim *sim; + int irqnum; work_ctx = container_of(work, struct irq_sim_work_ctx, work); - handle_simple_irq(irq_to_desc(work_ctx->irq)); + sim = container_of(work_ctx, struct irq_sim, work_ctx); + + while (!bitmap_empty(work_ctx->pending, sim->irq_count)) { + offset = find_next_bit(work_ctx->pending, + sim->irq_count, offset); + clear_bit(offset, work_ctx->pending); + irqnum = irq_sim_irqnum(sim, offset); + handle_simple_irq(irq_to_desc(irqnum)); + } } /** @@ -63,6 +74,13 @@ int irq_sim_init(struct irq_sim *sim, unsigned int num_irqs) return sim->irq_base; } + sim->work_ctx.pending = bitmap_zalloc(num_irqs, GFP_KERNEL); + if (!sim->work_ctx.pending) { + kfree(sim->irqs); + irq_free_descs(sim->irq_base, num_irqs); + return -ENOMEM; + } + for (i = 0; i < num_irqs; i++) { sim->irqs[i].irqnum = sim->irq_base + i; sim->irqs[i].enabled = false; @@ -89,6 +107,7 @@ EXPORT_SYMBOL_GPL(irq_sim_init); void irq_sim_fini(struct irq_sim *sim) { irq_work_sync(&sim->work_ctx.work); + bitmap_free(sim->work_ctx.pending); irq_free_descs(sim->irq_base, sim->irq_count); kfree(sim->irqs); } @@ -143,7 +162,7 @@ EXPORT_SYMBOL_GPL(devm_irq_sim_init); void irq_sim_fire(struct irq_sim *sim, unsigned int offset) { if (sim->irqs[offset].enabled) { - sim->work_ctx.irq = irq_sim_irqnum(sim, offset); + set_bit(offset, sim->work_ctx.pending); irq_work_queue(&sim->work_ctx.work); } } -- cgit From 9e794163a69c103633fefb10a3879408d4e4e2c8 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 12 Dec 2018 10:18:21 -0800 Subject: bpf: Remove bpf_dump_raw_ok() check for func_info and line_info The func_info and line_info have the bpf insn offset but they do not contain kernel address. They will still be useful for the userspace tool to annotate the xlated insn. This patch removes the bpf_dump_raw_ok() guard for the func_info and line_info during bpf_prog_get_info_by_fd(). The guard stays for jited_line_info which contains the kernel address. Although this bpf_dump_raw_ok() guard behavior has started since the earlier func_info patch series, I marked the Fixes tag to the latest line_info patch series which contains both func_info and line_info and this patch is fixing for both of them. Fixes: c454a46b5efd ("bpf: Add bpf_line_info support") Signed-off-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann --- kernel/bpf/syscall.c | 32 ++++++++++++-------------------- 1 file changed, 12 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 70fb11106fc2..b7c585838c72 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2272,33 +2272,25 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, ulen = info.nr_func_info; info.nr_func_info = prog->aux->func_info_cnt; if (info.nr_func_info && ulen) { - if (bpf_dump_raw_ok()) { - char __user *user_finfo; + char __user *user_finfo; - user_finfo = u64_to_user_ptr(info.func_info); - ulen = min_t(u32, info.nr_func_info, ulen); - if (copy_to_user(user_finfo, prog->aux->func_info, - info.func_info_rec_size * ulen)) - return -EFAULT; - } else { - info.func_info = 0; - } + user_finfo = u64_to_user_ptr(info.func_info); + ulen = min_t(u32, info.nr_func_info, ulen); + if (copy_to_user(user_finfo, prog->aux->func_info, + info.func_info_rec_size * ulen)) + return -EFAULT; } ulen = info.nr_line_info; info.nr_line_info = prog->aux->nr_linfo; if (info.nr_line_info && ulen) { - if (bpf_dump_raw_ok()) { - __u8 __user *user_linfo; + __u8 __user *user_linfo; - user_linfo = u64_to_user_ptr(info.line_info); - ulen = min_t(u32, info.nr_line_info, ulen); - if (copy_to_user(user_linfo, prog->aux->linfo, - info.line_info_rec_size * ulen)) - return -EFAULT; - } else { - info.line_info = 0; - } + user_linfo = u64_to_user_ptr(info.line_info); + ulen = min_t(u32, info.nr_line_info, ulen); + if (copy_to_user(user_linfo, prog->aux->linfo, + info.line_info_rec_size * ulen)) + return -EFAULT; } ulen = info.nr_jited_line_info; -- cgit From c872bdb38febb4c31ece3599c52cf1f833b89f4e Mon Sep 17 00:00:00 2001 From: Song Liu Date: Wed, 12 Dec 2018 09:37:46 -0800 Subject: bpf: include sub program tags in bpf_prog_info Changes v2 -> v3: 1. remove check for bpf_dump_raw_ok(). Changes v1 -> v2: 1. Fix error path as Martin suggested. This patch adds nr_prog_tags and prog_tags to bpf_prog_info. This is a reliable way for user space to get tags of all sub programs. Before this patch, user space need to find sub program tags via kallsyms. This feature will be used in BPF introspection, where user space queries information about BPF programs via sys_bpf. Signed-off-by: Song Liu Acked-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 2 ++ kernel/bpf/syscall.c | 22 ++++++++++++++++++++++ 2 files changed, 24 insertions(+) (limited to 'kernel') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index aa582cd5bfcf..e7d57e89f25f 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2717,6 +2717,8 @@ struct bpf_prog_info { __u32 nr_jited_line_info; __u32 line_info_rec_size; __u32 jited_line_info_rec_size; + __u32 nr_prog_tags; + __aligned_u64 prog_tags; } __attribute__((aligned(8))); struct bpf_map_info { diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index b7c585838c72..7f1410d6fbe9 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2315,6 +2315,28 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, } } + ulen = info.nr_prog_tags; + info.nr_prog_tags = prog->aux->func_cnt ? : 1; + if (ulen) { + __u8 __user (*user_prog_tags)[BPF_TAG_SIZE]; + u32 i; + + user_prog_tags = u64_to_user_ptr(info.prog_tags); + ulen = min_t(u32, info.nr_prog_tags, ulen); + if (prog->aux->func_cnt) { + for (i = 0; i < ulen; i++) { + if (copy_to_user(user_prog_tags[i], + prog->aux->func[i]->tag, + BPF_TAG_SIZE)) + return -EFAULT; + } + } else { + if (copy_to_user(user_prog_tags[0], + prog->tag, BPF_TAG_SIZE)) + return -EFAULT; + } + } + done: if (copy_to_user(uinfo, &info, info_len) || put_user(info_len, &uattr->info.info_len)) -- cgit From ba830885656414101b2f8ca88786524d4bb5e8c1 Mon Sep 17 00:00:00 2001 From: Kristina Martsenko Date: Fri, 7 Dec 2018 18:39:28 +0000 Subject: arm64: add prctl control for resetting ptrauth keys Add an arm64-specific prctl to allow a thread to reinitialize its pointer authentication keys to random values. This can be useful when exec() is not used for starting new processes, to ensure that different processes still have different keys. Signed-off-by: Kristina Martsenko Signed-off-by: Will Deacon --- arch/arm64/include/asm/pointer_auth.h | 3 +++ arch/arm64/include/asm/processor.h | 4 +++ arch/arm64/kernel/Makefile | 1 + arch/arm64/kernel/pointer_auth.c | 47 +++++++++++++++++++++++++++++++++++ include/uapi/linux/prctl.h | 8 ++++++ kernel/sys.c | 8 ++++++ 6 files changed, 71 insertions(+) create mode 100644 arch/arm64/kernel/pointer_auth.c (limited to 'kernel') diff --git a/arch/arm64/include/asm/pointer_auth.h b/arch/arm64/include/asm/pointer_auth.h index 5ccf49b4dac3..80eb03afd677 100644 --- a/arch/arm64/include/asm/pointer_auth.h +++ b/arch/arm64/include/asm/pointer_auth.h @@ -63,6 +63,8 @@ static inline void ptrauth_keys_switch(struct ptrauth_keys *keys) __ptrauth_key_install(APGA, keys->apga); } +extern int ptrauth_prctl_reset_keys(struct task_struct *tsk, unsigned long arg); + /* * The EL0 pointer bits used by a pointer authentication code. * This is dependent on TBI0 being enabled, or bits 63:56 would also apply. @@ -86,6 +88,7 @@ do { \ ptrauth_keys_switch(&(tsk)->thread_info.keys_user) #else /* CONFIG_ARM64_PTR_AUTH */ +#define ptrauth_prctl_reset_keys(tsk, arg) (-EINVAL) #define ptrauth_strip_insn_pac(lr) (lr) #define ptrauth_thread_init_user(tsk) #define ptrauth_thread_switch(tsk) diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h index f4b8e09aff56..142c708cb429 100644 --- a/arch/arm64/include/asm/processor.h +++ b/arch/arm64/include/asm/processor.h @@ -44,6 +44,7 @@ #include #include #include +#include #include #include @@ -289,6 +290,9 @@ extern void __init minsigstksz_setup(void); #define SVE_SET_VL(arg) sve_set_current_vl(arg) #define SVE_GET_VL() sve_get_current_vl() +/* PR_PAC_RESET_KEYS prctl */ +#define PAC_RESET_KEYS(tsk, arg) ptrauth_prctl_reset_keys(tsk, arg) + /* * For CONFIG_GCC_PLUGIN_STACKLEAK * diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile index 583334ce1c2c..df08d735b21d 100644 --- a/arch/arm64/kernel/Makefile +++ b/arch/arm64/kernel/Makefile @@ -58,6 +58,7 @@ arm64-obj-$(CONFIG_CRASH_DUMP) += crash_dump.o arm64-obj-$(CONFIG_CRASH_CORE) += crash_core.o arm64-obj-$(CONFIG_ARM_SDE_INTERFACE) += sdei.o arm64-obj-$(CONFIG_ARM64_SSBD) += ssbd.o +arm64-obj-$(CONFIG_ARM64_PTR_AUTH) += pointer_auth.o obj-y += $(arm64-obj-y) vdso/ probes/ obj-m += $(arm64-obj-m) diff --git a/arch/arm64/kernel/pointer_auth.c b/arch/arm64/kernel/pointer_auth.c new file mode 100644 index 000000000000..b9f6f5f3409a --- /dev/null +++ b/arch/arm64/kernel/pointer_auth.c @@ -0,0 +1,47 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include +#include +#include + +int ptrauth_prctl_reset_keys(struct task_struct *tsk, unsigned long arg) +{ + struct ptrauth_keys *keys = &tsk->thread_info.keys_user; + unsigned long addr_key_mask = PR_PAC_APIAKEY | PR_PAC_APIBKEY | + PR_PAC_APDAKEY | PR_PAC_APDBKEY; + unsigned long key_mask = addr_key_mask | PR_PAC_APGAKEY; + + if (!system_supports_address_auth() && !system_supports_generic_auth()) + return -EINVAL; + + if (!arg) { + ptrauth_keys_init(keys); + ptrauth_keys_switch(keys); + return 0; + } + + if (arg & ~key_mask) + return -EINVAL; + + if (((arg & addr_key_mask) && !system_supports_address_auth()) || + ((arg & PR_PAC_APGAKEY) && !system_supports_generic_auth())) + return -EINVAL; + + if (arg & PR_PAC_APIAKEY) + get_random_bytes(&keys->apia, sizeof(keys->apia)); + if (arg & PR_PAC_APIBKEY) + get_random_bytes(&keys->apib, sizeof(keys->apib)); + if (arg & PR_PAC_APDAKEY) + get_random_bytes(&keys->apda, sizeof(keys->apda)); + if (arg & PR_PAC_APDBKEY) + get_random_bytes(&keys->apdb, sizeof(keys->apdb)); + if (arg & PR_PAC_APGAKEY) + get_random_bytes(&keys->apga, sizeof(keys->apga)); + + ptrauth_keys_switch(keys); + + return 0; +} diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h index c0d7ea0bf5b6..0f535a501391 100644 --- a/include/uapi/linux/prctl.h +++ b/include/uapi/linux/prctl.h @@ -219,4 +219,12 @@ struct prctl_mm_map { # define PR_SPEC_DISABLE (1UL << 2) # define PR_SPEC_FORCE_DISABLE (1UL << 3) +/* Reset arm64 pointer authentication keys */ +#define PR_PAC_RESET_KEYS 54 +# define PR_PAC_APIAKEY (1UL << 0) +# define PR_PAC_APIBKEY (1UL << 1) +# define PR_PAC_APDAKEY (1UL << 2) +# define PR_PAC_APDBKEY (1UL << 3) +# define PR_PAC_APGAKEY (1UL << 4) + #endif /* _LINUX_PRCTL_H */ diff --git a/kernel/sys.c b/kernel/sys.c index 123bd73046ec..64b5a230f38d 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -121,6 +121,9 @@ #ifndef SVE_GET_VL # define SVE_GET_VL() (-EINVAL) #endif +#ifndef PAC_RESET_KEYS +# define PAC_RESET_KEYS(a, b) (-EINVAL) +#endif /* * this is where the system-wide overflow UID and GID are defined, for @@ -2476,6 +2479,11 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, return -EINVAL; error = arch_prctl_spec_ctrl_set(me, arg2, arg3); break; + case PR_PAC_RESET_KEYS: + if (arg3 || arg4 || arg5) + return -EINVAL; + error = PAC_RESET_KEYS(me, arg2); + break; default: error = -EINVAL; break; -- cgit From 7640ead939247e91e84b7ec6ec001f30193cc7df Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 12 Dec 2018 16:29:07 -0800 Subject: bpf: verifier: make sure callees don't prune with caller differences Currently for liveness and state pruning the register parentage chains don't include states of the callee. This makes some sense as the callee can't access those registers. However, this means that READs done after the callee returns will not propagate into the states of the callee. Callee will then perform pruning disregarding differences in caller state. Example: 0: (85) call bpf_user_rnd_u32 1: (b7) r8 = 0 2: (55) if r0 != 0x0 goto pc+1 3: (b7) r8 = 1 4: (bf) r1 = r8 5: (85) call pc+4 6: (15) if r8 == 0x1 goto pc+1 7: (05) *(u64 *)(r9 - 8) = r3 8: (b7) r0 = 0 9: (95) exit 10: (15) if r1 == 0x0 goto pc+0 11: (95) exit Here we acquire unknown state with call to get_random() [1]. Then we store this random state in r8 (either 0 or 1) [1 - 3], and make a call on line 5. Callee does nothing but a trivial conditional jump (to create a pruning point). Upon return caller checks the state of r8 and either performs an unsafe read or not. Verifier will first explore the path with r8 == 1, creating a pruning point at [11]. The parentage chain for r8 will include only callers states so once verifier reaches [6] it will mark liveness only on states in the caller, and not [11]. Now when verifier walks the paths with r8 == 0 it will reach [11] and since REG_LIVE_READ on r8 was not propagated there it will prune the walk entirely (stop walking the entire program, not just the callee). Since [6] was never walked with r8 == 0, [7] will be considered dead and replaced with "goto -1" causing hang at runtime. This patch weaves the callee's explored states onto the callers parentage chain. Rough parentage for r8 would have looked like this before: [0] [1] [2] [3] [4] [5] [10] [11] [6] [7] | | ,---|----. | | | sl0: sl0: / sl0: \ sl0: sl0: sl0: fr0: r8 <-- fr0: r8<+--fr0: r8 `fr0: r8 ,fr0: r8<-fr0: r8 \ fr1: r8 <- fr1: r8 / \__________________/ after: [0] [1] [2] [3] [4] [5] [10] [11] [6] [7] | | | | | | sl0: sl0: sl0: sl0: sl0: sl0: fr0: r8 <-- fr0: r8 <- fr0: r8 <- fr0: r8 <-fr0: r8<-fr0: r8 fr1: r8 <- fr1: r8 Now the mark from instruction 6 will travel through callees states. Note that we don't have to connect r0 because its overwritten by callees state on return and r1 - r5 because those are not alive any more once a call is made. v2: - don't connect the callees registers twice (Alexei: suggestion & code) - add more details to the comment (Ed & Alexei) v1: don't unnecessarily link caller saved regs (Jiong) Fixes: f4d7e40a5b71 ("bpf: introduce function calls (verification)") Reported-by: David Beckett Signed-off-by: Jakub Kicinski Reviewed-by: Jiong Wang Reviewed-by: Edward Cree Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 13 ++++++++++--- tools/testing/selftests/bpf/test_verifier.c | 28 ++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index fc760d00a38c..51ba84d4d34a 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5102,9 +5102,16 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) } new_sl->next = env->explored_states[insn_idx]; env->explored_states[insn_idx] = new_sl; - /* connect new state to parentage chain */ - for (i = 0; i < BPF_REG_FP; i++) - cur_regs(env)[i].parent = &new->frame[new->curframe]->regs[i]; + /* connect new state to parentage chain. Current frame needs all + * registers connected. Only r6 - r9 of the callers are alive (pushed + * to the stack implicitly by JITs) so in callers' frames connect just + * r6 - r9 as an optimization. Callers will have r1 - r5 connected to + * the state of the call instruction (with WRITTEN set), and r0 comes + * from callee with its full parentage chain, anyway. + */ + for (j = 0; j <= cur->curframe; j++) + for (i = j < cur->curframe ? BPF_REG_6 : 0; i < BPF_REG_FP; i++) + cur->frame[j]->regs[i].parent = &new->frame[j]->regs[i]; /* clear write marks in current state: the writes we did are not writes * our child did, so they don't screen off its reads from us. * (There are no read marks in current state, because reads always mark diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index 37583267bdbc..f8eac4a544f4 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -13915,6 +13915,34 @@ static struct bpf_test tests[] = { .result_unpriv = REJECT, .result = ACCEPT, }, + { + "calls: cross frame pruning", + .insns = { + /* r8 = !!random(); + * call pruner() + * if (r8) + * do something bad; + */ + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_get_prandom_u32), + BPF_MOV64_IMM(BPF_REG_8, 0), + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1), + BPF_MOV64_IMM(BPF_REG_8, 1), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_8), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 4), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_8, 1, 1), + BPF_LDX_MEM(BPF_B, BPF_REG_9, BPF_REG_1, 0), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 0), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SOCKET_FILTER, + .errstr_unpriv = "function calls to other bpf functions are allowed for root only", + .result_unpriv = REJECT, + .errstr = "!read_ok", + .result = REJECT, + }, }; static int probe_filter_length(const struct bpf_insn *fp) -- cgit From 20b105feda8d42360bd690b8fdf6b2f00ba4a993 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 4 Dec 2018 08:04:31 -0800 Subject: dma-mapping: remove a pointless memset in dma_atomic_pool_init We already zero the memory after allocating it from the pool that this function fills, and having the memset here in this form means we can't support CMA highmem allocations. Signed-off-by: Christoph Hellwig Reported-by: Russell King - ARM Linux --- kernel/dma/remap.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/dma/remap.c b/kernel/dma/remap.c index 8a44317cfc1a..18cc09fc27b9 100644 --- a/kernel/dma/remap.c +++ b/kernel/dma/remap.c @@ -121,7 +121,6 @@ int __init dma_atomic_pool_init(gfp_t gfp, pgprot_t prot) if (!page) goto out; - memset(page_address(page), 0, atomic_pool_size); arch_dma_prep_coherent(page, atomic_pool_size); atomic_pool = gen_pool_create(PAGE_SHIFT, -1); -- cgit From 8d59b5f2a44611d7327a2a14b36090d692186f60 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 3 Dec 2018 14:58:59 +0100 Subject: dma-mapping: simplify the dma_sync_single_range_for_{cpu,device} implementation We can just call the regular calls after adding offset the the address instead of reimplementing them. Signed-off-by: Christoph Hellwig Acked-by: Jesper Dangaard Brouer Tested-by: Jesper Dangaard Brouer Tested-by: Tony Luck --- include/linux/dma-debug.h | 27 --------------------------- include/linux/dma-mapping.h | 34 ++++++++++------------------------ kernel/dma/debug.c | 42 ------------------------------------------ 3 files changed, 10 insertions(+), 93 deletions(-) (limited to 'kernel') diff --git a/include/linux/dma-debug.h b/include/linux/dma-debug.h index 46e6131a72b6..2ad5c363d7d5 100644 --- a/include/linux/dma-debug.h +++ b/include/linux/dma-debug.h @@ -70,17 +70,6 @@ extern void debug_dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle, size_t size, int direction); -extern void debug_dma_sync_single_range_for_cpu(struct device *dev, - dma_addr_t dma_handle, - unsigned long offset, - size_t size, - int direction); - -extern void debug_dma_sync_single_range_for_device(struct device *dev, - dma_addr_t dma_handle, - unsigned long offset, - size_t size, int direction); - extern void debug_dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems, int direction); @@ -167,22 +156,6 @@ static inline void debug_dma_sync_single_for_device(struct device *dev, { } -static inline void debug_dma_sync_single_range_for_cpu(struct device *dev, - dma_addr_t dma_handle, - unsigned long offset, - size_t size, - int direction) -{ -} - -static inline void debug_dma_sync_single_range_for_device(struct device *dev, - dma_addr_t dma_handle, - unsigned long offset, - size_t size, - int direction) -{ -} - static inline void debug_dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems, int direction) diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index 7799c2b27849..8916499d2805 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -360,6 +360,13 @@ static inline void dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr, debug_dma_sync_single_for_cpu(dev, addr, size, dir); } +static inline void dma_sync_single_range_for_cpu(struct device *dev, + dma_addr_t addr, unsigned long offset, size_t size, + enum dma_data_direction dir) +{ + return dma_sync_single_for_cpu(dev, addr + offset, size, dir); +} + static inline void dma_sync_single_for_device(struct device *dev, dma_addr_t addr, size_t size, enum dma_data_direction dir) @@ -372,32 +379,11 @@ static inline void dma_sync_single_for_device(struct device *dev, debug_dma_sync_single_for_device(dev, addr, size, dir); } -static inline void dma_sync_single_range_for_cpu(struct device *dev, - dma_addr_t addr, - unsigned long offset, - size_t size, - enum dma_data_direction dir) -{ - const struct dma_map_ops *ops = get_dma_ops(dev); - - BUG_ON(!valid_dma_direction(dir)); - if (ops->sync_single_for_cpu) - ops->sync_single_for_cpu(dev, addr + offset, size, dir); - debug_dma_sync_single_range_for_cpu(dev, addr, offset, size, dir); -} - static inline void dma_sync_single_range_for_device(struct device *dev, - dma_addr_t addr, - unsigned long offset, - size_t size, - enum dma_data_direction dir) + dma_addr_t addr, unsigned long offset, size_t size, + enum dma_data_direction dir) { - const struct dma_map_ops *ops = get_dma_ops(dev); - - BUG_ON(!valid_dma_direction(dir)); - if (ops->sync_single_for_device) - ops->sync_single_for_device(dev, addr + offset, size, dir); - debug_dma_sync_single_range_for_device(dev, addr, offset, size, dir); + return dma_sync_single_for_device(dev, addr + offset, size, dir); } static inline void diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index 20ab0f6c1b70..164706da2a73 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -1633,48 +1633,6 @@ void debug_dma_sync_single_for_device(struct device *dev, } EXPORT_SYMBOL(debug_dma_sync_single_for_device); -void debug_dma_sync_single_range_for_cpu(struct device *dev, - dma_addr_t dma_handle, - unsigned long offset, size_t size, - int direction) -{ - struct dma_debug_entry ref; - - if (unlikely(dma_debug_disabled())) - return; - - ref.type = dma_debug_single; - ref.dev = dev; - ref.dev_addr = dma_handle; - ref.size = offset + size; - ref.direction = direction; - ref.sg_call_ents = 0; - - check_sync(dev, &ref, true); -} -EXPORT_SYMBOL(debug_dma_sync_single_range_for_cpu); - -void debug_dma_sync_single_range_for_device(struct device *dev, - dma_addr_t dma_handle, - unsigned long offset, - size_t size, int direction) -{ - struct dma_debug_entry ref; - - if (unlikely(dma_debug_disabled())) - return; - - ref.type = dma_debug_single; - ref.dev = dev; - ref.dev_addr = dma_handle; - ref.size = offset + size; - ref.direction = direction; - ref.sg_call_ents = 0; - - check_sync(dev, &ref, false); -} -EXPORT_SYMBOL(debug_dma_sync_single_range_for_device); - void debug_dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems, int direction) { -- cgit From 05887cb610a54bf568de7f0bc07c4a64e45ac6f9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 6 Dec 2018 12:25:54 -0800 Subject: dma-mapping: move dma_get_required_mask to kernel/dma dma_get_required_mask should really be with the rest of the DMA mapping implementation instead of in drivers/base as a lone outlier. Signed-off-by: Christoph Hellwig Acked-by: Jesper Dangaard Brouer Tested-by: Jesper Dangaard Brouer Tested-by: Tony Luck --- drivers/base/platform.c | 31 ------------------------------- kernel/dma/mapping.c | 34 +++++++++++++++++++++++++++++++++- 2 files changed, 33 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/drivers/base/platform.c b/drivers/base/platform.c index 41b91af95afb..eae841935a45 100644 --- a/drivers/base/platform.c +++ b/drivers/base/platform.c @@ -1179,37 +1179,6 @@ int __init platform_bus_init(void) return error; } -#ifndef ARCH_HAS_DMA_GET_REQUIRED_MASK -static u64 dma_default_get_required_mask(struct device *dev) -{ - u32 low_totalram = ((max_pfn - 1) << PAGE_SHIFT); - u32 high_totalram = ((max_pfn - 1) >> (32 - PAGE_SHIFT)); - u64 mask; - - if (!high_totalram) { - /* convert to mask just covering totalram */ - low_totalram = (1 << (fls(low_totalram) - 1)); - low_totalram += low_totalram - 1; - mask = low_totalram; - } else { - high_totalram = (1 << (fls(high_totalram) - 1)); - high_totalram += high_totalram - 1; - mask = (((u64)high_totalram) << 32) + 0xffffffff; - } - return mask; -} - -u64 dma_get_required_mask(struct device *dev) -{ - const struct dma_map_ops *ops = get_dma_ops(dev); - - if (ops->get_required_mask) - return ops->get_required_mask(dev); - return dma_default_get_required_mask(dev); -} -EXPORT_SYMBOL_GPL(dma_get_required_mask); -#endif - static __initdata LIST_HEAD(early_platform_driver_list); static __initdata LIST_HEAD(early_platform_device_list); diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index dfbc3deb95cd..dfe29d18dba1 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -5,7 +5,7 @@ * Copyright (c) 2006 SUSE Linux Products GmbH * Copyright (c) 2006 Tejun Heo */ - +#include /* for max_pfn */ #include #include #include @@ -262,3 +262,35 @@ int dma_common_mmap(struct device *dev, struct vm_area_struct *vma, #endif /* !CONFIG_ARCH_NO_COHERENT_DMA_MMAP */ } EXPORT_SYMBOL(dma_common_mmap); + +#ifndef ARCH_HAS_DMA_GET_REQUIRED_MASK +static u64 dma_default_get_required_mask(struct device *dev) +{ + u32 low_totalram = ((max_pfn - 1) << PAGE_SHIFT); + u32 high_totalram = ((max_pfn - 1) >> (32 - PAGE_SHIFT)); + u64 mask; + + if (!high_totalram) { + /* convert to mask just covering totalram */ + low_totalram = (1 << (fls(low_totalram) - 1)); + low_totalram += low_totalram - 1; + mask = low_totalram; + } else { + high_totalram = (1 << (fls(high_totalram) - 1)); + high_totalram += high_totalram - 1; + mask = (((u64)high_totalram) << 32) + 0xffffffff; + } + return mask; +} + +u64 dma_get_required_mask(struct device *dev) +{ + const struct dma_map_ops *ops = get_dma_ops(dev); + + if (ops->get_required_mask) + return ops->get_required_mask(dev); + return dma_default_get_required_mask(dev); +} +EXPORT_SYMBOL_GPL(dma_get_required_mask); +#endif + -- cgit From 7249c1a52df9967cd23550f3dc24fb6ca43cdc6a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 6 Dec 2018 12:43:30 -0800 Subject: dma-mapping: move various slow path functions out of line There is no need to have all setup and coherent allocation / freeing routines inline. Move them out of line to keep the implemeation nicely encapsulated and save some kernel text size. Signed-off-by: Christoph Hellwig Acked-by: Jesper Dangaard Brouer Tested-by: Jesper Dangaard Brouer Tested-by: Tony Luck --- arch/powerpc/include/asm/dma-mapping.h | 1 - include/linux/dma-mapping.h | 150 +++------------------------------ kernel/dma/mapping.c | 140 +++++++++++++++++++++++++++++- 3 files changed, 151 insertions(+), 140 deletions(-) (limited to 'kernel') diff --git a/arch/powerpc/include/asm/dma-mapping.h b/arch/powerpc/include/asm/dma-mapping.h index 8fa394520af6..5201f2b7838c 100644 --- a/arch/powerpc/include/asm/dma-mapping.h +++ b/arch/powerpc/include/asm/dma-mapping.h @@ -108,7 +108,6 @@ static inline void set_dma_offset(struct device *dev, dma_addr_t off) } #define HAVE_ARCH_DMA_SET_MASK 1 -extern int dma_set_mask(struct device *dev, u64 dma_mask); extern u64 __dma_get_required_mask(struct device *dev); diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index 3b431cc58794..0bbce52606c2 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -440,107 +440,24 @@ bool dma_in_atomic_pool(void *start, size_t size); void *dma_alloc_from_pool(size_t size, struct page **ret_page, gfp_t flags); bool dma_free_from_pool(void *start, size_t size); -/** - * dma_mmap_attrs - map a coherent DMA allocation into user space - * @dev: valid struct device pointer, or NULL for ISA and EISA-like devices - * @vma: vm_area_struct describing requested user mapping - * @cpu_addr: kernel CPU-view address returned from dma_alloc_attrs - * @handle: device-view address returned from dma_alloc_attrs - * @size: size of memory originally requested in dma_alloc_attrs - * @attrs: attributes of mapping properties requested in dma_alloc_attrs - * - * Map a coherent DMA buffer previously allocated by dma_alloc_attrs - * into user space. The coherent DMA buffer must not be freed by the - * driver until the user space mapping has been released. - */ -static inline int -dma_mmap_attrs(struct device *dev, struct vm_area_struct *vma, void *cpu_addr, - dma_addr_t dma_addr, size_t size, unsigned long attrs) -{ - const struct dma_map_ops *ops = get_dma_ops(dev); - BUG_ON(!ops); - if (ops->mmap) - return ops->mmap(dev, vma, cpu_addr, dma_addr, size, attrs); - return dma_common_mmap(dev, vma, cpu_addr, dma_addr, size, attrs); -} - +int dma_mmap_attrs(struct device *dev, struct vm_area_struct *vma, + void *cpu_addr, dma_addr_t dma_addr, size_t size, + unsigned long attrs); #define dma_mmap_coherent(d, v, c, h, s) dma_mmap_attrs(d, v, c, h, s, 0) int dma_common_get_sgtable(struct device *dev, struct sg_table *sgt, void *cpu_addr, dma_addr_t dma_addr, size_t size, unsigned long attrs); -static inline int -dma_get_sgtable_attrs(struct device *dev, struct sg_table *sgt, void *cpu_addr, - dma_addr_t dma_addr, size_t size, - unsigned long attrs) -{ - const struct dma_map_ops *ops = get_dma_ops(dev); - BUG_ON(!ops); - if (ops->get_sgtable) - return ops->get_sgtable(dev, sgt, cpu_addr, dma_addr, size, - attrs); - return dma_common_get_sgtable(dev, sgt, cpu_addr, dma_addr, size, - attrs); -} - +int dma_get_sgtable_attrs(struct device *dev, struct sg_table *sgt, + void *cpu_addr, dma_addr_t dma_addr, size_t size, + unsigned long attrs); #define dma_get_sgtable(d, t, v, h, s) dma_get_sgtable_attrs(d, t, v, h, s, 0) -#ifndef arch_dma_alloc_attrs -#define arch_dma_alloc_attrs(dev) (true) -#endif - -static inline void *dma_alloc_attrs(struct device *dev, size_t size, - dma_addr_t *dma_handle, gfp_t flag, - unsigned long attrs) -{ - const struct dma_map_ops *ops = get_dma_ops(dev); - void *cpu_addr; - - BUG_ON(!ops); - WARN_ON_ONCE(dev && !dev->coherent_dma_mask); - - if (dma_alloc_from_dev_coherent(dev, size, dma_handle, &cpu_addr)) - return cpu_addr; - - /* let the implementation decide on the zone to allocate from: */ - flag &= ~(__GFP_DMA | __GFP_DMA32 | __GFP_HIGHMEM); - - if (!arch_dma_alloc_attrs(&dev)) - return NULL; - if (!ops->alloc) - return NULL; - - cpu_addr = ops->alloc(dev, size, dma_handle, flag, attrs); - debug_dma_alloc_coherent(dev, size, *dma_handle, cpu_addr); - return cpu_addr; -} - -static inline void dma_free_attrs(struct device *dev, size_t size, - void *cpu_addr, dma_addr_t dma_handle, - unsigned long attrs) -{ - const struct dma_map_ops *ops = get_dma_ops(dev); - - BUG_ON(!ops); - - if (dma_release_from_dev_coherent(dev, get_order(size), cpu_addr)) - return; - /* - * On non-coherent platforms which implement DMA-coherent buffers via - * non-cacheable remaps, ops->free() may call vunmap(). Thus getting - * this far in IRQ context is a) at risk of a BUG_ON() or trying to - * sleep on some machines, and b) an indication that the driver is - * probably misusing the coherent API anyway. - */ - WARN_ON(irqs_disabled()); - - if (!ops->free || !cpu_addr) - return; - - debug_dma_free_coherent(dev, size, cpu_addr, dma_handle); - ops->free(dev, size, cpu_addr, dma_handle, attrs); -} +void *dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle, + gfp_t flag, unsigned long attrs); +void dma_free_attrs(struct device *dev, size_t size, void *cpu_addr, + dma_addr_t dma_handle, unsigned long attrs); static inline void *dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t gfp) @@ -565,35 +482,9 @@ static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr) return 0; } -static inline void dma_check_mask(struct device *dev, u64 mask) -{ - if (sme_active() && (mask < (((u64)sme_get_me_mask() << 1) - 1))) - dev_warn(dev, "SME is active, device will require DMA bounce buffers\n"); -} - -static inline int dma_supported(struct device *dev, u64 mask) -{ - const struct dma_map_ops *ops = get_dma_ops(dev); - - if (!ops) - return 0; - if (!ops->dma_supported) - return 1; - return ops->dma_supported(dev, mask); -} - -#ifndef HAVE_ARCH_DMA_SET_MASK -static inline int dma_set_mask(struct device *dev, u64 mask) -{ - if (!dev->dma_mask || !dma_supported(dev, mask)) - return -EIO; - - dma_check_mask(dev, mask); - - *dev->dma_mask = mask; - return 0; -} -#endif +int dma_supported(struct device *dev, u64 mask); +int dma_set_mask(struct device *dev, u64 mask); +int dma_set_coherent_mask(struct device *dev, u64 mask); static inline u64 dma_get_mask(struct device *dev) { @@ -602,21 +493,6 @@ static inline u64 dma_get_mask(struct device *dev) return DMA_BIT_MASK(32); } -#ifdef CONFIG_ARCH_HAS_DMA_SET_COHERENT_MASK -int dma_set_coherent_mask(struct device *dev, u64 mask); -#else -static inline int dma_set_coherent_mask(struct device *dev, u64 mask) -{ - if (!dma_supported(dev, mask)) - return -EIO; - - dma_check_mask(dev, mask); - - dev->coherent_dma_mask = mask; - return 0; -} -#endif - /* * Set both the DMA mask and the coherent DMA mask to the same thing. * Note that we don't check the return value from dma_set_coherent_mask() diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index dfe29d18dba1..176ae3e08916 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -223,7 +223,20 @@ int dma_common_get_sgtable(struct device *dev, struct sg_table *sgt, sg_set_page(sgt->sgl, page, PAGE_ALIGN(size), 0); return ret; } -EXPORT_SYMBOL(dma_common_get_sgtable); + +int dma_get_sgtable_attrs(struct device *dev, struct sg_table *sgt, + void *cpu_addr, dma_addr_t dma_addr, size_t size, + unsigned long attrs) +{ + const struct dma_map_ops *ops = get_dma_ops(dev); + BUG_ON(!ops); + if (ops->get_sgtable) + return ops->get_sgtable(dev, sgt, cpu_addr, dma_addr, size, + attrs); + return dma_common_get_sgtable(dev, sgt, cpu_addr, dma_addr, size, + attrs); +} +EXPORT_SYMBOL(dma_get_sgtable_attrs); /* * Create userspace mapping for the DMA-coherent memory. @@ -261,7 +274,31 @@ int dma_common_mmap(struct device *dev, struct vm_area_struct *vma, return -ENXIO; #endif /* !CONFIG_ARCH_NO_COHERENT_DMA_MMAP */ } -EXPORT_SYMBOL(dma_common_mmap); + +/** + * dma_mmap_attrs - map a coherent DMA allocation into user space + * @dev: valid struct device pointer, or NULL for ISA and EISA-like devices + * @vma: vm_area_struct describing requested user mapping + * @cpu_addr: kernel CPU-view address returned from dma_alloc_attrs + * @dma_addr: device-view address returned from dma_alloc_attrs + * @size: size of memory originally requested in dma_alloc_attrs + * @attrs: attributes of mapping properties requested in dma_alloc_attrs + * + * Map a coherent DMA buffer previously allocated by dma_alloc_attrs into user + * space. The coherent DMA buffer must not be freed by the driver until the + * user space mapping has been released. + */ +int dma_mmap_attrs(struct device *dev, struct vm_area_struct *vma, + void *cpu_addr, dma_addr_t dma_addr, size_t size, + unsigned long attrs) +{ + const struct dma_map_ops *ops = get_dma_ops(dev); + BUG_ON(!ops); + if (ops->mmap) + return ops->mmap(dev, vma, cpu_addr, dma_addr, size, attrs); + return dma_common_mmap(dev, vma, cpu_addr, dma_addr, size, attrs); +} +EXPORT_SYMBOL(dma_mmap_attrs); #ifndef ARCH_HAS_DMA_GET_REQUIRED_MASK static u64 dma_default_get_required_mask(struct device *dev) @@ -294,3 +331,102 @@ u64 dma_get_required_mask(struct device *dev) EXPORT_SYMBOL_GPL(dma_get_required_mask); #endif +#ifndef arch_dma_alloc_attrs +#define arch_dma_alloc_attrs(dev) (true) +#endif + +void *dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle, + gfp_t flag, unsigned long attrs) +{ + const struct dma_map_ops *ops = get_dma_ops(dev); + void *cpu_addr; + + BUG_ON(!ops); + WARN_ON_ONCE(dev && !dev->coherent_dma_mask); + + if (dma_alloc_from_dev_coherent(dev, size, dma_handle, &cpu_addr)) + return cpu_addr; + + /* let the implementation decide on the zone to allocate from: */ + flag &= ~(__GFP_DMA | __GFP_DMA32 | __GFP_HIGHMEM); + + if (!arch_dma_alloc_attrs(&dev)) + return NULL; + if (!ops->alloc) + return NULL; + + cpu_addr = ops->alloc(dev, size, dma_handle, flag, attrs); + debug_dma_alloc_coherent(dev, size, *dma_handle, cpu_addr); + return cpu_addr; +} +EXPORT_SYMBOL(dma_alloc_attrs); + +void dma_free_attrs(struct device *dev, size_t size, void *cpu_addr, + dma_addr_t dma_handle, unsigned long attrs) +{ + const struct dma_map_ops *ops = get_dma_ops(dev); + + BUG_ON(!ops); + + if (dma_release_from_dev_coherent(dev, get_order(size), cpu_addr)) + return; + /* + * On non-coherent platforms which implement DMA-coherent buffers via + * non-cacheable remaps, ops->free() may call vunmap(). Thus getting + * this far in IRQ context is a) at risk of a BUG_ON() or trying to + * sleep on some machines, and b) an indication that the driver is + * probably misusing the coherent API anyway. + */ + WARN_ON(irqs_disabled()); + + if (!ops->free || !cpu_addr) + return; + + debug_dma_free_coherent(dev, size, cpu_addr, dma_handle); + ops->free(dev, size, cpu_addr, dma_handle, attrs); +} +EXPORT_SYMBOL(dma_free_attrs); + +static inline void dma_check_mask(struct device *dev, u64 mask) +{ + if (sme_active() && (mask < (((u64)sme_get_me_mask() << 1) - 1))) + dev_warn(dev, "SME is active, device will require DMA bounce buffers\n"); +} + +int dma_supported(struct device *dev, u64 mask) +{ + const struct dma_map_ops *ops = get_dma_ops(dev); + + if (!ops) + return 0; + if (!ops->dma_supported) + return 1; + return ops->dma_supported(dev, mask); +} +EXPORT_SYMBOL(dma_supported); + +#ifndef HAVE_ARCH_DMA_SET_MASK +int dma_set_mask(struct device *dev, u64 mask) +{ + if (!dev->dma_mask || !dma_supported(dev, mask)) + return -EIO; + + dma_check_mask(dev, mask); + *dev->dma_mask = mask; + return 0; +} +EXPORT_SYMBOL(dma_set_mask); +#endif + +#ifndef CONFIG_ARCH_HAS_DMA_SET_COHERENT_MASK +int dma_set_coherent_mask(struct device *dev, u64 mask) +{ + if (!dma_supported(dev, mask)) + return -EIO; + + dma_check_mask(dev, mask); + dev->coherent_dma_mask = mask; + return 0; +} +EXPORT_SYMBOL(dma_set_coherent_mask); +#endif -- cgit From 8ddbe5943c0b1259b5ddb6dc1729863433fc256c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 6 Dec 2018 12:47:50 -0800 Subject: dma-mapping: move dma_cache_sync out of line This isn't exactly a slow path routine, but it is not super critical either, and moving it out of line will help to keep the include chain clean for the following DMA indirection bypass work. Signed-off-by: Christoph Hellwig Acked-by: Jesper Dangaard Brouer Tested-by: Jesper Dangaard Brouer Tested-by: Tony Luck --- include/linux/dma-mapping.h | 12 ++---------- kernel/dma/mapping.c | 11 +++++++++++ 2 files changed, 13 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index 0bbce52606c2..0f0078490df4 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -411,16 +411,8 @@ dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, #define dma_map_page(d, p, o, s, r) dma_map_page_attrs(d, p, o, s, r, 0) #define dma_unmap_page(d, a, s, r) dma_unmap_page_attrs(d, a, s, r, 0) -static inline void -dma_cache_sync(struct device *dev, void *vaddr, size_t size, - enum dma_data_direction dir) -{ - const struct dma_map_ops *ops = get_dma_ops(dev); - - BUG_ON(!valid_dma_direction(dir)); - if (ops->cache_sync) - ops->cache_sync(dev, vaddr, size, dir); -} +void dma_cache_sync(struct device *dev, void *vaddr, size_t size, + enum dma_data_direction dir); extern int dma_common_mmap(struct device *dev, struct vm_area_struct *vma, void *cpu_addr, dma_addr_t dma_addr, size_t size, diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index 176ae3e08916..0b18cfbdde95 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -430,3 +430,14 @@ int dma_set_coherent_mask(struct device *dev, u64 mask) } EXPORT_SYMBOL(dma_set_coherent_mask); #endif + +void dma_cache_sync(struct device *dev, void *vaddr, size_t size, + enum dma_data_direction dir) +{ + const struct dma_map_ops *ops = get_dma_ops(dev); + + BUG_ON(!valid_dma_direction(dir)); + if (ops->cache_sync) + ops->cache_sync(dev, vaddr, size, dir); +} +EXPORT_SYMBOL(dma_cache_sync); -- cgit From 3731c3d4774e38b9d91c01943e1e6a243c1776be Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 6 Dec 2018 12:50:26 -0800 Subject: dma-mapping: always build the direct mapping code All architectures except for sparc64 use the dma-direct code in some form, and even for sparc64 we had the discussion of a direct mapping mode a while ago. In preparation for directly calling the direct mapping code don't bother having it optionally but always build the code in. This is a minor hardship for some powerpc and arm configs that don't pull it in yet (although they should in a relase ot two), and sparc64 which currently doesn't need it at all, but it will reduce the ifdef mess we'd otherwise need significantly. Signed-off-by: Christoph Hellwig Acked-by: Jesper Dangaard Brouer Tested-by: Jesper Dangaard Brouer Tested-by: Tony Luck --- arch/alpha/Kconfig | 1 - arch/arc/Kconfig | 1 - arch/arm/Kconfig | 1 - arch/arm64/Kconfig | 1 - arch/c6x/Kconfig | 1 - arch/csky/Kconfig | 1 - arch/h8300/Kconfig | 1 - arch/hexagon/Kconfig | 1 - arch/m68k/Kconfig | 1 - arch/microblaze/Kconfig | 1 - arch/mips/Kconfig | 1 - arch/nds32/Kconfig | 1 - arch/nios2/Kconfig | 1 - arch/openrisc/Kconfig | 1 - arch/parisc/Kconfig | 1 - arch/riscv/Kconfig | 1 - arch/s390/Kconfig | 1 - arch/sh/Kconfig | 1 - arch/sparc/Kconfig | 1 - arch/unicore32/Kconfig | 1 - arch/x86/Kconfig | 1 - arch/xtensa/Kconfig | 1 - kernel/dma/Kconfig | 7 ------- kernel/dma/Makefile | 3 +-- 24 files changed, 1 insertion(+), 31 deletions(-) (limited to 'kernel') diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig index a7e748a46c18..5da6ff54b3e7 100644 --- a/arch/alpha/Kconfig +++ b/arch/alpha/Kconfig @@ -203,7 +203,6 @@ config ALPHA_EIGER config ALPHA_JENSEN bool "Jensen" depends on BROKEN - select DMA_DIRECT_OPS help DEC PC 150 AXP (aka Jensen): This is a very old Digital system - one of the first-generation Alpha systems. A number of these systems diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig index fd48d698da29..7deaabeb531a 100644 --- a/arch/arc/Kconfig +++ b/arch/arc/Kconfig @@ -17,7 +17,6 @@ config ARC select BUILDTIME_EXTABLE_SORT select CLONE_BACKWARDS select COMMON_CLK - select DMA_DIRECT_OPS select GENERIC_ATOMIC64 if !ISA_ARCV2 || !(ARC_HAS_LL64 && ARC_HAS_LLSC) select GENERIC_CLOCKEVENTS select GENERIC_FIND_FIRST_BIT diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index a858ee791ef0..586fc30b23bd 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -30,7 +30,6 @@ config ARM select CLONE_BACKWARDS select CPU_PM if (SUSPEND || CPU_IDLE) select DCACHE_WORD_ACCESS if HAVE_EFFICIENT_UNALIGNED_ACCESS - select DMA_DIRECT_OPS if !MMU select DMA_REMAP if MMU select EDAC_SUPPORT select EDAC_ATOMIC_SCRUB diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 06cf0ef24367..2092080240b0 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -80,7 +80,6 @@ config ARM64 select CPU_PM if (SUSPEND || CPU_IDLE) select CRC32 select DCACHE_WORD_ACCESS - select DMA_DIRECT_OPS select DMA_DIRECT_REMAP select EDAC_SUPPORT select FRAME_POINTER diff --git a/arch/c6x/Kconfig b/arch/c6x/Kconfig index 84420109113d..456e154674d1 100644 --- a/arch/c6x/Kconfig +++ b/arch/c6x/Kconfig @@ -9,7 +9,6 @@ config C6X select ARCH_HAS_SYNC_DMA_FOR_CPU select ARCH_HAS_SYNC_DMA_FOR_DEVICE select CLKDEV_LOOKUP - select DMA_DIRECT_OPS select GENERIC_ATOMIC64 select GENERIC_IRQ_SHOW select HAVE_ARCH_TRACEHOOK diff --git a/arch/csky/Kconfig b/arch/csky/Kconfig index ea74f3a9eeaf..37bed8aadf95 100644 --- a/arch/csky/Kconfig +++ b/arch/csky/Kconfig @@ -7,7 +7,6 @@ config CSKY select COMMON_CLK select CLKSRC_MMIO select CLKSRC_OF - select DMA_DIRECT_OPS select DMA_DIRECT_REMAP select IRQ_DOMAIN select HANDLE_DOMAIN_IRQ diff --git a/arch/h8300/Kconfig b/arch/h8300/Kconfig index d19c6b16cd5d..6472a0685470 100644 --- a/arch/h8300/Kconfig +++ b/arch/h8300/Kconfig @@ -22,7 +22,6 @@ config H8300 select HAVE_ARCH_KGDB select HAVE_ARCH_HASH select CPU_NO_EFFICIENT_FFS - select DMA_DIRECT_OPS config CPU_BIG_ENDIAN def_bool y diff --git a/arch/hexagon/Kconfig b/arch/hexagon/Kconfig index 2b688af379e6..d71036c598de 100644 --- a/arch/hexagon/Kconfig +++ b/arch/hexagon/Kconfig @@ -31,7 +31,6 @@ config HEXAGON select GENERIC_CLOCKEVENTS_BROADCAST select MODULES_USE_ELF_RELA select GENERIC_CPU_DEVICES - select DMA_DIRECT_OPS ---help--- Qualcomm Hexagon is a processor architecture designed for high performance and low power across a wide variety of applications. diff --git a/arch/m68k/Kconfig b/arch/m68k/Kconfig index 1bc9f1ba759a..8a5868e9a3a0 100644 --- a/arch/m68k/Kconfig +++ b/arch/m68k/Kconfig @@ -26,7 +26,6 @@ config M68K select MODULES_USE_ELF_RELA select OLD_SIGSUSPEND3 select OLD_SIGACTION - select DMA_DIRECT_OPS if HAS_DMA select ARCH_DISCARD_MEMBLOCK config CPU_BIG_ENDIAN diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig index effed2efd306..eda9e2315ef5 100644 --- a/arch/microblaze/Kconfig +++ b/arch/microblaze/Kconfig @@ -12,7 +12,6 @@ config MICROBLAZE select TIMER_OF select CLONE_BACKWARDS3 select COMMON_CLK - select DMA_DIRECT_OPS select GENERIC_ATOMIC64 select GENERIC_CLOCKEVENTS select GENERIC_CPU_DEVICES diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index 8272ea4c7264..2993aa9842c0 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -16,7 +16,6 @@ config MIPS select BUILDTIME_EXTABLE_SORT select CLONE_BACKWARDS select CPU_PM if CPU_IDLE - select DMA_DIRECT_OPS select GENERIC_ATOMIC64 if !64BIT select GENERIC_CLOCKEVENTS select GENERIC_CMOS_UPDATE diff --git a/arch/nds32/Kconfig b/arch/nds32/Kconfig index 7a04adacb2f0..1af6bbae7220 100644 --- a/arch/nds32/Kconfig +++ b/arch/nds32/Kconfig @@ -11,7 +11,6 @@ config NDS32 select CLKSRC_MMIO select CLONE_BACKWARDS select COMMON_CLK - select DMA_DIRECT_OPS select GENERIC_ATOMIC64 select GENERIC_CPU_DEVICES select GENERIC_CLOCKEVENTS diff --git a/arch/nios2/Kconfig b/arch/nios2/Kconfig index 7e95506e957a..f6c4b0f49997 100644 --- a/arch/nios2/Kconfig +++ b/arch/nios2/Kconfig @@ -4,7 +4,6 @@ config NIOS2 select ARCH_HAS_SYNC_DMA_FOR_CPU select ARCH_HAS_SYNC_DMA_FOR_DEVICE select ARCH_NO_SWAP - select DMA_DIRECT_OPS select TIMER_OF select GENERIC_ATOMIC64 select GENERIC_CLOCKEVENTS diff --git a/arch/openrisc/Kconfig b/arch/openrisc/Kconfig index 285f7d05c8ed..d0feebad5a8f 100644 --- a/arch/openrisc/Kconfig +++ b/arch/openrisc/Kconfig @@ -7,7 +7,6 @@ config OPENRISC def_bool y select ARCH_HAS_SYNC_DMA_FOR_DEVICE - select DMA_DIRECT_OPS select OF select OF_EARLY_FLATTREE select IRQ_DOMAIN diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig index 428ee50fc3db..6e1b71da0e71 100644 --- a/arch/parisc/Kconfig +++ b/arch/parisc/Kconfig @@ -185,7 +185,6 @@ config PA11 depends on PA7000 || PA7100LC || PA7200 || PA7300LC select ARCH_HAS_SYNC_DMA_FOR_CPU select ARCH_HAS_SYNC_DMA_FOR_DEVICE - select DMA_DIRECT_OPS select DMA_NONCOHERENT_CACHE_SYNC config PREFETCH diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 55da93f4e818..51d89c4b1dca 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -19,7 +19,6 @@ config RISCV select ARCH_WANT_FRAME_POINTERS select CLONE_BACKWARDS select COMMON_CLK - select DMA_DIRECT_OPS select GENERIC_CLOCKEVENTS select GENERIC_CPU_DEVICES select GENERIC_IRQ_SHOW diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 5624e8607054..21d271d04ca6 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -139,7 +139,6 @@ config S390 select HAVE_COPY_THREAD_TLS select HAVE_DEBUG_KMEMLEAK select HAVE_DMA_CONTIGUOUS - select DMA_DIRECT_OPS select HAVE_DYNAMIC_FTRACE select HAVE_DYNAMIC_FTRACE_WITH_REGS select HAVE_EFFICIENT_UNALIGNED_ACCESS diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index f82a4da7adf3..10fd4e9c454b 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -7,7 +7,6 @@ config SUPERH select ARCH_NO_COHERENT_DMA_MMAP if !MMU select HAVE_PATA_PLATFORM select CLKDEV_LOOKUP - select DMA_DIRECT_OPS select HAVE_IDE if HAS_IOPORT_MAP select HAVE_MEMBLOCK_NODE_MAP select ARCH_DISCARD_MEMBLOCK diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index 8853b6ceae17..f5bb9ded1d18 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -48,7 +48,6 @@ config SPARC config SPARC32 def_bool !64BIT select ARCH_HAS_SYNC_DMA_FOR_CPU - select DMA_DIRECT_OPS select GENERIC_ATOMIC64 select CLZ_TAB select HAVE_UID16 diff --git a/arch/unicore32/Kconfig b/arch/unicore32/Kconfig index a4c05159dca5..2681027d7bff 100644 --- a/arch/unicore32/Kconfig +++ b/arch/unicore32/Kconfig @@ -4,7 +4,6 @@ config UNICORE32 select ARCH_HAS_DEVMEM_IS_ALLOWED select ARCH_MIGHT_HAVE_PC_PARPORT select ARCH_MIGHT_HAVE_PC_SERIO - select DMA_DIRECT_OPS select HAVE_GENERIC_DMA_COHERENT select HAVE_KERNEL_GZIP select HAVE_KERNEL_BZIP2 diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index adc845b66f01..c14d4a35be13 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -89,7 +89,6 @@ config X86 select CLOCKSOURCE_VALIDATE_LAST_CYCLE select CLOCKSOURCE_WATCHDOG select DCACHE_WORD_ACCESS - select DMA_DIRECT_OPS select EDAC_ATOMIC_SCRUB select EDAC_SUPPORT select GENERIC_CLOCKEVENTS diff --git a/arch/xtensa/Kconfig b/arch/xtensa/Kconfig index 75488b606edc..36338e7564a3 100644 --- a/arch/xtensa/Kconfig +++ b/arch/xtensa/Kconfig @@ -9,7 +9,6 @@ config XTENSA select BUILDTIME_EXTABLE_SORT select CLONE_BACKWARDS select COMMON_CLK - select DMA_DIRECT_OPS select DMA_REMAP if MMU select GENERIC_ATOMIC64 select GENERIC_CLOCKEVENTS diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig index 41c3b1df70eb..ca88b867e7fe 100644 --- a/kernel/dma/Kconfig +++ b/kernel/dma/Kconfig @@ -35,13 +35,8 @@ config ARCH_HAS_DMA_COHERENT_TO_PFN config ARCH_HAS_DMA_MMAP_PGPROT bool -config DMA_DIRECT_OPS - bool - depends on HAS_DMA - config DMA_NONCOHERENT_CACHE_SYNC bool - depends on DMA_DIRECT_OPS config DMA_VIRT_OPS bool @@ -49,7 +44,6 @@ config DMA_VIRT_OPS config SWIOTLB bool - select DMA_DIRECT_OPS select NEED_DMA_MAP_STATE config DMA_REMAP @@ -58,5 +52,4 @@ config DMA_REMAP config DMA_DIRECT_REMAP bool - depends on DMA_DIRECT_OPS select DMA_REMAP diff --git a/kernel/dma/Makefile b/kernel/dma/Makefile index f4feeceb8020..a626f643cd63 100644 --- a/kernel/dma/Makefile +++ b/kernel/dma/Makefile @@ -1,9 +1,8 @@ # SPDX-License-Identifier: GPL-2.0 -obj-$(CONFIG_HAS_DMA) += mapping.o +obj-$(CONFIG_HAS_DMA) += mapping.o direct.o obj-$(CONFIG_DMA_CMA) += contiguous.o obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += coherent.o -obj-$(CONFIG_DMA_DIRECT_OPS) += direct.o obj-$(CONFIG_DMA_VIRT_OPS) += virt.o obj-$(CONFIG_DMA_API_DEBUG) += debug.o obj-$(CONFIG_SWIOTLB) += swiotlb.o -- cgit From 90ac706e98fcb24fb0b0a259558987f33cc2f0f6 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Thu, 6 Dec 2018 13:14:44 -0800 Subject: dma-mapping: factor out dummy DMA ops The dummy DMA ops are currently used by arm64 for any device which has an invalid ACPI description and is thus barred from using DMA due to not knowing whether is is cache-coherent or not. Factor these out into general dma-mapping code so that they can be referenced from other common code paths. In the process, we can prune all the optional callbacks which just do the same thing as the default behaviour, and fill in .map_resource for completeness. Signed-off-by: Robin Murphy [hch: moved to a separate source file] Reviewed-by: Rafael J. Wysocki Acked-by: Jesper Dangaard Brouer Tested-by: Jesper Dangaard Brouer Tested-by: Tony Luck Signed-off-by: Christoph Hellwig --- arch/arm64/include/asm/dma-mapping.h | 4 +- arch/arm64/mm/dma-mapping.c | 86 ------------------------------------ include/linux/dma-mapping.h | 1 + kernel/dma/Makefile | 2 +- kernel/dma/dummy.c | 39 ++++++++++++++++ 5 files changed, 42 insertions(+), 90 deletions(-) create mode 100644 kernel/dma/dummy.c (limited to 'kernel') diff --git a/arch/arm64/include/asm/dma-mapping.h b/arch/arm64/include/asm/dma-mapping.h index c41f3fb1446c..273e778f7de2 100644 --- a/arch/arm64/include/asm/dma-mapping.h +++ b/arch/arm64/include/asm/dma-mapping.h @@ -24,15 +24,13 @@ #include #include -extern const struct dma_map_ops dummy_dma_ops; - static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus) { /* * We expect no ISA devices, and all other DMA masters are expected to * have someone call arch_setup_dma_ops at device creation time. */ - return &dummy_dma_ops; + return &dma_dummy_ops; } void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size, diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c index 4c0f498069e8..6ff6ec8806c1 100644 --- a/arch/arm64/mm/dma-mapping.c +++ b/arch/arm64/mm/dma-mapping.c @@ -89,92 +89,6 @@ static int __swiotlb_mmap_pfn(struct vm_area_struct *vma, } #endif /* CONFIG_IOMMU_DMA */ -/******************************************** - * The following APIs are for dummy DMA ops * - ********************************************/ - -static void *__dummy_alloc(struct device *dev, size_t size, - dma_addr_t *dma_handle, gfp_t flags, - unsigned long attrs) -{ - return NULL; -} - -static void __dummy_free(struct device *dev, size_t size, - void *vaddr, dma_addr_t dma_handle, - unsigned long attrs) -{ -} - -static int __dummy_mmap(struct device *dev, - struct vm_area_struct *vma, - void *cpu_addr, dma_addr_t dma_addr, size_t size, - unsigned long attrs) -{ - return -ENXIO; -} - -static dma_addr_t __dummy_map_page(struct device *dev, struct page *page, - unsigned long offset, size_t size, - enum dma_data_direction dir, - unsigned long attrs) -{ - return DMA_MAPPING_ERROR; -} - -static void __dummy_unmap_page(struct device *dev, dma_addr_t dev_addr, - size_t size, enum dma_data_direction dir, - unsigned long attrs) -{ -} - -static int __dummy_map_sg(struct device *dev, struct scatterlist *sgl, - int nelems, enum dma_data_direction dir, - unsigned long attrs) -{ - return 0; -} - -static void __dummy_unmap_sg(struct device *dev, - struct scatterlist *sgl, int nelems, - enum dma_data_direction dir, - unsigned long attrs) -{ -} - -static void __dummy_sync_single(struct device *dev, - dma_addr_t dev_addr, size_t size, - enum dma_data_direction dir) -{ -} - -static void __dummy_sync_sg(struct device *dev, - struct scatterlist *sgl, int nelems, - enum dma_data_direction dir) -{ -} - -static int __dummy_dma_supported(struct device *hwdev, u64 mask) -{ - return 0; -} - -const struct dma_map_ops dummy_dma_ops = { - .alloc = __dummy_alloc, - .free = __dummy_free, - .mmap = __dummy_mmap, - .map_page = __dummy_map_page, - .unmap_page = __dummy_unmap_page, - .map_sg = __dummy_map_sg, - .unmap_sg = __dummy_unmap_sg, - .sync_single_for_cpu = __dummy_sync_single, - .sync_single_for_device = __dummy_sync_single, - .sync_sg_for_cpu = __dummy_sync_sg, - .sync_sg_for_device = __dummy_sync_sg, - .dma_supported = __dummy_dma_supported, -}; -EXPORT_SYMBOL(dummy_dma_ops); - static int __init arm64_dma_init(void) { WARN_TAINT(ARCH_DMA_MINALIGN < cache_line_size(), diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index 0f0078490df4..269ee27fc3d9 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -136,6 +136,7 @@ struct dma_map_ops { extern const struct dma_map_ops dma_direct_ops; extern const struct dma_map_ops dma_virt_ops; +extern const struct dma_map_ops dma_dummy_ops; #define DMA_BIT_MASK(n) (((n) == 64) ? ~0ULL : ((1ULL<<(n))-1)) diff --git a/kernel/dma/Makefile b/kernel/dma/Makefile index a626f643cd63..72ff6e46aa86 100644 --- a/kernel/dma/Makefile +++ b/kernel/dma/Makefile @@ -1,6 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 -obj-$(CONFIG_HAS_DMA) += mapping.o direct.o +obj-$(CONFIG_HAS_DMA) += mapping.o direct.o dummy.o obj-$(CONFIG_DMA_CMA) += contiguous.o obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += coherent.o obj-$(CONFIG_DMA_VIRT_OPS) += virt.o diff --git a/kernel/dma/dummy.c b/kernel/dma/dummy.c new file mode 100644 index 000000000000..05607642c888 --- /dev/null +++ b/kernel/dma/dummy.c @@ -0,0 +1,39 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Dummy DMA ops that always fail. + */ +#include + +static int dma_dummy_mmap(struct device *dev, struct vm_area_struct *vma, + void *cpu_addr, dma_addr_t dma_addr, size_t size, + unsigned long attrs) +{ + return -ENXIO; +} + +static dma_addr_t dma_dummy_map_page(struct device *dev, struct page *page, + unsigned long offset, size_t size, enum dma_data_direction dir, + unsigned long attrs) +{ + return DMA_MAPPING_ERROR; +} + +static int dma_dummy_map_sg(struct device *dev, struct scatterlist *sgl, + int nelems, enum dma_data_direction dir, + unsigned long attrs) +{ + return 0; +} + +static int dma_dummy_supported(struct device *hwdev, u64 mask) +{ + return 0; +} + +const struct dma_map_ops dma_dummy_ops = { + .mmap = dma_dummy_mmap, + .map_page = dma_dummy_map_page, + .map_sg = dma_dummy_map_sg, + .dma_supported = dma_dummy_supported, +}; +EXPORT_SYMBOL(dma_dummy_ops); -- cgit From b907e20508d02462a50c2841da0a5e3883fdab39 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 3 Dec 2018 11:42:52 +0100 Subject: swiotlb: remove SWIOTLB_MAP_ERROR We can use DMA_MAPPING_ERROR instead, which already maps to the same value. Signed-off-by: Christoph Hellwig Acked-by: Jesper Dangaard Brouer Tested-by: Jesper Dangaard Brouer Tested-by: Tony Luck --- drivers/xen/swiotlb-xen.c | 4 ++-- include/linux/swiotlb.h | 3 --- kernel/dma/swiotlb.c | 4 ++-- 3 files changed, 4 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c index 6dc969d5ea2f..833e80b46eb2 100644 --- a/drivers/xen/swiotlb-xen.c +++ b/drivers/xen/swiotlb-xen.c @@ -403,7 +403,7 @@ static dma_addr_t xen_swiotlb_map_page(struct device *dev, struct page *page, map = swiotlb_tbl_map_single(dev, start_dma_addr, phys, size, dir, attrs); - if (map == SWIOTLB_MAP_ERROR) + if (map == DMA_MAPPING_ERROR) return DMA_MAPPING_ERROR; dev_addr = xen_phys_to_bus(map); @@ -572,7 +572,7 @@ xen_swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl, sg_phys(sg), sg->length, dir, attrs); - if (map == SWIOTLB_MAP_ERROR) { + if (map == DMA_MAPPING_ERROR) { dev_warn(hwdev, "swiotlb buffer is full\n"); /* Don't panic here, we expect map_sg users to do proper error handling. */ diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h index a387b59640a4..14aec0b70dd9 100644 --- a/include/linux/swiotlb.h +++ b/include/linux/swiotlb.h @@ -46,9 +46,6 @@ enum dma_sync_target { SYNC_FOR_DEVICE = 1, }; -/* define the last possible byte of physical address space as a mapping error */ -#define SWIOTLB_MAP_ERROR (~(phys_addr_t)0x0) - extern phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, dma_addr_t tbl_dma_addr, phys_addr_t phys, size_t size, diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index ff1ce81bb623..19ba8e473d71 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -526,7 +526,7 @@ not_found: spin_unlock_irqrestore(&io_tlb_lock, flags); if (!(attrs & DMA_ATTR_NO_WARN) && printk_ratelimit()) dev_warn(hwdev, "swiotlb buffer is full (sz: %zd bytes)\n", size); - return SWIOTLB_MAP_ERROR; + return DMA_MAPPING_ERROR; found: spin_unlock_irqrestore(&io_tlb_lock, flags); @@ -637,7 +637,7 @@ static dma_addr_t swiotlb_bounce_page(struct device *dev, phys_addr_t *phys, /* Oh well, have to allocate and map a bounce buffer. */ *phys = swiotlb_tbl_map_single(dev, __phys_to_dma(dev, io_tlb_start), *phys, size, dir, attrs); - if (*phys == SWIOTLB_MAP_ERROR) + if (*phys == DMA_MAPPING_ERROR) return DMA_MAPPING_ERROR; /* Ensure that the address returned is DMA'ble */ -- cgit From 68c608345cc569bcfa1c1b2add4c00c343ecf933 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 6 Dec 2018 07:06:04 -0800 Subject: swiotlb: remove dma_mark_clean Instead of providing a special dma_mark_clean hook just for ia64, switch ia64 to use the normal arch_sync_dma_for_cpu hooks instead. This means that we now also set the PG_arch_1 bit for pages in the swiotlb buffer, which isn't stricly needed as we will never execute code out of the swiotlb buffer, but otherwise harmless. Signed-off-by: Christoph Hellwig Acked-by: Jesper Dangaard Brouer Tested-by: Jesper Dangaard Brouer Tested-by: Tony Luck --- arch/ia64/Kconfig | 3 ++- arch/ia64/kernel/dma-mapping.c | 20 +++++++++++++++++++- arch/ia64/mm/init.c | 19 ++++++++----------- drivers/xen/swiotlb-xen.c | 20 +------------------- include/linux/dma-direct.h | 8 -------- kernel/dma/swiotlb.c | 18 +----------------- 6 files changed, 31 insertions(+), 57 deletions(-) (limited to 'kernel') diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index d6f203658994..c587e3316c38 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -28,7 +28,8 @@ config IA64 select HAVE_ARCH_TRACEHOOK select HAVE_MEMBLOCK_NODE_MAP select HAVE_VIRT_CPU_ACCOUNTING - select ARCH_HAS_DMA_MARK_CLEAN + select ARCH_HAS_DMA_COHERENT_TO_PFN + select ARCH_HAS_SYNC_DMA_FOR_CPU select VIRT_TO_BUS select ARCH_DISCARD_MEMBLOCK select GENERIC_IRQ_PROBE diff --git a/arch/ia64/kernel/dma-mapping.c b/arch/ia64/kernel/dma-mapping.c index 7a471d8d67d4..36dd6aa6d759 100644 --- a/arch/ia64/kernel/dma-mapping.c +++ b/arch/ia64/kernel/dma-mapping.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -#include +#include #include #include @@ -16,6 +16,24 @@ const struct dma_map_ops *dma_get_ops(struct device *dev) EXPORT_SYMBOL(dma_get_ops); #ifdef CONFIG_SWIOTLB +void *arch_dma_alloc(struct device *dev, size_t size, + dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs) +{ + return dma_direct_alloc_pages(dev, size, dma_handle, gfp, attrs); +} + +void arch_dma_free(struct device *dev, size_t size, void *cpu_addr, + dma_addr_t dma_addr, unsigned long attrs) +{ + dma_direct_free_pages(dev, size, cpu_addr, dma_addr, attrs); +} + +long arch_dma_coherent_to_pfn(struct device *dev, void *cpu_addr, + dma_addr_t dma_addr) +{ + return page_to_pfn(virt_to_page(cpu_addr)); +} + void __init swiotlb_dma_init(void) { dma_ops = &swiotlb_dma_ops; diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index d5e12ff1d73c..0cf43bb13d6e 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -8,6 +8,7 @@ #include #include +#include #include #include #include @@ -71,18 +72,14 @@ __ia64_sync_icache_dcache (pte_t pte) * DMA can be marked as "clean" so that lazy_mmu_prot_update() doesn't have to * flush them when they get mapped into an executable vm-area. */ -void -dma_mark_clean(void *addr, size_t size) +void arch_sync_dma_for_cpu(struct device *dev, phys_addr_t paddr, + size_t size, enum dma_data_direction dir) { - unsigned long pg_addr, end; - - pg_addr = PAGE_ALIGN((unsigned long) addr); - end = (unsigned long) addr + size; - while (pg_addr + PAGE_SIZE <= end) { - struct page *page = virt_to_page(pg_addr); - set_bit(PG_arch_1, &page->flags); - pg_addr += PAGE_SIZE; - } + unsigned long pfn = PHYS_PFN(paddr); + + do { + set_bit(PG_arch_1, &pfn_to_page(pfn)->flags); + } while (++pfn <= PHYS_PFN(paddr + size - 1)); } inline void diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c index 833e80b46eb2..989cf872b98c 100644 --- a/drivers/xen/swiotlb-xen.c +++ b/drivers/xen/swiotlb-xen.c @@ -441,21 +441,8 @@ static void xen_unmap_single(struct device *hwdev, dma_addr_t dev_addr, xen_dma_unmap_page(hwdev, dev_addr, size, dir, attrs); /* NOTE: We use dev_addr here, not paddr! */ - if (is_xen_swiotlb_buffer(dev_addr)) { + if (is_xen_swiotlb_buffer(dev_addr)) swiotlb_tbl_unmap_single(hwdev, paddr, size, dir, attrs); - return; - } - - if (dir != DMA_FROM_DEVICE) - return; - - /* - * phys_to_virt doesn't work with hihgmem page but we could - * call dma_mark_clean() with hihgmem page here. However, we - * are fine since dma_mark_clean() is null on POWERPC. We can - * make dma_mark_clean() take a physical address if necessary. - */ - dma_mark_clean(phys_to_virt(paddr), size); } static void xen_swiotlb_unmap_page(struct device *hwdev, dma_addr_t dev_addr, @@ -493,11 +480,6 @@ xen_swiotlb_sync_single(struct device *hwdev, dma_addr_t dev_addr, if (target == SYNC_FOR_DEVICE) xen_dma_sync_single_for_device(hwdev, dev_addr, size, dir); - - if (dir != DMA_FROM_DEVICE) - return; - - dma_mark_clean(phys_to_virt(paddr), size); } void diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h index 6e5a47ae7d64..1aa73f4907ae 100644 --- a/include/linux/dma-direct.h +++ b/include/linux/dma-direct.h @@ -48,14 +48,6 @@ static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr) return __sme_clr(__dma_to_phys(dev, daddr)); } -#ifdef CONFIG_ARCH_HAS_DMA_MARK_CLEAN -void dma_mark_clean(void *addr, size_t size); -#else -static inline void dma_mark_clean(void *addr, size_t size) -{ -} -#endif /* CONFIG_ARCH_HAS_DMA_MARK_CLEAN */ - u64 dma_direct_get_required_mask(struct device *dev); void *dma_direct_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs); diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 19ba8e473d71..2e126bac5d7d 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -706,21 +706,8 @@ void swiotlb_unmap_page(struct device *hwdev, dma_addr_t dev_addr, (attrs & DMA_ATTR_SKIP_CPU_SYNC) == 0) arch_sync_dma_for_cpu(hwdev, paddr, size, dir); - if (is_swiotlb_buffer(paddr)) { + if (is_swiotlb_buffer(paddr)) swiotlb_tbl_unmap_single(hwdev, paddr, size, dir, attrs); - return; - } - - if (dir != DMA_FROM_DEVICE) - return; - - /* - * phys_to_virt doesn't work with hihgmem page but we could - * call dma_mark_clean() with hihgmem page here. However, we - * are fine since dma_mark_clean() is null on POWERPC. We can - * make dma_mark_clean() take a physical address if necessary. - */ - dma_mark_clean(phys_to_virt(paddr), size); } /* @@ -750,9 +737,6 @@ swiotlb_sync_single(struct device *hwdev, dma_addr_t dev_addr, if (!dev_is_dma_coherent(hwdev) && target == SYNC_FOR_DEVICE) arch_sync_dma_for_device(hwdev, paddr, size, dir); - - if (!is_swiotlb_buffer(paddr) && dir == DMA_FROM_DEVICE) - dma_mark_clean(phys_to_virt(paddr), size); } void -- cgit From 58dfd4ac022037c6a562e92fc6d2a778819b2162 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 3 Dec 2018 07:43:05 +0100 Subject: dma-direct: improve addressability error reporting Only report report a DMA addressability report once to avoid spewing the kernel log with repeated message. Also provide a stack trace to make it easy to find the actual caller that caused the problem. Last but not least move the actual check into the fast path and only leave the error reporting in a helper. Signed-off-by: Christoph Hellwig Acked-by: Jesper Dangaard Brouer Tested-by: Jesper Dangaard Brouer Tested-by: Tony Luck --- kernel/dma/direct.c | 36 +++++++++++++++--------------------- 1 file changed, 15 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 308f88a750c8..edb24f94ea1e 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -30,27 +30,16 @@ static inline bool force_dma_unencrypted(void) return sev_active(); } -static bool -check_addr(struct device *dev, dma_addr_t dma_addr, size_t size, - const char *caller) +static void report_addr(struct device *dev, dma_addr_t dma_addr, size_t size) { - if (unlikely(dev && !dma_capable(dev, dma_addr, size))) { - if (!dev->dma_mask) { - dev_err(dev, - "%s: call on device without dma_mask\n", - caller); - return false; - } - - if (*dev->dma_mask >= DMA_BIT_MASK(32) || dev->bus_dma_mask) { - dev_err(dev, - "%s: overflow %pad+%zu of device mask %llx bus mask %llx\n", - caller, &dma_addr, size, - *dev->dma_mask, dev->bus_dma_mask); - } - return false; + if (!dev->dma_mask) { + dev_err_once(dev, "DMA map on device without dma_mask\n"); + } else if (*dev->dma_mask >= DMA_BIT_MASK(32) || dev->bus_dma_mask) { + dev_err_once(dev, + "overflow %pad+%zu of DMA mask %llx bus mask %llx\n", + &dma_addr, size, *dev->dma_mask, dev->bus_dma_mask); } - return true; + WARN_ON_ONCE(1); } static inline dma_addr_t phys_to_dma_direct(struct device *dev, @@ -288,8 +277,10 @@ dma_addr_t dma_direct_map_page(struct device *dev, struct page *page, phys_addr_t phys = page_to_phys(page) + offset; dma_addr_t dma_addr = phys_to_dma(dev, phys); - if (!check_addr(dev, dma_addr, size, __func__)) + if (unlikely(dev && !dma_capable(dev, dma_addr, size))) { + report_addr(dev, dma_addr, size); return DMA_MAPPING_ERROR; + } if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC)) dma_direct_sync_single_for_device(dev, dma_addr, size, dir); @@ -306,8 +297,11 @@ int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents, BUG_ON(!sg_page(sg)); sg_dma_address(sg) = phys_to_dma(dev, sg_phys(sg)); - if (!check_addr(dev, sg_dma_address(sg), sg->length, __func__)) + if (unlikely(dev && !dma_capable(dev, sg_dma_address(sg), + sg->length))) { + report_addr(dev, sg_dma_address(sg), sg->length); return 0; + } sg_dma_len(sg) = sg->length; } -- cgit From 17ac524719f3fc88c1a90528f4789e4b4f618512 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 3 Dec 2018 11:14:09 +0100 Subject: dma-direct: use dma_direct_map_page to implement dma_direct_map_sg No need to duplicate the mapping logic. Signed-off-by: Christoph Hellwig Acked-by: Jesper Dangaard Brouer Tested-by: Jesper Dangaard Brouer Tested-by: Tony Luck --- kernel/dma/direct.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index edb24f94ea1e..d45306473c90 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -217,6 +217,7 @@ static void dma_direct_sync_single_for_device(struct device *dev, arch_sync_dma_for_device(dev, dma_to_phys(dev, addr), size, dir); } +#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) static void dma_direct_sync_sg_for_device(struct device *dev, struct scatterlist *sgl, int nents, enum dma_data_direction dir) { @@ -229,6 +230,7 @@ static void dma_direct_sync_sg_for_device(struct device *dev, for_each_sg(sgl, sg, nents, i) arch_sync_dma_for_device(dev, sg_phys(sg), sg->length, dir); } +#endif #if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \ defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL) @@ -294,19 +296,13 @@ int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents, struct scatterlist *sg; for_each_sg(sgl, sg, nents, i) { - BUG_ON(!sg_page(sg)); - - sg_dma_address(sg) = phys_to_dma(dev, sg_phys(sg)); - if (unlikely(dev && !dma_capable(dev, sg_dma_address(sg), - sg->length))) { - report_addr(dev, sg_dma_address(sg), sg->length); + sg->dma_address = dma_direct_map_page(dev, sg_page(sg), + sg->offset, sg->length, dir, attrs); + if (sg->dma_address == DMA_MAPPING_ERROR) return 0; - } sg_dma_len(sg) = sg->length; } - if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC)) - dma_direct_sync_sg_for_device(dev, sgl, nents, dir); return nents; } -- cgit From 55897af63091ebc2c3f239c6a6666f748113ac50 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 3 Dec 2018 11:43:54 +0100 Subject: dma-direct: merge swiotlb_dma_ops into the dma_direct code While the dma-direct code is (relatively) clean and simple we actually have to use the swiotlb ops for the mapping on many architectures due to devices with addressing limits. Instead of keeping two implementations around this commit allows the dma-direct implementation to call the swiotlb bounce buffering functions and thus share the guts of the mapping implementation. This also simplified the dma-mapping setup on a few architectures where we don't have to differenciate which implementation to use. Signed-off-by: Christoph Hellwig Acked-by: Jesper Dangaard Brouer Tested-by: Jesper Dangaard Brouer Tested-by: Tony Luck --- arch/arm64/mm/dma-mapping.c | 2 +- arch/ia64/hp/common/hwsw_iommu.c | 2 +- arch/ia64/hp/common/sba_iommu.c | 6 +- arch/ia64/kernel/dma-mapping.c | 2 +- arch/mips/include/asm/dma-mapping.h | 2 - arch/powerpc/kernel/dma-swiotlb.c | 16 +-- arch/riscv/include/asm/dma-mapping.h | 15 --- arch/x86/kernel/pci-swiotlb.c | 4 +- arch/x86/mm/mem_encrypt.c | 7 -- arch/x86/pci/sta2x11-fixup.c | 1 - include/linux/dma-direct.h | 12 ++ include/linux/swiotlb.h | 74 +++++------ kernel/dma/direct.c | 113 ++++++++++++----- kernel/dma/swiotlb.c | 232 ++--------------------------------- 14 files changed, 150 insertions(+), 338 deletions(-) delete mode 100644 arch/riscv/include/asm/dma-mapping.h (limited to 'kernel') diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c index 6ff6ec8806c1..ab1e417204d0 100644 --- a/arch/arm64/mm/dma-mapping.c +++ b/arch/arm64/mm/dma-mapping.c @@ -463,7 +463,7 @@ void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size, const struct iommu_ops *iommu, bool coherent) { if (!dev->dma_ops) - dev->dma_ops = &swiotlb_dma_ops; + dev->dma_ops = &dma_direct_ops; dev->dma_coherent = coherent; __iommu_setup_dma_ops(dev, dma_base, size, iommu); diff --git a/arch/ia64/hp/common/hwsw_iommu.c b/arch/ia64/hp/common/hwsw_iommu.c index 58969039bed2..f40ca499b246 100644 --- a/arch/ia64/hp/common/hwsw_iommu.c +++ b/arch/ia64/hp/common/hwsw_iommu.c @@ -38,7 +38,7 @@ static inline int use_swiotlb(struct device *dev) const struct dma_map_ops *hwsw_dma_get_ops(struct device *dev) { if (use_swiotlb(dev)) - return &swiotlb_dma_ops; + return &dma_direct_ops; return &sba_dma_ops; } EXPORT_SYMBOL(hwsw_dma_get_ops); diff --git a/arch/ia64/hp/common/sba_iommu.c b/arch/ia64/hp/common/sba_iommu.c index 0d21c0b5b23d..5ee74820a0f6 100644 --- a/arch/ia64/hp/common/sba_iommu.c +++ b/arch/ia64/hp/common/sba_iommu.c @@ -2065,8 +2065,6 @@ static int __init acpi_sba_ioc_init_acpi(void) /* This has to run before acpi_scan_init(). */ arch_initcall(acpi_sba_ioc_init_acpi); -extern const struct dma_map_ops swiotlb_dma_ops; - static int __init sba_init(void) { @@ -2080,7 +2078,7 @@ sba_init(void) * a successful kdump kernel boot is to use the swiotlb. */ if (is_kdump_kernel()) { - dma_ops = &swiotlb_dma_ops; + dma_ops = &dma_direct_ops; if (swiotlb_late_init_with_default_size(64 * (1<<20)) != 0) panic("Unable to initialize software I/O TLB:" " Try machvec=dig boot option"); @@ -2102,7 +2100,7 @@ sba_init(void) * If we didn't find something sba_iommu can claim, we * need to setup the swiotlb and switch to the dig machvec. */ - dma_ops = &swiotlb_dma_ops; + dma_ops = &dma_direct_ops; if (swiotlb_late_init_with_default_size(64 * (1<<20)) != 0) panic("Unable to find SBA IOMMU or initialize " "software I/O TLB: Try machvec=dig boot option"); diff --git a/arch/ia64/kernel/dma-mapping.c b/arch/ia64/kernel/dma-mapping.c index 36dd6aa6d759..80cd3e1ea95a 100644 --- a/arch/ia64/kernel/dma-mapping.c +++ b/arch/ia64/kernel/dma-mapping.c @@ -36,7 +36,7 @@ long arch_dma_coherent_to_pfn(struct device *dev, void *cpu_addr, void __init swiotlb_dma_init(void) { - dma_ops = &swiotlb_dma_ops; + dma_ops = &dma_direct_ops; swiotlb_init(1); } #endif diff --git a/arch/mips/include/asm/dma-mapping.h b/arch/mips/include/asm/dma-mapping.h index b4c477eb46ce..69f914667f3e 100644 --- a/arch/mips/include/asm/dma-mapping.h +++ b/arch/mips/include/asm/dma-mapping.h @@ -10,8 +10,6 @@ static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus) { #if defined(CONFIG_MACH_JAZZ) return &jazz_dma_ops; -#elif defined(CONFIG_SWIOTLB) - return &swiotlb_dma_ops; #else return &dma_direct_ops; #endif diff --git a/arch/powerpc/kernel/dma-swiotlb.c b/arch/powerpc/kernel/dma-swiotlb.c index 3d8df2cf8be9..430a7d0aa2cb 100644 --- a/arch/powerpc/kernel/dma-swiotlb.c +++ b/arch/powerpc/kernel/dma-swiotlb.c @@ -50,15 +50,15 @@ const struct dma_map_ops powerpc_swiotlb_dma_ops = { .alloc = __dma_nommu_alloc_coherent, .free = __dma_nommu_free_coherent, .mmap = dma_nommu_mmap_coherent, - .map_sg = swiotlb_map_sg_attrs, - .unmap_sg = swiotlb_unmap_sg_attrs, + .map_sg = dma_direct_map_sg, + .unmap_sg = dma_direct_unmap_sg, .dma_supported = swiotlb_dma_supported, - .map_page = swiotlb_map_page, - .unmap_page = swiotlb_unmap_page, - .sync_single_for_cpu = swiotlb_sync_single_for_cpu, - .sync_single_for_device = swiotlb_sync_single_for_device, - .sync_sg_for_cpu = swiotlb_sync_sg_for_cpu, - .sync_sg_for_device = swiotlb_sync_sg_for_device, + .map_page = dma_direct_map_page, + .unmap_page = dma_direct_unmap_page, + .sync_single_for_cpu = dma_direct_sync_single_for_cpu, + .sync_single_for_device = dma_direct_sync_single_for_device, + .sync_sg_for_cpu = dma_direct_sync_sg_for_cpu, + .sync_sg_for_device = dma_direct_sync_sg_for_device, .get_required_mask = swiotlb_powerpc_get_required, }; diff --git a/arch/riscv/include/asm/dma-mapping.h b/arch/riscv/include/asm/dma-mapping.h deleted file mode 100644 index 8facc1c8fa05..000000000000 --- a/arch/riscv/include/asm/dma-mapping.h +++ /dev/null @@ -1,15 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#ifndef _RISCV_ASM_DMA_MAPPING_H -#define _RISCV_ASM_DMA_MAPPING_H 1 - -#ifdef CONFIG_SWIOTLB -#include -static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus) -{ - return &swiotlb_dma_ops; -} -#else -#include -#endif /* CONFIG_SWIOTLB */ - -#endif /* _RISCV_ASM_DMA_MAPPING_H */ diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c index bd08b9e1c9e2..5f5302028a9a 100644 --- a/arch/x86/kernel/pci-swiotlb.c +++ b/arch/x86/kernel/pci-swiotlb.c @@ -62,10 +62,8 @@ IOMMU_INIT(pci_swiotlb_detect_4gb, void __init pci_swiotlb_init(void) { - if (swiotlb) { + if (swiotlb) swiotlb_init(0); - dma_ops = &swiotlb_dma_ops; - } } void __init pci_swiotlb_late_init(void) diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c index 006f373f54ab..385afa2b9e17 100644 --- a/arch/x86/mm/mem_encrypt.c +++ b/arch/x86/mm/mem_encrypt.c @@ -380,13 +380,6 @@ void __init mem_encrypt_init(void) /* Call into SWIOTLB to update the SWIOTLB DMA buffers */ swiotlb_update_mem_attributes(); - /* - * With SEV, DMA operations cannot use encryption, we need to use - * SWIOTLB to bounce buffer DMA operation. - */ - if (sev_active()) - dma_ops = &swiotlb_dma_ops; - /* * With SEV, we need to unroll the rep string I/O instructions. */ diff --git a/arch/x86/pci/sta2x11-fixup.c b/arch/x86/pci/sta2x11-fixup.c index 7a5bafb76d77..3cdafea55ab6 100644 --- a/arch/x86/pci/sta2x11-fixup.c +++ b/arch/x86/pci/sta2x11-fixup.c @@ -168,7 +168,6 @@ static void sta2x11_setup_pdev(struct pci_dev *pdev) return; pci_set_consistent_dma_mask(pdev, STA2X11_AMBA_SIZE - 1); pci_set_dma_mask(pdev, STA2X11_AMBA_SIZE - 1); - pdev->dev.dma_ops = &swiotlb_dma_ops; pdev->dev.archdata.is_sta2x11 = true; /* We must enable all devices as master, for audio DMA to work */ diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h index 1aa73f4907ae..3b0a3ea3876d 100644 --- a/include/linux/dma-direct.h +++ b/include/linux/dma-direct.h @@ -63,7 +63,19 @@ void __dma_direct_free_pages(struct device *dev, size_t size, struct page *page) dma_addr_t dma_direct_map_page(struct device *dev, struct page *page, unsigned long offset, size_t size, enum dma_data_direction dir, unsigned long attrs); +void dma_direct_unmap_page(struct device *dev, dma_addr_t addr, + size_t size, enum dma_data_direction dir, unsigned long attrs); int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents, enum dma_data_direction dir, unsigned long attrs); +void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl, + int nents, enum dma_data_direction dir, unsigned long attrs); +void dma_direct_sync_single_for_device(struct device *dev, + dma_addr_t addr, size_t size, enum dma_data_direction dir); +void dma_direct_sync_sg_for_device(struct device *dev, + struct scatterlist *sgl, int nents, enum dma_data_direction dir); +void dma_direct_sync_single_for_cpu(struct device *dev, + dma_addr_t addr, size_t size, enum dma_data_direction dir); +void dma_direct_sync_sg_for_cpu(struct device *dev, + struct scatterlist *sgl, int nents, enum dma_data_direction dir); int dma_direct_supported(struct device *dev, u64 mask); #endif /* _LINUX_DMA_DIRECT_H */ diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h index 14aec0b70dd9..7c007ed7505f 100644 --- a/include/linux/swiotlb.h +++ b/include/linux/swiotlb.h @@ -16,8 +16,6 @@ enum swiotlb_force { SWIOTLB_NO_FORCE, /* swiotlb=noforce */ }; -extern enum swiotlb_force swiotlb_force; - /* * Maximum allowable number of contiguous slabs to map, * must be a power of 2. What is the appropriate value ? @@ -62,56 +60,44 @@ extern void swiotlb_tbl_sync_single(struct device *hwdev, size_t size, enum dma_data_direction dir, enum dma_sync_target target); -/* Accessory functions. */ - -extern dma_addr_t swiotlb_map_page(struct device *dev, struct page *page, - unsigned long offset, size_t size, - enum dma_data_direction dir, - unsigned long attrs); -extern void swiotlb_unmap_page(struct device *hwdev, dma_addr_t dev_addr, - size_t size, enum dma_data_direction dir, - unsigned long attrs); - -extern int -swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl, int nelems, - enum dma_data_direction dir, - unsigned long attrs); - -extern void -swiotlb_unmap_sg_attrs(struct device *hwdev, struct scatterlist *sgl, - int nelems, enum dma_data_direction dir, - unsigned long attrs); - -extern void -swiotlb_sync_single_for_cpu(struct device *hwdev, dma_addr_t dev_addr, - size_t size, enum dma_data_direction dir); - -extern void -swiotlb_sync_sg_for_cpu(struct device *hwdev, struct scatterlist *sg, - int nelems, enum dma_data_direction dir); - -extern void -swiotlb_sync_single_for_device(struct device *hwdev, dma_addr_t dev_addr, - size_t size, enum dma_data_direction dir); - -extern void -swiotlb_sync_sg_for_device(struct device *hwdev, struct scatterlist *sg, - int nelems, enum dma_data_direction dir); - extern int swiotlb_dma_supported(struct device *hwdev, u64 mask); #ifdef CONFIG_SWIOTLB -extern void __init swiotlb_exit(void); +extern enum swiotlb_force swiotlb_force; +extern phys_addr_t io_tlb_start, io_tlb_end; + +static inline bool is_swiotlb_buffer(phys_addr_t paddr) +{ + return paddr >= io_tlb_start && paddr < io_tlb_end; +} + +bool swiotlb_map(struct device *dev, phys_addr_t *phys, dma_addr_t *dma_addr, + size_t size, enum dma_data_direction dir, unsigned long attrs); +void __init swiotlb_exit(void); unsigned int swiotlb_max_segment(void); #else -static inline void swiotlb_exit(void) { } -static inline unsigned int swiotlb_max_segment(void) { return 0; } -#endif +#define swiotlb_force SWIOTLB_NO_FORCE +static inline bool is_swiotlb_buffer(phys_addr_t paddr) +{ + return false; +} +static inline bool swiotlb_map(struct device *dev, phys_addr_t *phys, + dma_addr_t *dma_addr, size_t size, enum dma_data_direction dir, + unsigned long attrs) +{ + return false; +} +static inline void swiotlb_exit(void) +{ +} +static inline unsigned int swiotlb_max_segment(void) +{ + return 0; +} +#endif /* CONFIG_SWIOTLB */ extern void swiotlb_print_info(void); extern void swiotlb_set_max_segment(unsigned int); -extern const struct dma_map_ops swiotlb_dma_ops; - #endif /* __LINUX_SWIOTLB_H */ diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index d45306473c90..85d8286a0ba2 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -13,6 +13,7 @@ #include #include #include +#include /* * Most architectures use ZONE_DMA for the first 16 Megabytes, but @@ -209,69 +210,110 @@ void dma_direct_free(struct device *dev, size_t size, dma_direct_free_pages(dev, size, cpu_addr, dma_addr, attrs); } -static void dma_direct_sync_single_for_device(struct device *dev, +#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) || \ + defined(CONFIG_SWIOTLB) +void dma_direct_sync_single_for_device(struct device *dev, dma_addr_t addr, size_t size, enum dma_data_direction dir) { - if (dev_is_dma_coherent(dev)) - return; - arch_sync_dma_for_device(dev, dma_to_phys(dev, addr), size, dir); + phys_addr_t paddr = dma_to_phys(dev, addr); + + if (unlikely(is_swiotlb_buffer(paddr))) + swiotlb_tbl_sync_single(dev, paddr, size, dir, SYNC_FOR_DEVICE); + + if (!dev_is_dma_coherent(dev)) + arch_sync_dma_for_device(dev, paddr, size, dir); } -#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) -static void dma_direct_sync_sg_for_device(struct device *dev, +void dma_direct_sync_sg_for_device(struct device *dev, struct scatterlist *sgl, int nents, enum dma_data_direction dir) { struct scatterlist *sg; int i; - if (dev_is_dma_coherent(dev)) - return; + for_each_sg(sgl, sg, nents, i) { + if (unlikely(is_swiotlb_buffer(sg_phys(sg)))) + swiotlb_tbl_sync_single(dev, sg_phys(sg), sg->length, + dir, SYNC_FOR_DEVICE); - for_each_sg(sgl, sg, nents, i) - arch_sync_dma_for_device(dev, sg_phys(sg), sg->length, dir); + if (!dev_is_dma_coherent(dev)) + arch_sync_dma_for_device(dev, sg_phys(sg), sg->length, + dir); + } } #endif #if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \ - defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL) -static void dma_direct_sync_single_for_cpu(struct device *dev, + defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL) || \ + defined(CONFIG_SWIOTLB) +void dma_direct_sync_single_for_cpu(struct device *dev, dma_addr_t addr, size_t size, enum dma_data_direction dir) { - if (dev_is_dma_coherent(dev)) - return; - arch_sync_dma_for_cpu(dev, dma_to_phys(dev, addr), size, dir); - arch_sync_dma_for_cpu_all(dev); + phys_addr_t paddr = dma_to_phys(dev, addr); + + if (!dev_is_dma_coherent(dev)) { + arch_sync_dma_for_cpu(dev, paddr, size, dir); + arch_sync_dma_for_cpu_all(dev); + } + + if (unlikely(is_swiotlb_buffer(paddr))) + swiotlb_tbl_sync_single(dev, paddr, size, dir, SYNC_FOR_CPU); } -static void dma_direct_sync_sg_for_cpu(struct device *dev, +void dma_direct_sync_sg_for_cpu(struct device *dev, struct scatterlist *sgl, int nents, enum dma_data_direction dir) { struct scatterlist *sg; int i; - if (dev_is_dma_coherent(dev)) - return; + for_each_sg(sgl, sg, nents, i) { + if (!dev_is_dma_coherent(dev)) + arch_sync_dma_for_cpu(dev, sg_phys(sg), sg->length, dir); + + if (unlikely(is_swiotlb_buffer(sg_phys(sg)))) + swiotlb_tbl_sync_single(dev, sg_phys(sg), sg->length, dir, + SYNC_FOR_CPU); + } - for_each_sg(sgl, sg, nents, i) - arch_sync_dma_for_cpu(dev, sg_phys(sg), sg->length, dir); - arch_sync_dma_for_cpu_all(dev); + if (!dev_is_dma_coherent(dev)) + arch_sync_dma_for_cpu_all(dev); } -static void dma_direct_unmap_page(struct device *dev, dma_addr_t addr, +void dma_direct_unmap_page(struct device *dev, dma_addr_t addr, size_t size, enum dma_data_direction dir, unsigned long attrs) { + phys_addr_t phys = dma_to_phys(dev, addr); + if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC)) dma_direct_sync_single_for_cpu(dev, addr, size, dir); + + if (unlikely(is_swiotlb_buffer(phys))) + swiotlb_tbl_unmap_single(dev, phys, size, dir, attrs); } -static void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl, +void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl, + int nents, enum dma_data_direction dir, unsigned long attrs) +{ + struct scatterlist *sg; + int i; + + for_each_sg(sgl, sg, nents, i) + dma_direct_unmap_page(dev, sg->dma_address, sg_dma_len(sg), dir, + attrs); +} +#else +void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl, int nents, enum dma_data_direction dir, unsigned long attrs) { - if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC)) - dma_direct_sync_sg_for_cpu(dev, sgl, nents, dir); } #endif +static inline bool dma_direct_possible(struct device *dev, dma_addr_t dma_addr, + size_t size) +{ + return swiotlb_force != SWIOTLB_FORCE && + (!dev || dma_capable(dev, dma_addr, size)); +} + dma_addr_t dma_direct_map_page(struct device *dev, struct page *page, unsigned long offset, size_t size, enum dma_data_direction dir, unsigned long attrs) @@ -279,13 +321,14 @@ dma_addr_t dma_direct_map_page(struct device *dev, struct page *page, phys_addr_t phys = page_to_phys(page) + offset; dma_addr_t dma_addr = phys_to_dma(dev, phys); - if (unlikely(dev && !dma_capable(dev, dma_addr, size))) { + if (unlikely(!dma_direct_possible(dev, dma_addr, size)) && + !swiotlb_map(dev, &phys, &dma_addr, size, dir, attrs)) { report_addr(dev, dma_addr, size); return DMA_MAPPING_ERROR; } - if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC)) - dma_direct_sync_single_for_device(dev, dma_addr, size, dir); + if (!dev_is_dma_coherent(dev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) + arch_sync_dma_for_device(dev, phys, size, dir); return dma_addr; } @@ -299,11 +342,15 @@ int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents, sg->dma_address = dma_direct_map_page(dev, sg_page(sg), sg->offset, sg->length, dir, attrs); if (sg->dma_address == DMA_MAPPING_ERROR) - return 0; + goto out_unmap; sg_dma_len(sg) = sg->length; } return nents; + +out_unmap: + dma_direct_unmap_sg(dev, sgl, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC); + return 0; } /* @@ -331,12 +378,14 @@ const struct dma_map_ops dma_direct_ops = { .free = dma_direct_free, .map_page = dma_direct_map_page, .map_sg = dma_direct_map_sg, -#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) +#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) || \ + defined(CONFIG_SWIOTLB) .sync_single_for_device = dma_direct_sync_single_for_device, .sync_sg_for_device = dma_direct_sync_sg_for_device, #endif #if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \ - defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL) + defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL) || \ + defined(CONFIG_SWIOTLB) .sync_single_for_cpu = dma_direct_sync_single_for_cpu, .sync_sg_for_cpu = dma_direct_sync_sg_for_cpu, .unmap_page = dma_direct_unmap_page, diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 2e126bac5d7d..d6361776dc5c 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -21,7 +21,6 @@ #include #include -#include #include #include #include @@ -65,7 +64,7 @@ enum swiotlb_force swiotlb_force; * swiotlb_tbl_sync_single_*, to see if the memory was in fact allocated by this * API. */ -static phys_addr_t io_tlb_start, io_tlb_end; +phys_addr_t io_tlb_start, io_tlb_end; /* * The number of IO TLB blocks (in groups of 64) between io_tlb_start and @@ -383,11 +382,6 @@ void __init swiotlb_exit(void) max_segment = 0; } -static int is_swiotlb_buffer(phys_addr_t paddr) -{ - return paddr >= io_tlb_start && paddr < io_tlb_end; -} - /* * Bounce: copy the swiotlb buffer back to the original dma location */ @@ -623,221 +617,36 @@ void swiotlb_tbl_sync_single(struct device *hwdev, phys_addr_t tlb_addr, } } -static dma_addr_t swiotlb_bounce_page(struct device *dev, phys_addr_t *phys, +/* + * Create a swiotlb mapping for the buffer at @phys, and in case of DMAing + * to the device copy the data into it as well. + */ +bool swiotlb_map(struct device *dev, phys_addr_t *phys, dma_addr_t *dma_addr, size_t size, enum dma_data_direction dir, unsigned long attrs) { - dma_addr_t dma_addr; + trace_swiotlb_bounced(dev, *dma_addr, size, swiotlb_force); if (unlikely(swiotlb_force == SWIOTLB_NO_FORCE)) { dev_warn_ratelimited(dev, "Cannot do DMA to address %pa\n", phys); - return DMA_MAPPING_ERROR; + return false; } /* Oh well, have to allocate and map a bounce buffer. */ *phys = swiotlb_tbl_map_single(dev, __phys_to_dma(dev, io_tlb_start), *phys, size, dir, attrs); if (*phys == DMA_MAPPING_ERROR) - return DMA_MAPPING_ERROR; + return false; /* Ensure that the address returned is DMA'ble */ - dma_addr = __phys_to_dma(dev, *phys); - if (unlikely(!dma_capable(dev, dma_addr, size))) { + *dma_addr = __phys_to_dma(dev, *phys); + if (unlikely(!dma_capable(dev, *dma_addr, size))) { swiotlb_tbl_unmap_single(dev, *phys, size, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC); - return DMA_MAPPING_ERROR; - } - - return dma_addr; -} - -/* - * Map a single buffer of the indicated size for DMA in streaming mode. The - * physical address to use is returned. - * - * Once the device is given the dma address, the device owns this memory until - * either swiotlb_unmap_page or swiotlb_dma_sync_single is performed. - */ -dma_addr_t swiotlb_map_page(struct device *dev, struct page *page, - unsigned long offset, size_t size, - enum dma_data_direction dir, - unsigned long attrs) -{ - phys_addr_t phys = page_to_phys(page) + offset; - dma_addr_t dev_addr = phys_to_dma(dev, phys); - - BUG_ON(dir == DMA_NONE); - /* - * If the address happens to be in the device's DMA window, - * we can safely return the device addr and not worry about bounce - * buffering it. - */ - if (!dma_capable(dev, dev_addr, size) || - swiotlb_force == SWIOTLB_FORCE) { - trace_swiotlb_bounced(dev, dev_addr, size, swiotlb_force); - dev_addr = swiotlb_bounce_page(dev, &phys, size, dir, attrs); + return false; } - if (!dev_is_dma_coherent(dev) && - (attrs & DMA_ATTR_SKIP_CPU_SYNC) == 0 && - dev_addr != DMA_MAPPING_ERROR) - arch_sync_dma_for_device(dev, phys, size, dir); - - return dev_addr; -} - -/* - * Unmap a single streaming mode DMA translation. The dma_addr and size must - * match what was provided for in a previous swiotlb_map_page call. All - * other usages are undefined. - * - * After this call, reads by the cpu to the buffer are guaranteed to see - * whatever the device wrote there. - */ -void swiotlb_unmap_page(struct device *hwdev, dma_addr_t dev_addr, - size_t size, enum dma_data_direction dir, - unsigned long attrs) -{ - phys_addr_t paddr = dma_to_phys(hwdev, dev_addr); - - BUG_ON(dir == DMA_NONE); - - if (!dev_is_dma_coherent(hwdev) && - (attrs & DMA_ATTR_SKIP_CPU_SYNC) == 0) - arch_sync_dma_for_cpu(hwdev, paddr, size, dir); - - if (is_swiotlb_buffer(paddr)) - swiotlb_tbl_unmap_single(hwdev, paddr, size, dir, attrs); -} - -/* - * Make physical memory consistent for a single streaming mode DMA translation - * after a transfer. - * - * If you perform a swiotlb_map_page() but wish to interrogate the buffer - * using the cpu, yet do not wish to teardown the dma mapping, you must - * call this function before doing so. At the next point you give the dma - * address back to the card, you must first perform a - * swiotlb_dma_sync_for_device, and then the device again owns the buffer - */ -static void -swiotlb_sync_single(struct device *hwdev, dma_addr_t dev_addr, - size_t size, enum dma_data_direction dir, - enum dma_sync_target target) -{ - phys_addr_t paddr = dma_to_phys(hwdev, dev_addr); - - BUG_ON(dir == DMA_NONE); - - if (!dev_is_dma_coherent(hwdev) && target == SYNC_FOR_CPU) - arch_sync_dma_for_cpu(hwdev, paddr, size, dir); - - if (is_swiotlb_buffer(paddr)) - swiotlb_tbl_sync_single(hwdev, paddr, size, dir, target); - - if (!dev_is_dma_coherent(hwdev) && target == SYNC_FOR_DEVICE) - arch_sync_dma_for_device(hwdev, paddr, size, dir); -} - -void -swiotlb_sync_single_for_cpu(struct device *hwdev, dma_addr_t dev_addr, - size_t size, enum dma_data_direction dir) -{ - swiotlb_sync_single(hwdev, dev_addr, size, dir, SYNC_FOR_CPU); -} - -void -swiotlb_sync_single_for_device(struct device *hwdev, dma_addr_t dev_addr, - size_t size, enum dma_data_direction dir) -{ - swiotlb_sync_single(hwdev, dev_addr, size, dir, SYNC_FOR_DEVICE); -} - -/* - * Map a set of buffers described by scatterlist in streaming mode for DMA. - * This is the scatter-gather version of the above swiotlb_map_page - * interface. Here the scatter gather list elements are each tagged with the - * appropriate dma address and length. They are obtained via - * sg_dma_{address,length}(SG). - * - * Device ownership issues as mentioned above for swiotlb_map_page are the - * same here. - */ -int -swiotlb_map_sg_attrs(struct device *dev, struct scatterlist *sgl, int nelems, - enum dma_data_direction dir, unsigned long attrs) -{ - struct scatterlist *sg; - int i; - - for_each_sg(sgl, sg, nelems, i) { - sg->dma_address = swiotlb_map_page(dev, sg_page(sg), sg->offset, - sg->length, dir, attrs); - if (sg->dma_address == DMA_MAPPING_ERROR) - goto out_error; - sg_dma_len(sg) = sg->length; - } - - return nelems; - -out_error: - swiotlb_unmap_sg_attrs(dev, sgl, i, dir, - attrs | DMA_ATTR_SKIP_CPU_SYNC); - sg_dma_len(sgl) = 0; - return 0; -} - -/* - * Unmap a set of streaming mode DMA translations. Again, cpu read rules - * concerning calls here are the same as for swiotlb_unmap_page() above. - */ -void -swiotlb_unmap_sg_attrs(struct device *hwdev, struct scatterlist *sgl, - int nelems, enum dma_data_direction dir, - unsigned long attrs) -{ - struct scatterlist *sg; - int i; - - BUG_ON(dir == DMA_NONE); - - for_each_sg(sgl, sg, nelems, i) - swiotlb_unmap_page(hwdev, sg->dma_address, sg_dma_len(sg), dir, - attrs); -} - -/* - * Make physical memory consistent for a set of streaming mode DMA translations - * after a transfer. - * - * The same as swiotlb_sync_single_* but for a scatter-gather list, same rules - * and usage. - */ -static void -swiotlb_sync_sg(struct device *hwdev, struct scatterlist *sgl, - int nelems, enum dma_data_direction dir, - enum dma_sync_target target) -{ - struct scatterlist *sg; - int i; - - for_each_sg(sgl, sg, nelems, i) - swiotlb_sync_single(hwdev, sg->dma_address, - sg_dma_len(sg), dir, target); -} - -void -swiotlb_sync_sg_for_cpu(struct device *hwdev, struct scatterlist *sg, - int nelems, enum dma_data_direction dir) -{ - swiotlb_sync_sg(hwdev, sg, nelems, dir, SYNC_FOR_CPU); -} - -void -swiotlb_sync_sg_for_device(struct device *hwdev, struct scatterlist *sg, - int nelems, enum dma_data_direction dir) -{ - swiotlb_sync_sg(hwdev, sg, nelems, dir, SYNC_FOR_DEVICE); + return true; } /* @@ -851,18 +660,3 @@ swiotlb_dma_supported(struct device *hwdev, u64 mask) { return __phys_to_dma(hwdev, io_tlb_end - 1) <= mask; } - -const struct dma_map_ops swiotlb_dma_ops = { - .alloc = dma_direct_alloc, - .free = dma_direct_free, - .sync_single_for_cpu = swiotlb_sync_single_for_cpu, - .sync_single_for_device = swiotlb_sync_single_for_device, - .sync_sg_for_cpu = swiotlb_sync_sg_for_cpu, - .sync_sg_for_device = swiotlb_sync_sg_for_device, - .map_sg = swiotlb_map_sg_attrs, - .unmap_sg = swiotlb_unmap_sg_attrs, - .map_page = swiotlb_map_page, - .unmap_page = swiotlb_unmap_page, - .dma_supported = dma_direct_supported, -}; -EXPORT_SYMBOL(swiotlb_dma_ops); -- cgit From 356da6d0cde3323236977fce54c1f9612a742036 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 6 Dec 2018 13:39:32 -0800 Subject: dma-mapping: bypass indirect calls for dma-direct Avoid expensive indirect calls in the fast path DMA mapping operations by directly calling the dma_direct_* ops if we are using the directly mapped DMA operations. Signed-off-by: Christoph Hellwig Acked-by: Jesper Dangaard Brouer Tested-by: Jesper Dangaard Brouer Tested-by: Tony Luck --- arch/alpha/include/asm/dma-mapping.h | 2 +- arch/arc/mm/cache.c | 2 +- arch/arm/include/asm/dma-mapping.h | 2 +- arch/arm/mm/dma-mapping-nommu.c | 14 +---- arch/arm64/mm/dma-mapping.c | 3 - arch/ia64/hp/common/hwsw_iommu.c | 2 +- arch/ia64/hp/common/sba_iommu.c | 4 +- arch/ia64/kernel/dma-mapping.c | 1 - arch/mips/include/asm/dma-mapping.h | 2 +- arch/parisc/kernel/setup.c | 4 -- arch/sparc/include/asm/dma-mapping.h | 4 +- arch/x86/kernel/pci-dma.c | 2 +- drivers/gpu/drm/vmwgfx/vmwgfx_drv.c | 2 +- drivers/iommu/amd_iommu.c | 13 +--- include/asm-generic/dma-mapping.h | 2 +- include/linux/dma-direct.h | 17 ------ include/linux/dma-mapping.h | 111 ++++++++++++++++++++++++++++++----- include/linux/dma-noncoherent.h | 5 +- kernel/dma/direct.c | 37 +++--------- kernel/dma/mapping.c | 40 ++++++++----- 20 files changed, 150 insertions(+), 119 deletions(-) (limited to 'kernel') diff --git a/arch/alpha/include/asm/dma-mapping.h b/arch/alpha/include/asm/dma-mapping.h index 8beeafd4f68e..0ee6a5c99b16 100644 --- a/arch/alpha/include/asm/dma-mapping.h +++ b/arch/alpha/include/asm/dma-mapping.h @@ -7,7 +7,7 @@ extern const struct dma_map_ops alpha_pci_ops; static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus) { #ifdef CONFIG_ALPHA_JENSEN - return &dma_direct_ops; + return NULL; #else return &alpha_pci_ops; #endif diff --git a/arch/arc/mm/cache.c b/arch/arc/mm/cache.c index f2701c13a66b..e188bb3ede53 100644 --- a/arch/arc/mm/cache.c +++ b/arch/arc/mm/cache.c @@ -1280,7 +1280,7 @@ void __init arc_cache_init_master(void) /* * In case of IOC (say IOC+SLC case), pointers above could still be set * but end up not being relevant as the first function in chain is not - * called at all for @dma_direct_ops + * called at all for devices using coherent DMA. * arch_sync_dma_for_cpu() -> dma_cache_*() -> __dma_cache_*() */ } diff --git a/arch/arm/include/asm/dma-mapping.h b/arch/arm/include/asm/dma-mapping.h index 965b7c846ecb..31d3b96f0f4b 100644 --- a/arch/arm/include/asm/dma-mapping.h +++ b/arch/arm/include/asm/dma-mapping.h @@ -18,7 +18,7 @@ extern const struct dma_map_ops arm_coherent_dma_ops; static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus) { - return IS_ENABLED(CONFIG_MMU) ? &arm_dma_ops : &dma_direct_ops; + return IS_ENABLED(CONFIG_MMU) ? &arm_dma_ops : NULL; } #ifdef __arch_page_to_dma diff --git a/arch/arm/mm/dma-mapping-nommu.c b/arch/arm/mm/dma-mapping-nommu.c index 712416ecd8e6..f304b10e23a4 100644 --- a/arch/arm/mm/dma-mapping-nommu.c +++ b/arch/arm/mm/dma-mapping-nommu.c @@ -22,7 +22,7 @@ #include "dma.h" /* - * dma_direct_ops is used if + * The generic direct mapping code is used if * - MMU/MPU is off * - cpu is v7m w/o cache support * - device is coherent @@ -209,16 +209,9 @@ const struct dma_map_ops arm_nommu_dma_ops = { }; EXPORT_SYMBOL(arm_nommu_dma_ops); -static const struct dma_map_ops *arm_nommu_get_dma_map_ops(bool coherent) -{ - return coherent ? &dma_direct_ops : &arm_nommu_dma_ops; -} - void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size, const struct iommu_ops *iommu, bool coherent) { - const struct dma_map_ops *dma_ops; - if (IS_ENABLED(CONFIG_CPU_V7M)) { /* * Cache support for v7m is optional, so can be treated as @@ -234,7 +227,6 @@ void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size, dev->archdata.dma_coherent = (get_cr() & CR_M) ? coherent : true; } - dma_ops = arm_nommu_get_dma_map_ops(dev->archdata.dma_coherent); - - set_dma_ops(dev, dma_ops); + if (!dev->archdata.dma_coherent) + set_dma_ops(dev, &arm_nommu_dma_ops); } diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c index ab1e417204d0..95eda81e3f2d 100644 --- a/arch/arm64/mm/dma-mapping.c +++ b/arch/arm64/mm/dma-mapping.c @@ -462,9 +462,6 @@ static void __iommu_setup_dma_ops(struct device *dev, u64 dma_base, u64 size, void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size, const struct iommu_ops *iommu, bool coherent) { - if (!dev->dma_ops) - dev->dma_ops = &dma_direct_ops; - dev->dma_coherent = coherent; __iommu_setup_dma_ops(dev, dma_base, size, iommu); diff --git a/arch/ia64/hp/common/hwsw_iommu.c b/arch/ia64/hp/common/hwsw_iommu.c index f40ca499b246..8840ed97712f 100644 --- a/arch/ia64/hp/common/hwsw_iommu.c +++ b/arch/ia64/hp/common/hwsw_iommu.c @@ -38,7 +38,7 @@ static inline int use_swiotlb(struct device *dev) const struct dma_map_ops *hwsw_dma_get_ops(struct device *dev) { if (use_swiotlb(dev)) - return &dma_direct_ops; + return NULL; return &sba_dma_ops; } EXPORT_SYMBOL(hwsw_dma_get_ops); diff --git a/arch/ia64/hp/common/sba_iommu.c b/arch/ia64/hp/common/sba_iommu.c index 5ee74820a0f6..5a361e51cb1e 100644 --- a/arch/ia64/hp/common/sba_iommu.c +++ b/arch/ia64/hp/common/sba_iommu.c @@ -2078,7 +2078,7 @@ sba_init(void) * a successful kdump kernel boot is to use the swiotlb. */ if (is_kdump_kernel()) { - dma_ops = &dma_direct_ops; + dma_ops = NULL; if (swiotlb_late_init_with_default_size(64 * (1<<20)) != 0) panic("Unable to initialize software I/O TLB:" " Try machvec=dig boot option"); @@ -2100,7 +2100,7 @@ sba_init(void) * If we didn't find something sba_iommu can claim, we * need to setup the swiotlb and switch to the dig machvec. */ - dma_ops = &dma_direct_ops; + dma_ops = NULL; if (swiotlb_late_init_with_default_size(64 * (1<<20)) != 0) panic("Unable to find SBA IOMMU or initialize " "software I/O TLB: Try machvec=dig boot option"); diff --git a/arch/ia64/kernel/dma-mapping.c b/arch/ia64/kernel/dma-mapping.c index 80cd3e1ea95a..ad7d9963de34 100644 --- a/arch/ia64/kernel/dma-mapping.c +++ b/arch/ia64/kernel/dma-mapping.c @@ -36,7 +36,6 @@ long arch_dma_coherent_to_pfn(struct device *dev, void *cpu_addr, void __init swiotlb_dma_init(void) { - dma_ops = &dma_direct_ops; swiotlb_init(1); } #endif diff --git a/arch/mips/include/asm/dma-mapping.h b/arch/mips/include/asm/dma-mapping.h index 69f914667f3e..20dfaad3a55d 100644 --- a/arch/mips/include/asm/dma-mapping.h +++ b/arch/mips/include/asm/dma-mapping.h @@ -11,7 +11,7 @@ static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus) #if defined(CONFIG_MACH_JAZZ) return &jazz_dma_ops; #else - return &dma_direct_ops; + return NULL; #endif } diff --git a/arch/parisc/kernel/setup.c b/arch/parisc/kernel/setup.c index cd227f1cf629..54818cd78bd0 100644 --- a/arch/parisc/kernel/setup.c +++ b/arch/parisc/kernel/setup.c @@ -99,10 +99,6 @@ void __init dma_ops_init(void) case pcxl2: pa7300lc_init(); - case pcxl: /* falls through */ - case pcxs: - case pcxt: - hppa_dma_ops = &dma_direct_ops; break; default: break; diff --git a/arch/sparc/include/asm/dma-mapping.h b/arch/sparc/include/asm/dma-mapping.h index 55a44f08a9a4..ed32845bd2d2 100644 --- a/arch/sparc/include/asm/dma-mapping.h +++ b/arch/sparc/include/asm/dma-mapping.h @@ -12,11 +12,11 @@ static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus) { #ifdef CONFIG_SPARC_LEON if (sparc_cpu_model == sparc_leon) - return &dma_direct_ops; + return NULL; #endif #if defined(CONFIG_SPARC32) && defined(CONFIG_PCI) if (bus == &pci_bus_type) - return &dma_direct_ops; + return NULL; #endif return dma_ops; } diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index f4562fcec681..d460998ae828 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -17,7 +17,7 @@ static bool disable_dac_quirk __read_mostly; -const struct dma_map_ops *dma_ops = &dma_direct_ops; +const struct dma_map_ops *dma_ops; EXPORT_SYMBOL(dma_ops); #ifdef CONFIG_IOMMU_DEBUG diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c index 61a84b958d67..50637f372e9f 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c @@ -581,7 +581,7 @@ static int vmw_dma_select_mode(struct vmw_private *dev_priv) dev_priv->map_mode = vmw_dma_map_populate; - if (dma_ops->sync_single_for_cpu) + if (dma_ops && dma_ops->sync_single_for_cpu) dev_priv->map_mode = vmw_dma_alloc_coherent; #ifdef CONFIG_SWIOTLB if (swiotlb_nr_tbl() == 0) diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index c5d6c7c42b0a..567221cca13c 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -2184,7 +2184,7 @@ static int amd_iommu_add_device(struct device *dev) dev_name(dev)); iommu_ignore_device(dev); - dev->dma_ops = &dma_direct_ops; + dev->dma_ops = NULL; goto out; } init_iommu_group(dev); @@ -2770,17 +2770,6 @@ int __init amd_iommu_init_dma_ops(void) swiotlb = (iommu_pass_through || sme_me_mask) ? 1 : 0; iommu_detected = 1; - /* - * In case we don't initialize SWIOTLB (actually the common case - * when AMD IOMMU is enabled and SME is not active), make sure there - * are global dma_ops set as a fall-back for devices not handled by - * this driver (for example non-PCI devices). When SME is active, - * make sure that swiotlb variable remains set so the global dma_ops - * continue to be SWIOTLB. - */ - if (!swiotlb) - dma_ops = &dma_direct_ops; - if (amd_iommu_unmap_flush) pr_info("AMD-Vi: IO/TLB flush on unmap enabled\n"); else diff --git a/include/asm-generic/dma-mapping.h b/include/asm-generic/dma-mapping.h index 880a292d792f..c13f46109e88 100644 --- a/include/asm-generic/dma-mapping.h +++ b/include/asm-generic/dma-mapping.h @@ -4,7 +4,7 @@ static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus) { - return &dma_direct_ops; + return NULL; } #endif /* _ASM_GENERIC_DMA_MAPPING_H */ diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h index 3b0a3ea3876d..b7338702592a 100644 --- a/include/linux/dma-direct.h +++ b/include/linux/dma-direct.h @@ -60,22 +60,5 @@ void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr, struct page *__dma_direct_alloc_pages(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs); void __dma_direct_free_pages(struct device *dev, size_t size, struct page *page); -dma_addr_t dma_direct_map_page(struct device *dev, struct page *page, - unsigned long offset, size_t size, enum dma_data_direction dir, - unsigned long attrs); -void dma_direct_unmap_page(struct device *dev, dma_addr_t addr, - size_t size, enum dma_data_direction dir, unsigned long attrs); -int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents, - enum dma_data_direction dir, unsigned long attrs); -void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl, - int nents, enum dma_data_direction dir, unsigned long attrs); -void dma_direct_sync_single_for_device(struct device *dev, - dma_addr_t addr, size_t size, enum dma_data_direction dir); -void dma_direct_sync_sg_for_device(struct device *dev, - struct scatterlist *sgl, int nents, enum dma_data_direction dir); -void dma_direct_sync_single_for_cpu(struct device *dev, - dma_addr_t addr, size_t size, enum dma_data_direction dir); -void dma_direct_sync_sg_for_cpu(struct device *dev, - struct scatterlist *sgl, int nents, enum dma_data_direction dir); int dma_direct_supported(struct device *dev, u64 mask); #endif /* _LINUX_DMA_DIRECT_H */ diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index 269ee27fc3d9..f422aec0f53c 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -134,7 +134,6 @@ struct dma_map_ops { #define DMA_MAPPING_ERROR (~(dma_addr_t)0) -extern const struct dma_map_ops dma_direct_ops; extern const struct dma_map_ops dma_virt_ops; extern const struct dma_map_ops dma_dummy_ops; @@ -222,6 +221,69 @@ static inline const struct dma_map_ops *get_dma_ops(struct device *dev) } #endif +static inline bool dma_is_direct(const struct dma_map_ops *ops) +{ + return likely(!ops); +} + +/* + * All the dma_direct_* declarations are here just for the indirect call bypass, + * and must not be used directly drivers! + */ +dma_addr_t dma_direct_map_page(struct device *dev, struct page *page, + unsigned long offset, size_t size, enum dma_data_direction dir, + unsigned long attrs); +int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents, + enum dma_data_direction dir, unsigned long attrs); + +#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) || \ + defined(CONFIG_SWIOTLB) +void dma_direct_sync_single_for_device(struct device *dev, + dma_addr_t addr, size_t size, enum dma_data_direction dir); +void dma_direct_sync_sg_for_device(struct device *dev, + struct scatterlist *sgl, int nents, enum dma_data_direction dir); +#else +static inline void dma_direct_sync_single_for_device(struct device *dev, + dma_addr_t addr, size_t size, enum dma_data_direction dir) +{ +} +static inline void dma_direct_sync_sg_for_device(struct device *dev, + struct scatterlist *sgl, int nents, enum dma_data_direction dir) +{ +} +#endif + +#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \ + defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL) || \ + defined(CONFIG_SWIOTLB) +void dma_direct_unmap_page(struct device *dev, dma_addr_t addr, + size_t size, enum dma_data_direction dir, unsigned long attrs); +void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl, + int nents, enum dma_data_direction dir, unsigned long attrs); +void dma_direct_sync_single_for_cpu(struct device *dev, + dma_addr_t addr, size_t size, enum dma_data_direction dir); +void dma_direct_sync_sg_for_cpu(struct device *dev, + struct scatterlist *sgl, int nents, enum dma_data_direction dir); +#else +static inline void dma_direct_unmap_page(struct device *dev, dma_addr_t addr, + size_t size, enum dma_data_direction dir, unsigned long attrs) +{ +} +static inline void dma_direct_unmap_sg(struct device *dev, + struct scatterlist *sgl, int nents, enum dma_data_direction dir, + unsigned long attrs) +{ +} +static inline void dma_direct_sync_single_for_cpu(struct device *dev, + dma_addr_t addr, size_t size, enum dma_data_direction dir) +{ +} +static inline void dma_direct_sync_sg_for_cpu(struct device *dev, + struct scatterlist *sgl, int nents, enum dma_data_direction dir) +{ +} +#endif + static inline dma_addr_t dma_map_single_attrs(struct device *dev, void *ptr, size_t size, enum dma_data_direction dir, @@ -232,9 +294,12 @@ static inline dma_addr_t dma_map_single_attrs(struct device *dev, void *ptr, BUG_ON(!valid_dma_direction(dir)); debug_dma_map_single(dev, ptr, size); - addr = ops->map_page(dev, virt_to_page(ptr), - offset_in_page(ptr), size, - dir, attrs); + if (dma_is_direct(ops)) + addr = dma_direct_map_page(dev, virt_to_page(ptr), + offset_in_page(ptr), size, dir, attrs); + else + addr = ops->map_page(dev, virt_to_page(ptr), + offset_in_page(ptr), size, dir, attrs); debug_dma_map_page(dev, virt_to_page(ptr), offset_in_page(ptr), size, dir, addr, true); @@ -249,7 +314,9 @@ static inline void dma_unmap_single_attrs(struct device *dev, dma_addr_t addr, const struct dma_map_ops *ops = get_dma_ops(dev); BUG_ON(!valid_dma_direction(dir)); - if (ops->unmap_page) + if (dma_is_direct(ops)) + dma_direct_unmap_page(dev, addr, size, dir, attrs); + else if (ops->unmap_page) ops->unmap_page(dev, addr, size, dir, attrs); debug_dma_unmap_page(dev, addr, size, dir, true); } @@ -272,7 +339,10 @@ static inline int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, int ents; BUG_ON(!valid_dma_direction(dir)); - ents = ops->map_sg(dev, sg, nents, dir, attrs); + if (dma_is_direct(ops)) + ents = dma_direct_map_sg(dev, sg, nents, dir, attrs); + else + ents = ops->map_sg(dev, sg, nents, dir, attrs); BUG_ON(ents < 0); debug_dma_map_sg(dev, sg, nents, ents, dir); @@ -287,7 +357,9 @@ static inline void dma_unmap_sg_attrs(struct device *dev, struct scatterlist *sg BUG_ON(!valid_dma_direction(dir)); debug_dma_unmap_sg(dev, sg, nents, dir); - if (ops->unmap_sg) + if (dma_is_direct(ops)) + dma_direct_unmap_sg(dev, sg, nents, dir, attrs); + else if (ops->unmap_sg) ops->unmap_sg(dev, sg, nents, dir, attrs); } @@ -301,7 +373,10 @@ static inline dma_addr_t dma_map_page_attrs(struct device *dev, dma_addr_t addr; BUG_ON(!valid_dma_direction(dir)); - addr = ops->map_page(dev, page, offset, size, dir, attrs); + if (dma_is_direct(ops)) + addr = dma_direct_map_page(dev, page, offset, size, dir, attrs); + else + addr = ops->map_page(dev, page, offset, size, dir, attrs); debug_dma_map_page(dev, page, offset, size, dir, addr, false); return addr; @@ -322,7 +397,7 @@ static inline dma_addr_t dma_map_resource(struct device *dev, BUG_ON(pfn_valid(PHYS_PFN(phys_addr))); addr = phys_addr; - if (ops->map_resource) + if (ops && ops->map_resource) addr = ops->map_resource(dev, phys_addr, size, dir, attrs); debug_dma_map_resource(dev, phys_addr, size, dir, addr); @@ -337,7 +412,7 @@ static inline void dma_unmap_resource(struct device *dev, dma_addr_t addr, const struct dma_map_ops *ops = get_dma_ops(dev); BUG_ON(!valid_dma_direction(dir)); - if (ops->unmap_resource) + if (ops && ops->unmap_resource) ops->unmap_resource(dev, addr, size, dir, attrs); debug_dma_unmap_resource(dev, addr, size, dir); } @@ -349,7 +424,9 @@ static inline void dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr, const struct dma_map_ops *ops = get_dma_ops(dev); BUG_ON(!valid_dma_direction(dir)); - if (ops->sync_single_for_cpu) + if (dma_is_direct(ops)) + dma_direct_sync_single_for_cpu(dev, addr, size, dir); + else if (ops->sync_single_for_cpu) ops->sync_single_for_cpu(dev, addr, size, dir); debug_dma_sync_single_for_cpu(dev, addr, size, dir); } @@ -368,7 +445,9 @@ static inline void dma_sync_single_for_device(struct device *dev, const struct dma_map_ops *ops = get_dma_ops(dev); BUG_ON(!valid_dma_direction(dir)); - if (ops->sync_single_for_device) + if (dma_is_direct(ops)) + dma_direct_sync_single_for_device(dev, addr, size, dir); + else if (ops->sync_single_for_device) ops->sync_single_for_device(dev, addr, size, dir); debug_dma_sync_single_for_device(dev, addr, size, dir); } @@ -387,7 +466,9 @@ dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, const struct dma_map_ops *ops = get_dma_ops(dev); BUG_ON(!valid_dma_direction(dir)); - if (ops->sync_sg_for_cpu) + if (dma_is_direct(ops)) + dma_direct_sync_sg_for_cpu(dev, sg, nelems, dir); + else if (ops->sync_sg_for_cpu) ops->sync_sg_for_cpu(dev, sg, nelems, dir); debug_dma_sync_sg_for_cpu(dev, sg, nelems, dir); } @@ -399,7 +480,9 @@ dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, const struct dma_map_ops *ops = get_dma_ops(dev); BUG_ON(!valid_dma_direction(dir)); - if (ops->sync_sg_for_device) + if (dma_is_direct(ops)) + dma_direct_sync_sg_for_device(dev, sg, nelems, dir); + else if (ops->sync_sg_for_device) ops->sync_sg_for_device(dev, sg, nelems, dir); debug_dma_sync_sg_for_device(dev, sg, nelems, dir); diff --git a/include/linux/dma-noncoherent.h b/include/linux/dma-noncoherent.h index 306557331d7d..69b36ed31a99 100644 --- a/include/linux/dma-noncoherent.h +++ b/include/linux/dma-noncoherent.h @@ -38,7 +38,10 @@ pgprot_t arch_dma_mmap_pgprot(struct device *dev, pgprot_t prot, void arch_dma_cache_sync(struct device *dev, void *vaddr, size_t size, enum dma_data_direction direction); #else -#define arch_dma_cache_sync NULL +static inline void arch_dma_cache_sync(struct device *dev, void *vaddr, + size_t size, enum dma_data_direction direction) +{ +} #endif /* CONFIG_DMA_NONCOHERENT_CACHE_SYNC */ #ifdef CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 85d8286a0ba2..79da61b49fa4 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -223,6 +223,7 @@ void dma_direct_sync_single_for_device(struct device *dev, if (!dev_is_dma_coherent(dev)) arch_sync_dma_for_device(dev, paddr, size, dir); } +EXPORT_SYMBOL(dma_direct_sync_single_for_device); void dma_direct_sync_sg_for_device(struct device *dev, struct scatterlist *sgl, int nents, enum dma_data_direction dir) @@ -240,6 +241,7 @@ void dma_direct_sync_sg_for_device(struct device *dev, dir); } } +EXPORT_SYMBOL(dma_direct_sync_sg_for_device); #endif #if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \ @@ -258,6 +260,7 @@ void dma_direct_sync_single_for_cpu(struct device *dev, if (unlikely(is_swiotlb_buffer(paddr))) swiotlb_tbl_sync_single(dev, paddr, size, dir, SYNC_FOR_CPU); } +EXPORT_SYMBOL(dma_direct_sync_single_for_cpu); void dma_direct_sync_sg_for_cpu(struct device *dev, struct scatterlist *sgl, int nents, enum dma_data_direction dir) @@ -277,6 +280,7 @@ void dma_direct_sync_sg_for_cpu(struct device *dev, if (!dev_is_dma_coherent(dev)) arch_sync_dma_for_cpu_all(dev); } +EXPORT_SYMBOL(dma_direct_sync_sg_for_cpu); void dma_direct_unmap_page(struct device *dev, dma_addr_t addr, size_t size, enum dma_data_direction dir, unsigned long attrs) @@ -289,6 +293,7 @@ void dma_direct_unmap_page(struct device *dev, dma_addr_t addr, if (unlikely(is_swiotlb_buffer(phys))) swiotlb_tbl_unmap_single(dev, phys, size, dir, attrs); } +EXPORT_SYMBOL(dma_direct_unmap_page); void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl, int nents, enum dma_data_direction dir, unsigned long attrs) @@ -300,11 +305,7 @@ void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl, dma_direct_unmap_page(dev, sg->dma_address, sg_dma_len(sg), dir, attrs); } -#else -void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl, - int nents, enum dma_data_direction dir, unsigned long attrs) -{ -} +EXPORT_SYMBOL(dma_direct_unmap_sg); #endif static inline bool dma_direct_possible(struct device *dev, dma_addr_t dma_addr, @@ -331,6 +332,7 @@ dma_addr_t dma_direct_map_page(struct device *dev, struct page *page, arch_sync_dma_for_device(dev, phys, size, dir); return dma_addr; } +EXPORT_SYMBOL(dma_direct_map_page); int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents, enum dma_data_direction dir, unsigned long attrs) @@ -352,6 +354,7 @@ out_unmap: dma_direct_unmap_sg(dev, sgl, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC); return 0; } +EXPORT_SYMBOL(dma_direct_map_sg); /* * Because 32-bit DMA masks are so common we expect every architecture to be @@ -372,27 +375,3 @@ int dma_direct_supported(struct device *dev, u64 mask) return mask >= phys_to_dma(dev, min_mask); } - -const struct dma_map_ops dma_direct_ops = { - .alloc = dma_direct_alloc, - .free = dma_direct_free, - .map_page = dma_direct_map_page, - .map_sg = dma_direct_map_sg, -#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) || \ - defined(CONFIG_SWIOTLB) - .sync_single_for_device = dma_direct_sync_single_for_device, - .sync_sg_for_device = dma_direct_sync_sg_for_device, -#endif -#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \ - defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL) || \ - defined(CONFIG_SWIOTLB) - .sync_single_for_cpu = dma_direct_sync_single_for_cpu, - .sync_sg_for_cpu = dma_direct_sync_sg_for_cpu, - .unmap_page = dma_direct_unmap_page, - .unmap_sg = dma_direct_unmap_sg, -#endif - .get_required_mask = dma_direct_get_required_mask, - .dma_supported = dma_direct_supported, - .cache_sync = arch_dma_cache_sync, -}; -EXPORT_SYMBOL(dma_direct_ops); diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index 0b18cfbdde95..fc84c81029d9 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -7,6 +7,7 @@ */ #include /* for max_pfn */ #include +#include #include #include #include @@ -229,8 +230,8 @@ int dma_get_sgtable_attrs(struct device *dev, struct sg_table *sgt, unsigned long attrs) { const struct dma_map_ops *ops = get_dma_ops(dev); - BUG_ON(!ops); - if (ops->get_sgtable) + + if (!dma_is_direct(ops) && ops->get_sgtable) return ops->get_sgtable(dev, sgt, cpu_addr, dma_addr, size, attrs); return dma_common_get_sgtable(dev, sgt, cpu_addr, dma_addr, size, @@ -293,8 +294,8 @@ int dma_mmap_attrs(struct device *dev, struct vm_area_struct *vma, unsigned long attrs) { const struct dma_map_ops *ops = get_dma_ops(dev); - BUG_ON(!ops); - if (ops->mmap) + + if (!dma_is_direct(ops) && ops->mmap) return ops->mmap(dev, vma, cpu_addr, dma_addr, size, attrs); return dma_common_mmap(dev, vma, cpu_addr, dma_addr, size, attrs); } @@ -324,6 +325,8 @@ u64 dma_get_required_mask(struct device *dev) { const struct dma_map_ops *ops = get_dma_ops(dev); + if (dma_is_direct(ops)) + return dma_direct_get_required_mask(dev); if (ops->get_required_mask) return ops->get_required_mask(dev); return dma_default_get_required_mask(dev); @@ -341,7 +344,6 @@ void *dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle, const struct dma_map_ops *ops = get_dma_ops(dev); void *cpu_addr; - BUG_ON(!ops); WARN_ON_ONCE(dev && !dev->coherent_dma_mask); if (dma_alloc_from_dev_coherent(dev, size, dma_handle, &cpu_addr)) @@ -352,10 +354,14 @@ void *dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle, if (!arch_dma_alloc_attrs(&dev)) return NULL; - if (!ops->alloc) + + if (dma_is_direct(ops)) + cpu_addr = dma_direct_alloc(dev, size, dma_handle, flag, attrs); + else if (ops->alloc) + cpu_addr = ops->alloc(dev, size, dma_handle, flag, attrs); + else return NULL; - cpu_addr = ops->alloc(dev, size, dma_handle, flag, attrs); debug_dma_alloc_coherent(dev, size, *dma_handle, cpu_addr); return cpu_addr; } @@ -366,8 +372,6 @@ void dma_free_attrs(struct device *dev, size_t size, void *cpu_addr, { const struct dma_map_ops *ops = get_dma_ops(dev); - BUG_ON(!ops); - if (dma_release_from_dev_coherent(dev, get_order(size), cpu_addr)) return; /* @@ -379,11 +383,14 @@ void dma_free_attrs(struct device *dev, size_t size, void *cpu_addr, */ WARN_ON(irqs_disabled()); - if (!ops->free || !cpu_addr) + if (!cpu_addr) return; debug_dma_free_coherent(dev, size, cpu_addr, dma_handle); - ops->free(dev, size, cpu_addr, dma_handle, attrs); + if (dma_is_direct(ops)) + dma_direct_free(dev, size, cpu_addr, dma_handle, attrs); + else if (ops->free) + ops->free(dev, size, cpu_addr, dma_handle, attrs); } EXPORT_SYMBOL(dma_free_attrs); @@ -397,9 +404,9 @@ int dma_supported(struct device *dev, u64 mask) { const struct dma_map_ops *ops = get_dma_ops(dev); - if (!ops) - return 0; - if (!ops->dma_supported) + if (dma_is_direct(ops)) + return dma_direct_supported(dev, mask); + if (ops->dma_supported) return 1; return ops->dma_supported(dev, mask); } @@ -437,7 +444,10 @@ void dma_cache_sync(struct device *dev, void *vaddr, size_t size, const struct dma_map_ops *ops = get_dma_ops(dev); BUG_ON(!valid_dma_direction(dir)); - if (ops->cache_sync) + + if (dma_is_direct(ops)) + arch_dma_cache_sync(dev, vaddr, size, dir); + else if (ops->cache_sync) ops->cache_sync(dev, vaddr, size, dir); } EXPORT_SYMBOL(dma_cache_sync); -- cgit From 9f8c1c5712954f9d8877ac55b18adbdf03e51e1f Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 12 Dec 2018 10:45:38 +0100 Subject: bpf: remove obsolete prog->aux sanitation in bpf_insn_prepare_dump This logic is not needed anymore since we got rid of the verifier rewrite that was using prog->aux address in f6069b9aa993 ("bpf: fix redirect to map under tail calls"). Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 7 ------- 1 file changed, 7 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 7f1410d6fbe9..6ae062f1cf20 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2032,13 +2032,6 @@ static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog) insns[i + 1].imm = 0; continue; } - - if (!bpf_dump_raw_ok() && - imm == (unsigned long)prog->aux) { - insns[i].imm = 0; - insns[i + 1].imm = 0; - continue; - } } return insns; -- cgit From 319deec7db6c0aab276d2447f778e7cffed24c7c Mon Sep 17 00:00:00 2001 From: Tycho Andersen Date: Wed, 12 Dec 2018 19:46:54 -0700 Subject: seccomp: fix poor type promotion sparse complains, kernel/seccomp.c:1172:13: warning: incorrect type in assignment (different base types) kernel/seccomp.c:1172:13: expected restricted __poll_t [usertype] ret kernel/seccomp.c:1172:13: got int kernel/seccomp.c:1173:13: warning: restricted __poll_t degrades to integer Instead of assigning this to ret, since we don't use this anywhere, let's just test it against 0 directly. Signed-off-by: Tycho Andersen Reported-by: 0day robot Fixes: 6a21cc50f0c7 ("seccomp: add a return code to trap to userspace") Signed-off-by: Kees Cook --- kernel/seccomp.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 15b6be97fc09..d7f538847b84 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -1169,8 +1169,7 @@ static __poll_t seccomp_notify_poll(struct file *file, poll_wait(file, &filter->notif->wqh, poll_tab); - ret = mutex_lock_interruptible(&filter->notify_lock); - if (ret < 0) + if (mutex_lock_interruptible(&filter->notify_lock) < 0) return EPOLLERR; list_for_each_entry(cur, &filter->notif->notifications, list) { -- cgit From d406db524c32ca35bd85cada28a547fff3115715 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Sun, 9 Dec 2018 14:25:02 +0800 Subject: audit: remove duplicated include from audit.c Remove duplicated include. Signed-off-by: YueHaibing Signed-off-by: Paul Moore --- kernel/audit.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index a0a4544e69ca..632d36059556 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -60,7 +60,6 @@ #include #include #include -#include #include -- cgit From 5439c985c5a83a8419f762115afdf560ab72a452 Mon Sep 17 00:00:00 2001 From: Vincent Whitchurch Date: Fri, 14 Dec 2018 17:05:54 +0100 Subject: module: Overwrite st_size instead of st_info st_info is currently overwritten after relocation and used to store the elf_type(). However, we're going to need it fix kallsyms on ARM's Thumb-2 kernels, so preserve st_info and overwrite the st_size field instead. st_size is neither used by the module core nor by any architecture. Reviewed-by: Miroslav Benes Reviewed-by: Dave Martin Signed-off-by: Vincent Whitchurch Signed-off-by: Jessica Yu --- kernel/module.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 1b5edf78694c..b36ff8a3d562 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2684,7 +2684,7 @@ static void add_kallsyms(struct module *mod, const struct load_info *info) /* Set types up while we still have access to sections. */ for (i = 0; i < mod->kallsyms->num_symtab; i++) - mod->kallsyms->symtab[i].st_info + mod->kallsyms->symtab[i].st_size = elf_type(&mod->kallsyms->symtab[i], info); /* Now populate the cut down core kallsyms for after init. */ @@ -4070,7 +4070,7 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type, kallsyms = rcu_dereference_sched(mod->kallsyms); if (symnum < kallsyms->num_symtab) { *value = kallsyms->symtab[symnum].st_value; - *type = kallsyms->symtab[symnum].st_info; + *type = kallsyms->symtab[symnum].st_size; strlcpy(name, kallsyms_symbol_name(kallsyms, symnum), KSYM_NAME_LEN); strlcpy(module_name, mod->name, MODULE_NAME_LEN); *exported = is_exported(name, *value, mod); -- cgit From 93d77e7f1410c366050d6035dcba1a5167c7cf0b Mon Sep 17 00:00:00 2001 From: Vincent Whitchurch Date: Fri, 14 Dec 2018 17:05:55 +0100 Subject: ARM: module: Fix function kallsyms on Thumb-2 Thumb-2 functions have the lowest bit set in the symbol value in the symtab. When kallsyms are generated for the vmlinux, the kallsyms are generated from the output of nm, and nm clears the lowest bit. $ arm-linux-gnueabihf-readelf -a vmlinux | grep show_interrupts 95947: 8015dc89 686 FUNC GLOBAL DEFAULT 2 show_interrupts $ arm-linux-gnueabihf-nm vmlinux | grep show_interrupts 8015dc88 T show_interrupts $ cat /proc/kallsyms | grep show_interrupts 8015dc88 T show_interrupts However, for modules, the kallsyms uses the values in the symbol table without modification, so for functions in modules, the lowest bit is set in kallsyms. $ arm-linux-gnueabihf-readelf -a drivers/net/tun.ko | grep tun_get_socket 333: 00002d4d 36 FUNC GLOBAL DEFAULT 1 tun_get_socket $ arm-linux-gnueabihf-nm drivers/net/tun.ko | grep tun_get_socket 00002d4c T tun_get_socket $ cat /proc/kallsyms | grep tun_get_socket 7f802d4d t tun_get_socket [tun] Because of this, the symbol+offset of the crashing instruction shown in oopses is incorrect when the crash is in a module. For example, given a tun_get_socket which starts like this, 00002d4c : 2d4c: 6943 ldr r3, [r0, #20] 2d4e: 4a07 ldr r2, [pc, #28] 2d50: 4293 cmp r3, r2 a crash when tun_get_socket is called with NULL results in: PC is at tun_xdp+0xa3/0xa4 [tun] pc : [<7f802d4c>] As can be seen, the "PC is at" line reports the wrong symbol name, and the symbol+offset will point to the wrong source line if it is passed to gdb. To solve this, add a way for archs to fixup the reading of these module kallsyms values, and use that to clear the lowest bit for function symbols on Thumb-2. After the fix: # cat /proc/kallsyms | grep tun_get_socket 7f802d4c t tun_get_socket [tun] PC is at tun_get_socket+0x0/0x24 [tun] pc : [<7f802d4c>] Signed-off-by: Vincent Whitchurch Signed-off-by: Jessica Yu --- arch/arm/include/asm/module.h | 11 +++++++++++ include/linux/module.h | 7 +++++++ kernel/module.c | 43 +++++++++++++++++++++++++++---------------- 3 files changed, 45 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/arch/arm/include/asm/module.h b/arch/arm/include/asm/module.h index 9e81b7c498d8..182163b55546 100644 --- a/arch/arm/include/asm/module.h +++ b/arch/arm/include/asm/module.h @@ -61,4 +61,15 @@ u32 get_module_plt(struct module *mod, unsigned long loc, Elf32_Addr val); MODULE_ARCH_VERMAGIC_ARMTHUMB \ MODULE_ARCH_VERMAGIC_P2V +#ifdef CONFIG_THUMB2_KERNEL +#define HAVE_ARCH_KALLSYMS_SYMBOL_VALUE +static inline unsigned long kallsyms_symbol_value(const Elf_Sym *sym) +{ + if (ELF_ST_TYPE(sym->st_info) == STT_FUNC) + return sym->st_value & ~1; + + return sym->st_value; +} +#endif + #endif /* _ASM_ARM_MODULE_H */ diff --git a/include/linux/module.h b/include/linux/module.h index fce6b4335e36..c0b4b7840b57 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -486,6 +486,13 @@ struct module { #define MODULE_ARCH_INIT {} #endif +#ifndef HAVE_ARCH_KALLSYMS_SYMBOL_VALUE +static inline unsigned long kallsyms_symbol_value(const Elf_Sym *sym) +{ + return sym->st_value; +} +#endif + extern struct mutex module_mutex; /* FIXME: It'd be nice to isolate modules during init, too, so they diff --git a/kernel/module.c b/kernel/module.c index b36ff8a3d562..164bf201eae4 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3928,7 +3928,7 @@ static const char *find_kallsyms_symbol(struct module *mod, unsigned long *offset) { unsigned int i, best = 0; - unsigned long nextval; + unsigned long nextval, bestval; struct mod_kallsyms *kallsyms = rcu_dereference_sched(mod->kallsyms); /* At worse, next value is at end of module */ @@ -3937,10 +3937,15 @@ static const char *find_kallsyms_symbol(struct module *mod, else nextval = (unsigned long)mod->core_layout.base+mod->core_layout.text_size; + bestval = kallsyms_symbol_value(&kallsyms->symtab[best]); + /* Scan for closest preceding symbol, and next symbol. (ELF starts real symbols at 1). */ for (i = 1; i < kallsyms->num_symtab; i++) { - if (kallsyms->symtab[i].st_shndx == SHN_UNDEF) + const Elf_Sym *sym = &kallsyms->symtab[i]; + unsigned long thisval = kallsyms_symbol_value(sym); + + if (sym->st_shndx == SHN_UNDEF) continue; /* We ignore unnamed symbols: they're uninformative @@ -3949,21 +3954,21 @@ static const char *find_kallsyms_symbol(struct module *mod, || is_arm_mapping_symbol(kallsyms_symbol_name(kallsyms, i))) continue; - if (kallsyms->symtab[i].st_value <= addr - && kallsyms->symtab[i].st_value > kallsyms->symtab[best].st_value) + if (thisval <= addr && thisval > bestval) { best = i; - if (kallsyms->symtab[i].st_value > addr - && kallsyms->symtab[i].st_value < nextval) - nextval = kallsyms->symtab[i].st_value; + bestval = thisval; + } + if (thisval > addr && thisval < nextval) + nextval = thisval; } if (!best) return NULL; if (size) - *size = nextval - kallsyms->symtab[best].st_value; + *size = nextval - bestval; if (offset) - *offset = addr - kallsyms->symtab[best].st_value; + *offset = addr - bestval; return kallsyms_symbol_name(kallsyms, best); } @@ -4069,8 +4074,10 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type, continue; kallsyms = rcu_dereference_sched(mod->kallsyms); if (symnum < kallsyms->num_symtab) { - *value = kallsyms->symtab[symnum].st_value; - *type = kallsyms->symtab[symnum].st_size; + const Elf_Sym *sym = &kallsyms->symtab[symnum]; + + *value = kallsyms_symbol_value(sym); + *type = sym->st_size; strlcpy(name, kallsyms_symbol_name(kallsyms, symnum), KSYM_NAME_LEN); strlcpy(module_name, mod->name, MODULE_NAME_LEN); *exported = is_exported(name, *value, mod); @@ -4089,10 +4096,13 @@ static unsigned long find_kallsyms_symbol_value(struct module *mod, const char * unsigned int i; struct mod_kallsyms *kallsyms = rcu_dereference_sched(mod->kallsyms); - for (i = 0; i < kallsyms->num_symtab; i++) + for (i = 0; i < kallsyms->num_symtab; i++) { + const Elf_Sym *sym = &kallsyms->symtab[i]; + if (strcmp(name, kallsyms_symbol_name(kallsyms, i)) == 0 && - kallsyms->symtab[i].st_shndx != SHN_UNDEF) - return kallsyms->symtab[i].st_value; + sym->st_shndx != SHN_UNDEF) + return kallsyms_symbol_value(sym); + } return 0; } @@ -4137,12 +4147,13 @@ int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *, if (mod->state == MODULE_STATE_UNFORMED) continue; for (i = 0; i < kallsyms->num_symtab; i++) { + const Elf_Sym *sym = &kallsyms->symtab[i]; - if (kallsyms->symtab[i].st_shndx == SHN_UNDEF) + if (sym->st_shndx == SHN_UNDEF) continue; ret = fn(data, kallsyms_symbol_name(kallsyms, i), - mod, kallsyms->symtab[i].st_value); + mod, kallsyms_symbol_value(sym)); if (ret != 0) return ret; } -- cgit From 23127b33ec80e656921362d7dc82a0064bac20a2 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 13 Dec 2018 10:41:46 -0800 Subject: bpf: Create a new btf_name_by_offset() for non type name use case The current btf_name_by_offset() is returning "(anon)" type name for the offset == 0 case and "(invalid-name-offset)" for the out-of-bound offset case. It fits well for the internal BTF verbose log purpose which is focusing on type. For example, offset == 0 => "(anon)" => anonymous type/name. Returning non-NULL for the bad offset case is needed during the BTF verification process because the BTF verifier may complain about another field first before discovering the name_off is invalid. However, it may not be ideal for the newer use case which does not necessary mean type name. For example, when logging line_info in the BPF verifier in the next patch, it is better to log an empty src line instead of logging "(anon)". The existing bpf_name_by_offset() is renamed to __bpf_name_by_offset() and static to btf.c. A new bpf_name_by_offset() is added for generic context usage. It returns "\0" for name_off == 0 (note that btf->strings[0] is "\0") and NULL for invalid offset. It allows the caller to decide what is the best output in its context. The new btf_name_by_offset() is overlapped with btf_name_offset_valid(). Hence, btf_name_offset_valid() is removed from btf.h to keep the btf.h API minimal. The existing btf_name_offset_valid() usage in btf.c could also be replaced later. Signed-off-by: Martin KaFai Lau Acked-by: Yonghong Song Signed-off-by: Alexei Starovoitov --- include/linux/btf.h | 1 - kernel/bpf/btf.c | 31 ++++++++++++++++++++----------- kernel/bpf/verifier.c | 4 ++-- 3 files changed, 22 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/include/linux/btf.h b/include/linux/btf.h index a4cf075b89eb..58000d7e06e3 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -46,7 +46,6 @@ void btf_type_seq_show(const struct btf *btf, u32 type_id, void *obj, struct seq_file *m); int btf_get_fd_by_id(u32 id); u32 btf_id(const struct btf *btf); -bool btf_name_offset_valid(const struct btf *btf, u32 offset); bool btf_type_is_reg_int(const struct btf_type *t, u32 expected_size); #ifdef CONFIG_BPF_SYSCALL diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 1545ddfb6fa5..8fa0bf1c33fd 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -474,7 +474,7 @@ static bool btf_name_valid_identifier(const struct btf *btf, u32 offset) return !*src; } -const char *btf_name_by_offset(const struct btf *btf, u32 offset) +static const char *__btf_name_by_offset(const struct btf *btf, u32 offset) { if (!offset) return "(anon)"; @@ -484,6 +484,14 @@ const char *btf_name_by_offset(const struct btf *btf, u32 offset) return "(invalid-name-offset)"; } +const char *btf_name_by_offset(const struct btf *btf, u32 offset) +{ + if (offset < btf->hdr.str_len) + return &btf->strings[offset]; + + return NULL; +} + const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id) { if (type_id > btf->nr_types) @@ -576,7 +584,7 @@ __printf(4, 5) static void __btf_verifier_log_type(struct btf_verifier_env *env, __btf_verifier_log(log, "[%u] %s %s%s", env->log_type_id, btf_kind_str[kind], - btf_name_by_offset(btf, t->name_off), + __btf_name_by_offset(btf, t->name_off), log_details ? " " : ""); if (log_details) @@ -620,7 +628,7 @@ static void btf_verifier_log_member(struct btf_verifier_env *env, btf_verifier_log_type(env, struct_type, NULL); __btf_verifier_log(log, "\t%s type_id=%u bits_offset=%u", - btf_name_by_offset(btf, member->name_off), + __btf_name_by_offset(btf, member->name_off), member->type, member->offset); if (fmt && *fmt) { @@ -1872,7 +1880,7 @@ static s32 btf_enum_check_meta(struct btf_verifier_env *env, btf_verifier_log(env, "\t%s val=%d\n", - btf_name_by_offset(btf, enums[i].name_off), + __btf_name_by_offset(btf, enums[i].name_off), enums[i].val); } @@ -1896,7 +1904,8 @@ static void btf_enum_seq_show(const struct btf *btf, const struct btf_type *t, for (i = 0; i < nr_enums; i++) { if (v == enums[i].val) { seq_printf(m, "%s", - btf_name_by_offset(btf, enums[i].name_off)); + __btf_name_by_offset(btf, + enums[i].name_off)); return; } } @@ -1954,20 +1963,20 @@ static void btf_func_proto_log(struct btf_verifier_env *env, } btf_verifier_log(env, "%u %s", args[0].type, - btf_name_by_offset(env->btf, - args[0].name_off)); + __btf_name_by_offset(env->btf, + args[0].name_off)); for (i = 1; i < nr_args - 1; i++) btf_verifier_log(env, ", %u %s", args[i].type, - btf_name_by_offset(env->btf, - args[i].name_off)); + __btf_name_by_offset(env->btf, + args[i].name_off)); if (nr_args > 1) { const struct btf_param *last_arg = &args[nr_args - 1]; if (last_arg->type) btf_verifier_log(env, ", %u %s", last_arg->type, - btf_name_by_offset(env->btf, - last_arg->name_off)); + __btf_name_by_offset(env->btf, + last_arg->name_off)); else btf_verifier_log(env, ", vararg"); } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 8b511a4fe84a..89ce2613fdb0 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4910,8 +4910,8 @@ static int check_btf_line(struct bpf_verifier_env *env, goto err_free; } - if (!btf_name_offset_valid(btf, linfo[i].line_off) || - !btf_name_offset_valid(btf, linfo[i].file_name_off)) { + if (!btf_name_by_offset(btf, linfo[i].line_off) || + !btf_name_by_offset(btf, linfo[i].file_name_off)) { verbose(env, "Invalid line_info[%u].line_off or .file_name_off\n", i); err = -EINVAL; goto err_free; -- cgit From d9762e84ede3eae9636f5dbbe0c8f0390d37e114 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 13 Dec 2018 10:41:48 -0800 Subject: bpf: verbose log bpf_line_info in verifier This patch adds bpf_line_info during the verifier's verbose. It can give error context for debug purpose. ~~~~~~~~~~ Here is the verbose log for backedge: while (a) { a += bpf_get_smp_processor_id(); bpf_trace_printk(fmt, sizeof(fmt), a); } ~> bpftool prog load ./test_loop.o /sys/fs/bpf/test_loop type tracepoint 13: while (a) { 3: a += bpf_get_smp_processor_id(); back-edge from insn 13 to 3 ~~~~~~~~~~ Here is the verbose log for invalid pkt access: Modification to test_xdp_noinline.c: data = (void *)(long)xdp->data; data_end = (void *)(long)xdp->data_end; /* if (data + 4 > data_end) return XDP_DROP; */ *(u32 *)data = dst->dst; ~> bpftool prog load ./test_xdp_noinline.o /sys/fs/bpf/test_xdp_noinline type xdp ; data = (void *)(long)xdp->data; 224: (79) r2 = *(u64 *)(r10 -112) 225: (61) r2 = *(u32 *)(r2 +0) ; *(u32 *)data = dst->dst; 226: (63) *(u32 *)(r2 +0) = r1 invalid access to packet, off=0 size=4, R2(id=0,off=0,r=0) R2 offset is outside of the packet Signed-off-by: Martin KaFai Lau Acked-by: Yonghong Song Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 1 + kernel/bpf/verifier.c | 74 +++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 70 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index c736945be7c5..548dcbdb7111 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -224,6 +224,7 @@ struct bpf_verifier_env { bool allow_ptr_leaks; bool seen_direct_write; struct bpf_insn_aux_data *insn_aux_data; /* array of per-insn state */ + const struct bpf_line_info *prev_linfo; struct bpf_verifier_log log; struct bpf_subprog_info subprog_info[BPF_MAX_SUBPROGS + 1]; u32 subprog_cnt; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 89ce2613fdb0..ba8e3134bbc2 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -26,6 +26,7 @@ #include #include #include +#include #include "disasm.h" @@ -216,6 +217,27 @@ struct bpf_call_arg_meta { static DEFINE_MUTEX(bpf_verifier_lock); +static const struct bpf_line_info * +find_linfo(const struct bpf_verifier_env *env, u32 insn_off) +{ + const struct bpf_line_info *linfo; + const struct bpf_prog *prog; + u32 i, nr_linfo; + + prog = env->prog; + nr_linfo = prog->aux->nr_linfo; + + if (!nr_linfo || insn_off >= prog->len) + return NULL; + + linfo = prog->aux->linfo; + for (i = 1; i < nr_linfo; i++) + if (insn_off < linfo[i].insn_off) + break; + + return &linfo[i - 1]; +} + void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt, va_list args) { @@ -266,6 +288,42 @@ __printf(2, 3) static void verbose(void *private_data, const char *fmt, ...) va_end(args); } +static const char *ltrim(const char *s) +{ + while (isspace(*s)) + s++; + + return s; +} + +__printf(3, 4) static void verbose_linfo(struct bpf_verifier_env *env, + u32 insn_off, + const char *prefix_fmt, ...) +{ + const struct bpf_line_info *linfo; + + if (!bpf_verifier_log_needed(&env->log)) + return; + + linfo = find_linfo(env, insn_off); + if (!linfo || linfo == env->prev_linfo) + return; + + if (prefix_fmt) { + va_list args; + + va_start(args, prefix_fmt); + bpf_verifier_vlog(&env->log, prefix_fmt, args); + va_end(args); + } + + verbose(env, "%s\n", + ltrim(btf_name_by_offset(env->prog->aux->btf, + linfo->line_off))); + + env->prev_linfo = linfo; +} + static bool type_is_pkt_pointer(enum bpf_reg_type type) { return type == PTR_TO_PACKET || @@ -4561,6 +4619,7 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env) return 0; if (w < 0 || w >= env->prog->len) { + verbose_linfo(env, t, "%d: ", t); verbose(env, "jump out of range from insn %d to %d\n", t, w); return -EINVAL; } @@ -4578,6 +4637,8 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env) insn_stack[cur_stack++] = w; return 1; } else if ((insn_state[w] & 0xF0) == DISCOVERED) { + verbose_linfo(env, t, "%d: ", t); + verbose_linfo(env, w, "%d: ", w); verbose(env, "back-edge from insn %d to %d\n", t, w); return -EINVAL; } else if (insn_state[w] == EXPLORED) { @@ -4600,10 +4661,6 @@ static int check_cfg(struct bpf_verifier_env *env) int ret = 0; int i, t; - ret = check_subprogs(env); - if (ret < 0) - return ret; - insn_state = kcalloc(insn_cnt, sizeof(int), GFP_KERNEL); if (!insn_state) return -ENOMEM; @@ -5448,6 +5505,8 @@ static int do_check(struct bpf_verifier_env *env) int insn_processed = 0; bool do_print_state = false; + env->prev_linfo = NULL; + state = kzalloc(sizeof(struct bpf_verifier_state), GFP_KERNEL); if (!state) return -ENOMEM; @@ -5521,6 +5580,7 @@ static int do_check(struct bpf_verifier_env *env) .private_data = env, }; + verbose_linfo(env, insn_idx, "; "); verbose(env, "%d: ", insn_idx); print_bpf_insn(&cbs, insn, env->allow_ptr_leaks); } @@ -6755,7 +6815,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, env->allow_ptr_leaks = capable(CAP_SYS_ADMIN); - ret = check_cfg(env); + ret = check_subprogs(env); if (ret < 0) goto skip_full_check; @@ -6763,6 +6823,10 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, if (ret < 0) goto skip_full_check; + ret = check_cfg(env); + if (ret < 0) + goto skip_full_check; + ret = do_check(env); if (env->cur_state) { free_verifier_state(env->cur_state, true); -- cgit From b233920c97a6201eb47fbafd78365c6946e6d7b6 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 13 Dec 2018 11:42:31 -0800 Subject: bpf: speed up stacksafe check Don't check the same stack liveness condition 8 times. once is enough. Signed-off-by: Alexei Starovoitov Acked-by: Edward Cree Acked-by: Jakub Kicinski Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index ba8e3134bbc2..0c6deb3f2be4 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5204,9 +5204,11 @@ static bool stacksafe(struct bpf_func_state *old, for (i = 0; i < old->allocated_stack; i++) { spi = i / BPF_REG_SIZE; - if (!(old->stack[spi].spilled_ptr.live & REG_LIVE_READ)) + if (!(old->stack[spi].spilled_ptr.live & REG_LIVE_READ)) { + i += BPF_REG_SIZE - 1; /* explored state didn't use this */ continue; + } if (old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_INVALID) continue; -- cgit From 19e2dbb7dd978d24505e918ac54d6f7dfdc88b1d Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 13 Dec 2018 11:42:33 -0800 Subject: bpf: improve stacksafe state comparison "if (old->allocated_stack > cur->allocated_stack)" check is too conservative. In some cases explored stack could have allocated more space, but that stack space was not live. The test case improves from 19 to 15 processed insns and improvement on real programs is significant as well: before after bpf_lb-DLB_L3.o 1940 1831 bpf_lb-DLB_L4.o 3089 3029 bpf_lb-DUNKNOWN.o 1065 1064 bpf_lxc-DDROP_ALL.o 28052 26309 bpf_lxc-DUNKNOWN.o 35487 33517 bpf_netdev.o 10864 9713 bpf_overlay.o 6643 6184 bpf_lcx_jit.o 38437 37335 Signed-off-by: Alexei Starovoitov Acked-by: Edward Cree Acked-by: Jakub Kicinski Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 13 +++++++------ tools/testing/selftests/bpf/test_verifier.c | 22 ++++++++++++++++++++++ 2 files changed, 29 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 0c6deb3f2be4..e4724fe8120f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5191,12 +5191,6 @@ static bool stacksafe(struct bpf_func_state *old, { int i, spi; - /* if explored stack has more populated slots than current stack - * such stacks are not equivalent - */ - if (old->allocated_stack > cur->allocated_stack) - return false; - /* walk slots of the explored stack and ignore any additional * slots in the current stack, since explored(safe) state * didn't use them @@ -5212,6 +5206,13 @@ static bool stacksafe(struct bpf_func_state *old, if (old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_INVALID) continue; + + /* explored stack has more populated slots than current stack + * and these slots were used + */ + if (i >= cur->allocated_stack) + return false; + /* if old state was safe with misc data in the stack * it will be safe with zero-initialized stack. * The opposite is not true diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index 82359cdbc805..f9de7fe0c26d 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -13647,6 +13647,28 @@ static struct bpf_test tests[] = { .prog_type = BPF_PROG_TYPE_SCHED_CLS, .result = ACCEPT, }, + { + "allocated_stack", + .insns = { + BPF_ALU64_REG(BPF_MOV, BPF_REG_6, BPF_REG_1), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_prandom_u32), + BPF_ALU64_REG(BPF_MOV, BPF_REG_7, BPF_REG_0), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 5), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_6, -8), + BPF_LDX_MEM(BPF_DW, BPF_REG_6, BPF_REG_10, -8), + BPF_STX_MEM(BPF_B, BPF_REG_10, BPF_REG_7, -9), + BPF_LDX_MEM(BPF_B, BPF_REG_7, BPF_REG_10, -9), + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 0), + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 0), + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 0), + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 0), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .result_unpriv = ACCEPT, + .insn_processed = 15, + }, { "reference tracking in call: free reference in subprog and outside", .insns = { -- cgit From 9242b5f5615c823bfc1e9aea284617ff25a55f10 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 13 Dec 2018 11:42:34 -0800 Subject: bpf: add self-check logic to liveness analysis Introduce REG_LIVE_DONE to check the liveness propagation and prepare the states for merging. See algorithm description in clean_live_states(). Signed-off-by: Alexei Starovoitov Acked-by: Jakub Kicinski Signed-off-by: Daniel Borkmann --- include/linux/bpf_verifier.h | 1 + kernel/bpf/verifier.c | 108 ++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 108 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 548dcbdb7111..c233efc106c6 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -38,6 +38,7 @@ enum bpf_reg_liveness { REG_LIVE_NONE = 0, /* reg hasn't been read or written this branch */ REG_LIVE_READ, /* reg was read, so we're sensitive to initial value */ REG_LIVE_WRITTEN, /* reg was written first, screening off later reads */ + REG_LIVE_DONE = 4, /* liveness won't be updating this register anymore */ }; struct bpf_reg_state { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index e4724fe8120f..0125731e2512 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -397,12 +397,14 @@ static char slot_type_char[] = { static void print_liveness(struct bpf_verifier_env *env, enum bpf_reg_liveness live) { - if (live & (REG_LIVE_READ | REG_LIVE_WRITTEN)) + if (live & (REG_LIVE_READ | REG_LIVE_WRITTEN | REG_LIVE_DONE)) verbose(env, "_"); if (live & REG_LIVE_READ) verbose(env, "r"); if (live & REG_LIVE_WRITTEN) verbose(env, "w"); + if (live & REG_LIVE_DONE) + verbose(env, "D"); } static struct bpf_func_state *func(struct bpf_verifier_env *env, @@ -1132,6 +1134,12 @@ static int mark_reg_read(struct bpf_verifier_env *env, /* if read wasn't screened by an earlier write ... */ if (writes && state->live & REG_LIVE_WRITTEN) break; + if (parent->live & REG_LIVE_DONE) { + verbose(env, "verifier BUG type %s var_off %lld off %d\n", + reg_type_str[parent->type], + parent->var_off.value, parent->off); + return -EFAULT; + } /* ... then we depend on parent's value */ parent->live |= REG_LIVE_READ; state = parent; @@ -5078,6 +5086,102 @@ static bool check_ids(u32 old_id, u32 cur_id, struct idpair *idmap) return false; } +static void clean_func_state(struct bpf_verifier_env *env, + struct bpf_func_state *st) +{ + enum bpf_reg_liveness live; + int i, j; + + for (i = 0; i < BPF_REG_FP; i++) { + live = st->regs[i].live; + /* liveness must not touch this register anymore */ + st->regs[i].live |= REG_LIVE_DONE; + if (!(live & REG_LIVE_READ)) + /* since the register is unused, clear its state + * to make further comparison simpler + */ + __mark_reg_not_init(&st->regs[i]); + } + + for (i = 0; i < st->allocated_stack / BPF_REG_SIZE; i++) { + live = st->stack[i].spilled_ptr.live; + /* liveness must not touch this stack slot anymore */ + st->stack[i].spilled_ptr.live |= REG_LIVE_DONE; + if (!(live & REG_LIVE_READ)) { + __mark_reg_not_init(&st->stack[i].spilled_ptr); + for (j = 0; j < BPF_REG_SIZE; j++) + st->stack[i].slot_type[j] = STACK_INVALID; + } + } +} + +static void clean_verifier_state(struct bpf_verifier_env *env, + struct bpf_verifier_state *st) +{ + int i; + + if (st->frame[0]->regs[0].live & REG_LIVE_DONE) + /* all regs in this state in all frames were already marked */ + return; + + for (i = 0; i <= st->curframe; i++) + clean_func_state(env, st->frame[i]); +} + +/* the parentage chains form a tree. + * the verifier states are added to state lists at given insn and + * pushed into state stack for future exploration. + * when the verifier reaches bpf_exit insn some of the verifer states + * stored in the state lists have their final liveness state already, + * but a lot of states will get revised from liveness point of view when + * the verifier explores other branches. + * Example: + * 1: r0 = 1 + * 2: if r1 == 100 goto pc+1 + * 3: r0 = 2 + * 4: exit + * when the verifier reaches exit insn the register r0 in the state list of + * insn 2 will be seen as !REG_LIVE_READ. Then the verifier pops the other_branch + * of insn 2 and goes exploring further. At the insn 4 it will walk the + * parentage chain from insn 4 into insn 2 and will mark r0 as REG_LIVE_READ. + * + * Since the verifier pushes the branch states as it sees them while exploring + * the program the condition of walking the branch instruction for the second + * time means that all states below this branch were already explored and + * their final liveness markes are already propagated. + * Hence when the verifier completes the search of state list in is_state_visited() + * we can call this clean_live_states() function to mark all liveness states + * as REG_LIVE_DONE to indicate that 'parent' pointers of 'struct bpf_reg_state' + * will not be used. + * This function also clears the registers and stack for states that !READ + * to simplify state merging. + * + * Important note here that walking the same branch instruction in the callee + * doesn't meant that the states are DONE. The verifier has to compare + * the callsites + */ +static void clean_live_states(struct bpf_verifier_env *env, int insn, + struct bpf_verifier_state *cur) +{ + struct bpf_verifier_state_list *sl; + int i; + + sl = env->explored_states[insn]; + if (!sl) + return; + + while (sl != STATE_LIST_MARK) { + if (sl->state.curframe != cur->curframe) + goto next; + for (i = 0; i <= cur->curframe; i++) + if (sl->state.frame[i]->callsite != cur->frame[i]->callsite) + goto next; + clean_verifier_state(env, &sl->state); +next: + sl = sl->next; + } +} + /* Returns true if (rold safe implies rcur safe) */ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, struct idpair *idmap) @@ -5396,6 +5500,8 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) */ return 0; + clean_live_states(env, insn_idx, cur); + while (sl != STATE_LIST_MARK) { if (states_equal(env, &sl->state, cur)) { /* reached equivalent register/stack state, -- cgit From 723679339d087d79e36c0af67f4be84d866fee20 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Tue, 11 Dec 2018 20:00:51 +0900 Subject: kconfig: warn no new line at end of file It would be nice to warn if a new line is missing at end of file. We could do this by checkpatch.pl for arbitrary files, but new line is rather essential as a statement terminator in Kconfig. The warning message looks like this: kernel/Kconfig.preempt:60:warning: no new line at end of file Currently, kernel/Kconfig.preempt is the only file with no new line at end of file. Fix it. I know there are some false negative cases. For example, no warning is displayed when the last line contains some whitespaces/comments, but no new line. Yet, this commit works well for most cases. Signed-off-by: Masahiro Yamada --- kernel/Kconfig.preempt | 2 +- scripts/kconfig/zconf.l | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt index cd1655122ec0..0fee5fe6c899 100644 --- a/kernel/Kconfig.preempt +++ b/kernel/Kconfig.preempt @@ -57,4 +57,4 @@ config PREEMPT endchoice config PREEMPT_COUNT - bool \ No newline at end of file + bool diff --git a/scripts/kconfig/zconf.l b/scripts/kconfig/zconf.l index 9038e9736bf0..8e856f9e6da9 100644 --- a/scripts/kconfig/zconf.l +++ b/scripts/kconfig/zconf.l @@ -261,6 +261,10 @@ n [A-Za-z0-9_-] <> { BEGIN(INITIAL); + if (prev_token != T_EOL && prev_token != T_HELPTEXT) + fprintf(stderr, "%s:%d:warning: no new line at end of file\n", + current_file->name, yylineno); + if (current_file) { zconf_endfile(); return T_EOL; -- cgit From 0e334db6bb4b1fd1e2d72c1f3d8f004313cd9f94 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 17 Dec 2018 13:31:05 +0100 Subject: posix-timers: Fix division by zero bug The signal delivery path of posix-timers can try to rearm the timer even if the interval is zero. That's handled for the common case (hrtimer) but not for alarm timers. In that case the forwarding function raises a division by zero exception. The handling for hrtimer based posix timers is wrong because it marks the timer as active despite the fact that it is stopped. Move the check from common_hrtimer_rearm() to posixtimer_rearm() to cure both issues. Reported-by: syzbot+9d38bedac9cc77b8ad5e@syzkaller.appspotmail.com Signed-off-by: Thomas Gleixner Cc: John Stultz Cc: Linus Torvalds Cc: Peter Zijlstra Cc: sboyd@kernel.org Cc: stable@vger.kernel.org Cc: syzkaller-bugs@googlegroups.com Link: http://lkml.kernel.org/r/alpine.DEB.2.21.1812171328050.1880@nanos.tec.linutronix.de Signed-off-by: Ingo Molnar --- kernel/time/posix-timers.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index bd62b5eeb5a0..31f49ae80f43 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -289,9 +289,6 @@ static void common_hrtimer_rearm(struct k_itimer *timr) { struct hrtimer *timer = &timr->it.real.timer; - if (!timr->it_interval) - return; - timr->it_overrun += hrtimer_forward(timer, timer->base->get_time(), timr->it_interval); hrtimer_restart(timer); @@ -317,7 +314,7 @@ void posixtimer_rearm(struct kernel_siginfo *info) if (!timr) return; - if (timr->it_requeue_pending == info->si_sys_private) { + if (timr->it_interval && timr->it_requeue_pending == info->si_sys_private) { timr->kclock->timer_rearm(timr); timr->it_active = 1; -- cgit From fb1a59fae8baa3f3c69b72a87ff94fc4fa5683ec Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 17 Dec 2018 17:20:55 +0900 Subject: kprobes: Blacklist symbols in arch-defined prohibited area Blacklist symbols in arch-defined probe-prohibited areas. With this change, user can see all symbols which are prohibited to probe in debugfs. All archtectures which have custom prohibit areas should define its own arch_populate_kprobe_blacklist() function, but unless that, all symbols marked __kprobes are blacklisted. Reported-by: Andrea Righi Tested-by: Andrea Righi Signed-off-by: Masami Hiramatsu Cc: Andy Lutomirski Cc: Anil S Keshavamurthy Cc: Borislav Petkov Cc: David S. Miller Cc: Linus Torvalds Cc: Naveen N. Rao Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Yonghong Song Link: http://lkml.kernel.org/r/154503485491.26176.15823229545155174796.stgit@devbox Signed-off-by: Ingo Molnar --- include/linux/kprobes.h | 3 +++ kernel/kprobes.c | 67 ++++++++++++++++++++++++++++++++++++++----------- 2 files changed, 56 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index e909413e4e38..5da8a1de2187 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -242,10 +242,13 @@ extern int arch_init_kprobes(void); extern void show_registers(struct pt_regs *regs); extern void kprobes_inc_nmissed_count(struct kprobe *p); extern bool arch_within_kprobe_blacklist(unsigned long addr); +extern int arch_populate_kprobe_blacklist(void); extern bool arch_kprobe_on_func_entry(unsigned long offset); extern bool kprobe_on_func_entry(kprobe_opcode_t *addr, const char *sym, unsigned long offset); extern bool within_kprobe_blacklist(unsigned long addr); +extern int kprobe_add_ksym_blacklist(unsigned long entry); +extern int kprobe_add_area_blacklist(unsigned long start, unsigned long end); struct kprobe_insn_cache { struct mutex mutex; diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 90e98e233647..90569aec0f24 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -2093,6 +2093,47 @@ void dump_kprobe(struct kprobe *kp) } NOKPROBE_SYMBOL(dump_kprobe); +int kprobe_add_ksym_blacklist(unsigned long entry) +{ + struct kprobe_blacklist_entry *ent; + unsigned long offset = 0, size = 0; + + if (!kernel_text_address(entry) || + !kallsyms_lookup_size_offset(entry, &size, &offset)) + return -EINVAL; + + ent = kmalloc(sizeof(*ent), GFP_KERNEL); + if (!ent) + return -ENOMEM; + ent->start_addr = entry; + ent->end_addr = entry + size; + INIT_LIST_HEAD(&ent->list); + list_add_tail(&ent->list, &kprobe_blacklist); + + return (int)size; +} + +/* Add all symbols in given area into kprobe blacklist */ +int kprobe_add_area_blacklist(unsigned long start, unsigned long end) +{ + unsigned long entry; + int ret = 0; + + for (entry = start; entry < end; entry += ret) { + ret = kprobe_add_ksym_blacklist(entry); + if (ret < 0) + return ret; + if (ret == 0) /* In case of alias symbol */ + ret = 1; + } + return 0; +} + +int __init __weak arch_populate_kprobe_blacklist(void) +{ + return 0; +} + /* * Lookup and populate the kprobe_blacklist. * @@ -2104,26 +2145,24 @@ NOKPROBE_SYMBOL(dump_kprobe); static int __init populate_kprobe_blacklist(unsigned long *start, unsigned long *end) { + unsigned long entry; unsigned long *iter; - struct kprobe_blacklist_entry *ent; - unsigned long entry, offset = 0, size = 0; + int ret; for (iter = start; iter < end; iter++) { entry = arch_deref_entry_point((void *)*iter); - - if (!kernel_text_address(entry) || - !kallsyms_lookup_size_offset(entry, &size, &offset)) + ret = kprobe_add_ksym_blacklist(entry); + if (ret == -EINVAL) continue; - - ent = kmalloc(sizeof(*ent), GFP_KERNEL); - if (!ent) - return -ENOMEM; - ent->start_addr = entry; - ent->end_addr = entry + size; - INIT_LIST_HEAD(&ent->list); - list_add_tail(&ent->list, &kprobe_blacklist); + if (ret < 0) + return ret; } - return 0; + + /* Symbols in __kprobes_text are blacklisted */ + ret = kprobe_add_area_blacklist((unsigned long)__kprobes_text_start, + (unsigned long)__kprobes_text_end); + + return ret ? : arch_populate_kprobe_blacklist(); } /* Module notifier call back, checking kprobes on the module */ -- cgit From c92a54cfa0257e8ffd66b2a17d49e9c0bd4b769f Mon Sep 17 00:00:00 2001 From: "Lendacky, Thomas" Date: Mon, 17 Dec 2018 14:39:16 +0000 Subject: dma-direct: do not include SME mask in the DMA supported check The dma_direct_supported() function intends to check the DMA mask against specific values. However, the phys_to_dma() function includes the SME encryption mask, which defeats the intended purpose of the check. This results in drivers that support less than 48-bit DMA (SME encryption mask is bit 47) from being able to set the DMA mask successfully when SME is active, which results in the driver failing to initialize. Change the function used to check the mask from phys_to_dma() to __phys_to_dma() so that the SME encryption mask is not part of the check. Fixes: c1d0af1a1d5d ("kernel/dma/direct: take DMA offset into account in dma_direct_supported") Signed-off-by: Tom Lendacky Signed-off-by: Christoph Hellwig --- kernel/dma/direct.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 22a12ab5a5e9..375c77e8d52f 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -309,7 +309,12 @@ int dma_direct_supported(struct device *dev, u64 mask) min_mask = min_t(u64, min_mask, (max_pfn - 1) << PAGE_SHIFT); - return mask >= phys_to_dma(dev, min_mask); + /* + * This check needs to be against the actual bit mask value, so + * use __phys_to_dma() here so that the SME encryption mask isn't + * part of the check. + */ + return mask >= __phys_to_dma(dev, min_mask); } int dma_direct_mapping_error(struct device *dev, dma_addr_t dma_addr) -- cgit From 6c4fc209fcf9d27efbaa48368773e4d2bfbd59aa Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sun, 16 Dec 2018 00:49:47 +0100 Subject: bpf: remove useless version check for prog load Existing libraries and tracing frameworks work around this kernel version check by automatically deriving the kernel version from uname(3) or similar such that the user does not need to do it manually; these workarounds also make the version check useless at the same time. Moreover, most other BPF tracing types enabling bpf_probe_read()-like functionality have /not/ adapted this check, and in general these days it is well understood anyway that all the tracing programs are not stable with regards to future kernels as kernel internal data structures are subject to change from release to release. Back at last netconf we discussed [0] and agreed to remove this check from bpf_prog_load() and instead document it here in the uapi header that there is no such guarantee for stable API for these programs. [0] http://vger.kernel.org/netconf2018_files/DanielBorkmann_netconf2018.pdf Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Acked-by: Quentin Monnet Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 10 +++++++++- kernel/bpf/syscall.c | 5 ----- tools/include/uapi/linux/bpf.h | 10 +++++++++- 3 files changed, 18 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index e7d57e89f25f..1d324c2cbca2 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -133,6 +133,14 @@ enum bpf_map_type { BPF_MAP_TYPE_STACK, }; +/* Note that tracing related programs such as + * BPF_PROG_TYPE_{KPROBE,TRACEPOINT,PERF_EVENT,RAW_TRACEPOINT} + * are not subject to a stable API since kernel internal data + * structures can change from release to release and may + * therefore break existing tracing BPF programs. Tracing BPF + * programs correspond to /a/ specific kernel which is to be + * analyzed, and not /a/ specific kernel /and/ all future ones. + */ enum bpf_prog_type { BPF_PROG_TYPE_UNSPEC, BPF_PROG_TYPE_SOCKET_FILTER, @@ -343,7 +351,7 @@ union bpf_attr { __u32 log_level; /* verbosity level of verifier */ __u32 log_size; /* size of user buffer */ __aligned_u64 log_buf; /* user supplied buffer */ - __u32 kern_version; /* checked when prog_type=kprobe */ + __u32 kern_version; /* not used */ __u32 prog_flags; char prog_name[BPF_OBJ_NAME_LEN]; __u32 prog_ifindex; /* ifindex of netdev to prep for */ diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 6ae062f1cf20..5db31067d85e 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1473,11 +1473,6 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) if (attr->insn_cnt == 0 || attr->insn_cnt > BPF_MAXINSNS) return -E2BIG; - - if (type == BPF_PROG_TYPE_KPROBE && - attr->kern_version != LINUX_VERSION_CODE) - return -EINVAL; - if (type != BPF_PROG_TYPE_SOCKET_FILTER && type != BPF_PROG_TYPE_CGROUP_SKB && !capable(CAP_SYS_ADMIN)) diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index e7d57e89f25f..1d324c2cbca2 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -133,6 +133,14 @@ enum bpf_map_type { BPF_MAP_TYPE_STACK, }; +/* Note that tracing related programs such as + * BPF_PROG_TYPE_{KPROBE,TRACEPOINT,PERF_EVENT,RAW_TRACEPOINT} + * are not subject to a stable API since kernel internal data + * structures can change from release to release and may + * therefore break existing tracing BPF programs. Tracing BPF + * programs correspond to /a/ specific kernel which is to be + * analyzed, and not /a/ specific kernel /and/ all future ones. + */ enum bpf_prog_type { BPF_PROG_TYPE_UNSPEC, BPF_PROG_TYPE_SOCKET_FILTER, @@ -343,7 +351,7 @@ union bpf_attr { __u32 log_level; /* verbosity level of verifier */ __u32 log_size; /* size of user buffer */ __aligned_u64 log_buf; /* user supplied buffer */ - __u32 kern_version; /* checked when prog_type=kprobe */ + __u32 kern_version; /* not used */ __u32 prog_flags; char prog_name[BPF_OBJ_NAME_LEN]; __u32 prog_ifindex; /* ifindex of netdev to prep for */ -- cgit From f97be3ab044cf6dd342fed7668c977ba07a7cd95 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 15 Dec 2018 22:13:50 -0800 Subject: bpf: btf: refactor btf_int_bits_seq_show() Refactor function btf_int_bits_seq_show() by creating function btf_bitfield_seq_show() which has no dependence on btf and btf_type. The function btf_bitfield_seq_show() will be in later patch to directly dump bitfield member values. Acked-by: Martin KaFai Lau Signed-off-by: Yonghong Song Signed-off-by: Daniel Borkmann --- kernel/bpf/btf.c | 35 +++++++++++++++++++++-------------- 1 file changed, 21 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 8fa0bf1c33fd..72caa799e82f 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -1068,26 +1068,16 @@ static void btf_int_log(struct btf_verifier_env *env, btf_int_encoding_str(BTF_INT_ENCODING(int_data))); } -static void btf_int_bits_seq_show(const struct btf *btf, - const struct btf_type *t, - void *data, u8 bits_offset, - struct seq_file *m) +static void btf_bitfield_seq_show(void *data, u8 bits_offset, + u8 nr_bits, struct seq_file *m) { u16 left_shift_bits, right_shift_bits; - u32 int_data = btf_type_int(t); - u8 nr_bits = BTF_INT_BITS(int_data); - u8 total_bits_offset; u8 nr_copy_bytes; u8 nr_copy_bits; u64 print_num; - /* - * bits_offset is at most 7. - * BTF_INT_OFFSET() cannot exceed 64 bits. - */ - total_bits_offset = bits_offset + BTF_INT_OFFSET(int_data); - data += BITS_ROUNDDOWN_BYTES(total_bits_offset); - bits_offset = BITS_PER_BYTE_MASKED(total_bits_offset); + data += BITS_ROUNDDOWN_BYTES(bits_offset); + bits_offset = BITS_PER_BYTE_MASKED(bits_offset); nr_copy_bits = nr_bits + bits_offset; nr_copy_bytes = BITS_ROUNDUP_BYTES(nr_copy_bits); @@ -1107,6 +1097,23 @@ static void btf_int_bits_seq_show(const struct btf *btf, seq_printf(m, "0x%llx", print_num); } +static void btf_int_bits_seq_show(const struct btf *btf, + const struct btf_type *t, + void *data, u8 bits_offset, + struct seq_file *m) +{ + u32 int_data = btf_type_int(t); + u8 nr_bits = BTF_INT_BITS(int_data); + u8 total_bits_offset; + + /* + * bits_offset is at most 7. + * BTF_INT_OFFSET() cannot exceed 64 bits. + */ + total_bits_offset = bits_offset + BTF_INT_OFFSET(int_data); + btf_bitfield_seq_show(data, total_bits_offset, nr_bits, m); +} + static void btf_int_seq_show(const struct btf *btf, const struct btf_type *t, u32 type_id, void *data, u8 bits_offset, struct seq_file *m) -- cgit From 9d5f9f701b1891466fb3dbb1806ad97716f95cc3 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 15 Dec 2018 22:13:51 -0800 Subject: bpf: btf: fix struct/union/fwd types with kind_flag This patch fixed two issues with BTF. One is related to struct/union bitfield encoding and the other is related to forward type. Issue #1 and solution: ====================== Current btf encoding of bitfield follows what pahole generates. For each bitfield, pahole will duplicate the type chain and put the bitfield size at the final int or enum type. Since the BTF enum type cannot encode bit size, pahole workarounds the issue by generating an int type whenever the enum bit size is not 32. For example, -bash-4.4$ cat t.c typedef int ___int; enum A { A1, A2, A3 }; struct t { int a[5]; ___int b:4; volatile enum A c:4; } g; -bash-4.4$ gcc -c -O2 -g t.c The current kernel supports the following BTF encoding: $ pahole -JV t.o [1] TYPEDEF ___int type_id=2 [2] INT int size=4 bit_offset=0 nr_bits=32 encoding=SIGNED [3] ENUM A size=4 vlen=3 A1 val=0 A2 val=1 A3 val=2 [4] STRUCT t size=24 vlen=3 a type_id=5 bits_offset=0 b type_id=9 bits_offset=160 c type_id=11 bits_offset=164 [5] ARRAY (anon) type_id=2 index_type_id=2 nr_elems=5 [6] INT sizetype size=8 bit_offset=0 nr_bits=64 encoding=(none) [7] VOLATILE (anon) type_id=3 [8] INT int size=1 bit_offset=0 nr_bits=4 encoding=(none) [9] TYPEDEF ___int type_id=8 [10] INT (anon) size=1 bit_offset=0 nr_bits=4 encoding=SIGNED [11] VOLATILE (anon) type_id=10 Two issues are in the above: . by changing enum type to int, we lost the original type information and this will not be ideal later when we try to convert BTF to a header file. . the type duplication for bitfields will cause BTF bloat. Duplicated types cannot be deduplicated later if the bitfield size is different. To fix this issue, this patch implemented a compatible change for BTF struct type encoding: . the bit 31 of struct_type->info, previously reserved, now is used to indicate whether bitfield_size is encoded in btf_member or not. . if bit 31 of struct_type->info is set, btf_member->offset will encode like: bit 0 - 23: bit offset bit 24 - 31: bitfield size if bit 31 is not set, the old behavior is preserved: bit 0 - 31: bit offset So if the struct contains a bit field, the maximum bit offset will be reduced to (2^24 - 1) instead of MAX_UINT. The maximum bitfield size will be 256 which is enough for today as maximum bitfield in compiler can be 128 where int128 type is supported. This kernel patch intends to support the new BTF encoding: $ pahole -JV t.o [1] TYPEDEF ___int type_id=2 [2] INT int size=4 bit_offset=0 nr_bits=32 encoding=SIGNED [3] ENUM A size=4 vlen=3 A1 val=0 A2 val=1 A3 val=2 [4] STRUCT t kind_flag=1 size=24 vlen=3 a type_id=5 bitfield_size=0 bits_offset=0 b type_id=1 bitfield_size=4 bits_offset=160 c type_id=7 bitfield_size=4 bits_offset=164 [5] ARRAY (anon) type_id=2 index_type_id=2 nr_elems=5 [6] INT sizetype size=8 bit_offset=0 nr_bits=64 encoding=(none) [7] VOLATILE (anon) type_id=3 Issue #2 and solution: ====================== Current forward type in BTF does not specify whether the original type is struct or union. This will not work for type pretty print and BTF-to-header-file conversion as struct/union must be specified. $ cat tt.c struct t; union u; int foo(struct t *t, union u *u) { return 0; } $ gcc -c -g -O2 tt.c $ pahole -JV tt.o [1] INT int size=4 bit_offset=0 nr_bits=32 encoding=SIGNED [2] FWD t type_id=0 [3] PTR (anon) type_id=2 [4] FWD u type_id=0 [5] PTR (anon) type_id=4 To fix this issue, similar to issue #1, type->info bit 31 is used. If the bit is set, it is union type. Otherwise, it is a struct type. $ pahole -JV tt.o [1] INT int size=4 bit_offset=0 nr_bits=32 encoding=SIGNED [2] FWD t kind_flag=0 type_id=0 [3] PTR (anon) kind_flag=0 type_id=2 [4] FWD u kind_flag=1 type_id=0 [5] PTR (anon) kind_flag=0 type_id=4 Pahole/LLVM change: =================== The new kind_flag functionality has been implemented in pahole and llvm: https://github.com/yonghong-song/pahole/tree/bitfield https://github.com/yonghong-song/llvm/tree/bitfield Note that pahole hasn't implemented func/func_proto kind and .BTF.ext. So to print function signature with bpftool, the llvm compiler should be used. Fixes: 69b693f0aefa ("bpf: btf: Introduce BPF Type Format (BTF)") Acked-by: Martin KaFai Lau Signed-off-by: Martin KaFai Lau Signed-off-by: Yonghong Song Signed-off-by: Daniel Borkmann --- include/uapi/linux/btf.h | 20 +++- kernel/bpf/btf.c | 280 +++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 278 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/include/uapi/linux/btf.h b/include/uapi/linux/btf.h index 14f66948fc95..7b7475ef2f17 100644 --- a/include/uapi/linux/btf.h +++ b/include/uapi/linux/btf.h @@ -34,7 +34,9 @@ struct btf_type { * bits 0-15: vlen (e.g. # of struct's members) * bits 16-23: unused * bits 24-27: kind (e.g. int, ptr, array...etc) - * bits 28-31: unused + * bits 28-30: unused + * bit 31: kind_flag, currently used by + * struct, union and fwd */ __u32 info; /* "size" is used by INT, ENUM, STRUCT and UNION. @@ -52,6 +54,7 @@ struct btf_type { #define BTF_INFO_KIND(info) (((info) >> 24) & 0x0f) #define BTF_INFO_VLEN(info) ((info) & 0xffff) +#define BTF_INFO_KFLAG(info) ((info) >> 31) #define BTF_KIND_UNKN 0 /* Unknown */ #define BTF_KIND_INT 1 /* Integer */ @@ -110,9 +113,22 @@ struct btf_array { struct btf_member { __u32 name_off; __u32 type; - __u32 offset; /* offset in bits */ + /* If the type info kind_flag is set, the btf_member offset + * contains both member bitfield size and bit offset. The + * bitfield size is set for bitfield members. If the type + * info kind_flag is not set, the offset contains only bit + * offset. + */ + __u32 offset; }; +/* If the struct/union type info kind_flag is set, the + * following two macros are used to access bitfield_size + * and bit_offset from btf_member.offset. + */ +#define BTF_MEMBER_BITFIELD_SIZE(val) ((val) >> 24) +#define BTF_MEMBER_BIT_OFFSET(val) ((val) & 0xffffff) + /* BTF_KIND_FUNC_PROTO is followed by multiple "struct btf_param". * The exact number of btf_param is stored in the vlen (of the * info in "struct btf_type"). diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 72caa799e82f..93b6905e3a9b 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -164,7 +164,7 @@ #define BITS_ROUNDUP_BYTES(bits) \ (BITS_ROUNDDOWN_BYTES(bits) + !!BITS_PER_BYTE_MASKED(bits)) -#define BTF_INFO_MASK 0x0f00ffff +#define BTF_INFO_MASK 0x8f00ffff #define BTF_INT_MASK 0x0fffffff #define BTF_TYPE_ID_VALID(type_id) ((type_id) <= BTF_MAX_TYPE) #define BTF_STR_OFFSET_VALID(name_off) ((name_off) <= BTF_MAX_NAME_OFFSET) @@ -274,6 +274,10 @@ struct btf_kind_operations { const struct btf_type *struct_type, const struct btf_member *member, const struct btf_type *member_type); + int (*check_kflag_member)(struct btf_verifier_env *env, + const struct btf_type *struct_type, + const struct btf_member *member, + const struct btf_type *member_type); void (*log_details)(struct btf_verifier_env *env, const struct btf_type *t); void (*seq_show)(const struct btf *btf, const struct btf_type *t, @@ -419,6 +423,25 @@ static u16 btf_type_vlen(const struct btf_type *t) return BTF_INFO_VLEN(t->info); } +static bool btf_type_kflag(const struct btf_type *t) +{ + return BTF_INFO_KFLAG(t->info); +} + +static u32 btf_member_bit_offset(const struct btf_type *struct_type, + const struct btf_member *member) +{ + return btf_type_kflag(struct_type) ? BTF_MEMBER_BIT_OFFSET(member->offset) + : member->offset; +} + +static u32 btf_member_bitfield_size(const struct btf_type *struct_type, + const struct btf_member *member) +{ + return btf_type_kflag(struct_type) ? BTF_MEMBER_BITFIELD_SIZE(member->offset) + : 0; +} + static u32 btf_type_int(const struct btf_type *t) { return *(u32 *)(t + 1); @@ -627,9 +650,17 @@ static void btf_verifier_log_member(struct btf_verifier_env *env, if (env->phase != CHECK_META) btf_verifier_log_type(env, struct_type, NULL); - __btf_verifier_log(log, "\t%s type_id=%u bits_offset=%u", - __btf_name_by_offset(btf, member->name_off), - member->type, member->offset); + if (btf_type_kflag(struct_type)) + __btf_verifier_log(log, + "\t%s type_id=%u bitfield_size=%u bits_offset=%u", + __btf_name_by_offset(btf, member->name_off), + member->type, + BTF_MEMBER_BITFIELD_SIZE(member->offset), + BTF_MEMBER_BIT_OFFSET(member->offset)); + else + __btf_verifier_log(log, "\t%s type_id=%u bits_offset=%u", + __btf_name_by_offset(btf, member->name_off), + member->type, member->offset); if (fmt && *fmt) { __btf_verifier_log(log, " "); @@ -945,6 +976,38 @@ static int btf_df_check_member(struct btf_verifier_env *env, return -EINVAL; } +static int btf_df_check_kflag_member(struct btf_verifier_env *env, + const struct btf_type *struct_type, + const struct btf_member *member, + const struct btf_type *member_type) +{ + btf_verifier_log_basic(env, struct_type, + "Unsupported check_kflag_member"); + return -EINVAL; +} + +/* Used for ptr, array and struct/union type members. + * int, enum and modifier types have their specific callback functions. + */ +static int btf_generic_check_kflag_member(struct btf_verifier_env *env, + const struct btf_type *struct_type, + const struct btf_member *member, + const struct btf_type *member_type) +{ + if (BTF_MEMBER_BITFIELD_SIZE(member->offset)) { + btf_verifier_log_member(env, struct_type, member, + "Invalid member bitfield_size"); + return -EINVAL; + } + + /* bitfield size is 0, so member->offset represents bit offset only. + * It is safe to call non kflag check_member variants. + */ + return btf_type_ops(member_type)->check_member(env, struct_type, + member, + member_type); +} + static int btf_df_resolve(struct btf_verifier_env *env, const struct resolve_vertex *v) { @@ -997,6 +1060,62 @@ static int btf_int_check_member(struct btf_verifier_env *env, return 0; } +static int btf_int_check_kflag_member(struct btf_verifier_env *env, + const struct btf_type *struct_type, + const struct btf_member *member, + const struct btf_type *member_type) +{ + u32 struct_bits_off, nr_bits, nr_int_data_bits, bytes_offset; + u32 int_data = btf_type_int(member_type); + u32 struct_size = struct_type->size; + u32 nr_copy_bits; + + /* a regular int type is required for the kflag int member */ + if (!btf_type_int_is_regular(member_type)) { + btf_verifier_log_member(env, struct_type, member, + "Invalid member base type"); + return -EINVAL; + } + + /* check sanity of bitfield size */ + nr_bits = BTF_MEMBER_BITFIELD_SIZE(member->offset); + struct_bits_off = BTF_MEMBER_BIT_OFFSET(member->offset); + nr_int_data_bits = BTF_INT_BITS(int_data); + if (!nr_bits) { + /* Not a bitfield member, member offset must be at byte + * boundary. + */ + if (BITS_PER_BYTE_MASKED(struct_bits_off)) { + btf_verifier_log_member(env, struct_type, member, + "Invalid member offset"); + return -EINVAL; + } + + nr_bits = nr_int_data_bits; + } else if (nr_bits > nr_int_data_bits) { + btf_verifier_log_member(env, struct_type, member, + "Invalid member bitfield_size"); + return -EINVAL; + } + + bytes_offset = BITS_ROUNDDOWN_BYTES(struct_bits_off); + nr_copy_bits = nr_bits + BITS_PER_BYTE_MASKED(struct_bits_off); + if (nr_copy_bits > BITS_PER_U64) { + btf_verifier_log_member(env, struct_type, member, + "nr_copy_bits exceeds 64"); + return -EINVAL; + } + + if (struct_size < bytes_offset || + struct_size - bytes_offset < BITS_ROUNDUP_BYTES(nr_copy_bits)) { + btf_verifier_log_member(env, struct_type, member, + "Member exceeds struct_size"); + return -EINVAL; + } + + return 0; +} + static s32 btf_int_check_meta(struct btf_verifier_env *env, const struct btf_type *t, u32 meta_left) @@ -1016,6 +1135,11 @@ static s32 btf_int_check_meta(struct btf_verifier_env *env, return -EINVAL; } + if (btf_type_kflag(t)) { + btf_verifier_log_type(env, t, "Invalid btf_info kind_flag"); + return -EINVAL; + } + int_data = btf_type_int(t); if (int_data & ~BTF_INT_MASK) { btf_verifier_log_basic(env, t, "Invalid int_data:%x", @@ -1097,6 +1221,7 @@ static void btf_bitfield_seq_show(void *data, u8 bits_offset, seq_printf(m, "0x%llx", print_num); } + static void btf_int_bits_seq_show(const struct btf *btf, const struct btf_type *t, void *data, u8 bits_offset, @@ -1163,6 +1288,7 @@ static const struct btf_kind_operations int_ops = { .check_meta = btf_int_check_meta, .resolve = btf_df_resolve, .check_member = btf_int_check_member, + .check_kflag_member = btf_int_check_kflag_member, .log_details = btf_int_log, .seq_show = btf_int_seq_show, }; @@ -1192,6 +1318,31 @@ static int btf_modifier_check_member(struct btf_verifier_env *env, resolved_type); } +static int btf_modifier_check_kflag_member(struct btf_verifier_env *env, + const struct btf_type *struct_type, + const struct btf_member *member, + const struct btf_type *member_type) +{ + const struct btf_type *resolved_type; + u32 resolved_type_id = member->type; + struct btf_member resolved_member; + struct btf *btf = env->btf; + + resolved_type = btf_type_id_size(btf, &resolved_type_id, NULL); + if (!resolved_type) { + btf_verifier_log_member(env, struct_type, member, + "Invalid member"); + return -EINVAL; + } + + resolved_member = *member; + resolved_member.type = resolved_type_id; + + return btf_type_ops(resolved_type)->check_kflag_member(env, struct_type, + &resolved_member, + resolved_type); +} + static int btf_ptr_check_member(struct btf_verifier_env *env, const struct btf_type *struct_type, const struct btf_member *member, @@ -1227,6 +1378,11 @@ static int btf_ref_type_check_meta(struct btf_verifier_env *env, return -EINVAL; } + if (btf_type_kflag(t)) { + btf_verifier_log_type(env, t, "Invalid btf_info kind_flag"); + return -EINVAL; + } + if (!BTF_TYPE_ID_VALID(t->type)) { btf_verifier_log_type(env, t, "Invalid type_id"); return -EINVAL; @@ -1380,6 +1536,7 @@ static struct btf_kind_operations modifier_ops = { .check_meta = btf_ref_type_check_meta, .resolve = btf_modifier_resolve, .check_member = btf_modifier_check_member, + .check_kflag_member = btf_modifier_check_kflag_member, .log_details = btf_ref_type_log, .seq_show = btf_modifier_seq_show, }; @@ -1388,6 +1545,7 @@ static struct btf_kind_operations ptr_ops = { .check_meta = btf_ref_type_check_meta, .resolve = btf_ptr_resolve, .check_member = btf_ptr_check_member, + .check_kflag_member = btf_generic_check_kflag_member, .log_details = btf_ref_type_log, .seq_show = btf_ptr_seq_show, }; @@ -1422,6 +1580,7 @@ static struct btf_kind_operations fwd_ops = { .check_meta = btf_fwd_check_meta, .resolve = btf_df_resolve, .check_member = btf_df_check_member, + .check_kflag_member = btf_df_check_kflag_member, .log_details = btf_ref_type_log, .seq_show = btf_df_seq_show, }; @@ -1480,6 +1639,11 @@ static s32 btf_array_check_meta(struct btf_verifier_env *env, return -EINVAL; } + if (btf_type_kflag(t)) { + btf_verifier_log_type(env, t, "Invalid btf_info kind_flag"); + return -EINVAL; + } + if (t->size) { btf_verifier_log_type(env, t, "size != 0"); return -EINVAL; @@ -1603,6 +1767,7 @@ static struct btf_kind_operations array_ops = { .check_meta = btf_array_check_meta, .resolve = btf_array_resolve, .check_member = btf_array_check_member, + .check_kflag_member = btf_generic_check_kflag_member, .log_details = btf_array_log, .seq_show = btf_array_seq_show, }; @@ -1641,6 +1806,7 @@ static s32 btf_struct_check_meta(struct btf_verifier_env *env, u32 meta_needed, last_offset; struct btf *btf = env->btf; u32 struct_size = t->size; + u32 offset; u16 i; meta_needed = btf_type_vlen(t) * sizeof(*member); @@ -1682,7 +1848,8 @@ static s32 btf_struct_check_meta(struct btf_verifier_env *env, return -EINVAL; } - if (is_union && member->offset) { + offset = btf_member_bit_offset(t, member); + if (is_union && offset) { btf_verifier_log_member(env, t, member, "Invalid member bits_offset"); return -EINVAL; @@ -1692,20 +1859,20 @@ static s32 btf_struct_check_meta(struct btf_verifier_env *env, * ">" instead of ">=" because the last member could be * "char a[0];" */ - if (last_offset > member->offset) { + if (last_offset > offset) { btf_verifier_log_member(env, t, member, "Invalid member bits_offset"); return -EINVAL; } - if (BITS_ROUNDUP_BYTES(member->offset) > struct_size) { + if (BITS_ROUNDUP_BYTES(offset) > struct_size) { btf_verifier_log_member(env, t, member, "Member bits_offset exceeds its struct size"); return -EINVAL; } btf_verifier_log_member(env, t, member, NULL); - last_offset = member->offset; + last_offset = offset; } return meta_needed; @@ -1735,9 +1902,14 @@ static int btf_struct_resolve(struct btf_verifier_env *env, last_member_type = btf_type_by_id(env->btf, last_member_type_id); - err = btf_type_ops(last_member_type)->check_member(env, v->t, - last_member, - last_member_type); + if (btf_type_kflag(v->t)) + err = btf_type_ops(last_member_type)->check_kflag_member(env, v->t, + last_member, + last_member_type); + else + err = btf_type_ops(last_member_type)->check_member(env, v->t, + last_member, + last_member_type); if (err) return err; } @@ -1759,9 +1931,14 @@ static int btf_struct_resolve(struct btf_verifier_env *env, return env_stack_push(env, member_type, member_type_id); } - err = btf_type_ops(member_type)->check_member(env, v->t, - member, - member_type); + if (btf_type_kflag(v->t)) + err = btf_type_ops(member_type)->check_kflag_member(env, v->t, + member, + member_type); + else + err = btf_type_ops(member_type)->check_member(env, v->t, + member, + member_type); if (err) return err; } @@ -1789,17 +1966,26 @@ static void btf_struct_seq_show(const struct btf *btf, const struct btf_type *t, for_each_member(i, t, member) { const struct btf_type *member_type = btf_type_by_id(btf, member->type); - u32 member_offset = member->offset; - u32 bytes_offset = BITS_ROUNDDOWN_BYTES(member_offset); - u8 bits8_offset = BITS_PER_BYTE_MASKED(member_offset); const struct btf_kind_operations *ops; + u32 member_offset, bitfield_size; + u32 bytes_offset; + u8 bits8_offset; if (i) seq_puts(m, seq); - ops = btf_type_ops(member_type); - ops->seq_show(btf, member_type, member->type, - data + bytes_offset, bits8_offset, m); + member_offset = btf_member_bit_offset(t, member); + bitfield_size = btf_member_bitfield_size(t, member); + if (bitfield_size) { + btf_bitfield_seq_show(data, member_offset, + bitfield_size, m); + } else { + bytes_offset = BITS_ROUNDDOWN_BYTES(member_offset); + bits8_offset = BITS_PER_BYTE_MASKED(member_offset); + ops = btf_type_ops(member_type); + ops->seq_show(btf, member_type, member->type, + data + bytes_offset, bits8_offset, m); + } } seq_puts(m, "}"); } @@ -1808,6 +1994,7 @@ static struct btf_kind_operations struct_ops = { .check_meta = btf_struct_check_meta, .resolve = btf_struct_resolve, .check_member = btf_struct_check_member, + .check_kflag_member = btf_generic_check_kflag_member, .log_details = btf_struct_log, .seq_show = btf_struct_seq_show, }; @@ -1837,6 +2024,41 @@ static int btf_enum_check_member(struct btf_verifier_env *env, return 0; } +static int btf_enum_check_kflag_member(struct btf_verifier_env *env, + const struct btf_type *struct_type, + const struct btf_member *member, + const struct btf_type *member_type) +{ + u32 struct_bits_off, nr_bits, bytes_end, struct_size; + u32 int_bitsize = sizeof(int) * BITS_PER_BYTE; + + struct_bits_off = BTF_MEMBER_BIT_OFFSET(member->offset); + nr_bits = BTF_MEMBER_BITFIELD_SIZE(member->offset); + if (!nr_bits) { + if (BITS_PER_BYTE_MASKED(struct_bits_off)) { + btf_verifier_log_member(env, struct_type, member, + "Member is not byte aligned"); + return -EINVAL; + } + + nr_bits = int_bitsize; + } else if (nr_bits > int_bitsize) { + btf_verifier_log_member(env, struct_type, member, + "Invalid member bitfield_size"); + return -EINVAL; + } + + struct_size = struct_type->size; + bytes_end = BITS_ROUNDUP_BYTES(struct_bits_off + nr_bits); + if (struct_size < bytes_end) { + btf_verifier_log_member(env, struct_type, member, + "Member exceeds struct_size"); + return -EINVAL; + } + + return 0; +} + static s32 btf_enum_check_meta(struct btf_verifier_env *env, const struct btf_type *t, u32 meta_left) @@ -1856,6 +2078,11 @@ static s32 btf_enum_check_meta(struct btf_verifier_env *env, return -EINVAL; } + if (btf_type_kflag(t)) { + btf_verifier_log_type(env, t, "Invalid btf_info kind_flag"); + return -EINVAL; + } + if (t->size != sizeof(int)) { btf_verifier_log_type(env, t, "Expected size:%zu", sizeof(int)); @@ -1924,6 +2151,7 @@ static struct btf_kind_operations enum_ops = { .check_meta = btf_enum_check_meta, .resolve = btf_df_resolve, .check_member = btf_enum_check_member, + .check_kflag_member = btf_enum_check_kflag_member, .log_details = btf_enum_log, .seq_show = btf_enum_seq_show, }; @@ -1946,6 +2174,11 @@ static s32 btf_func_proto_check_meta(struct btf_verifier_env *env, return -EINVAL; } + if (btf_type_kflag(t)) { + btf_verifier_log_type(env, t, "Invalid btf_info kind_flag"); + return -EINVAL; + } + btf_verifier_log_type(env, t, NULL); return meta_needed; @@ -2005,6 +2238,7 @@ static struct btf_kind_operations func_proto_ops = { * Hence, there is no btf_func_check_member(). */ .check_member = btf_df_check_member, + .check_kflag_member = btf_df_check_kflag_member, .log_details = btf_func_proto_log, .seq_show = btf_df_seq_show, }; @@ -2024,6 +2258,11 @@ static s32 btf_func_check_meta(struct btf_verifier_env *env, return -EINVAL; } + if (btf_type_kflag(t)) { + btf_verifier_log_type(env, t, "Invalid btf_info kind_flag"); + return -EINVAL; + } + btf_verifier_log_type(env, t, NULL); return 0; @@ -2033,6 +2272,7 @@ static struct btf_kind_operations func_ops = { .check_meta = btf_func_check_meta, .resolve = btf_df_resolve, .check_member = btf_df_check_member, + .check_kflag_member = btf_df_check_kflag_member, .log_details = btf_ref_type_log, .seq_show = btf_df_seq_show, }; -- cgit From ffa0c1cf59596fba54546ea828305acfcc2cf55e Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 15 Dec 2018 22:13:52 -0800 Subject: bpf: enable cgroup local storage map pretty print with kind_flag Commit 970289fc0a83 ("bpf: add bpffs pretty print for cgroup local storage maps") added bpffs pretty print for cgroup local storage maps. The commit worked for struct without kind_flag set. This patch refactored and made pretty print also work with kind_flag set for the struct. Acked-by: Martin KaFai Lau Signed-off-by: Yonghong Song Signed-off-by: Daniel Borkmann --- include/linux/btf.h | 5 ++++- kernel/bpf/btf.c | 37 ++++++++++++++++++++++++++++--------- kernel/bpf/local_storage.c | 17 ++++------------- 3 files changed, 36 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/include/linux/btf.h b/include/linux/btf.h index 58000d7e06e3..12502e25e767 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -7,6 +7,7 @@ #include struct btf; +struct btf_member; struct btf_type; union bpf_attr; @@ -46,7 +47,9 @@ void btf_type_seq_show(const struct btf *btf, u32 type_id, void *obj, struct seq_file *m); int btf_get_fd_by_id(u32 id); u32 btf_id(const struct btf *btf); -bool btf_type_is_reg_int(const struct btf_type *t, u32 expected_size); +bool btf_member_is_reg_int(const struct btf *btf, const struct btf_type *s, + const struct btf_member *m, + u32 expected_offset, u32 expected_size); #ifdef CONFIG_BPF_SYSCALL const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id); diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 93b6905e3a9b..e804b26a0506 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -546,22 +546,41 @@ static bool btf_type_int_is_regular(const struct btf_type *t) } /* - * Check that given type is a regular int and has the expected size. + * Check that given struct member is a regular int with expected + * offset and size. */ -bool btf_type_is_reg_int(const struct btf_type *t, u32 expected_size) +bool btf_member_is_reg_int(const struct btf *btf, const struct btf_type *s, + const struct btf_member *m, + u32 expected_offset, u32 expected_size) { - u8 nr_bits, nr_bytes; - u32 int_data; + const struct btf_type *t; + u32 id, int_data; + u8 nr_bits; - if (!btf_type_is_int(t)) + id = m->type; + t = btf_type_id_size(btf, &id, NULL); + if (!t || !btf_type_is_int(t)) return false; int_data = btf_type_int(t); nr_bits = BTF_INT_BITS(int_data); - nr_bytes = BITS_ROUNDUP_BYTES(nr_bits); - if (BITS_PER_BYTE_MASKED(nr_bits) || - BTF_INT_OFFSET(int_data) || - nr_bytes != expected_size) + if (btf_type_kflag(s)) { + u32 bitfield_size = BTF_MEMBER_BITFIELD_SIZE(m->offset); + u32 bit_offset = BTF_MEMBER_BIT_OFFSET(m->offset); + + /* if kflag set, int should be a regular int and + * bit offset should be at byte boundary. + */ + return !bitfield_size && + BITS_ROUNDUP_BYTES(bit_offset) == expected_offset && + BITS_ROUNDUP_BYTES(nr_bits) == expected_size; + } + + if (BTF_INT_OFFSET(int_data) || + BITS_PER_BYTE_MASKED(m->offset) || + BITS_ROUNDUP_BYTES(m->offset) != expected_offset || + BITS_PER_BYTE_MASKED(nr_bits) || + BITS_ROUNDUP_BYTES(nr_bits) != expected_size) return false; return true; diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index 5eca03da0989..07a34ef562a0 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -315,9 +315,8 @@ static int cgroup_storage_check_btf(const struct bpf_map *map, const struct btf_type *key_type, const struct btf_type *value_type) { - const struct btf_type *t; struct btf_member *m; - u32 id, size; + u32 offset, size; /* Key is expected to be of struct bpf_cgroup_storage_key type, * which is: @@ -338,25 +337,17 @@ static int cgroup_storage_check_btf(const struct bpf_map *map, * The first field must be a 64 bit integer at 0 offset. */ m = (struct btf_member *)(key_type + 1); - if (m->offset) - return -EINVAL; - id = m->type; - t = btf_type_id_size(btf, &id, NULL); size = FIELD_SIZEOF(struct bpf_cgroup_storage_key, cgroup_inode_id); - if (!t || !btf_type_is_reg_int(t, size)) + if (!btf_member_is_reg_int(btf, key_type, m, 0, size)) return -EINVAL; /* * The second field must be a 32 bit integer at 64 bit offset. */ m++; - if (m->offset != offsetof(struct bpf_cgroup_storage_key, attach_type) * - BITS_PER_BYTE) - return -EINVAL; - id = m->type; - t = btf_type_id_size(btf, &id, NULL); + offset = offsetof(struct bpf_cgroup_storage_key, attach_type); size = FIELD_SIZEOF(struct bpf_cgroup_storage_key, attach_type); - if (!t || !btf_type_is_reg_int(t, size)) + if (!btf_member_is_reg_int(btf, key_type, m, offset, size)) return -EINVAL; return 0; -- cgit From 07daef8b41e0d9e7802a448f6766504e7641a234 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Sun, 9 Dec 2018 14:22:25 +0800 Subject: ntp: Remove duplicated include Signed-off-by: YueHaibing Signed-off-by: Thomas Gleixner Cc: Cc: Link: https://lkml.kernel.org/r/20181209062225.4344-1-yuehaibing@huawei.com --- kernel/time/ntp.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index c5e0cba3b39c..bc3a3c37ec9c 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -17,7 +17,6 @@ #include #include #include -#include #include "ntp_internal.h" #include "timekeeping_internal.h" -- cgit From c5f48c0a7aa1a8c82d81cdf27e63aa0a5544c6e6 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 3 Dec 2018 11:44:51 +0100 Subject: genirq: Fix various typos in comments Go over the IRQ subsystem source code (including irqchip drivers) and fix common typos in comments. No change in functionality intended. Signed-off-by: Ingo Molnar Cc: Thomas Gleixner Cc: Jason Cooper Cc: Marc Zyngier Cc: Peter Zijlstra Cc: Linus Torvalds Cc: linux-kernel@vger.kernel.org --- drivers/irqchip/irq-dw-apb-ictl.c | 2 +- drivers/irqchip/irq-gic.c | 6 +++--- drivers/irqchip/irq-renesas-h8s.c | 2 +- drivers/irqchip/irq-s3c24xx.c | 2 +- include/linux/irqchip.h | 4 ++-- kernel/irq/chip.c | 2 +- kernel/irq/ipi.c | 4 ++-- kernel/irq/manage.c | 2 +- kernel/irq/spurious.c | 6 +++--- 9 files changed, 15 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/drivers/irqchip/irq-dw-apb-ictl.c b/drivers/irqchip/irq-dw-apb-ictl.c index 0a19618ce2c8..e4550e9c810b 100644 --- a/drivers/irqchip/irq-dw-apb-ictl.c +++ b/drivers/irqchip/irq-dw-apb-ictl.c @@ -105,7 +105,7 @@ static int __init dw_apb_ictl_init(struct device_node *np, * DW IP can be configured to allow 2-64 irqs. We can determine * the number of irqs supported by writing into enable register * and look for bits not set, as corresponding flip-flops will - * have been removed by sythesis tool. + * have been removed by synthesis tool. */ /* mask and enable all interrupts */ diff --git a/drivers/irqchip/irq-gic.c b/drivers/irqchip/irq-gic.c index ced10c44b68a..ba2a37a27a54 100644 --- a/drivers/irqchip/irq-gic.c +++ b/drivers/irqchip/irq-gic.c @@ -604,8 +604,8 @@ void gic_dist_save(struct gic_chip_data *gic) /* * Restores the GIC distributor registers during resume or when coming out of * idle. Must be called before enabling interrupts. If a level interrupt - * that occured while the GIC was suspended is still present, it will be - * handled normally, but any edge interrupts that occured will not be seen by + * that occurred while the GIC was suspended is still present, it will be + * handled normally, but any edge interrupts that occurred will not be seen by * the GIC and need to be handled by the platform-specific wakeup source. */ void gic_dist_restore(struct gic_chip_data *gic) @@ -899,7 +899,7 @@ void gic_migrate_target(unsigned int new_cpu_id) gic_cpu_map[cpu] = 1 << new_cpu_id; /* - * Find all the peripheral interrupts targetting the current + * Find all the peripheral interrupts targeting the current * CPU interface and migrate them to the new CPU interface. * We skip DIST_TARGET 0 to 7 as they are read-only. */ diff --git a/drivers/irqchip/irq-renesas-h8s.c b/drivers/irqchip/irq-renesas-h8s.c index 85234d456638..4e2461bae944 100644 --- a/drivers/irqchip/irq-renesas-h8s.c +++ b/drivers/irqchip/irq-renesas-h8s.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * H8S interrupt contoller driver + * H8S interrupt controller driver * * Copyright 2015 Yoshinori Sato */ diff --git a/drivers/irqchip/irq-s3c24xx.c b/drivers/irqchip/irq-s3c24xx.c index c19766fe8a1a..b623f300f1b1 100644 --- a/drivers/irqchip/irq-s3c24xx.c +++ b/drivers/irqchip/irq-s3c24xx.c @@ -58,7 +58,7 @@ struct s3c_irq_data { }; /* - * Sructure holding the controller data + * Structure holding the controller data * @reg_pending register holding pending irqs * @reg_intpnd special register intpnd in main intc * @reg_mask mask register diff --git a/include/linux/irqchip.h b/include/linux/irqchip.h index 89c34b200671..950e4b2458f0 100644 --- a/include/linux/irqchip.h +++ b/include/linux/irqchip.h @@ -19,7 +19,7 @@ * the association between their DT compatible string and their * initialization function. * - * @name: name that must be unique accross all IRQCHIP_DECLARE of the + * @name: name that must be unique across all IRQCHIP_DECLARE of the * same file. * @compstr: compatible string of the irqchip driver * @fn: initialization function @@ -30,7 +30,7 @@ * This macro must be used by the different irqchip drivers to declare * the association between their version and their initialization function. * - * @name: name that must be unique accross all IRQCHIP_ACPI_DECLARE of the + * @name: name that must be unique across all IRQCHIP_ACPI_DECLARE of the * same file. * @subtable: Subtable to be identified in MADT * @validate: Function to be called on that subtable to check its validity. diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index a2b3d9de999c..34e969069488 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -929,7 +929,7 @@ __irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle, break; /* * Bail out if the outer chip is not set up - * and the interrrupt supposed to be started + * and the interrupt supposed to be started * right away. */ if (WARN_ON(is_chained)) diff --git a/kernel/irq/ipi.c b/kernel/irq/ipi.c index 8b778e37dc6d..43e3d1be622c 100644 --- a/kernel/irq/ipi.c +++ b/kernel/irq/ipi.c @@ -56,7 +56,7 @@ int irq_reserve_ipi(struct irq_domain *domain, unsigned int next; /* - * The IPI requires a seperate HW irq on each CPU. We require + * The IPI requires a separate HW irq on each CPU. We require * that the destination mask is consecutive. If an * implementation needs to support holes, it can reserve * several IPI ranges. @@ -172,7 +172,7 @@ irq_hw_number_t ipi_get_hwirq(unsigned int irq, unsigned int cpu) /* * Get the real hardware irq number if the underlying implementation - * uses a seperate irq per cpu. If the underlying implementation uses + * uses a separate irq per cpu. If the underlying implementation uses * a single hardware irq for all cpus then the IPI send mechanism * needs to take care of the cpu destinations. */ diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 9dbdccab3b6a..a4888ce4667a 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -915,7 +915,7 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) { } #endif /* - * Interrupts which are not explicitely requested as threaded + * Interrupts which are not explicitly requested as threaded * interrupts rely on the implicit bh/preempt disable of the hard irq * context. So we need to disable bh here to avoid deadlocks and other * side effects. diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index d867d6ddafdd..6d2fa6914b30 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -66,7 +66,7 @@ static int try_one_irq(struct irq_desc *desc, bool force) raw_spin_lock(&desc->lock); /* - * PER_CPU, nested thread interrupts and interrupts explicitely + * PER_CPU, nested thread interrupts and interrupts explicitly * marked polled are excluded from polling. */ if (irq_settings_is_per_cpu(desc) || @@ -76,7 +76,7 @@ static int try_one_irq(struct irq_desc *desc, bool force) /* * Do not poll disabled interrupts unless the spurious - * disabled poller asks explicitely. + * disabled poller asks explicitly. */ if (irqd_irq_disabled(&desc->irq_data) && !force) goto out; @@ -292,7 +292,7 @@ void note_interrupt(struct irq_desc *desc, irqreturn_t action_ret) * So in case a thread is woken, we just note the fact and * defer the analysis to the next hardware interrupt. * - * The threaded handlers store whether they sucessfully + * The threaded handlers store whether they successfully * handled an interrupt and we check whether that number * changed versus the last invocation. * -- cgit From e11d4284e2f4de5048c6d1787c82226f0a198292 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 18 Apr 2018 13:43:52 +0200 Subject: y2038: socket: Add compat_sys_recvmmsg_time64 recvmmsg() takes two arguments to pointers of structures that differ between 32-bit and 64-bit architectures: mmsghdr and timespec. For y2038 compatbility, we are changing the native system call from timespec to __kernel_timespec with a 64-bit time_t (in another patch), and use the existing compat system call on both 32-bit and 64-bit architectures for compatibility with traditional 32-bit user space. As we now have two variants of recvmmsg() for 32-bit tasks that are both different from the variant that we use on 64-bit tasks, this means we also require two compat system calls! The solution I picked is to flip things around: The existing compat_sys_recvmmsg() call gets moved from net/compat.c into net/socket.c and now handles the case for old user space on all architectures that have set CONFIG_COMPAT_32BIT_TIME. A new compat_sys_recvmmsg_time64() call gets added in the old place for 64-bit architectures only, this one handles the case of a compat mmsghdr structure combined with __kernel_timespec. In the indirect sys_socketcall(), we now need to call either do_sys_recvmmsg() or __compat_sys_recvmmsg(), depending on what kind of architecture we are on. For compat_sys_socketcall(), no such change is needed, we always call __compat_sys_recvmmsg(). I decided to not add a new SYS_RECVMMSG_TIME64 socketcall: Any libc implementation for 64-bit time_t will need significant changes including an updated asm/unistd.h, and it seems better to consistently use the separate syscalls that configuration, leaving the socketcall only for backward compatibility with 32-bit time_t based libc. The naming is asymmetric for the moment, so both existing syscalls entry points keep their names, while the new ones are recvmmsg_time32 and compat_recvmmsg_time64 respectively. I expect that we will rename the compat syscalls later as we start using generated syscall tables everywhere and add these entry points. Signed-off-by: Arnd Bergmann --- include/linux/compat.h | 3 +++ include/linux/socket.h | 9 ++++--- include/linux/syscalls.h | 3 +++ kernel/sys_ni.c | 2 ++ net/compat.c | 34 ++++++++++---------------- net/socket.c | 62 +++++++++++++++++++++++++++++++++++------------- 6 files changed, 72 insertions(+), 41 deletions(-) (limited to 'kernel') diff --git a/include/linux/compat.h b/include/linux/compat.h index 8be8daa38c9a..4b0463608589 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -893,6 +893,9 @@ asmlinkage long compat_sys_move_pages(pid_t pid, compat_ulong_t nr_pages, asmlinkage long compat_sys_rt_tgsigqueueinfo(compat_pid_t tgid, compat_pid_t pid, int sig, struct compat_siginfo __user *uinfo); +asmlinkage long compat_sys_recvmmsg_time64(int fd, struct compat_mmsghdr __user *mmsg, + unsigned vlen, unsigned int flags, + struct __kernel_timespec __user *timeout); asmlinkage long compat_sys_recvmmsg(int fd, struct compat_mmsghdr __user *mmsg, unsigned vlen, unsigned int flags, struct old_timespec32 __user *timeout); diff --git a/include/linux/socket.h b/include/linux/socket.h index 8b571e9b9f76..333b5df8a1b2 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -348,7 +348,8 @@ struct ucred { extern int move_addr_to_kernel(void __user *uaddr, int ulen, struct sockaddr_storage *kaddr); extern int put_cmsg(struct msghdr*, int level, int type, int len, void *data); -struct timespec64; +struct __kernel_timespec; +struct old_timespec32; /* The __sys_...msg variants allow MSG_CMSG_COMPAT iff * forbid_cmsg_compat==false @@ -357,8 +358,10 @@ extern long __sys_recvmsg(int fd, struct user_msghdr __user *msg, unsigned int flags, bool forbid_cmsg_compat); extern long __sys_sendmsg(int fd, struct user_msghdr __user *msg, unsigned int flags, bool forbid_cmsg_compat); -extern int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen, - unsigned int flags, struct timespec64 *timeout); +extern int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg, + unsigned int vlen, unsigned int flags, + struct __kernel_timespec __user *timeout, + struct old_timespec32 __user *timeout32); extern int __sys_sendmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen, unsigned int flags, bool forbid_cmsg_compat); diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 247ad9eca955..03cda6793be3 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -843,6 +843,9 @@ asmlinkage long sys_accept4(int, struct sockaddr __user *, int __user *, int); asmlinkage long sys_recvmmsg(int fd, struct mmsghdr __user *msg, unsigned int vlen, unsigned flags, struct __kernel_timespec __user *timeout); +asmlinkage long sys_recvmmsg_time32(int fd, struct mmsghdr __user *msg, + unsigned int vlen, unsigned flags, + struct old_timespec32 __user *timeout); asmlinkage long sys_wait4(pid_t pid, int __user *stat_addr, int options, struct rusage __user *ru); diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index df556175be50..ab9d0e3c6d50 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -284,7 +284,9 @@ COND_SYSCALL_COMPAT(move_pages); COND_SYSCALL(perf_event_open); COND_SYSCALL(accept4); COND_SYSCALL(recvmmsg); +COND_SYSCALL(recvmmsg_time32); COND_SYSCALL_COMPAT(recvmmsg); +COND_SYSCALL_COMPAT(recvmmsg_time64); /* * Architecture specific syscalls: see further below diff --git a/net/compat.c b/net/compat.c index 47a614b370cd..f7084780a8f8 100644 --- a/net/compat.c +++ b/net/compat.c @@ -810,34 +810,23 @@ COMPAT_SYSCALL_DEFINE6(recvfrom, int, fd, void __user *, buf, compat_size_t, len return __compat_sys_recvfrom(fd, buf, len, flags, addr, addrlen); } -static int __compat_sys_recvmmsg(int fd, struct compat_mmsghdr __user *mmsg, - unsigned int vlen, unsigned int flags, - struct old_timespec32 __user *timeout) +COMPAT_SYSCALL_DEFINE5(recvmmsg_time64, int, fd, struct compat_mmsghdr __user *, mmsg, + unsigned int, vlen, unsigned int, flags, + struct __kernel_timespec __user *, timeout) { - int datagrams; - struct timespec64 ktspec; - - if (timeout == NULL) - return __sys_recvmmsg(fd, (struct mmsghdr __user *)mmsg, vlen, - flags | MSG_CMSG_COMPAT, NULL); - - if (compat_get_timespec64(&ktspec, timeout)) - return -EFAULT; - - datagrams = __sys_recvmmsg(fd, (struct mmsghdr __user *)mmsg, vlen, - flags | MSG_CMSG_COMPAT, &ktspec); - if (datagrams > 0 && compat_put_timespec64(&ktspec, timeout)) - datagrams = -EFAULT; - - return datagrams; + return __sys_recvmmsg(fd, (struct mmsghdr __user *)mmsg, vlen, + flags | MSG_CMSG_COMPAT, timeout, NULL); } +#ifdef CONFIG_COMPAT_32BIT_TIME COMPAT_SYSCALL_DEFINE5(recvmmsg, int, fd, struct compat_mmsghdr __user *, mmsg, unsigned int, vlen, unsigned int, flags, struct old_timespec32 __user *, timeout) { - return __compat_sys_recvmmsg(fd, mmsg, vlen, flags, timeout); + return __sys_recvmmsg(fd, (struct mmsghdr __user *)mmsg, vlen, + flags | MSG_CMSG_COMPAT, NULL, timeout); } +#endif COMPAT_SYSCALL_DEFINE2(socketcall, int, call, u32 __user *, args) { @@ -925,8 +914,9 @@ COMPAT_SYSCALL_DEFINE2(socketcall, int, call, u32 __user *, args) ret = __compat_sys_recvmsg(a0, compat_ptr(a1), a[2]); break; case SYS_RECVMMSG: - ret = __compat_sys_recvmmsg(a0, compat_ptr(a1), a[2], a[3], - compat_ptr(a[4])); + ret = __sys_recvmmsg(a0, compat_ptr(a1), a[2], + a[3] | MSG_CMSG_COMPAT, NULL, + compat_ptr(a[4])); break; case SYS_ACCEPT4: ret = __sys_accept4(a0, compat_ptr(a1), compat_ptr(a[2]), a[3]); diff --git a/net/socket.c b/net/socket.c index 593826e11a53..f137a96628f1 100644 --- a/net/socket.c +++ b/net/socket.c @@ -2341,8 +2341,9 @@ SYSCALL_DEFINE3(recvmsg, int, fd, struct user_msghdr __user *, msg, * Linux recvmmsg interface */ -int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen, - unsigned int flags, struct timespec64 *timeout) +static int do_recvmmsg(int fd, struct mmsghdr __user *mmsg, + unsigned int vlen, unsigned int flags, + struct timespec64 *timeout) { int fput_needed, err, datagrams; struct socket *sock; @@ -2451,25 +2452,32 @@ out_put: return datagrams; } -static int do_sys_recvmmsg(int fd, struct mmsghdr __user *mmsg, - unsigned int vlen, unsigned int flags, - struct __kernel_timespec __user *timeout) +int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg, + unsigned int vlen, unsigned int flags, + struct __kernel_timespec __user *timeout, + struct old_timespec32 __user *timeout32) { int datagrams; struct timespec64 timeout_sys; - if (flags & MSG_CMSG_COMPAT) - return -EINVAL; - - if (!timeout) - return __sys_recvmmsg(fd, mmsg, vlen, flags, NULL); + if (timeout && get_timespec64(&timeout_sys, timeout)) + return -EFAULT; - if (get_timespec64(&timeout_sys, timeout)) + if (timeout32 && get_old_timespec32(&timeout_sys, timeout32)) return -EFAULT; - datagrams = __sys_recvmmsg(fd, mmsg, vlen, flags, &timeout_sys); + if (!timeout && !timeout32) + return do_recvmmsg(fd, mmsg, vlen, flags, NULL); + + datagrams = do_recvmmsg(fd, mmsg, vlen, flags, &timeout_sys); - if (datagrams > 0 && put_timespec64(&timeout_sys, timeout)) + if (datagrams <= 0) + return datagrams; + + if (timeout && put_timespec64(&timeout_sys, timeout)) + datagrams = -EFAULT; + + if (timeout32 && put_old_timespec32(&timeout_sys, timeout32)) datagrams = -EFAULT; return datagrams; @@ -2479,8 +2487,23 @@ SYSCALL_DEFINE5(recvmmsg, int, fd, struct mmsghdr __user *, mmsg, unsigned int, vlen, unsigned int, flags, struct __kernel_timespec __user *, timeout) { - return do_sys_recvmmsg(fd, mmsg, vlen, flags, timeout); + if (flags & MSG_CMSG_COMPAT) + return -EINVAL; + + return __sys_recvmmsg(fd, mmsg, vlen, flags, timeout, NULL); +} + +#ifdef CONFIG_COMPAT_32BIT_TIME +SYSCALL_DEFINE5(recvmmsg_time32, int, fd, struct mmsghdr __user *, mmsg, + unsigned int, vlen, unsigned int, flags, + struct old_timespec32 __user *, timeout) +{ + if (flags & MSG_CMSG_COMPAT) + return -EINVAL; + + return __sys_recvmmsg(fd, mmsg, vlen, flags, NULL, timeout); } +#endif #ifdef __ARCH_WANT_SYS_SOCKETCALL /* Argument list sizes for sys_socketcall */ @@ -2600,8 +2623,15 @@ SYSCALL_DEFINE2(socketcall, int, call, unsigned long __user *, args) a[2], true); break; case SYS_RECVMMSG: - err = do_sys_recvmmsg(a0, (struct mmsghdr __user *)a1, a[2], - a[3], (struct __kernel_timespec __user *)a[4]); + if (IS_ENABLED(CONFIG_64BIT) || !IS_ENABLED(CONFIG_64BIT_TIME)) + err = __sys_recvmmsg(a0, (struct mmsghdr __user *)a1, + a[2], a[3], + (struct __kernel_timespec __user *)a[4], + NULL); + else + err = __sys_recvmmsg(a0, (struct mmsghdr __user *)a1, + a[2], a[3], NULL, + (struct old_timespec32 __user *)a[4]); break; case SYS_ACCEPT4: err = __sys_accept4(a0, (struct sockaddr __user *)a1, -- cgit From df8522a340ee4ccb725036e1f9145f5646939aed Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 18 Apr 2018 16:15:37 +0200 Subject: y2038: signal: Add sys_rt_sigtimedwait_time32 Once sys_rt_sigtimedwait() gets changed to a 64-bit time_t, we have to provide compatibility support for existing binaries. An earlier version of this patch reused the compat_sys_rt_sigtimedwait entry point to avoid code duplication, but this newer approach duplicates the existing native entry point instead, which seems a bit cleaner. Signed-off-by: Arnd Bergmann --- include/linux/syscalls.h | 4 ++++ kernel/signal.c | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+) (limited to 'kernel') diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 03cda6793be3..251979d2e709 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -649,6 +649,10 @@ asmlinkage long sys_rt_sigtimedwait(const sigset_t __user *uthese, siginfo_t __user *uinfo, const struct __kernel_timespec __user *uts, size_t sigsetsize); +asmlinkage long sys_rt_sigtimedwait_time32(const sigset_t __user *uthese, + siginfo_t __user *uinfo, + const struct old_timespec32 __user *uts, + size_t sigsetsize); asmlinkage long sys_rt_sigqueueinfo(pid_t pid, int sig, siginfo_t __user *uinfo); /* kernel/sys.c */ diff --git a/kernel/signal.c b/kernel/signal.c index 3c8ea7a328e0..be6744cd0a11 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3332,6 +3332,39 @@ SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese, return ret; } +#ifdef CONFIG_COMPAT_32BIT_TIME +SYSCALL_DEFINE4(rt_sigtimedwait_time32, const sigset_t __user *, uthese, + siginfo_t __user *, uinfo, + const struct old_timespec32 __user *, uts, + size_t, sigsetsize) +{ + sigset_t these; + struct timespec64 ts; + kernel_siginfo_t info; + int ret; + + if (sigsetsize != sizeof(sigset_t)) + return -EINVAL; + + if (copy_from_user(&these, uthese, sizeof(these))) + return -EFAULT; + + if (uts) { + if (get_old_timespec32(&ts, uts)) + return -EFAULT; + } + + ret = do_sigtimedwait(&these, &info, uts ? &ts : NULL); + + if (ret > 0 && uinfo) { + if (copy_siginfo_to_user(uinfo, &info)) + ret = -EFAULT; + } + + return ret; +} +#endif + #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese, struct compat_siginfo __user *, uinfo, -- cgit From 2367c4b5fa09b2947d03c5cd23d7bc0200b7fe4f Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 18 Apr 2018 16:18:35 +0200 Subject: y2038: signal: Add compat_sys_rt_sigtimedwait_time64 Now that 32-bit architectures have two variants of sys_rt_sigtimedwaid() for 32-bit and 64-bit time_t, we also need to have a second compat system call entry point on the corresponding 64-bit architectures. The traditional system call keeps getting handled by compat_sys_rt_sigtimedwait(), and this adds a new compat_sys_rt_sigtimedwait_time64() that differs only in the timeout argument type. The naming remains a bit asymmetric for the moment. Ideally we would want to have compat_sys_rt_sigtimedwait_time32() for the old version and compat_sys_rt_sigtimedwait() for the new one to mirror the names of the native entry points, but renaming the existing system call tables causes unnecessary churn. I would suggest renaming all such system calls together at a later point. Signed-off-by: Arnd Bergmann --- include/linux/compat.h | 3 +++ kernel/signal.c | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+) (limited to 'kernel') diff --git a/include/linux/compat.h b/include/linux/compat.h index 4b0463608589..056be0d03722 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -788,6 +788,9 @@ asmlinkage long compat_sys_rt_sigpending(compat_sigset_t __user *uset, asmlinkage long compat_sys_rt_sigtimedwait(compat_sigset_t __user *uthese, struct compat_siginfo __user *uinfo, struct old_timespec32 __user *uts, compat_size_t sigsetsize); +asmlinkage long compat_sys_rt_sigtimedwait_time64(compat_sigset_t __user *uthese, + struct compat_siginfo __user *uinfo, + struct __kernel_timespec __user *uts, compat_size_t sigsetsize); asmlinkage long compat_sys_rt_sigqueueinfo(compat_pid_t pid, int sig, struct compat_siginfo __user *uinfo); /* No generic prototype for rt_sigreturn */ diff --git a/kernel/signal.c b/kernel/signal.c index be6744cd0a11..53e07d97ffe0 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3366,6 +3366,37 @@ SYSCALL_DEFINE4(rt_sigtimedwait_time32, const sigset_t __user *, uthese, #endif #ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait_time64, compat_sigset_t __user *, uthese, + struct compat_siginfo __user *, uinfo, + struct __kernel_timespec __user *, uts, compat_size_t, sigsetsize) +{ + sigset_t s; + struct timespec64 t; + kernel_siginfo_t info; + long ret; + + if (sigsetsize != sizeof(sigset_t)) + return -EINVAL; + + if (get_compat_sigset(&s, uthese)) + return -EFAULT; + + if (uts) { + if (get_timespec64(&t, uts)) + return -EFAULT; + } + + ret = do_sigtimedwait(&s, &info, uts ? &t : NULL); + + if (ret > 0 && uinfo) { + if (copy_siginfo_to_user32(uinfo, &info)) + ret = -EFAULT; + } + + return ret; +} + +#ifdef CONFIG_COMPAT_32BIT_TIME COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese, struct compat_siginfo __user *, uinfo, struct old_timespec32 __user *, uts, compat_size_t, sigsetsize) @@ -3396,6 +3427,7 @@ COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese, return ret; } #endif +#endif /** * sys_kill - send a signal to a process -- cgit From 926617889dc8383a120c66a2ecf7959a69f96950 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 14 Aug 2018 14:15:23 +0200 Subject: timekeeping: remove unused {read,update}_persistent_clock After arch/sh has removed the last reference to these functions, we can remove them completely and just rely on the 64-bit time_t based versions. This cleans up a rather ugly use of __weak functions. Signed-off-by: Arnd Bergmann Acked-by: John Stultz --- include/linux/timekeeping32.h | 6 ------ kernel/time/ntp.c | 10 +--------- kernel/time/timekeeping.c | 12 ++---------- 3 files changed, 3 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/include/linux/timekeeping32.h b/include/linux/timekeeping32.h index a502616f7e1c..0036ff314ac5 100644 --- a/include/linux/timekeeping32.h +++ b/include/linux/timekeeping32.h @@ -52,10 +52,4 @@ static inline void getboottime(struct timespec *ts) *ts = timespec64_to_timespec(ts64); } -/* - * Persistent clock related interfaces - */ -extern void read_persistent_clock(struct timespec *ts); -extern int update_persistent_clock(struct timespec now); - #endif diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index c5e0cba3b39c..e23be418d015 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -555,17 +555,9 @@ static void sync_rtc_clock(void) } #ifdef CONFIG_GENERIC_CMOS_UPDATE -int __weak update_persistent_clock(struct timespec now) -{ - return -ENODEV; -} - int __weak update_persistent_clock64(struct timespec64 now64) { - struct timespec now; - - now = timespec64_to_timespec(now64); - return update_persistent_clock(now); + return -ENODEV; } #endif diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 2d110c948805..eb09be4871b3 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1467,7 +1467,7 @@ u64 timekeeping_max_deferment(void) } /** - * read_persistent_clock - Return time from the persistent clock. + * read_persistent_clock64 - Return time from the persistent clock. * * Weak dummy function for arches that do not yet support it. * Reads the time from the battery backed persistent clock. @@ -1475,20 +1475,12 @@ u64 timekeeping_max_deferment(void) * * XXX - Do be sure to remove it once all arches implement it. */ -void __weak read_persistent_clock(struct timespec *ts) +void __weak read_persistent_clock64(struct timespec64 *ts) { ts->tv_sec = 0; ts->tv_nsec = 0; } -void __weak read_persistent_clock64(struct timespec64 *ts64) -{ - struct timespec ts; - - read_persistent_clock(&ts); - *ts64 = timespec_to_timespec64(ts); -} - /** * read_persistent_wall_and_boot_offset - Read persistent clock, and also offset * from the boot. -- cgit From 437e78d3fd6d35e6d56230962e6d03bb5dcda7f6 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 7 Dec 2018 13:41:02 +0100 Subject: timekeeping: remove timespec_add/timespec_del The last users were removed a while ago since everyone moved to ktime_t, so we can remove the two unused interfaces for old timespec structures. With those two gone, set_normalized_timespec() is also unused, so remove that as well. Signed-off-by: Arnd Bergmann Acked-by: John Stultz --- include/linux/time32.h | 25 ------------------------- kernel/time/time.c | 36 ------------------------------------ 2 files changed, 61 deletions(-) (limited to 'kernel') diff --git a/include/linux/time32.h b/include/linux/time32.h index 61904a6c098f..118b9977080c 100644 --- a/include/linux/time32.h +++ b/include/linux/time32.h @@ -96,31 +96,6 @@ static inline int timespec_compare(const struct timespec *lhs, const struct time return lhs->tv_nsec - rhs->tv_nsec; } -extern void set_normalized_timespec(struct timespec *ts, time_t sec, s64 nsec); - -static inline struct timespec timespec_add(struct timespec lhs, - struct timespec rhs) -{ - struct timespec ts_delta; - - set_normalized_timespec(&ts_delta, lhs.tv_sec + rhs.tv_sec, - lhs.tv_nsec + rhs.tv_nsec); - return ts_delta; -} - -/* - * sub = lhs - rhs, in normalized form - */ -static inline struct timespec timespec_sub(struct timespec lhs, - struct timespec rhs) -{ - struct timespec ts_delta; - - set_normalized_timespec(&ts_delta, lhs.tv_sec - rhs.tv_sec, - lhs.tv_nsec - rhs.tv_nsec); - return ts_delta; -} - /* * Returns true if the timespec is norm, false if denorm: */ diff --git a/kernel/time/time.c b/kernel/time/time.c index ad204cf6d001..532bb560252d 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -386,42 +386,6 @@ time64_t mktime64(const unsigned int year0, const unsigned int mon0, } EXPORT_SYMBOL(mktime64); -/** - * set_normalized_timespec - set timespec sec and nsec parts and normalize - * - * @ts: pointer to timespec variable to be set - * @sec: seconds to set - * @nsec: nanoseconds to set - * - * Set seconds and nanoseconds field of a timespec variable and - * normalize to the timespec storage format - * - * Note: The tv_nsec part is always in the range of - * 0 <= tv_nsec < NSEC_PER_SEC - * For negative values only the tv_sec field is negative ! - */ -void set_normalized_timespec(struct timespec *ts, time_t sec, s64 nsec) -{ - while (nsec >= NSEC_PER_SEC) { - /* - * The following asm() prevents the compiler from - * optimising this loop into a modulo operation. See - * also __iter_div_u64_rem() in include/linux/time.h - */ - asm("" : "+rm"(nsec)); - nsec -= NSEC_PER_SEC; - ++sec; - } - while (nsec < 0) { - asm("" : "+rm"(nsec)); - nsec += NSEC_PER_SEC; - --sec; - } - ts->tv_sec = sec; - ts->tv_nsec = nsec; -} -EXPORT_SYMBOL(set_normalized_timespec); - /** * ns_to_timespec - Convert nanoseconds to timespec * @nsec: the nanoseconds value to be converted -- cgit From a38d1107f937ca95dcf820161ef44ea683d6a0b1 Mon Sep 17 00:00:00 2001 From: Matt Mullins Date: Wed, 12 Dec 2018 16:42:37 -0800 Subject: bpf: support raw tracepoints in modules Distributions build drivers as modules, including network and filesystem drivers which export numerous tracepoints. This enables bpf(BPF_RAW_TRACEPOINT_OPEN) to attach to those tracepoints. Signed-off-by: Matt Mullins Acked-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov --- include/linux/module.h | 4 ++ include/linux/trace_events.h | 8 +++- kernel/bpf/syscall.c | 11 +++-- kernel/module.c | 5 +++ kernel/trace/bpf_trace.c | 99 +++++++++++++++++++++++++++++++++++++++++++- 5 files changed, 120 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/include/linux/module.h b/include/linux/module.h index fce6b4335e36..5f147dd5e709 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -432,6 +432,10 @@ struct module { unsigned int num_tracepoints; tracepoint_ptr_t *tracepoints_ptrs; #endif +#ifdef CONFIG_BPF_EVENTS + unsigned int num_bpf_raw_events; + struct bpf_raw_event_map *bpf_raw_events; +#endif #ifdef HAVE_JUMP_LABEL struct jump_entry *jump_entries; unsigned int num_jump_entries; diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 4130a5497d40..8a62731673f7 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -471,7 +471,8 @@ void perf_event_detach_bpf_prog(struct perf_event *event); int perf_event_query_prog_array(struct perf_event *event, void __user *info); int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog); int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog); -struct bpf_raw_event_map *bpf_find_raw_tracepoint(const char *name); +struct bpf_raw_event_map *bpf_get_raw_tracepoint(const char *name); +void bpf_put_raw_tracepoint(struct bpf_raw_event_map *btp); int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id, u32 *fd_type, const char **buf, u64 *probe_offset, u64 *probe_addr); @@ -502,10 +503,13 @@ static inline int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf { return -EOPNOTSUPP; } -static inline struct bpf_raw_event_map *bpf_find_raw_tracepoint(const char *name) +static inline struct bpf_raw_event_map *bpf_get_raw_tracepoint(const char *name) { return NULL; } +static inline void bpf_put_raw_tracepoint(struct bpf_raw_event_map *btp) +{ +} static inline int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id, u32 *fd_type, const char **buf, u64 *probe_offset, diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 5db31067d85e..0607db304def 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1604,6 +1604,7 @@ static int bpf_raw_tracepoint_release(struct inode *inode, struct file *filp) bpf_probe_unregister(raw_tp->btp, raw_tp->prog); bpf_prog_put(raw_tp->prog); } + bpf_put_raw_tracepoint(raw_tp->btp); kfree(raw_tp); return 0; } @@ -1629,13 +1630,15 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr) return -EFAULT; tp_name[sizeof(tp_name) - 1] = 0; - btp = bpf_find_raw_tracepoint(tp_name); + btp = bpf_get_raw_tracepoint(tp_name); if (!btp) return -ENOENT; raw_tp = kzalloc(sizeof(*raw_tp), GFP_USER); - if (!raw_tp) - return -ENOMEM; + if (!raw_tp) { + err = -ENOMEM; + goto out_put_btp; + } raw_tp->btp = btp; prog = bpf_prog_get_type(attr->raw_tracepoint.prog_fd, @@ -1663,6 +1666,8 @@ out_put_prog: bpf_prog_put(prog); out_free_tp: kfree(raw_tp); +out_put_btp: + bpf_put_raw_tracepoint(btp); return err; } diff --git a/kernel/module.c b/kernel/module.c index 49a405891587..06ec68f08387 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3093,6 +3093,11 @@ static int find_module_sections(struct module *mod, struct load_info *info) sizeof(*mod->tracepoints_ptrs), &mod->num_tracepoints); #endif +#ifdef CONFIG_BPF_EVENTS + mod->bpf_raw_events = section_objs(info, "__bpf_raw_tp_map", + sizeof(*mod->bpf_raw_events), + &mod->num_bpf_raw_events); +#endif #ifdef HAVE_JUMP_LABEL mod->jump_entries = section_objs(info, "__jump_table", sizeof(*mod->jump_entries), diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 9864a35c8bb5..9ddb6fddb4e0 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -17,6 +17,43 @@ #include "trace_probe.h" #include "trace.h" +#ifdef CONFIG_MODULES +struct bpf_trace_module { + struct module *module; + struct list_head list; +}; + +static LIST_HEAD(bpf_trace_modules); +static DEFINE_MUTEX(bpf_module_mutex); + +static struct bpf_raw_event_map *bpf_get_raw_tracepoint_module(const char *name) +{ + struct bpf_raw_event_map *btp, *ret = NULL; + struct bpf_trace_module *btm; + unsigned int i; + + mutex_lock(&bpf_module_mutex); + list_for_each_entry(btm, &bpf_trace_modules, list) { + for (i = 0; i < btm->module->num_bpf_raw_events; ++i) { + btp = &btm->module->bpf_raw_events[i]; + if (!strcmp(btp->tp->name, name)) { + if (try_module_get(btm->module)) + ret = btp; + goto out; + } + } + } +out: + mutex_unlock(&bpf_module_mutex); + return ret; +} +#else +static struct bpf_raw_event_map *bpf_get_raw_tracepoint_module(const char *name) +{ + return NULL; +} +#endif /* CONFIG_MODULES */ + u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); u64 bpf_get_stack(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); @@ -1076,7 +1113,7 @@ int perf_event_query_prog_array(struct perf_event *event, void __user *info) extern struct bpf_raw_event_map __start__bpf_raw_tp[]; extern struct bpf_raw_event_map __stop__bpf_raw_tp[]; -struct bpf_raw_event_map *bpf_find_raw_tracepoint(const char *name) +struct bpf_raw_event_map *bpf_get_raw_tracepoint(const char *name) { struct bpf_raw_event_map *btp = __start__bpf_raw_tp; @@ -1084,7 +1121,16 @@ struct bpf_raw_event_map *bpf_find_raw_tracepoint(const char *name) if (!strcmp(btp->tp->name, name)) return btp; } - return NULL; + + return bpf_get_raw_tracepoint_module(name); +} + +void bpf_put_raw_tracepoint(struct bpf_raw_event_map *btp) +{ + struct module *mod = __module_address((unsigned long)btp); + + if (mod) + module_put(mod); } static __always_inline @@ -1222,3 +1268,52 @@ int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id, return err; } + +#ifdef CONFIG_MODULES +int bpf_event_notify(struct notifier_block *nb, unsigned long op, void *module) +{ + struct bpf_trace_module *btm, *tmp; + struct module *mod = module; + + if (mod->num_bpf_raw_events == 0 || + (op != MODULE_STATE_COMING && op != MODULE_STATE_GOING)) + return 0; + + mutex_lock(&bpf_module_mutex); + + switch (op) { + case MODULE_STATE_COMING: + btm = kzalloc(sizeof(*btm), GFP_KERNEL); + if (btm) { + btm->module = module; + list_add(&btm->list, &bpf_trace_modules); + } + break; + case MODULE_STATE_GOING: + list_for_each_entry_safe(btm, tmp, &bpf_trace_modules, list) { + if (btm->module == module) { + list_del(&btm->list); + kfree(btm); + break; + } + } + break; + } + + mutex_unlock(&bpf_module_mutex); + + return 0; +} + +static struct notifier_block bpf_module_nb = { + .notifier_call = bpf_event_notify, +}; + +int __init bpf_event_init(void) +{ + register_module_notifier(&bpf_module_nb); + return 0; +} + +fs_initcall(bpf_event_init); +#endif /* CONFIG_MODULES */ -- cgit From da791a667536bf8322042e38ca85d55a78d3c273 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 10 Dec 2018 14:35:14 +0100 Subject: futex: Cure exit race Stefan reported, that the glibc tst-robustpi4 test case fails occasionally. That case creates the following race between sys_exit() and sys_futex_lock_pi(): CPU0 CPU1 sys_exit() sys_futex() do_exit() futex_lock_pi() exit_signals(tsk) No waiters: tsk->flags |= PF_EXITING; *uaddr == 0x00000PID mm_release(tsk) Set waiter bit exit_robust_list(tsk) { *uaddr = 0x80000PID; Set owner died attach_to_pi_owner() { *uaddr = 0xC0000000; tsk = get_task(PID); } if (!tsk->flags & PF_EXITING) { ... attach(); tsk->flags |= PF_EXITPIDONE; } else { if (!(tsk->flags & PF_EXITPIDONE)) return -EAGAIN; return -ESRCH; <--- FAIL } ESRCH is returned all the way to user space, which triggers the glibc test case assert. Returning ESRCH unconditionally is wrong here because the user space value has been changed by the exiting task to 0xC0000000, i.e. the FUTEX_OWNER_DIED bit is set and the futex PID value has been cleared. This is a valid state and the kernel has to handle it, i.e. taking the futex. Cure it by rereading the user space value when PF_EXITING and PF_EXITPIDONE is set in the task which 'owns' the futex. If the value has changed, let the kernel retry the operation, which includes all regular sanity checks and correctly handles the FUTEX_OWNER_DIED case. If it hasn't changed, then return ESRCH as there is no way to distinguish this case from malfunctioning user space. This happens when the exiting task did not have a robust list, the robust list was corrupted or the user space value in the futex was simply bogus. Reported-by: Stefan Liebler Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Cc: Heiko Carstens Cc: Darren Hart Cc: Ingo Molnar Cc: Sasha Levin Cc: stable@vger.kernel.org Link: https://bugzilla.kernel.org/show_bug.cgi?id=200467 Link: https://lkml.kernel.org/r/20181210152311.986181245@linutronix.de --- kernel/futex.c | 69 +++++++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 63 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index f423f9b6577e..5cc8083a4c89 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1148,11 +1148,65 @@ out_error: return ret; } +static int handle_exit_race(u32 __user *uaddr, u32 uval, + struct task_struct *tsk) +{ + u32 uval2; + + /* + * If PF_EXITPIDONE is not yet set, then try again. + */ + if (tsk && !(tsk->flags & PF_EXITPIDONE)) + return -EAGAIN; + + /* + * Reread the user space value to handle the following situation: + * + * CPU0 CPU1 + * + * sys_exit() sys_futex() + * do_exit() futex_lock_pi() + * futex_lock_pi_atomic() + * exit_signals(tsk) No waiters: + * tsk->flags |= PF_EXITING; *uaddr == 0x00000PID + * mm_release(tsk) Set waiter bit + * exit_robust_list(tsk) { *uaddr = 0x80000PID; + * Set owner died attach_to_pi_owner() { + * *uaddr = 0xC0000000; tsk = get_task(PID); + * } if (!tsk->flags & PF_EXITING) { + * ... attach(); + * tsk->flags |= PF_EXITPIDONE; } else { + * if (!(tsk->flags & PF_EXITPIDONE)) + * return -EAGAIN; + * return -ESRCH; <--- FAIL + * } + * + * Returning ESRCH unconditionally is wrong here because the + * user space value has been changed by the exiting task. + * + * The same logic applies to the case where the exiting task is + * already gone. + */ + if (get_futex_value_locked(&uval2, uaddr)) + return -EFAULT; + + /* If the user space value has changed, try again. */ + if (uval2 != uval) + return -EAGAIN; + + /* + * The exiting task did not have a robust list, the robust list was + * corrupted or the user space value in *uaddr is simply bogus. + * Give up and tell user space. + */ + return -ESRCH; +} + /* * Lookup the task for the TID provided from user space and attach to * it after doing proper sanity checks. */ -static int attach_to_pi_owner(u32 uval, union futex_key *key, +static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key, struct futex_pi_state **ps) { pid_t pid = uval & FUTEX_TID_MASK; @@ -1162,12 +1216,15 @@ static int attach_to_pi_owner(u32 uval, union futex_key *key, /* * We are the first waiter - try to look up the real owner and attach * the new pi_state to it, but bail out when TID = 0 [1] + * + * The !pid check is paranoid. None of the call sites should end up + * with pid == 0, but better safe than sorry. Let the caller retry */ if (!pid) - return -ESRCH; + return -EAGAIN; p = find_get_task_by_vpid(pid); if (!p) - return -ESRCH; + return handle_exit_race(uaddr, uval, NULL); if (unlikely(p->flags & PF_KTHREAD)) { put_task_struct(p); @@ -1187,7 +1244,7 @@ static int attach_to_pi_owner(u32 uval, union futex_key *key, * set, we know that the task has finished the * cleanup: */ - int ret = (p->flags & PF_EXITPIDONE) ? -ESRCH : -EAGAIN; + int ret = handle_exit_race(uaddr, uval, p); raw_spin_unlock_irq(&p->pi_lock); put_task_struct(p); @@ -1244,7 +1301,7 @@ static int lookup_pi_state(u32 __user *uaddr, u32 uval, * We are the first waiter - try to look up the owner based on * @uval and attach to it. */ - return attach_to_pi_owner(uval, key, ps); + return attach_to_pi_owner(uaddr, uval, key, ps); } static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval) @@ -1352,7 +1409,7 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb, * attach to the owner. If that fails, no harm done, we only * set the FUTEX_WAITERS bit in the user space variable. */ - return attach_to_pi_owner(uval, key, ps); + return attach_to_pi_owner(uaddr, newval, key, ps); } /** -- cgit From 0bae2d4d62d523f06ff1a8e88ce38b45400acd28 Mon Sep 17 00:00:00 2001 From: Jiong Wang Date: Sat, 15 Dec 2018 03:34:40 -0500 Subject: bpf: correct slot_type marking logic to allow more stack slot sharing Verifier is supposed to support sharing stack slot allocated to ptr with SCALAR_VALUE for privileged program. However this doesn't happen for some cases. The reason is verifier is not clearing slot_type STACK_SPILL for all bytes, it only clears part of them, while verifier is using: slot_type[0] == STACK_SPILL as a convention to check one slot is ptr type. So, the consequence of partial clearing slot_type is verifier could treat a partially overridden ptr slot, which should now be a SCALAR_VALUE slot, still as ptr slot, and rejects some valid programs. Before this patch, test_xdp_noinline.o under bpf selftests, bpf_lxc.o and bpf_netdev.o under Cilium bpf repo, when built with -mattr=+alu32 are rejected due to this issue. After this patch, they all accepted. There is no processed insn number change before and after this patch on Cilium bpf programs. Reviewed-by: Jakub Kicinski Signed-off-by: Jiong Wang Reviewed-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 5 +++++ tools/testing/selftests/bpf/test_verifier.c | 34 +++++++++++++++++++++++++++-- 2 files changed, 37 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 0125731e2512..e0e77ffeefb8 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1286,6 +1286,10 @@ static int check_stack_write(struct bpf_verifier_env *env, /* regular write of data into stack destroys any spilled ptr */ state->stack[spi].spilled_ptr.type = NOT_INIT; + /* Mark slots as STACK_MISC if they belonged to spilled ptr. */ + if (state->stack[spi].slot_type[0] == STACK_SPILL) + for (i = 0; i < BPF_REG_SIZE; i++) + state->stack[spi].slot_type[i] = STACK_MISC; /* only mark the slot as written if all 8 bytes were written * otherwise read propagation may incorrectly stop too soon @@ -1303,6 +1307,7 @@ static int check_stack_write(struct bpf_verifier_env *env, register_is_null(&cur->regs[value_regno])) type = STACK_ZERO; + /* Mark slots affected by this stack write. */ for (i = 0; i < size; i++) state->stack[spi].slot_type[(slot - i) % BPF_REG_SIZE] = type; diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index f9de7fe0c26d..cf242734e2eb 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -1001,14 +1001,44 @@ static struct bpf_test tests[] = { BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_1, -8), /* mess up with R1 pointer on stack */ BPF_ST_MEM(BPF_B, BPF_REG_10, -7, 0x23), - /* fill back into R0 should fail */ + /* fill back into R0 is fine for priv. + * R0 now becomes SCALAR_VALUE. + */ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_10, -8), + /* Load from R0 should fail. */ + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 8), BPF_EXIT_INSN(), }, .errstr_unpriv = "attempt to corrupt spilled", - .errstr = "corrupted spill", + .errstr = "R0 invalid mem access 'inv", .result = REJECT, }, + { + "check corrupted spill/fill, LSB", + .insns = { + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_1, -8), + BPF_ST_MEM(BPF_H, BPF_REG_10, -8, 0xcafe), + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_10, -8), + BPF_EXIT_INSN(), + }, + .errstr_unpriv = "attempt to corrupt spilled", + .result_unpriv = REJECT, + .result = ACCEPT, + .retval = POINTER_VALUE, + }, + { + "check corrupted spill/fill, MSB", + .insns = { + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_1, -8), + BPF_ST_MEM(BPF_W, BPF_REG_10, -4, 0x12345678), + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_10, -8), + BPF_EXIT_INSN(), + }, + .errstr_unpriv = "attempt to corrupt spilled", + .result_unpriv = REJECT, + .result = ACCEPT, + .retval = POINTER_VALUE, + }, { "invalid src register in STX", .insns = { -- cgit From 76c43ae84e3f455e0b460ed0c43799e018d09ee9 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 18 Dec 2018 13:43:58 -0800 Subject: bpf: log struct/union attribute for forward type Current btf internal verbose logger logs fwd type as [2] FWD A type_id=0 where A is the type name. Commit 9d5f9f701b18 ("bpf: btf: fix struct/union/fwd types with kind_flag") introduced kind_flag which can be used to distinguish whether a forward type is a struct or union. Also, "type_id=0" does not carry any meaningful information for fwd type as btf_type.type = 0 is simply enforced during btf verification and is not used anywhere else. This commit changed the log to [2] FWD A struct if kind_flag = 0, or [2] FWD A union if kind_flag = 1. Acked-by: Martin KaFai Lau Signed-off-by: Yonghong Song Signed-off-by: Daniel Borkmann --- kernel/bpf/btf.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index e804b26a0506..715f9fcf4712 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -1595,12 +1595,18 @@ static s32 btf_fwd_check_meta(struct btf_verifier_env *env, return 0; } +static void btf_fwd_type_log(struct btf_verifier_env *env, + const struct btf_type *t) +{ + btf_verifier_log(env, "%s", btf_type_kflag(t) ? "union" : "struct"); +} + static struct btf_kind_operations fwd_ops = { .check_meta = btf_fwd_check_meta, .resolve = btf_df_resolve, .check_member = btf_df_check_member, .check_kflag_member = btf_df_check_kflag_member, - .log_details = btf_ref_type_log, + .log_details = btf_fwd_type_log, .seq_show = btf_df_seq_show, }; -- cgit From c2899c3470de04823870b28646981695c0046efe Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 18 Dec 2018 16:06:53 +0100 Subject: genirq/affinity: Remove excess indentation Plus other coding style issues which stood out while staring at that code. Signed-off-by: Thomas Gleixner --- kernel/irq/affinity.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c index 08c904eb7279..e423bff1928c 100644 --- a/kernel/irq/affinity.c +++ b/kernel/irq/affinity.c @@ -95,11 +95,11 @@ static int get_nodes_in_cpumask(cpumask_var_t *node_to_cpumask, } static int __irq_build_affinity_masks(const struct irq_affinity *affd, - int startvec, int numvecs, int firstvec, - cpumask_var_t *node_to_cpumask, - const struct cpumask *cpu_mask, - struct cpumask *nmsk, - struct cpumask *masks) + int startvec, int numvecs, int firstvec, + cpumask_var_t *node_to_cpumask, + const struct cpumask *cpu_mask, + struct cpumask *nmsk, + struct cpumask *masks) { int n, nodes, cpus_per_vec, extra_vecs, done = 0; int last_affv = firstvec + numvecs; @@ -180,10 +180,10 @@ static int irq_build_affinity_masks(const struct irq_affinity *affd, cpumask_var_t nmsk, npresmsk; if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL)) - return ret; + return ret; if (!zalloc_cpumask_var(&npresmsk, GFP_KERNEL)) - goto fail; + goto fail; ret = 0; /* Stabilize the cpumasks */ @@ -212,7 +212,7 @@ static int irq_build_affinity_masks(const struct irq_affinity *affd, put_online_cpus(); if (nr_present < numvecs) - WARN_ON(nr_present + nr_others < numvecs); + WARN_ON(nr_present + nr_others < numvecs); free_cpumask_var(npresmsk); @@ -271,9 +271,9 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) ret = irq_build_affinity_masks(affd, curvec, this_vecs, curvec, node_to_cpumask, masks); if (ret) { - kfree(masks); - masks = NULL; - goto outnodemsk; + kfree(masks); + masks = NULL; + goto outnodemsk; } curvec += this_vecs; usedvecs += this_vecs; -- cgit From bec04037e4e484f41ee4d9409e40616874169d20 Mon Sep 17 00:00:00 2001 From: Dou Liyang Date: Tue, 4 Dec 2018 23:51:20 +0800 Subject: genirq/core: Introduce struct irq_affinity_desc The interrupt affinity management uses straight cpumask pointers to convey the automatically assigned affinity masks for managed interrupts. The core interrupt descriptor allocation also decides based on the pointer being non NULL whether an interrupt is managed or not. Devices which use managed interrupts usually have two classes of interrupts: - Interrupts for multiple device queues - Interrupts for general device management Currently both classes are treated the same way, i.e. as managed interrupts. The general interrupts get the default affinity mask assigned while the device queue interrupts are spread out over the possible CPUs. Treating the general interrupts as managed is both a limitation and under certain circumstances a bug. Assume the following situation: default_irq_affinity = 4..7 So if CPUs 4-7 are offlined, then the core code will shut down the device management interrupts because the last CPU in their affinity mask went offline. It's also a limitation because it's desired to allow manual placement of the general device interrupts for various reasons. If they are marked managed then the interrupt affinity setting from both user and kernel space is disabled. To remedy that situation it's required to convey more information than the cpumasks through various interfaces related to interrupt descriptor allocation. Instead of adding yet another argument, create a new data structure 'irq_affinity_desc' which for now just contains the cpumask. This struct can be expanded to convey auxilliary information in the next step. No functional change, just preparatory work. [ tglx: Simplified logic and clarified changelog ] Suggested-by: Thomas Gleixner Suggested-by: Bjorn Helgaas Signed-off-by: Dou Liyang Signed-off-by: Thomas Gleixner Cc: linux-pci@vger.kernel.org Cc: kashyap.desai@broadcom.com Cc: shivasharan.srikanteshwara@broadcom.com Cc: sumit.saxena@broadcom.com Cc: ming.lei@redhat.com Cc: hch@lst.de Cc: douliyang1@huawei.com Link: https://lkml.kernel.org/r/20181204155122.6327-2-douliyangs@gmail.com --- drivers/pci/msi.c | 9 ++++----- include/linux/interrupt.h | 14 ++++++++++++-- include/linux/irq.h | 6 ++++-- include/linux/irqdomain.h | 6 ++++-- include/linux/msi.h | 4 ++-- kernel/irq/affinity.c | 22 ++++++++++++---------- kernel/irq/devres.c | 4 ++-- kernel/irq/irqdesc.c | 17 +++++++++-------- kernel/irq/irqdomain.c | 4 ++-- kernel/irq/msi.c | 8 ++++---- 10 files changed, 55 insertions(+), 39 deletions(-) (limited to 'kernel') diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c index 265ed3e4c920..7a1c8a09efa5 100644 --- a/drivers/pci/msi.c +++ b/drivers/pci/msi.c @@ -534,14 +534,13 @@ error_attrs: static struct msi_desc * msi_setup_entry(struct pci_dev *dev, int nvec, const struct irq_affinity *affd) { - struct cpumask *masks = NULL; + struct irq_affinity_desc *masks = NULL; struct msi_desc *entry; u16 control; if (affd) masks = irq_create_affinity_masks(nvec, affd); - /* MSI Entry Initialization */ entry = alloc_msi_entry(&dev->dev, nvec, masks); if (!entry) @@ -672,7 +671,7 @@ static int msix_setup_entries(struct pci_dev *dev, void __iomem *base, struct msix_entry *entries, int nvec, const struct irq_affinity *affd) { - struct cpumask *curmsk, *masks = NULL; + struct irq_affinity_desc *curmsk, *masks = NULL; struct msi_desc *entry; int ret, i; @@ -1264,7 +1263,7 @@ const struct cpumask *pci_irq_get_affinity(struct pci_dev *dev, int nr) for_each_pci_msi_entry(entry, dev) { if (i == nr) - return entry->affinity; + return &entry->affinity->mask; i++; } WARN_ON_ONCE(1); @@ -1276,7 +1275,7 @@ const struct cpumask *pci_irq_get_affinity(struct pci_dev *dev, int nr) nr >= entry->nvec_used)) return NULL; - return &entry->affinity[nr]; + return &entry->affinity[nr].mask; } else { return cpu_possible_mask; } diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index ca397ff40836..c44b7844dc83 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -257,6 +257,14 @@ struct irq_affinity { int *sets; }; +/** + * struct irq_affinity_desc - Interrupt affinity descriptor + * @mask: cpumask to hold the affinity assignment + */ +struct irq_affinity_desc { + struct cpumask mask; +}; + #if defined(CONFIG_SMP) extern cpumask_var_t irq_default_affinity; @@ -303,7 +311,9 @@ extern int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m); extern int irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify); -struct cpumask *irq_create_affinity_masks(int nvec, const struct irq_affinity *affd); +struct irq_affinity_desc * +irq_create_affinity_masks(int nvec, const struct irq_affinity *affd); + int irq_calc_affinity_vectors(int minvec, int maxvec, const struct irq_affinity *affd); #else /* CONFIG_SMP */ @@ -337,7 +347,7 @@ irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify) return 0; } -static inline struct cpumask * +static inline struct irq_affinity_desc * irq_create_affinity_masks(int nvec, const struct irq_affinity *affd) { return NULL; diff --git a/include/linux/irq.h b/include/linux/irq.h index c9bffda04a45..def2b2aac8b1 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -27,6 +27,7 @@ struct seq_file; struct module; struct msi_msg; +struct irq_affinity_desc; enum irqchip_irq_state; /* @@ -834,11 +835,12 @@ struct cpumask *irq_data_get_effective_affinity_mask(struct irq_data *d) unsigned int arch_dynirq_lower_bound(unsigned int from); int __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node, - struct module *owner, const struct cpumask *affinity); + struct module *owner, + const struct irq_affinity_desc *affinity); int __devm_irq_alloc_descs(struct device *dev, int irq, unsigned int from, unsigned int cnt, int node, struct module *owner, - const struct cpumask *affinity); + const struct irq_affinity_desc *affinity); /* use macros to avoid needing export.h for THIS_MODULE */ #define irq_alloc_descs(irq, from, cnt, node) \ diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index 068aa46f0d55..35965f41d7be 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -43,6 +43,7 @@ struct irq_chip; struct irq_data; struct cpumask; struct seq_file; +struct irq_affinity_desc; /* Number of irqs reserved for a legacy isa controller */ #define NUM_ISA_INTERRUPTS 16 @@ -266,7 +267,7 @@ extern bool irq_domain_check_msi_remap(void); extern void irq_set_default_host(struct irq_domain *host); extern int irq_domain_alloc_descs(int virq, unsigned int nr_irqs, irq_hw_number_t hwirq, int node, - const struct cpumask *affinity); + const struct irq_affinity_desc *affinity); static inline struct fwnode_handle *of_node_to_fwnode(struct device_node *node) { @@ -449,7 +450,8 @@ static inline struct irq_domain *irq_domain_add_hierarchy(struct irq_domain *par extern int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base, unsigned int nr_irqs, int node, void *arg, - bool realloc, const struct cpumask *affinity); + bool realloc, + const struct irq_affinity_desc *affinity); extern void irq_domain_free_irqs(unsigned int virq, unsigned int nr_irqs); extern int irq_domain_activate_irq(struct irq_data *irq_data, bool early); extern void irq_domain_deactivate_irq(struct irq_data *irq_data); diff --git a/include/linux/msi.h b/include/linux/msi.h index eb213b87617c..784fb52b9900 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -76,7 +76,7 @@ struct msi_desc { unsigned int nvec_used; struct device *dev; struct msi_msg msg; - struct cpumask *affinity; + struct irq_affinity_desc *affinity; union { /* PCI MSI/X specific data */ @@ -138,7 +138,7 @@ static inline void pci_write_msi_msg(unsigned int irq, struct msi_msg *msg) #endif /* CONFIG_PCI_MSI */ struct msi_desc *alloc_msi_entry(struct device *dev, int nvec, - const struct cpumask *affinity); + const struct irq_affinity_desc *affinity); void free_msi_entry(struct msi_desc *entry); void __pci_read_msi_msg(struct msi_desc *entry, struct msi_msg *msg); void __pci_write_msi_msg(struct msi_desc *entry, struct msi_msg *msg); diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c index e423bff1928c..c0fe591b0dc9 100644 --- a/kernel/irq/affinity.c +++ b/kernel/irq/affinity.c @@ -99,7 +99,7 @@ static int __irq_build_affinity_masks(const struct irq_affinity *affd, cpumask_var_t *node_to_cpumask, const struct cpumask *cpu_mask, struct cpumask *nmsk, - struct cpumask *masks) + struct irq_affinity_desc *masks) { int n, nodes, cpus_per_vec, extra_vecs, done = 0; int last_affv = firstvec + numvecs; @@ -117,7 +117,9 @@ static int __irq_build_affinity_masks(const struct irq_affinity *affd, */ if (numvecs <= nodes) { for_each_node_mask(n, nodemsk) { - cpumask_or(masks + curvec, masks + curvec, node_to_cpumask[n]); + cpumask_or(&masks[curvec].mask, + &masks[curvec].mask, + node_to_cpumask[n]); if (++curvec == last_affv) curvec = firstvec; } @@ -150,7 +152,8 @@ static int __irq_build_affinity_masks(const struct irq_affinity *affd, cpus_per_vec++; --extra_vecs; } - irq_spread_init_one(masks + curvec, nmsk, cpus_per_vec); + irq_spread_init_one(&masks[curvec].mask, nmsk, + cpus_per_vec); } done += v; @@ -173,7 +176,7 @@ out: static int irq_build_affinity_masks(const struct irq_affinity *affd, int startvec, int numvecs, int firstvec, cpumask_var_t *node_to_cpumask, - struct cpumask *masks) + struct irq_affinity_desc *masks) { int curvec = startvec, nr_present, nr_others; int ret = -ENOMEM; @@ -226,15 +229,15 @@ static int irq_build_affinity_masks(const struct irq_affinity *affd, * @nvecs: The total number of vectors * @affd: Description of the affinity requirements * - * Returns the masks pointer or NULL if allocation failed. + * Returns the irq_affinity_desc pointer or NULL if allocation failed. */ -struct cpumask * +struct irq_affinity_desc * irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) { int affvecs = nvecs - affd->pre_vectors - affd->post_vectors; int curvec, usedvecs; cpumask_var_t *node_to_cpumask; - struct cpumask *masks = NULL; + struct irq_affinity_desc *masks = NULL; int i, nr_sets; /* @@ -254,8 +257,7 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) /* Fill out vectors at the beginning that don't need affinity */ for (curvec = 0; curvec < affd->pre_vectors; curvec++) - cpumask_copy(masks + curvec, irq_default_affinity); - + cpumask_copy(&masks[curvec].mask, irq_default_affinity); /* * Spread on present CPUs starting from affd->pre_vectors. If we * have multiple sets, build each sets affinity mask separately. @@ -285,7 +287,7 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) else curvec = affd->pre_vectors + usedvecs; for (; curvec < nvecs; curvec++) - cpumask_copy(masks + curvec, irq_default_affinity); + cpumask_copy(&masks[curvec].mask, irq_default_affinity); outnodemsk: free_node_to_cpumask(node_to_cpumask); diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c index 6a682c229e10..5d5378ea0afe 100644 --- a/kernel/irq/devres.c +++ b/kernel/irq/devres.c @@ -169,7 +169,7 @@ static void devm_irq_desc_release(struct device *dev, void *res) * @cnt: Number of consecutive irqs to allocate * @node: Preferred node on which the irq descriptor should be allocated * @owner: Owning module (can be NULL) - * @affinity: Optional pointer to an affinity mask array of size @cnt + * @affinity: Optional pointer to an irq_affinity_desc array of size @cnt * which hints where the irq descriptors should be allocated * and which default affinities to use * @@ -179,7 +179,7 @@ static void devm_irq_desc_release(struct device *dev, void *res) */ int __devm_irq_alloc_descs(struct device *dev, int irq, unsigned int from, unsigned int cnt, int node, struct module *owner, - const struct cpumask *affinity) + const struct irq_affinity_desc *affinity) { struct irq_desc_devres *dr; int base; diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 578d0e5f1b5b..cb401d6c5040 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -449,28 +449,29 @@ static void free_desc(unsigned int irq) } static int alloc_descs(unsigned int start, unsigned int cnt, int node, - const struct cpumask *affinity, struct module *owner) + const struct irq_affinity_desc *affinity, + struct module *owner) { - const struct cpumask *mask = NULL; struct irq_desc *desc; unsigned int flags; int i; /* Validate affinity mask(s) */ if (affinity) { - for (i = 0, mask = affinity; i < cnt; i++, mask++) { - if (cpumask_empty(mask)) + for (i = 0; i < cnt; i++) { + if (cpumask_empty(&affinity[i].mask)) return -EINVAL; } } flags = affinity ? IRQD_AFFINITY_MANAGED | IRQD_MANAGED_SHUTDOWN : 0; - mask = NULL; for (i = 0; i < cnt; i++) { + const struct cpumask *mask = NULL; + if (affinity) { node = cpu_to_node(cpumask_first(affinity)); - mask = affinity; + mask = &affinity->mask; affinity++; } desc = alloc_desc(start + i, node, flags, mask, owner); @@ -575,7 +576,7 @@ static void free_desc(unsigned int irq) } static inline int alloc_descs(unsigned int start, unsigned int cnt, int node, - const struct cpumask *affinity, + const struct irq_affinity_desc *affinity, struct module *owner) { u32 i; @@ -705,7 +706,7 @@ EXPORT_SYMBOL_GPL(irq_free_descs); */ int __ref __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node, - struct module *owner, const struct cpumask *affinity) + struct module *owner, const struct irq_affinity_desc *affinity) { int start, ret; diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 3366d11c3e02..8b0be4bd6565 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -969,7 +969,7 @@ const struct irq_domain_ops irq_domain_simple_ops = { EXPORT_SYMBOL_GPL(irq_domain_simple_ops); int irq_domain_alloc_descs(int virq, unsigned int cnt, irq_hw_number_t hwirq, - int node, const struct cpumask *affinity) + int node, const struct irq_affinity_desc *affinity) { unsigned int hint; @@ -1281,7 +1281,7 @@ int irq_domain_alloc_irqs_hierarchy(struct irq_domain *domain, */ int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base, unsigned int nr_irqs, int node, void *arg, - bool realloc, const struct cpumask *affinity) + bool realloc, const struct irq_affinity_desc *affinity) { int i, ret, virq; diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 4ca2fd46645d..ad26fbcfbfc8 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -23,11 +23,11 @@ * @nvec: The number of vectors used in this entry * @affinity: Optional pointer to an affinity mask array size of @nvec * - * If @affinity is not NULL then a an affinity array[@nvec] is allocated - * and the affinity masks from @affinity are copied. + * If @affinity is not NULL then an affinity array[@nvec] is allocated + * and the affinity masks and flags from @affinity are copied. */ -struct msi_desc * -alloc_msi_entry(struct device *dev, int nvec, const struct cpumask *affinity) +struct msi_desc *alloc_msi_entry(struct device *dev, int nvec, + const struct irq_affinity_desc *affinity) { struct msi_desc *desc; -- cgit From c410abbbacb9b378365ba17a30df08b4b9eec64f Mon Sep 17 00:00:00 2001 From: Dou Liyang Date: Tue, 4 Dec 2018 23:51:21 +0800 Subject: genirq/affinity: Add is_managed to struct irq_affinity_desc Devices which use managed interrupts usually have two classes of interrupts: - Interrupts for multiple device queues - Interrupts for general device management Currently both classes are treated the same way, i.e. as managed interrupts. The general interrupts get the default affinity mask assigned while the device queue interrupts are spread out over the possible CPUs. Treating the general interrupts as managed is both a limitation and under certain circumstances a bug. Assume the following situation: default_irq_affinity = 4..7 So if CPUs 4-7 are offlined, then the core code will shut down the device management interrupts because the last CPU in their affinity mask went offline. It's also a limitation because it's desired to allow manual placement of the general device interrupts for various reasons. If they are marked managed then the interrupt affinity setting from both user and kernel space is disabled. That limitation was reported by Kashyap and Sumit. Expand struct irq_affinity_desc with a new bit 'is_managed' which is set for truly managed interrupts (queue interrupts) and cleared for the general device interrupts. [ tglx: Simplify code and massage changelog ] Reported-by: Kashyap Desai Reported-by: Sumit Saxena Signed-off-by: Dou Liyang Signed-off-by: Thomas Gleixner Cc: linux-pci@vger.kernel.org Cc: shivasharan.srikanteshwara@broadcom.com Cc: ming.lei@redhat.com Cc: hch@lst.de Cc: bhelgaas@google.com Cc: douliyang1@huawei.com Link: https://lkml.kernel.org/r/20181204155122.6327-3-douliyangs@gmail.com --- include/linux/interrupt.h | 1 + kernel/irq/affinity.c | 4 ++++ kernel/irq/irqdesc.c | 13 ++++++++----- 3 files changed, 13 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index c44b7844dc83..c672f34235e7 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -263,6 +263,7 @@ struct irq_affinity { */ struct irq_affinity_desc { struct cpumask mask; + unsigned int is_managed : 1; }; #if defined(CONFIG_SMP) diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c index c0fe591b0dc9..45b68b4ea48b 100644 --- a/kernel/irq/affinity.c +++ b/kernel/irq/affinity.c @@ -289,6 +289,10 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) for (; curvec < nvecs; curvec++) cpumask_copy(&masks[curvec].mask, irq_default_affinity); + /* Mark the managed interrupts */ + for (i = affd->pre_vectors; i < nvecs - affd->post_vectors; i++) + masks[i].is_managed = 1; + outnodemsk: free_node_to_cpumask(node_to_cpumask); return masks; diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index cb401d6c5040..ee062b7939d3 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -453,27 +453,30 @@ static int alloc_descs(unsigned int start, unsigned int cnt, int node, struct module *owner) { struct irq_desc *desc; - unsigned int flags; int i; /* Validate affinity mask(s) */ if (affinity) { - for (i = 0; i < cnt; i++) { + for (i = 0; i < cnt; i++, i++) { if (cpumask_empty(&affinity[i].mask)) return -EINVAL; } } - flags = affinity ? IRQD_AFFINITY_MANAGED | IRQD_MANAGED_SHUTDOWN : 0; - for (i = 0; i < cnt; i++) { const struct cpumask *mask = NULL; + unsigned int flags = 0; if (affinity) { - node = cpu_to_node(cpumask_first(affinity)); + if (affinity->is_managed) { + flags = IRQD_AFFINITY_MANAGED | + IRQD_MANAGED_SHUTDOWN; + } mask = &affinity->mask; + node = cpu_to_node(cpumask_first(mask)); affinity++; } + desc = alloc_desc(start + i, node, flags, mask, owner); if (!desc) goto err; -- cgit From d89b22d46a40da3a1630ecea111beaf3ef10bc21 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 3 Dec 2018 11:30:30 +1100 Subject: cred: add cred_fscmp() for comparing creds. NFS needs to compare to credentials, to see if they can be treated the same w.r.t. filesystem access. Sometimes an ordering is needed when credentials are used as a key to an rbtree. NFS currently has its own private credential management from before 'struct cred' existed. To move it over to more consistent use of 'struct cred' we need a comparison function. This patch adds that function. Signed-off-by: NeilBrown Signed-off-by: Anna Schumaker --- include/linux/cred.h | 1 + kernel/cred.c | 55 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 56 insertions(+) (limited to 'kernel') diff --git a/include/linux/cred.h b/include/linux/cred.h index 7eed6101c791..f1085767e1b3 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -169,6 +169,7 @@ extern int change_create_files_as(struct cred *, struct inode *); extern int set_security_override(struct cred *, u32); extern int set_security_override_from_ctx(struct cred *, const char *); extern int set_create_files_as(struct cred *, struct inode *); +extern int cred_fscmp(const struct cred *, const struct cred *); extern void __init cred_init(void); /* diff --git a/kernel/cred.c b/kernel/cred.c index ecf03657e71c..0b3ac72bd717 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -19,6 +19,7 @@ #include #include #include +#include #if 0 #define kdebug(FMT, ...) \ @@ -564,6 +565,60 @@ void revert_creds(const struct cred *old) } EXPORT_SYMBOL(revert_creds); +/** + * cred_fscmp - Compare two credentials with respect to filesystem access. + * @a: The first credential + * @b: The second credential + * + * cred_cmp() will return zero if both credentials have the same + * fsuid, fsgid, and supplementary groups. That is, if they will both + * provide the same access to files based on mode/uid/gid. + * If the credentials are different, then either -1 or 1 will + * be returned depending on whether @a comes before or after @b + * respectively in an arbitrary, but stable, ordering of credentials. + * + * Return: -1, 0, or 1 depending on comparison + */ +int cred_fscmp(const struct cred *a, const struct cred *b) +{ + struct group_info *ga, *gb; + int g; + + if (a == b) + return 0; + if (uid_lt(a->fsuid, b->fsuid)) + return -1; + if (uid_gt(a->fsuid, b->fsuid)) + return 1; + + if (gid_lt(a->fsgid, b->fsgid)) + return -1; + if (gid_gt(a->fsgid, b->fsgid)) + return 1; + + ga = a->group_info; + gb = b->group_info; + if (ga == gb) + return 0; + if (ga == NULL) + return -1; + if (gb == NULL) + return 1; + if (ga->ngroups < gb->ngroups) + return -1; + if (ga->ngroups > gb->ngroups) + return 1; + + for (g = 0; g < ga->ngroups; g++) { + if (gid_lt(ga->gid[g], gb->gid[g])) + return -1; + if (gid_gt(ga->gid[g], gb->gid[g])) + return 1; + } + return 0; +} +EXPORT_SYMBOL(cred_fscmp); + /* * initialise the credentials stuff */ -- cgit From 97d0fb239c041f5f99655af74812c3ab75cc4346 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 3 Dec 2018 11:30:30 +1100 Subject: cred: add get_cred_rcu() Sometimes we want to opportunistically get a ref to a cred in an rcu_read_lock protected section. get_task_cred() does this, and NFS does as similar thing with its own credential structures. To prepare for NFS converting to use 'struct cred' more uniformly, define get_cred_rcu(), and use it in get_task_cred(). Signed-off-by: NeilBrown Signed-off-by: Anna Schumaker --- include/linux/cred.h | 11 +++++++++++ kernel/cred.c | 2 +- 2 files changed, 12 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/cred.h b/include/linux/cred.h index f1085767e1b3..48979fcb95cf 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -252,6 +252,17 @@ static inline const struct cred *get_cred(const struct cred *cred) return get_new_cred(nonconst_cred); } +static inline const struct cred *get_cred_rcu(const struct cred *cred) +{ + struct cred *nonconst_cred = (struct cred *) cred; + if (!cred) + return NULL; + if (!atomic_inc_not_zero(&nonconst_cred->usage)) + return NULL; + validate_creds(cred); + return cred; +} + /** * put_cred - Release a reference to a set of credentials * @cred: The credentials to release diff --git a/kernel/cred.c b/kernel/cred.c index 0b3ac72bd717..ba60162249e8 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -195,7 +195,7 @@ const struct cred *get_task_cred(struct task_struct *task) do { cred = __task_cred((task)); BUG_ON(!cred); - } while (!atomic_inc_not_zero(&((struct cred *)cred)->usage)); + } while (!get_cred_rcu(cred)); rcu_read_unlock(); return cred; -- cgit From a6d8e7637faac93f362b694311bff64d5a8c701e Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 3 Dec 2018 11:30:30 +1100 Subject: cred: export get_task_cred(). There is no reason that modules should not be able to use this, and NFS will need it when converted to use 'struct cred'. Signed-off-by: NeilBrown Signed-off-by: Anna Schumaker --- kernel/cred.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/cred.c b/kernel/cred.c index ba60162249e8..21f4a97085b4 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -200,6 +200,7 @@ const struct cred *get_task_cred(struct task_struct *task) rcu_read_unlock(); return cred; } +EXPORT_SYMBOL(get_task_cred); /* * Allocate blank credentials, such that the credentials can be filled in at a -- cgit From fdbaa0beb78b7c8847e261fe2c32816e9d1c54cc Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 19 Dec 2018 13:01:01 -0800 Subject: bpf: Ensure line_info.insn_off cannot point to insn with zero code This patch rejects a line_info if the bpf insn code referred by line_info.insn_off is 0. F.e. a broken userspace tool might generate a line_info.insn_off that points to the second 8 bytes of a BPF_LD_IMM64. Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index e0e77ffeefb8..5c64281d566e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4980,6 +4980,14 @@ static int check_btf_line(struct bpf_verifier_env *env, goto err_free; } + if (!prog->insnsi[linfo[i].insn_off].code) { + verbose(env, + "Invalid insn code at line_info[%u].insn_off\n", + i); + err = -EINVAL; + goto err_free; + } + if (!btf_name_by_offset(btf, linfo[i].line_off) || !btf_name_by_offset(btf, linfo[i].file_name_off)) { verbose(env, "Invalid line_info[%u].line_off or .file_name_off\n", i); -- cgit From 518a2f1925c3165befbf06b75e07636549d92c1c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 14 Dec 2018 09:00:40 +0100 Subject: dma-mapping: zero memory returned from dma_alloc_* If we want to map memory from the DMA allocator to userspace it must be zeroed at allocation time to prevent stale data leaks. We already do this on most common architectures, but some architectures don't do this yet, fix them up, either by passing GFP_ZERO when we use the normal page allocator or doing a manual memset otherwise. Signed-off-by: Christoph Hellwig Acked-by: Geert Uytterhoeven [m68k] Acked-by: Sam Ravnborg [sparc] --- arch/alpha/kernel/pci_iommu.c | 2 +- arch/arc/mm/dma.c | 2 +- arch/c6x/mm/dma-coherent.c | 5 ++++- arch/m68k/kernel/dma.c | 2 +- arch/microblaze/mm/consistent.c | 2 +- arch/openrisc/kernel/dma.c | 2 +- arch/parisc/kernel/pci-dma.c | 4 ++-- arch/s390/pci/pci_dma.c | 2 +- arch/sparc/kernel/ioport.c | 2 +- arch/sparc/mm/io-unit.c | 2 +- arch/sparc/mm/iommu.c | 2 +- arch/xtensa/kernel/pci-dma.c | 2 +- drivers/misc/mic/host/mic_boot.c | 2 +- kernel/dma/virt.c | 2 +- 14 files changed, 18 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/arch/alpha/kernel/pci_iommu.c b/arch/alpha/kernel/pci_iommu.c index e1716e0d92fd..aa0f50d0f823 100644 --- a/arch/alpha/kernel/pci_iommu.c +++ b/arch/alpha/kernel/pci_iommu.c @@ -443,7 +443,7 @@ static void *alpha_pci_alloc_coherent(struct device *dev, size_t size, gfp &= ~GFP_DMA; try_again: - cpu_addr = (void *)__get_free_pages(gfp, order); + cpu_addr = (void *)__get_free_pages(gfp | __GFP_ZERO, order); if (! cpu_addr) { printk(KERN_INFO "pci_alloc_consistent: " "get_free_pages failed from %pf\n", diff --git a/arch/arc/mm/dma.c b/arch/arc/mm/dma.c index db203ff69ccf..1525ac00fd02 100644 --- a/arch/arc/mm/dma.c +++ b/arch/arc/mm/dma.c @@ -33,7 +33,7 @@ void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, */ BUG_ON(gfp & __GFP_HIGHMEM); - page = alloc_pages(gfp, order); + page = alloc_pages(gfp | __GFP_ZERO, order); if (!page) return NULL; diff --git a/arch/c6x/mm/dma-coherent.c b/arch/c6x/mm/dma-coherent.c index 01305c787201..75b79571732c 100644 --- a/arch/c6x/mm/dma-coherent.c +++ b/arch/c6x/mm/dma-coherent.c @@ -78,6 +78,7 @@ static void __free_dma_pages(u32 addr, int order) void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *handle, gfp_t gfp, unsigned long attrs) { + void *ret; u32 paddr; int order; @@ -94,7 +95,9 @@ void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *handle, if (!paddr) return NULL; - return phys_to_virt(paddr); + ret = phys_to_virt(paddr); + memset(ret, 0, 1 << order); + return ret; } /* diff --git a/arch/m68k/kernel/dma.c b/arch/m68k/kernel/dma.c index e99993c57d6b..b4aa853051bd 100644 --- a/arch/m68k/kernel/dma.c +++ b/arch/m68k/kernel/dma.c @@ -32,7 +32,7 @@ void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *handle, size = PAGE_ALIGN(size); order = get_order(size); - page = alloc_pages(flag, order); + page = alloc_pages(flag | __GFP_ZERO, order); if (!page) return NULL; diff --git a/arch/microblaze/mm/consistent.c b/arch/microblaze/mm/consistent.c index 45e0a1aa9357..3002cbca3059 100644 --- a/arch/microblaze/mm/consistent.c +++ b/arch/microblaze/mm/consistent.c @@ -81,7 +81,7 @@ void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, size = PAGE_ALIGN(size); order = get_order(size); - vaddr = __get_free_pages(gfp, order); + vaddr = __get_free_pages(gfp | __GFP_ZERO, order); if (!vaddr) return NULL; diff --git a/arch/openrisc/kernel/dma.c b/arch/openrisc/kernel/dma.c index 159336adfa2f..f79457cb3741 100644 --- a/arch/openrisc/kernel/dma.c +++ b/arch/openrisc/kernel/dma.c @@ -89,7 +89,7 @@ arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, .mm = &init_mm }; - page = alloc_pages_exact(size, gfp); + page = alloc_pages_exact(size, gfp | __GFP_ZERO); if (!page) return NULL; diff --git a/arch/parisc/kernel/pci-dma.c b/arch/parisc/kernel/pci-dma.c index 04c48f1ef3fb..239162355b58 100644 --- a/arch/parisc/kernel/pci-dma.c +++ b/arch/parisc/kernel/pci-dma.c @@ -404,7 +404,7 @@ static void *pcxl_dma_alloc(struct device *dev, size_t size, order = get_order(size); size = 1 << (order + PAGE_SHIFT); vaddr = pcxl_alloc_range(size); - paddr = __get_free_pages(flag, order); + paddr = __get_free_pages(flag | __GFP_ZERO, order); flush_kernel_dcache_range(paddr, size); paddr = __pa(paddr); map_uncached_pages(vaddr, size, paddr); @@ -429,7 +429,7 @@ static void *pcx_dma_alloc(struct device *dev, size_t size, if ((attrs & DMA_ATTR_NON_CONSISTENT) == 0) return NULL; - addr = (void *)__get_free_pages(flag, get_order(size)); + addr = (void *)__get_free_pages(flag | __GFP_ZERO, get_order(size)); if (addr) *dma_handle = (dma_addr_t)virt_to_phys(addr); diff --git a/arch/s390/pci/pci_dma.c b/arch/s390/pci/pci_dma.c index 346ba382193a..9e52d1527f71 100644 --- a/arch/s390/pci/pci_dma.c +++ b/arch/s390/pci/pci_dma.c @@ -404,7 +404,7 @@ static void *s390_dma_alloc(struct device *dev, size_t size, dma_addr_t map; size = PAGE_ALIGN(size); - page = alloc_pages(flag, get_order(size)); + page = alloc_pages(flag | __GFP_ZERO, get_order(size)); if (!page) return NULL; diff --git a/arch/sparc/kernel/ioport.c b/arch/sparc/kernel/ioport.c index baa235652c27..f89603855f1e 100644 --- a/arch/sparc/kernel/ioport.c +++ b/arch/sparc/kernel/ioport.c @@ -325,7 +325,7 @@ void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, return NULL; size = PAGE_ALIGN(size); - va = (void *) __get_free_pages(gfp, get_order(size)); + va = (void *) __get_free_pages(gfp | __GFP_ZERO, get_order(size)); if (!va) { printk("%s: no %zd pages\n", __func__, size >> PAGE_SHIFT); return NULL; diff --git a/arch/sparc/mm/io-unit.c b/arch/sparc/mm/io-unit.c index 91be13935d40..f770ee7229d8 100644 --- a/arch/sparc/mm/io-unit.c +++ b/arch/sparc/mm/io-unit.c @@ -224,7 +224,7 @@ static void *iounit_alloc(struct device *dev, size_t len, return NULL; len = PAGE_ALIGN(len); - va = __get_free_pages(gfp, get_order(len)); + va = __get_free_pages(gfp | __GFP_ZERO, get_order(len)); if (!va) return NULL; diff --git a/arch/sparc/mm/iommu.c b/arch/sparc/mm/iommu.c index fb771a634452..e8d5d73ca40d 100644 --- a/arch/sparc/mm/iommu.c +++ b/arch/sparc/mm/iommu.c @@ -344,7 +344,7 @@ static void *sbus_iommu_alloc(struct device *dev, size_t len, return NULL; len = PAGE_ALIGN(len); - va = __get_free_pages(gfp, get_order(len)); + va = __get_free_pages(gfp | __GFP_ZERO, get_order(len)); if (va == 0) return NULL; diff --git a/arch/xtensa/kernel/pci-dma.c b/arch/xtensa/kernel/pci-dma.c index 1fc138b6bc0a..9171bff76fc4 100644 --- a/arch/xtensa/kernel/pci-dma.c +++ b/arch/xtensa/kernel/pci-dma.c @@ -160,7 +160,7 @@ void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *handle, flag & __GFP_NOWARN); if (!page) - page = alloc_pages(flag, get_order(size)); + page = alloc_pages(flag | __GFP_ZERO, get_order(size)); if (!page) return NULL; diff --git a/drivers/misc/mic/host/mic_boot.c b/drivers/misc/mic/host/mic_boot.c index c327985c9523..6479435ac96b 100644 --- a/drivers/misc/mic/host/mic_boot.c +++ b/drivers/misc/mic/host/mic_boot.c @@ -149,7 +149,7 @@ static void *__mic_dma_alloc(struct device *dev, size_t size, struct scif_hw_dev *scdev = dev_get_drvdata(dev); struct mic_device *mdev = scdev_to_mdev(scdev); dma_addr_t tmp; - void *va = kmalloc(size, gfp); + void *va = kmalloc(size, gfp | __GFP_ZERO); if (va) { tmp = mic_map_single(mdev, va, size); diff --git a/kernel/dma/virt.c b/kernel/dma/virt.c index 631ddec4b60a..ebe128833af7 100644 --- a/kernel/dma/virt.c +++ b/kernel/dma/virt.c @@ -13,7 +13,7 @@ static void *dma_virt_alloc(struct device *dev, size_t size, { void *ret; - ret = (void *)__get_free_pages(gfp, get_order(size)); + ret = (void *)__get_free_pages(gfp | __GFP_ZERO, get_order(size)); if (ret) *dma_handle = (uintptr_t)ret; return ret; -- cgit From 960ea056561a08e2b837b2f02d22c53226414a84 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 19 Dec 2018 22:13:04 -0800 Subject: bpf: verifier: teach the verifier to reason about the BPF_JSET instruction Some JITs (nfp) try to optimize code on their own. It could make sense in case of BPF_JSET instruction which is currently not interpreted by the verifier, meaning for instance that dead could would not be detected if it was under BPF_JSET branch. Teach the verifier basics of BPF_JSET, JIT optimizations will be removed shortly. Signed-off-by: Jakub Kicinski Reviewed-by: Jiong Wang Acked-by: Edward Cree Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 5c64281d566e..98ed27bbd045 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3859,6 +3859,12 @@ static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode) if (tnum_is_const(reg->var_off)) return !tnum_equals_const(reg->var_off, val); break; + case BPF_JSET: + if ((~reg->var_off.mask & reg->var_off.value) & val) + return 1; + if (!((reg->var_off.mask | reg->var_off.value) & val)) + return 0; + break; case BPF_JGT: if (reg->umin_value > val) return 1; @@ -3943,6 +3949,13 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg, */ __mark_reg_known(false_reg, val); break; + case BPF_JSET: + false_reg->var_off = tnum_and(false_reg->var_off, + tnum_const(~val)); + if (is_power_of_2(val)) + true_reg->var_off = tnum_or(true_reg->var_off, + tnum_const(val)); + break; case BPF_JGT: false_reg->umax_value = min(false_reg->umax_value, val); true_reg->umin_value = max(true_reg->umin_value, val + 1); @@ -4015,6 +4028,13 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg, */ __mark_reg_known(false_reg, val); break; + case BPF_JSET: + false_reg->var_off = tnum_and(false_reg->var_off, + tnum_const(~val)); + if (is_power_of_2(val)) + true_reg->var_off = tnum_or(true_reg->var_off, + tnum_const(val)); + break; case BPF_JGT: true_reg->umax_value = min(true_reg->umax_value, val - 1); false_reg->umin_value = max(false_reg->umin_value, val); -- cgit From 9b38c4056b2736bb5902e8b0911832db666fd19b Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 19 Dec 2018 22:13:06 -0800 Subject: bpf: verifier: reorder stack size check with dead code sanitization Reorder the calls to check_max_stack_depth() and sanitize_dead_code() to separate functions which can rewrite instructions from pure checks. No functional changes. Signed-off-by: Jakub Kicinski Reviewed-by: Jiong Wang Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 98ed27bbd045..d27d5a880015 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6983,10 +6983,11 @@ skip_full_check: free_states(env); if (ret == 0) - sanitize_dead_code(env); + ret = check_max_stack_depth(env); + /* instruction rewrites happen after this point */ if (ret == 0) - ret = check_max_stack_depth(env); + sanitize_dead_code(env); if (ret == 0) /* program is valid, convert *(u32*)(ctx + off) accesses */ -- cgit From 8b1cce9f5832a8eda17d37a3c49fb7dd2d650f46 Mon Sep 17 00:00:00 2001 From: Thierry Reding Date: Thu, 20 Dec 2018 17:35:47 +0100 Subject: dma-mapping: fix inverted logic in dma_supported The cleanup in commit 356da6d0cde3 ("dma-mapping: bypass indirect calls for dma-direct") accidentally inverted the logic in the check for the presence of a ->dma_supported() callback. Switch this back to the way it was to prevent a crash on boot. Fixes: 356da6d0cde3 ("dma-mapping: bypass indirect calls for dma-direct") Signed-off-by: Thierry Reding Signed-off-by: Christoph Hellwig --- kernel/dma/mapping.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index fc84c81029d9..d7c34d2d1ba5 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -406,7 +406,7 @@ int dma_supported(struct device *dev, u64 mask) if (dma_is_direct(ops)) return dma_direct_supported(dev, mask); - if (ops->dma_supported) + if (!ops->dma_supported) return 1; return ops->dma_supported(dev, mask); } -- cgit From 77ea5f4cbe2084db9ab021ba73fb7eadf1610884 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Wed, 19 Dec 2018 17:00:23 +0100 Subject: bpf/cpumap: make sure frame_size for build_skb is aligned if headroom isn't The frame_size passed to build_skb must be aligned, else it is possible that the embedded struct skb_shared_info gets unaligned. For correctness make sure that xdpf->headroom in included in the alignment. No upstream drivers can hit this, as all XDP drivers provide an aligned headroom. This was discovered when playing with implementing XDP support for mvneta, which have a 2 bytes DSA header, and this Marvell ARM64 platform didn't like doing atomic operations on an unaligned skb_shinfo(skb)->dataref addresses. Fixes: 1c601d829ab0 ("bpf: cpumap xdp_buff to skb conversion and allocation") Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Daniel Borkmann --- kernel/bpf/cpumap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 24aac0d0f412..8974b3755670 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -183,7 +183,7 @@ static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu, * is not at a fixed memory location, with mixed length * packets, which is bad for cache-line hotness. */ - frame_size = SKB_DATA_ALIGN(xdpf->len) + xdpf->headroom + + frame_size = SKB_DATA_ALIGN(xdpf->len + xdpf->headroom) + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); pkt_data_start = xdpf->data - xdpf->headroom; -- cgit From 5eed6f1dff87bfb5e545935def3843edf42800f2 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Fri, 21 Dec 2018 14:30:54 -0800 Subject: fork,memcg: fix crash in free_thread_stack on memcg charge fail Commit 9b6f7e163cd0 ("mm: rework memcg kernel stack accounting") will result in fork failing if allocating a kernel stack for a task in dup_task_struct exceeds the kernel memory allowance for that cgroup. Unfortunately, it also results in a crash. This is due to the code jumping to free_stack and calling free_thread_stack when the memcg kernel stack charge fails, but without tsk->stack pointing at the freshly allocated stack. This in turn results in the vfree_atomic in free_thread_stack oopsing with a backtrace like this: #5 [ffffc900244efc88] die at ffffffff8101f0ab #6 [ffffc900244efcb8] do_general_protection at ffffffff8101cb86 #7 [ffffc900244efce0] general_protection at ffffffff818ff082 [exception RIP: llist_add_batch+7] RIP: ffffffff8150d487 RSP: ffffc900244efd98 RFLAGS: 00010282 RAX: 0000000000000000 RBX: ffff88085ef55980 RCX: 0000000000000000 RDX: ffff88085ef55980 RSI: 343834343531203a RDI: 343834343531203a RBP: ffffc900244efd98 R8: 0000000000000001 R9: ffff8808578c3600 R10: 0000000000000000 R11: 0000000000000001 R12: ffff88029f6c21c0 R13: 0000000000000286 R14: ffff880147759b00 R15: 0000000000000000 ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018 #8 [ffffc900244efda0] vfree_atomic at ffffffff811df2c7 #9 [ffffc900244efdb8] copy_process at ffffffff81086e37 #10 [ffffc900244efe98] _do_fork at ffffffff810884e0 #11 [ffffc900244eff10] sys_vfork at ffffffff810887ff #12 [ffffc900244eff20] do_syscall_64 at ffffffff81002a43 RIP: 000000000049b948 RSP: 00007ffcdb307830 RFLAGS: 00000246 RAX: ffffffffffffffda RBX: 0000000000896030 RCX: 000000000049b948 RDX: 0000000000000000 RSI: 00007ffcdb307790 RDI: 00000000005d7421 RBP: 000000000067370f R8: 00007ffcdb3077b0 R9: 000000000001ed00 R10: 0000000000000008 R11: 0000000000000246 R12: 0000000000000040 R13: 000000000000000f R14: 0000000000000000 R15: 000000000088d018 ORIG_RAX: 000000000000003a CS: 0033 SS: 002b The simplest fix is to assign tsk->stack right where it is allocated. Link: http://lkml.kernel.org/r/20181214231726.7ee4843c@imladris.surriel.com Fixes: 9b6f7e163cd0 ("mm: rework memcg kernel stack accounting") Signed-off-by: Rik van Riel Acked-by: Roman Gushchin Acked-by: Michal Hocko Cc: Shakeel Butt Cc: Johannes Weiner Cc: Tejun Heo Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 07cddff89c7b..e2a5156bc9c3 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -240,8 +240,10 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) * free_thread_stack() can be called in interrupt context, * so cache the vm_struct. */ - if (stack) + if (stack) { tsk->stack_vm_area = find_vm_area(stack); + tsk->stack = stack; + } return stack; #else struct page *page = alloc_pages_node(node, THREADINFO_GFP, @@ -288,7 +290,10 @@ static struct kmem_cache *thread_stack_cache; static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) { - return kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node); + unsigned long *stack; + stack = kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node); + tsk->stack = stack; + return stack; } static void free_thread_stack(struct task_struct *tsk) -- cgit From e8d086ddb5339d72c60e6c7b8d28810f26960f9a Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 18 Dec 2018 15:50:02 -0500 Subject: tracing: Fix ftrace_graph_get_ret_stack() to use task and not current The function ftrace_graph_get_ret_stack() takes a task struct descriptor but uses current as the task to perform the operations on. In pretty much all cases the task decriptor is the same as current, so this wasn't an issue. But there is a case in the ARM architecture that passes in a task that is not current, and expects a result from that task, and this code breaks it. Fixes: 51584396cff5 ("arm64: Use ftrace_graph_get_ret_stack() instead of curr_ret_stack") Reported-by: James Morse Tested-by: James Morse Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/fgraph.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index d4f04f0ca646..8dfd5021b933 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -246,10 +246,10 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer) struct ftrace_ret_stack * ftrace_graph_get_ret_stack(struct task_struct *task, int idx) { - idx = current->curr_ret_stack - idx; + idx = task->curr_ret_stack - idx; if (idx >= 0 && idx <= task->curr_ret_stack) - return ¤t->ret_stack[idx]; + return &task->ret_stack[idx]; return NULL; } -- cgit From 6801f0d5ca00bed159c7f87870cc6f5411837544 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Tue, 18 Dec 2018 14:33:20 -0600 Subject: tracing: Remove unnecessary hist trigger struct field hist_field.var_idx is completely unused, so remove it. Link: http://lkml.kernel.org/r/d4e066c0f509f5f13ad3babc8c33ca6e7ddc439a.1545161087.git.tom.zanussi@linux.intel.com Acked-by: Namhyung Kim Reviewed-by: Masami Hiramatsu Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 82e72c48a5a9..d29bf8a8e1dd 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -61,7 +61,6 @@ struct hist_field { char *system; char *event_name; char *name; - unsigned int var_idx; unsigned int var_ref_idx; bool read_once; }; -- cgit From 2f31ed9308cc9e95c149078b276a47360ab507bb Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Tue, 18 Dec 2018 14:33:21 -0600 Subject: tracing: Change strlen to sizeof for hist trigger static strings There's no need to use strlen() for static strings when the length is already known, so update trace_events_hist.c with sizeof() for those cases. Link: http://lkml.kernel.org/r/e3e754f2bd18e56eaa8baf79bee619316ebf4cfc.1545161087.git.tom.zanussi@linux.intel.com Acked-by: Namhyung Kim Reviewed-by: Masami Hiramatsu Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index d29bf8a8e1dd..25d06b3ae1f6 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -507,7 +507,7 @@ static int synth_field_string_size(char *type) start = strstr(type, "char["); if (start == NULL) return -EINVAL; - start += strlen("char["); + start += sizeof("char[") - 1; end = strchr(type, ']'); if (!end || end < start) @@ -1843,8 +1843,8 @@ static int parse_action(char *str, struct hist_trigger_attrs *attrs) if (attrs->n_actions >= HIST_ACTIONS_MAX) return ret; - if ((strncmp(str, "onmatch(", strlen("onmatch(")) == 0) || - (strncmp(str, "onmax(", strlen("onmax(")) == 0)) { + if ((strncmp(str, "onmatch(", sizeof("onmatch(") - 1) == 0) || + (strncmp(str, "onmax(", sizeof("onmax(") - 1) == 0)) { attrs->action_str[attrs->n_actions] = kstrdup(str, GFP_KERNEL); if (!attrs->action_str[attrs->n_actions]) { ret = -ENOMEM; @@ -1861,34 +1861,34 @@ static int parse_assignment(char *str, struct hist_trigger_attrs *attrs) { int ret = 0; - if ((strncmp(str, "key=", strlen("key=")) == 0) || - (strncmp(str, "keys=", strlen("keys=")) == 0)) { + if ((strncmp(str, "key=", sizeof("key=") - 1) == 0) || + (strncmp(str, "keys=", sizeof("keys=") - 1) == 0)) { attrs->keys_str = kstrdup(str, GFP_KERNEL); if (!attrs->keys_str) { ret = -ENOMEM; goto out; } - } else if ((strncmp(str, "val=", strlen("val=")) == 0) || - (strncmp(str, "vals=", strlen("vals=")) == 0) || - (strncmp(str, "values=", strlen("values=")) == 0)) { + } else if ((strncmp(str, "val=", sizeof("val=") - 1) == 0) || + (strncmp(str, "vals=", sizeof("vals=") - 1) == 0) || + (strncmp(str, "values=", sizeof("values=") - 1) == 0)) { attrs->vals_str = kstrdup(str, GFP_KERNEL); if (!attrs->vals_str) { ret = -ENOMEM; goto out; } - } else if (strncmp(str, "sort=", strlen("sort=")) == 0) { + } else if (strncmp(str, "sort=", sizeof("sort=") - 1) == 0) { attrs->sort_key_str = kstrdup(str, GFP_KERNEL); if (!attrs->sort_key_str) { ret = -ENOMEM; goto out; } - } else if (strncmp(str, "name=", strlen("name=")) == 0) { + } else if (strncmp(str, "name=", sizeof("name=") - 1) == 0) { attrs->name = kstrdup(str, GFP_KERNEL); if (!attrs->name) { ret = -ENOMEM; goto out; } - } else if (strncmp(str, "clock=", strlen("clock=")) == 0) { + } else if (strncmp(str, "clock=", sizeof("clock=") - 1) == 0) { strsep(&str, "="); if (!str) { ret = -EINVAL; @@ -1901,7 +1901,7 @@ static int parse_assignment(char *str, struct hist_trigger_attrs *attrs) ret = -ENOMEM; goto out; } - } else if (strncmp(str, "size=", strlen("size=")) == 0) { + } else if (strncmp(str, "size=", sizeof("size=") - 1) == 0) { int map_bits = parse_map_size(str); if (map_bits < 0) { @@ -3497,7 +3497,7 @@ static struct action_data *onmax_parse(char *str) if (!onmax_fn_name || !str) goto free; - if (strncmp(onmax_fn_name, "save", strlen("save")) == 0) { + if (strncmp(onmax_fn_name, "save", sizeof("save") - 1) == 0) { char *params = strsep(&str, ")"); if (!params) { @@ -4302,8 +4302,8 @@ static int parse_actions(struct hist_trigger_data *hist_data) for (i = 0; i < hist_data->attrs->n_actions; i++) { str = hist_data->attrs->action_str[i]; - if (strncmp(str, "onmatch(", strlen("onmatch(")) == 0) { - char *action_str = str + strlen("onmatch("); + if (strncmp(str, "onmatch(", sizeof("onmatch(") - 1) == 0) { + char *action_str = str + sizeof("onmatch(") - 1; data = onmatch_parse(tr, action_str); if (IS_ERR(data)) { @@ -4311,8 +4311,8 @@ static int parse_actions(struct hist_trigger_data *hist_data) break; } data->fn = action_trace; - } else if (strncmp(str, "onmax(", strlen("onmax(")) == 0) { - char *action_str = str + strlen("onmax("); + } else if (strncmp(str, "onmax(", sizeof("onmax(") - 1) == 0) { + char *action_str = str + sizeof("onmax(") - 1; data = onmax_parse(action_str); if (IS_ERR(data)) { @@ -5548,9 +5548,9 @@ static int event_hist_trigger_func(struct event_command *cmd_ops, p++; continue; } - if (p >= param + strlen(param) - strlen("if") - 1) + if (p >= param + strlen(param) - (sizeof("if") - 1) - 1) return -EINVAL; - if (*(p + strlen("if")) != ' ' && *(p + strlen("if")) != '\t') { + if (*(p + sizeof("if") - 1) != ' ' && *(p + sizeof("if") - 1) != '\t') { p++; continue; } -- cgit From e4f6d245031e04bdd12db390298acec0474a1a46 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Wed, 19 Dec 2018 13:09:16 -0600 Subject: tracing: Use var_refs[] for hist trigger reference checking Since all the variable reference hist_fields are collected into hist_data->var_refs[] array, there's no need to go through all the fields looking for them, or in separate arrays like synth_var_refs[], which will be going away soon anyway. This also allows us to get rid of some unnecessary code and functions currently used for the same purpose. Link: http://lkml.kernel.org/r/1545246556.4239.7.camel@gmail.com Acked-by: Namhyung Kim Reviewed-by: Masami Hiramatsu Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 68 +++++++--------------------------------- 1 file changed, 11 insertions(+), 57 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 25d06b3ae1f6..f3caf6e484f4 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -1297,75 +1297,29 @@ check_field_for_var_ref(struct hist_field *hist_field, struct hist_trigger_data *var_data, unsigned int var_idx) { - struct hist_field *found = NULL; - - if (hist_field && hist_field->flags & HIST_FIELD_FL_VAR_REF) { - if (hist_field->var.idx == var_idx && - hist_field->var.hist_data == var_data) { - found = hist_field; - } - } - - return found; -} - -static struct hist_field * -check_field_for_var_refs(struct hist_trigger_data *hist_data, - struct hist_field *hist_field, - struct hist_trigger_data *var_data, - unsigned int var_idx, - unsigned int level) -{ - struct hist_field *found = NULL; - unsigned int i; - - if (level > 3) - return found; - - if (!hist_field) - return found; - - found = check_field_for_var_ref(hist_field, var_data, var_idx); - if (found) - return found; - - for (i = 0; i < HIST_FIELD_OPERANDS_MAX; i++) { - struct hist_field *operand; + WARN_ON(!(hist_field && hist_field->flags & HIST_FIELD_FL_VAR_REF)); - operand = hist_field->operands[i]; - found = check_field_for_var_refs(hist_data, operand, var_data, - var_idx, level + 1); - if (found) - return found; - } + if (hist_field && hist_field->var.idx == var_idx && + hist_field->var.hist_data == var_data) + return hist_field; - return found; + return NULL; } static struct hist_field *find_var_ref(struct hist_trigger_data *hist_data, struct hist_trigger_data *var_data, unsigned int var_idx) { - struct hist_field *hist_field, *found = NULL; + struct hist_field *hist_field; unsigned int i; - for_each_hist_field(i, hist_data) { - hist_field = hist_data->fields[i]; - found = check_field_for_var_refs(hist_data, hist_field, - var_data, var_idx, 0); - if (found) - return found; - } - - for (i = 0; i < hist_data->n_synth_var_refs; i++) { - hist_field = hist_data->synth_var_refs[i]; - found = check_field_for_var_refs(hist_data, hist_field, - var_data, var_idx, 0); - if (found) - return found; + for (i = 0; i < hist_data->n_var_refs; i++) { + hist_field = hist_data->var_refs[i]; + if (check_field_for_var_ref(hist_field, var_data, var_idx)) + return hist_field; } - return found; + return NULL; } static struct hist_field *find_any_var_ref(struct hist_trigger_data *hist_data, -- cgit From de40f033d4e84e843d6a12266e3869015ea9097c Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Tue, 18 Dec 2018 14:33:23 -0600 Subject: tracing: Remove open-coding of hist trigger var_ref management Have create_var_ref() manage the hist trigger's var_ref list, rather than having similar code doing it in multiple places. This cleans up the code and makes sure var_refs are always accounted properly. Also, document the var_ref-related functions to make what their purpose clearer. Link: http://lkml.kernel.org/r/05ddae93ff514e66fc03897d6665231892939913.1545161087.git.tom.zanussi@linux.intel.com Acked-by: Namhyung Kim Reviewed-by: Masami Hiramatsu Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 93 ++++++++++++++++++++++++++++++++-------- 1 file changed, 75 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index f3caf6e484f4..6309f4dbfb9c 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -1292,6 +1292,17 @@ static u64 hist_field_cpu(struct hist_field *hist_field, return cpu; } +/** + * check_field_for_var_ref - Check if a VAR_REF field references a variable + * @hist_field: The VAR_REF field to check + * @var_data: The hist trigger that owns the variable + * @var_idx: The trigger variable identifier + * + * Check the given VAR_REF field to see whether or not it references + * the given variable associated with the given trigger. + * + * Return: The VAR_REF field if it does reference the variable, NULL if not + */ static struct hist_field * check_field_for_var_ref(struct hist_field *hist_field, struct hist_trigger_data *var_data, @@ -1306,6 +1317,18 @@ check_field_for_var_ref(struct hist_field *hist_field, return NULL; } +/** + * find_var_ref - Check if a trigger has a reference to a trigger variable + * @hist_data: The hist trigger that might have a reference to the variable + * @var_data: The hist trigger that owns the variable + * @var_idx: The trigger variable identifier + * + * Check the list of var_refs[] on the first hist trigger to see + * whether any of them are references to the variable on the second + * trigger. + * + * Return: The VAR_REF field referencing the variable if so, NULL if not + */ static struct hist_field *find_var_ref(struct hist_trigger_data *hist_data, struct hist_trigger_data *var_data, unsigned int var_idx) @@ -1322,6 +1345,20 @@ static struct hist_field *find_var_ref(struct hist_trigger_data *hist_data, return NULL; } +/** + * find_any_var_ref - Check if there is a reference to a given trigger variable + * @hist_data: The hist trigger + * @var_idx: The trigger variable identifier + * + * Check to see whether the given variable is currently referenced by + * any other trigger. + * + * The trigger the variable is defined on is explicitly excluded - the + * assumption being that a self-reference doesn't prevent a trigger + * from being removed. + * + * Return: The VAR_REF field referencing the variable if so, NULL if not + */ static struct hist_field *find_any_var_ref(struct hist_trigger_data *hist_data, unsigned int var_idx) { @@ -1340,6 +1377,19 @@ static struct hist_field *find_any_var_ref(struct hist_trigger_data *hist_data, return found; } +/** + * check_var_refs - Check if there is a reference to any of trigger's variables + * @hist_data: The hist trigger + * + * A trigger can define one or more variables. If any one of them is + * currently referenced by any other trigger, this function will + * determine that. + + * Typically used to determine whether or not a trigger can be removed + * - if there are any references to a trigger's variables, it cannot. + * + * Return: True if there is a reference to any of trigger's variables + */ static bool check_var_refs(struct hist_trigger_data *hist_data) { struct hist_field *field; @@ -2343,7 +2393,23 @@ static int init_var_ref(struct hist_field *ref_field, goto out; } -static struct hist_field *create_var_ref(struct hist_field *var_field, +/** + * create_var_ref - Create a variable reference and attach it to trigger + * @hist_data: The trigger that will be referencing the variable + * @var_field: The VAR field to create a reference to + * @system: The optional system string + * @event_name: The optional event_name string + * + * Given a variable hist_field, create a VAR_REF hist_field that + * represents a reference to it. + * + * This function also adds the reference to the trigger that + * now references the variable. + * + * Return: The VAR_REF field if successful, NULL if not + */ +static struct hist_field *create_var_ref(struct hist_trigger_data *hist_data, + struct hist_field *var_field, char *system, char *event_name) { unsigned long flags = HIST_FIELD_FL_VAR_REF; @@ -2355,6 +2421,9 @@ static struct hist_field *create_var_ref(struct hist_field *var_field, destroy_hist_field(ref_field, 0); return NULL; } + + hist_data->var_refs[hist_data->n_var_refs] = ref_field; + ref_field->var_ref_idx = hist_data->n_var_refs++; } return ref_field; @@ -2428,7 +2497,8 @@ static struct hist_field *parse_var_ref(struct hist_trigger_data *hist_data, var_field = find_event_var(hist_data, system, event_name, var_name); if (var_field) - ref_field = create_var_ref(var_field, system, event_name); + ref_field = create_var_ref(hist_data, var_field, + system, event_name); if (!ref_field) hist_err_event("Couldn't find variable: $", @@ -2546,8 +2616,6 @@ static struct hist_field *parse_atom(struct hist_trigger_data *hist_data, if (!s) { hist_field = parse_var_ref(hist_data, ref_system, ref_event, ref_var); if (hist_field) { - hist_data->var_refs[hist_data->n_var_refs] = hist_field; - hist_field->var_ref_idx = hist_data->n_var_refs++; if (var_name) { hist_field = create_alias(hist_data, hist_field, var_name); if (!hist_field) { @@ -3321,7 +3389,6 @@ static int onmax_create(struct hist_trigger_data *hist_data, unsigned int var_ref_idx = hist_data->n_var_refs; struct field_var *field_var; char *onmax_var_str, *param; - unsigned long flags; unsigned int i; int ret = 0; @@ -3338,18 +3405,10 @@ static int onmax_create(struct hist_trigger_data *hist_data, return -EINVAL; } - flags = HIST_FIELD_FL_VAR_REF; - ref_field = create_hist_field(hist_data, NULL, flags, NULL); + ref_field = create_var_ref(hist_data, var_field, NULL, NULL); if (!ref_field) return -ENOMEM; - if (init_var_ref(ref_field, var_field, NULL, NULL)) { - destroy_hist_field(ref_field, 0); - ret = -ENOMEM; - goto out; - } - hist_data->var_refs[hist_data->n_var_refs] = ref_field; - ref_field->var_ref_idx = hist_data->n_var_refs++; data->onmax.var = ref_field; data->fn = onmax_save; @@ -3538,9 +3597,6 @@ static void save_synth_var_ref(struct hist_trigger_data *hist_data, struct hist_field *var_ref) { hist_data->synth_var_refs[hist_data->n_synth_var_refs++] = var_ref; - - hist_data->var_refs[hist_data->n_var_refs] = var_ref; - var_ref->var_ref_idx = hist_data->n_var_refs++; } static int check_synth_field(struct synth_event *event, @@ -3694,7 +3750,8 @@ static int onmatch_create(struct hist_trigger_data *hist_data, } if (check_synth_field(event, hist_field, field_pos) == 0) { - var_ref = create_var_ref(hist_field, system, event_name); + var_ref = create_var_ref(hist_data, hist_field, + system, event_name); if (!var_ref) { kfree(p); ret = -ENOMEM; -- cgit From 656fe2ba85e81d00e4447bf77b8da2be3c47acb2 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Tue, 18 Dec 2018 14:33:24 -0600 Subject: tracing: Use hist trigger's var_ref array to destroy var_refs Since every var ref for a trigger has an entry in the var_ref[] array, use that to destroy the var_refs, instead of piecemeal via the field expressions. This allows us to avoid having to keep and treat differently separate lists for the action-related references, which future patches will remove. Link: http://lkml.kernel.org/r/fad1a164f0e257c158e70d6eadbf6c586e04b2a2.1545161087.git.tom.zanussi@linux.intel.com Acked-by: Namhyung Kim Reviewed-by: Masami Hiramatsu Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 6309f4dbfb9c..923c572ee68d 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -2190,6 +2190,15 @@ static int contains_operator(char *str) return field_op; } +static void __destroy_hist_field(struct hist_field *hist_field) +{ + kfree(hist_field->var.name); + kfree(hist_field->name); + kfree(hist_field->type); + + kfree(hist_field); +} + static void destroy_hist_field(struct hist_field *hist_field, unsigned int level) { @@ -2201,14 +2210,13 @@ static void destroy_hist_field(struct hist_field *hist_field, if (!hist_field) return; + if (hist_field->flags & HIST_FIELD_FL_VAR_REF) + return; /* var refs will be destroyed separately */ + for (i = 0; i < HIST_FIELD_OPERANDS_MAX; i++) destroy_hist_field(hist_field->operands[i], level + 1); - kfree(hist_field->var.name); - kfree(hist_field->name); - kfree(hist_field->type); - - kfree(hist_field); + __destroy_hist_field(hist_field); } static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data, @@ -2335,6 +2343,12 @@ static void destroy_hist_fields(struct hist_trigger_data *hist_data) hist_data->fields[i] = NULL; } } + + for (i = 0; i < hist_data->n_var_refs; i++) { + WARN_ON(!(hist_data->var_refs[i]->flags & HIST_FIELD_FL_VAR_REF)); + __destroy_hist_field(hist_data->var_refs[i]); + hist_data->var_refs[i] = NULL; + } } static int init_var_ref(struct hist_field *ref_field, -- cgit From 912201345f7c39e6b0ac283207be2b6641fa47b9 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Tue, 18 Dec 2018 14:33:25 -0600 Subject: tracing: Remove hist trigger synth_var_refs All var_refs are now handled uniformly and there's no reason to treat the synth_refs in a special way now, so remove them and associated functions. Link: http://lkml.kernel.org/r/b4d3470526b8f0426dcec125399dad9ad9b8589d.1545161087.git.tom.zanussi@linux.intel.com Acked-by: Namhyung Kim Reviewed-by: Masami Hiramatsu Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 18 ------------------ 1 file changed, 18 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 923c572ee68d..1b1aee944588 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -279,8 +279,6 @@ struct hist_trigger_data { struct action_data *actions[HIST_ACTIONS_MAX]; unsigned int n_actions; - struct hist_field *synth_var_refs[SYNTH_FIELDS_MAX]; - unsigned int n_synth_var_refs; struct field_var *field_vars[SYNTH_FIELDS_MAX]; unsigned int n_field_vars; unsigned int n_field_var_str; @@ -3599,20 +3597,6 @@ static void save_field_var(struct hist_trigger_data *hist_data, } -static void destroy_synth_var_refs(struct hist_trigger_data *hist_data) -{ - unsigned int i; - - for (i = 0; i < hist_data->n_synth_var_refs; i++) - destroy_hist_field(hist_data->synth_var_refs[i], 0); -} - -static void save_synth_var_ref(struct hist_trigger_data *hist_data, - struct hist_field *var_ref) -{ - hist_data->synth_var_refs[hist_data->n_synth_var_refs++] = var_ref; -} - static int check_synth_field(struct synth_event *event, struct hist_field *hist_field, unsigned int field_pos) @@ -3772,7 +3756,6 @@ static int onmatch_create(struct hist_trigger_data *hist_data, goto err; } - save_synth_var_ref(hist_data, var_ref); field_pos++; kfree(p); continue; @@ -4516,7 +4499,6 @@ static void destroy_hist_data(struct hist_trigger_data *hist_data) destroy_actions(hist_data); destroy_field_vars(hist_data); destroy_field_var_hists(hist_data); - destroy_synth_var_refs(hist_data); kfree(hist_data); } -- cgit From 05ddb25cb31491b0901d057c40ad50d01b3db783 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Tue, 18 Dec 2018 14:33:26 -0600 Subject: tracing: Add hist trigger comments for variable-related fields Add a few comments to help clarify how variable and variable reference fields are used in the code. Link: http://lkml.kernel.org/r/ea857ce948531d7bec712bbb0f38360aa1d378ec.1545161087.git.tom.zanussi@linux.intel.com Acked-by: Namhyung Kim Reviewed-by: Masami Hiramatsu Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 1b1aee944588..c5448c103770 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -40,6 +40,16 @@ enum field_op_id { FIELD_OP_UNARY_MINUS, }; +/* + * A hist_var (histogram variable) contains variable information for + * hist_fields having the HIST_FIELD_FL_VAR or HIST_FIELD_FL_VAR_REF + * flag set. A hist_var has a variable name e.g. ts0, and is + * associated with a given histogram trigger, as specified by + * hist_data. The hist_var idx is the unique index assigned to the + * variable by the hist trigger's tracing_map. The idx is what is + * used to set a variable's value and, by a variable reference, to + * retrieve it. + */ struct hist_var { char *name; struct hist_trigger_data *hist_data; @@ -56,11 +66,29 @@ struct hist_field { const char *type; struct hist_field *operands[HIST_FIELD_OPERANDS_MAX]; struct hist_trigger_data *hist_data; + + /* + * Variable fields contain variable-specific info in var. + */ struct hist_var var; enum field_op_id operator; char *system; char *event_name; + + /* + * The name field is used for EXPR and VAR_REF fields. VAR + * fields contain the variable name in var.name. + */ char *name; + + /* + * When a histogram trigger is hit, if it has any references + * to variables, the values of those variables are collected + * into a var_ref_vals array by resolve_var_refs(). The + * current value of each variable is read from the tracing_map + * using the hist field's hist_var.idx and entered into the + * var_ref_idx entry i.e. var_ref_vals[var_ref_idx]. + */ unsigned int var_ref_idx; bool read_once; }; @@ -365,6 +393,14 @@ struct action_data { union { struct { + /* + * When a histogram trigger is hit, the values of any + * references to variables, including variables being passed + * as parameters to synthetic events, are collected into a + * var_ref_vals array. This var_ref_idx is the index of the + * first param in the array to be passed to the synthetic + * event invocation. + */ unsigned int var_ref_idx; char *match_event; char *match_event_system; -- cgit From 59dd974bc079079c23b5429cba841696fa7fae41 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Mon, 29 Oct 2018 23:35:40 +0100 Subject: tracing: Merge seq_print_sym_short() and seq_print_sym_offset() These two functions are nearly identical, so we can avoid some code duplication by moving the conditional into a common implementation. Link: http://lkml.kernel.org/r/20181029223542.26175-2-linux@rasmusvillemoes.dk Signed-off-by: Rasmus Villemoes Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_output.c | 34 +++++++--------------------------- 1 file changed, 7 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 6e6cc64faa38..85ecd061c7be 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -339,34 +339,17 @@ static inline const char *kretprobed(const char *name) #endif /* CONFIG_KRETPROBES */ static void -seq_print_sym_short(struct trace_seq *s, const char *fmt, unsigned long address) +seq_print_sym(struct trace_seq *s, const char *fmt, unsigned long address, + bool offset) { char str[KSYM_SYMBOL_LEN]; #ifdef CONFIG_KALLSYMS const char *name; - kallsyms_lookup(address, NULL, NULL, NULL, str); - - name = kretprobed(str); - - if (name && strlen(name)) { - trace_seq_printf(s, fmt, name); - return; - } -#endif - snprintf(str, KSYM_SYMBOL_LEN, "0x%08lx", address); - trace_seq_printf(s, fmt, str); -} - -static void -seq_print_sym_offset(struct trace_seq *s, const char *fmt, - unsigned long address) -{ - char str[KSYM_SYMBOL_LEN]; -#ifdef CONFIG_KALLSYMS - const char *name; - - sprint_symbol(str, address); + if (offset) + sprint_symbol(str, address); + else + kallsyms_lookup(address, NULL, NULL, NULL, str); name = kretprobed(str); if (name && strlen(name)) { @@ -424,10 +407,7 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags) goto out; } - if (sym_flags & TRACE_ITER_SYM_OFFSET) - seq_print_sym_offset(s, "%s", ip); - else - seq_print_sym_short(s, "%s", ip); + seq_print_sym(s, "%s", ip, sym_flags & TRACE_ITER_SYM_OFFSET); if (sym_flags & TRACE_ITER_SYM_ADDR) trace_seq_printf(s, " <" IP_FMT ">", ip); -- cgit From cc9f59fb3bc455984404dbab8be16f156a47d023 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Mon, 29 Oct 2018 23:35:41 +0100 Subject: tracing: Avoid -Wformat-nonliteral warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Building with -Wformat-nonliteral, gcc complains kernel/trace/trace_output.c: In function ‘seq_print_sym’: kernel/trace/trace_output.c:356:3: warning: format not a string literal, argument types not checked [-Wformat-nonliteral] trace_seq_printf(s, fmt, name); But seq_print_sym only has a single caller which passes "%s" as fmt, so we might as well just use that directly. That also paves the way for further cleanups that will actually make that format string go away entirely. Link: http://lkml.kernel.org/r/20181029223542.26175-3-linux@rasmusvillemoes.dk Signed-off-by: Rasmus Villemoes Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_output.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 85ecd061c7be..f06fb899b746 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -339,8 +339,7 @@ static inline const char *kretprobed(const char *name) #endif /* CONFIG_KRETPROBES */ static void -seq_print_sym(struct trace_seq *s, const char *fmt, unsigned long address, - bool offset) +seq_print_sym(struct trace_seq *s, unsigned long address, bool offset) { char str[KSYM_SYMBOL_LEN]; #ifdef CONFIG_KALLSYMS @@ -353,12 +352,12 @@ seq_print_sym(struct trace_seq *s, const char *fmt, unsigned long address, name = kretprobed(str); if (name && strlen(name)) { - trace_seq_printf(s, fmt, name); + trace_seq_printf(s, "%s", name); return; } #endif snprintf(str, KSYM_SYMBOL_LEN, "0x%08lx", address); - trace_seq_printf(s, fmt, str); + trace_seq_printf(s, "%s", str); } #ifndef CONFIG_64BIT @@ -407,7 +406,7 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags) goto out; } - seq_print_sym(s, "%s", ip, sym_flags & TRACE_ITER_SYM_OFFSET); + seq_print_sym(s, ip, sym_flags & TRACE_ITER_SYM_OFFSET); if (sym_flags & TRACE_ITER_SYM_ADDR) trace_seq_printf(s, " <" IP_FMT ">", ip); -- cgit From bea6957d5cd7cbb327083d3bcf080133ee63ef65 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Mon, 29 Oct 2018 23:35:42 +0100 Subject: tracing: Simplify printf'ing in seq_print_sym trace_seq_printf(..., "%s", ...) can be done with trace_seq_puts() instead, avoiding printf overhead. In the second instance, the string we're copying was just created from an snprintf() to a stack buffer, so we might as well do that printf directly. This naturally leads to moving the declaration of the str buffer inside the CONFIG_KALLSYMS guard, which in turn will make gcc inline the function for !CONFIG_KALLSYMS (it only has a single caller, but the huge stack frame seems to make gcc not inline it for CONFIG_KALLSYMS). Link: http://lkml.kernel.org/r/20181029223542.26175-4-linux@rasmusvillemoes.dk Signed-off-by: Rasmus Villemoes Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_output.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index f06fb899b746..54373d93e251 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -341,8 +341,8 @@ static inline const char *kretprobed(const char *name) static void seq_print_sym(struct trace_seq *s, unsigned long address, bool offset) { - char str[KSYM_SYMBOL_LEN]; #ifdef CONFIG_KALLSYMS + char str[KSYM_SYMBOL_LEN]; const char *name; if (offset) @@ -352,12 +352,11 @@ seq_print_sym(struct trace_seq *s, unsigned long address, bool offset) name = kretprobed(str); if (name && strlen(name)) { - trace_seq_printf(s, "%s", name); + trace_seq_puts(s, name); return; } #endif - snprintf(str, KSYM_SYMBOL_LEN, "0x%08lx", address); - trace_seq_printf(s, "%s", str); + trace_seq_printf(s, "0x%08lx", address); } #ifndef CONFIG_64BIT -- cgit From 1cce377df1800154301525a8ee04100dd83c9633 Mon Sep 17 00:00:00 2001 From: Mathieu Malaterre Date: Wed, 16 May 2018 21:30:12 +0200 Subject: tracing: Make function ‘ftrace_exports’ static MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In commit 478409dd683d ("tracing: Add hook to function tracing for other subsystems to use"), a new function ‘ftrace_exports’ was added. Since this function can be made static, make it so. Silence the following warning triggered using W=1: kernel/trace/trace.c:2451:6: warning: no previous prototype for ‘ftrace_exports’ [-Wmissing-prototypes] Link: http://lkml.kernel.org/r/20180516193012.25390-1-malat@debian.org Signed-off-by: Mathieu Malaterre Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 911470ad9e94..5afcfecb4bc2 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2452,7 +2452,7 @@ static inline void ftrace_exports_disable(void) static_branch_disable(&ftrace_exports_enabled); } -void ftrace_exports(struct ring_buffer_event *event) +static void ftrace_exports(struct ring_buffer_event *event) { struct trace_export *export; -- cgit From 754481e6954cbef53f8bc4412ad48dde611e21d3 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 19 Dec 2018 22:38:21 -0500 Subject: tracing: Use str_has_prefix() helper for histogram code The tracing histogram code contains a lot of instances of the construct: strncmp(str, "const", sizeof("const") - 1) This can be prone to bugs due to typos or bad cut and paste. Use the str_has_prefix() helper macro instead that removes the need for having two copies of the constant string. Cc: Tom Zanussi Acked-by: Namhyung Kim Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index c5448c103770..9d590138f870 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -1881,8 +1881,8 @@ static int parse_action(char *str, struct hist_trigger_attrs *attrs) if (attrs->n_actions >= HIST_ACTIONS_MAX) return ret; - if ((strncmp(str, "onmatch(", sizeof("onmatch(") - 1) == 0) || - (strncmp(str, "onmax(", sizeof("onmax(") - 1) == 0)) { + if ((str_has_prefix(str, "onmatch(")) || + (str_has_prefix(str, "onmax("))) { attrs->action_str[attrs->n_actions] = kstrdup(str, GFP_KERNEL); if (!attrs->action_str[attrs->n_actions]) { ret = -ENOMEM; @@ -1899,34 +1899,34 @@ static int parse_assignment(char *str, struct hist_trigger_attrs *attrs) { int ret = 0; - if ((strncmp(str, "key=", sizeof("key=") - 1) == 0) || - (strncmp(str, "keys=", sizeof("keys=") - 1) == 0)) { + if ((str_has_prefix(str, "key=")) || + (str_has_prefix(str, "keys="))) { attrs->keys_str = kstrdup(str, GFP_KERNEL); if (!attrs->keys_str) { ret = -ENOMEM; goto out; } - } else if ((strncmp(str, "val=", sizeof("val=") - 1) == 0) || - (strncmp(str, "vals=", sizeof("vals=") - 1) == 0) || - (strncmp(str, "values=", sizeof("values=") - 1) == 0)) { + } else if ((str_has_prefix(str, "val=")) || + (str_has_prefix(str, "vals=")) || + (str_has_prefix(str, "values="))) { attrs->vals_str = kstrdup(str, GFP_KERNEL); if (!attrs->vals_str) { ret = -ENOMEM; goto out; } - } else if (strncmp(str, "sort=", sizeof("sort=") - 1) == 0) { + } else if (str_has_prefix(str, "sort=")) { attrs->sort_key_str = kstrdup(str, GFP_KERNEL); if (!attrs->sort_key_str) { ret = -ENOMEM; goto out; } - } else if (strncmp(str, "name=", sizeof("name=") - 1) == 0) { + } else if (str_has_prefix(str, "name=")) { attrs->name = kstrdup(str, GFP_KERNEL); if (!attrs->name) { ret = -ENOMEM; goto out; } - } else if (strncmp(str, "clock=", sizeof("clock=") - 1) == 0) { + } else if (str_has_prefix(str, "clock=")) { strsep(&str, "="); if (!str) { ret = -EINVAL; @@ -1939,7 +1939,7 @@ static int parse_assignment(char *str, struct hist_trigger_attrs *attrs) ret = -ENOMEM; goto out; } - } else if (strncmp(str, "size=", sizeof("size=") - 1) == 0) { + } else if (str_has_prefix(str, "size=")) { int map_bits = parse_map_size(str); if (map_bits < 0) { @@ -3558,7 +3558,7 @@ static struct action_data *onmax_parse(char *str) if (!onmax_fn_name || !str) goto free; - if (strncmp(onmax_fn_name, "save", sizeof("save") - 1) == 0) { + if (str_has_prefix(onmax_fn_name, "save")) { char *params = strsep(&str, ")"); if (!params) { @@ -4346,7 +4346,7 @@ static int parse_actions(struct hist_trigger_data *hist_data) for (i = 0; i < hist_data->attrs->n_actions; i++) { str = hist_data->attrs->action_str[i]; - if (strncmp(str, "onmatch(", sizeof("onmatch(") - 1) == 0) { + if (str_has_prefix(str, "onmatch(")) { char *action_str = str + sizeof("onmatch(") - 1; data = onmatch_parse(tr, action_str); @@ -4355,7 +4355,7 @@ static int parse_actions(struct hist_trigger_data *hist_data) break; } data->fn = action_trace; - } else if (strncmp(str, "onmax(", sizeof("onmax(") - 1) == 0) { + } else if (str_has_prefix(str, "onmax(")) { char *action_str = str + sizeof("onmax(") - 1; data = onmax_parse(action_str); -- cgit From b6b2735514bcd70ad1556a33892a636b20ece671 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 20 Dec 2018 13:20:07 -0500 Subject: tracing: Use str_has_prefix() instead of using fixed sizes There are several instances of strncmp(str, "const", 123), where 123 is the strlen of the const string to check if "const" is the prefix of str. But this can be error prone. Use str_has_prefix() instead. Acked-by: Namhyung Kim Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 2 +- kernel/trace/trace_events.c | 2 +- kernel/trace/trace_events_hist.c | 2 +- kernel/trace/trace_probe.c | 4 ++-- kernel/trace/trace_stack.c | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 5afcfecb4bc2..eac2824a18ab 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4411,7 +4411,7 @@ static int trace_set_options(struct trace_array *tr, char *option) cmp = strstrip(option); - if (strncmp(cmp, "no", 2) == 0) { + if (str_has_prefix(cmp, "no")) { neg = 1; cmp += 2; } diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index bd0162c0467c..5b3b0c3c8a47 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1251,7 +1251,7 @@ static int f_show(struct seq_file *m, void *v) */ array_descriptor = strchr(field->type, '['); - if (!strncmp(field->type, "__data_loc", 10)) + if (str_has_prefix(field->type, "__data_loc")) array_descriptor = NULL; if (!array_descriptor) diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 9d590138f870..0d878dcd1e4b 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -518,7 +518,7 @@ static int synth_event_define_fields(struct trace_event_call *call) static bool synth_field_signed(char *type) { - if (strncmp(type, "u", 1) == 0) + if (str_has_prefix(type, "u")) return false; return true; diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index ff86417c0149..541375737403 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -194,7 +194,7 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t, code->op = FETCH_OP_RETVAL; else ret = -EINVAL; - } else if (strncmp(arg, "stack", 5) == 0) { + } else if (str_has_prefix(arg, "stack")) { if (arg[5] == '\0') { code->op = FETCH_OP_STACKP; } else if (isdigit(arg[5])) { @@ -213,7 +213,7 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t, #ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API } else if (((flags & TPARG_FL_MASK) == (TPARG_FL_KERNEL | TPARG_FL_FENTRY)) && - strncmp(arg, "arg", 3) == 0) { + str_has_prefix(arg, "arg")) { if (!isdigit(arg[3])) return -EINVAL; ret = kstrtoul(arg + 3, 10, ¶m); diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index e2a153fc1afc..3641f28c343f 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -448,7 +448,7 @@ static char stack_trace_filter_buf[COMMAND_LINE_SIZE+1] __initdata; static __init int enable_stacktrace(char *str) { - if (strncmp(str, "_filter=", 8) == 0) + if (str_has_prefix(str, "_filter=")) strncpy(stack_trace_filter_buf, str+8, COMMAND_LINE_SIZE); stack_tracer_enabled = 1; -- cgit From 036876fa56204ae0fa59045bd6bbb2691a060633 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 21 Dec 2018 18:40:46 -0500 Subject: tracing: Have the historgram use the result of str_has_prefix() for len of prefix As str_has_prefix() returns the length on match, we can use that for the updating of the string pointer instead of recalculating the prefix size. Cc: Tom Zanussi Acked-by: Namhyung Kim Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 0d878dcd1e4b..449d90cfa151 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -4342,12 +4342,13 @@ static int parse_actions(struct hist_trigger_data *hist_data) unsigned int i; int ret = 0; char *str; + int len; for (i = 0; i < hist_data->attrs->n_actions; i++) { str = hist_data->attrs->action_str[i]; - if (str_has_prefix(str, "onmatch(")) { - char *action_str = str + sizeof("onmatch(") - 1; + if ((len = str_has_prefix(str, "onmatch("))) { + char *action_str = str + len; data = onmatch_parse(tr, action_str); if (IS_ERR(data)) { @@ -4355,8 +4356,8 @@ static int parse_actions(struct hist_trigger_data *hist_data) break; } data->fn = action_trace; - } else if (str_has_prefix(str, "onmax(")) { - char *action_str = str + sizeof("onmax(") - 1; + } else if ((len = str_has_prefix(str, "onmax("))) { + char *action_str = str + len; data = onmax_parse(action_str); if (IS_ERR(data)) { -- cgit From 3d739c1f6156c70eb0548aa288dcfbac9e0bd162 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 21 Dec 2018 23:10:26 -0500 Subject: tracing: Use the return of str_has_prefix() to remove open coded numbers There are several locations that compare constants to the beginning of string variables to determine what commands should be done, then the constant length is used to index into the string. This is error prone as the hard coded numbers have to match the size of the constants. Instead, use the len returned from str_has_prefix() and remove the open coded string length sizes. Cc: Joe Perches Acked-by: Masami Hiramatsu (for trace_probe part) Acked-by: Namhyung Kim Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 8 +++++--- kernel/trace/trace_probe.c | 17 +++++++++-------- kernel/trace/trace_stack.c | 6 ++++-- 3 files changed, 18 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index eac2824a18ab..18b86c3974e1 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4408,13 +4408,15 @@ static int trace_set_options(struct trace_array *tr, char *option) int neg = 0; int ret; size_t orig_len = strlen(option); + int len; cmp = strstrip(option); - if (str_has_prefix(cmp, "no")) { + len = str_has_prefix(cmp, "no"); + if (len) neg = 1; - cmp += 2; - } + + cmp += len; mutex_lock(&trace_types_lock); diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 541375737403..9962cb5da8ac 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -186,19 +186,20 @@ int traceprobe_parse_event_name(const char **pevent, const char **pgroup, static int parse_probe_vars(char *arg, const struct fetch_type *t, struct fetch_insn *code, unsigned int flags) { - int ret = 0; unsigned long param; + int ret = 0; + int len; if (strcmp(arg, "retval") == 0) { if (flags & TPARG_FL_RETURN) code->op = FETCH_OP_RETVAL; else ret = -EINVAL; - } else if (str_has_prefix(arg, "stack")) { - if (arg[5] == '\0') { + } else if ((len = str_has_prefix(arg, "stack"))) { + if (arg[len] == '\0') { code->op = FETCH_OP_STACKP; - } else if (isdigit(arg[5])) { - ret = kstrtoul(arg + 5, 10, ¶m); + } else if (isdigit(arg[len])) { + ret = kstrtoul(arg + len, 10, ¶m); if (ret || ((flags & TPARG_FL_KERNEL) && param > PARAM_MAX_STACK)) ret = -EINVAL; @@ -213,10 +214,10 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t, #ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API } else if (((flags & TPARG_FL_MASK) == (TPARG_FL_KERNEL | TPARG_FL_FENTRY)) && - str_has_prefix(arg, "arg")) { - if (!isdigit(arg[3])) + (len = str_has_prefix(arg, "arg"))) { + if (!isdigit(arg[len])) return -EINVAL; - ret = kstrtoul(arg + 3, 10, ¶m); + ret = kstrtoul(arg + len, 10, ¶m); if (ret || !param || param > PARAM_MAX_STACK) return -EINVAL; code->op = FETCH_OP_ARG; diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 3641f28c343f..eec648a0d673 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -448,8 +448,10 @@ static char stack_trace_filter_buf[COMMAND_LINE_SIZE+1] __initdata; static __init int enable_stacktrace(char *str) { - if (str_has_prefix(str, "_filter=")) - strncpy(stack_trace_filter_buf, str+8, COMMAND_LINE_SIZE); + int len; + + if ((len = str_has_prefix(str, "_filter="))) + strncpy(stack_trace_filter_buf, str + len, COMMAND_LINE_SIZE); stack_tracer_enabled = 1; last_stack_tracer_enabled = 1; -- cgit From 6d101ba6be2a26a3e1f513b5e293f0fd2b79ec5c Mon Sep 17 00:00:00 2001 From: Olof Johansson Date: Sun, 25 Nov 2018 14:41:05 -0800 Subject: sched/fair: Fix warning on non-SMP build Caused by making the variable static: kernel/sched/fair.c:119:21: warning: 'capacity_margin' defined but not used [-Wunused-variable] Seems easiest to just move it up under the existing ifdef CONFIG_SMP that's a few lines above. Fixes: ed8885a14433a ('sched/fair: Make some variables static') Signed-off-by: Olof Johansson Signed-off-by: Linus Torvalds --- kernel/sched/fair.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1c1cfbf6ba0c..d1907506318a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -94,6 +94,14 @@ int __weak arch_asym_cpu_priority(int cpu) { return -cpu; } + +/* + * The margin used when comparing utilization with CPU capacity: + * util * margin < capacity * 1024 + * + * (default: ~20%) + */ +static unsigned int capacity_margin = 1280; #endif #ifdef CONFIG_CFS_BANDWIDTH @@ -110,14 +118,6 @@ int __weak arch_asym_cpu_priority(int cpu) unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; #endif -/* - * The margin used when comparing utilization with CPU capacity: - * util * margin < capacity * 1024 - * - * (default: ~20%) - */ -static unsigned int capacity_margin = 1280; - static inline void update_load_add(struct load_weight *lw, unsigned long inc) { lw->weight += inc; -- cgit From e250d91d65750a0c0c62483ac4f9f357e7317617 Mon Sep 17 00:00:00 2001 From: Ondrej Mosnacek Date: Thu, 13 Dec 2018 15:17:37 +0100 Subject: cgroup: fix parsing empty mount option string This fixes the case where all mount options specified are consumed by an LSM and all that's left is an empty string. In this case cgroupfs should accept the string and not fail. How to reproduce (with SELinux enabled): # umount /sys/fs/cgroup/unified # mount -o context=system_u:object_r:cgroup_t:s0 -t cgroup2 cgroup2 /sys/fs/cgroup/unified mount: /sys/fs/cgroup/unified: wrong fs type, bad option, bad superblock on cgroup2, missing codepage or helper program, or other error. # dmesg | tail -n 1 [ 31.575952] cgroup: cgroup2: unknown option "" Fixes: 67e9c74b8a87 ("cgroup: replace __DEVEL__sane_behavior with cgroup2 fs type") [NOTE: should apply on top of commit 5136f6365ce3 ("cgroup: implement "nsdelegate" mount option"), older versions need manual rebase] Suggested-by: Stephen Smalley Signed-off-by: Ondrej Mosnacek Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 8da9bf5e422f..879c9f191f66 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1748,7 +1748,7 @@ static int parse_cgroup_root_flags(char *data, unsigned int *root_flags) *root_flags = 0; - if (!data) + if (!data || *data == '\0') return 0; while ((token = strsep(&data, ",")) != NULL) { -- cgit From 3fc9c12d27b4ded4f1f761a800558dab2e6bbac5 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 28 Dec 2018 10:31:07 -0800 Subject: cgroup: Add named hierarchy disabling to cgroup_no_v1 boot param It can be useful to inhibit all cgroup1 hierarchies especially during transition and for debugging. cgroup_no_v1 can block hierarchies with controllers which leaves out the named hierarchies. Expand it to cover the named hierarchies so that "cgroup_no_v1=all,named" disables all cgroup1 hierarchies. Signed-off-by: Tejun Heo Suggested-by: Marcin Pawlowski Signed-off-by: Tejun Heo --- Documentation/admin-guide/kernel-parameters.txt | 8 ++++++-- kernel/cgroup/cgroup-v1.c | 14 +++++++++++++- 2 files changed, 19 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 81d1d5a74728..8b7652449228 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -486,10 +486,14 @@ cut the overhead, others just disable the usage. So only cgroup_disable=memory is actually worthy} - cgroup_no_v1= [KNL] Disable one, multiple, all cgroup controllers in v1 - Format: { controller[,controller...] | "all" } + cgroup_no_v1= [KNL] Disable cgroup controllers and named hierarchies in v1 + Format: { { controller | "all" | "named" } + [,{ controller | "all" | "named" }...] } Like cgroup_disable, but only applies to cgroup v1; the blacklisted controllers remain available in cgroup2. + "all" blacklists all controllers and "named" disables + named mounts. Specifying both "all" and "named" disables + all v1 hierarchies. cgroup.memory= [KNL] Pass options to the cgroup memory controller. Format: diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 51063e7a93c2..583b969b0c0e 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -27,6 +27,9 @@ /* Controllers blocked by the commandline in v1 */ static u16 cgroup_no_v1_mask; +/* disable named v1 mounts */ +static bool cgroup_no_v1_named; + /* * pidlist destructions need to be flushed on cgroup destruction. Use a * separate workqueue as flush domain. @@ -963,6 +966,10 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) } if (!strncmp(token, "name=", 5)) { const char *name = token + 5; + + /* blocked by boot param? */ + if (cgroup_no_v1_named) + return -ENOENT; /* Can't specify an empty name */ if (!strlen(name)) return -EINVAL; @@ -1292,7 +1299,12 @@ static int __init cgroup_no_v1(char *str) if (!strcmp(token, "all")) { cgroup_no_v1_mask = U16_MAX; - break; + continue; + } + + if (!strcmp(token, "named")) { + cgroup_no_v1_named = true; + continue; } for_each_subsys(ss, i) { -- cgit From 3d6357de8aa09e1966770dc1171c72679946464f Mon Sep 17 00:00:00 2001 From: Arun KS Date: Fri, 28 Dec 2018 00:34:20 -0800 Subject: mm: reference totalram_pages and managed_pages once per function Patch series "mm: convert totalram_pages, totalhigh_pages and managed pages to atomic", v5. This series converts totalram_pages, totalhigh_pages and zone->managed_pages to atomic variables. totalram_pages, zone->managed_pages and totalhigh_pages updates are protected by managed_page_count_lock, but readers never care about it. Convert these variables to atomic to avoid readers potentially seeing a store tear. Main motivation was that managed_page_count_lock handling was complicating things. It was discussed in length here, https://lore.kernel.org/patchwork/patch/995739/#1181785 It seemes better to remove the lock and convert variables to atomic. With the change, preventing poteintial store-to-read tearing comes as a bonus. This patch (of 4): This is in preparation to a later patch which converts totalram_pages and zone->managed_pages to atomic variables. Please note that re-reading the value might lead to a different value and as such it could lead to unexpected behavior. There are no known bugs as a result of the current code but it is better to prevent from them in principle. Link: http://lkml.kernel.org/r/1542090790-21750-2-git-send-email-arunks@codeaurora.org Signed-off-by: Arun KS Reviewed-by: Konstantin Khlebnikov Reviewed-by: David Hildenbrand Acked-by: Michal Hocko Acked-by: Vlastimil Babka Reviewed-by: Pavel Tatashin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/um/kernel/mem.c | 2 +- arch/x86/kernel/cpu/microcode/core.c | 5 +++-- drivers/hv/hv_balloon.c | 19 ++++++++++--------- fs/file_table.c | 7 ++++--- kernel/fork.c | 5 +++-- kernel/kexec_core.c | 5 +++-- mm/page_alloc.c | 5 +++-- mm/shmem.c | 3 ++- net/dccp/proto.c | 7 ++++--- net/netfilter/nf_conntrack_core.c | 7 ++++--- net/netfilter/xt_hashlimit.c | 5 +++-- net/sctp/protocol.c | 7 ++++--- 12 files changed, 44 insertions(+), 33 deletions(-) (limited to 'kernel') diff --git a/arch/um/kernel/mem.c b/arch/um/kernel/mem.c index 1067469ba2ea..2da209687a22 100644 --- a/arch/um/kernel/mem.c +++ b/arch/um/kernel/mem.c @@ -52,7 +52,7 @@ void __init mem_init(void) /* this will put all low memory onto the freelists */ memblock_free_all(); max_low_pfn = totalram_pages; - max_pfn = totalram_pages; + max_pfn = max_low_pfn; mem_init_print_info(NULL); kmalloc_ok = 1; } diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index 2637ff09d6a0..168fa272cc3e 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -434,9 +434,10 @@ static ssize_t microcode_write(struct file *file, const char __user *buf, size_t len, loff_t *ppos) { ssize_t ret = -EINVAL; + unsigned long nr_pages = totalram_pages; - if ((len >> PAGE_SHIFT) > totalram_pages) { - pr_err("too much data (max %ld pages)\n", totalram_pages); + if ((len >> PAGE_SHIFT) > nr_pages) { + pr_err("too much data (max %ld pages)\n", nr_pages); return ret; } diff --git a/drivers/hv/hv_balloon.c b/drivers/hv/hv_balloon.c index 41631512ae97..f3e7da981610 100644 --- a/drivers/hv/hv_balloon.c +++ b/drivers/hv/hv_balloon.c @@ -1090,6 +1090,7 @@ static void process_info(struct hv_dynmem_device *dm, struct dm_info_msg *msg) static unsigned long compute_balloon_floor(void) { unsigned long min_pages; + unsigned long nr_pages = totalram_pages; #define MB2PAGES(mb) ((mb) << (20 - PAGE_SHIFT)) /* Simple continuous piecewiese linear function: * max MiB -> min MiB gradient @@ -1102,16 +1103,16 @@ static unsigned long compute_balloon_floor(void) * 8192 744 (1/16) * 32768 1512 (1/32) */ - if (totalram_pages < MB2PAGES(128)) - min_pages = MB2PAGES(8) + (totalram_pages >> 1); - else if (totalram_pages < MB2PAGES(512)) - min_pages = MB2PAGES(40) + (totalram_pages >> 2); - else if (totalram_pages < MB2PAGES(2048)) - min_pages = MB2PAGES(104) + (totalram_pages >> 3); - else if (totalram_pages < MB2PAGES(8192)) - min_pages = MB2PAGES(232) + (totalram_pages >> 4); + if (nr_pages < MB2PAGES(128)) + min_pages = MB2PAGES(8) + (nr_pages >> 1); + else if (nr_pages < MB2PAGES(512)) + min_pages = MB2PAGES(40) + (nr_pages >> 2); + else if (nr_pages < MB2PAGES(2048)) + min_pages = MB2PAGES(104) + (nr_pages >> 3); + else if (nr_pages < MB2PAGES(8192)) + min_pages = MB2PAGES(232) + (nr_pages >> 4); else - min_pages = MB2PAGES(488) + (totalram_pages >> 5); + min_pages = MB2PAGES(488) + (nr_pages >> 5); #undef MB2PAGES return min_pages; } diff --git a/fs/file_table.c b/fs/file_table.c index e49af4caf15d..b6e9587f05c7 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -380,10 +380,11 @@ void __init files_init(void) void __init files_maxfiles_init(void) { unsigned long n; - unsigned long memreserve = (totalram_pages - nr_free_pages()) * 3/2; + unsigned long nr_pages = totalram_pages; + unsigned long memreserve = (nr_pages - nr_free_pages()) * 3/2; - memreserve = min(memreserve, totalram_pages - 1); - n = ((totalram_pages - memreserve) * (PAGE_SIZE / 1024)) / 10; + memreserve = min(memreserve, nr_pages - 1); + n = ((nr_pages - memreserve) * (PAGE_SIZE / 1024)) / 10; files_stat.max_files = max_t(unsigned long, n, NR_FILE); } diff --git a/kernel/fork.c b/kernel/fork.c index e2a5156bc9c3..8617a326e9f5 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -744,15 +744,16 @@ void __init __weak arch_task_cache_init(void) { } static void set_max_threads(unsigned int max_threads_suggested) { u64 threads; + unsigned long nr_pages = totalram_pages; /* * The number of threads shall be limited such that the thread * structures may only consume a small part of the available memory. */ - if (fls64(totalram_pages) + fls64(PAGE_SIZE) > 64) + if (fls64(nr_pages) + fls64(PAGE_SIZE) > 64) threads = MAX_THREADS; else - threads = div64_u64((u64) totalram_pages * (u64) PAGE_SIZE, + threads = div64_u64((u64) nr_pages * (u64) PAGE_SIZE, (u64) THREAD_SIZE * 8UL); if (threads > max_threads_suggested) diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 86ef06d3dbe3..7e967ca98d92 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -152,6 +152,7 @@ int sanity_check_segment_list(struct kimage *image) int i; unsigned long nr_segments = image->nr_segments; unsigned long total_pages = 0; + unsigned long nr_pages = totalram_pages; /* * Verify we have good destination addresses. The caller is @@ -217,13 +218,13 @@ int sanity_check_segment_list(struct kimage *image) * wasted allocating pages, which can cause a soft lockup. */ for (i = 0; i < nr_segments; i++) { - if (PAGE_COUNT(image->segment[i].memsz) > totalram_pages / 2) + if (PAGE_COUNT(image->segment[i].memsz) > nr_pages / 2) return -EINVAL; total_pages += PAGE_COUNT(image->segment[i].memsz); } - if (total_pages > totalram_pages / 2) + if (total_pages > nr_pages / 2) return -EINVAL; /* diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c82c41edfe22..b79e79caea99 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -7257,6 +7257,7 @@ static void calculate_totalreserve_pages(void) for (i = 0; i < MAX_NR_ZONES; i++) { struct zone *zone = pgdat->node_zones + i; long max = 0; + unsigned long managed_pages = zone->managed_pages; /* Find valid and maximum lowmem_reserve in the zone */ for (j = i; j < MAX_NR_ZONES; j++) { @@ -7267,8 +7268,8 @@ static void calculate_totalreserve_pages(void) /* we treat the high watermark as reserved pages. */ max += high_wmark_pages(zone); - if (max > zone->managed_pages) - max = zone->managed_pages; + if (max > managed_pages) + max = managed_pages; pgdat->totalreserve_pages += max; diff --git a/mm/shmem.c b/mm/shmem.c index 375f3ac19bb8..b1f0f54470fb 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -114,7 +114,8 @@ static unsigned long shmem_default_max_blocks(void) static unsigned long shmem_default_max_inodes(void) { - return min(totalram_pages - totalhigh_pages, totalram_pages / 2); + unsigned long nr_pages = totalram_pages; + return min(nr_pages - totalhigh_pages, nr_pages / 2); } #endif diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 2cc5fbb1b29e..ff727ff61b5b 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -1131,6 +1131,7 @@ EXPORT_SYMBOL_GPL(dccp_debug); static int __init dccp_init(void) { unsigned long goal; + unsigned long nr_pages = totalram_pages; int ehash_order, bhash_order, i; int rc; @@ -1157,10 +1158,10 @@ static int __init dccp_init(void) * * The methodology is similar to that of the buffer cache. */ - if (totalram_pages >= (128 * 1024)) - goal = totalram_pages >> (21 - PAGE_SHIFT); + if (nr_pages >= (128 * 1024)) + goal = nr_pages >> (21 - PAGE_SHIFT); else - goal = totalram_pages >> (23 - PAGE_SHIFT); + goal = nr_pages >> (23 - PAGE_SHIFT); if (thash_entries) goal = (thash_entries * diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index e87c21e47efe..5eb990830348 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -2248,6 +2248,7 @@ static __always_inline unsigned int total_extension_size(void) int nf_conntrack_init_start(void) { + unsigned long nr_pages = totalram_pages; int max_factor = 8; int ret = -ENOMEM; int i; @@ -2267,11 +2268,11 @@ int nf_conntrack_init_start(void) * >= 4GB machines have 65536 buckets. */ nf_conntrack_htable_size - = (((totalram_pages << PAGE_SHIFT) / 16384) + = (((nr_pages << PAGE_SHIFT) / 16384) / sizeof(struct hlist_head)); - if (totalram_pages > (4 * (1024 * 1024 * 1024 / PAGE_SIZE))) + if (nr_pages > (4 * (1024 * 1024 * 1024 / PAGE_SIZE))) nf_conntrack_htable_size = 65536; - else if (totalram_pages > (1024 * 1024 * 1024 / PAGE_SIZE)) + else if (nr_pages > (1024 * 1024 * 1024 / PAGE_SIZE)) nf_conntrack_htable_size = 16384; if (nf_conntrack_htable_size < 32) nf_conntrack_htable_size = 32; diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c index 28e27a32d9b9..88b520ba2abc 100644 --- a/net/netfilter/xt_hashlimit.c +++ b/net/netfilter/xt_hashlimit.c @@ -274,14 +274,15 @@ static int htable_create(struct net *net, struct hashlimit_cfg3 *cfg, struct xt_hashlimit_htable *hinfo; const struct seq_operations *ops; unsigned int size, i; + unsigned long nr_pages = totalram_pages; int ret; if (cfg->size) { size = cfg->size; } else { - size = (totalram_pages << PAGE_SHIFT) / 16384 / + size = (nr_pages << PAGE_SHIFT) / 16384 / sizeof(struct hlist_head); - if (totalram_pages > 1024 * 1024 * 1024 / PAGE_SIZE) + if (nr_pages > 1024 * 1024 * 1024 / PAGE_SIZE) size = 8192; if (size < 16) size = 16; diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 9b277bd36d1a..a5b24182b3cc 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -1368,6 +1368,7 @@ static __init int sctp_init(void) int status = -EINVAL; unsigned long goal; unsigned long limit; + unsigned long nr_pages = totalram_pages; int max_share; int order; int num_entries; @@ -1426,10 +1427,10 @@ static __init int sctp_init(void) * The methodology is similar to that of the tcp hash tables. * Though not identical. Start by getting a goal size */ - if (totalram_pages >= (128 * 1024)) - goal = totalram_pages >> (22 - PAGE_SHIFT); + if (nr_pages >= (128 * 1024)) + goal = nr_pages >> (22 - PAGE_SHIFT); else - goal = totalram_pages >> (24 - PAGE_SHIFT); + goal = nr_pages >> (24 - PAGE_SHIFT); /* Then compute the page order for said goal */ order = get_order(goal); -- cgit From ca79b0c211af63fa3276f0e3fd7dd9ada2439839 Mon Sep 17 00:00:00 2001 From: Arun KS Date: Fri, 28 Dec 2018 00:34:29 -0800 Subject: mm: convert totalram_pages and totalhigh_pages variables to atomic totalram_pages and totalhigh_pages are made static inline function. Main motivation was that managed_page_count_lock handling was complicating things. It was discussed in length here, https://lore.kernel.org/patchwork/patch/995739/#1181785 So it seemes better to remove the lock and convert variables to atomic, with preventing poteintial store-to-read tearing as a bonus. [akpm@linux-foundation.org: coding style fixes] Link: http://lkml.kernel.org/r/1542090790-21750-4-git-send-email-arunks@codeaurora.org Signed-off-by: Arun KS Suggested-by: Michal Hocko Suggested-by: Vlastimil Babka Reviewed-by: Konstantin Khlebnikov Reviewed-by: Pavel Tatashin Acked-by: Michal Hocko Acked-by: Vlastimil Babka Cc: David Hildenbrand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/csky/mm/init.c | 4 ++-- arch/powerpc/platforms/pseries/cmm.c | 10 +++++----- arch/s390/mm/init.c | 2 +- arch/um/kernel/mem.c | 2 +- arch/x86/kernel/cpu/microcode/core.c | 2 +- drivers/char/agp/backend.c | 4 ++-- drivers/gpu/drm/i915/i915_gem.c | 2 +- drivers/gpu/drm/i915/selftests/i915_gem_gtt.c | 4 ++-- drivers/hv/hv_balloon.c | 2 +- drivers/md/dm-bufio.c | 2 +- drivers/md/dm-crypt.c | 2 +- drivers/md/dm-integrity.c | 2 +- drivers/md/dm-stats.c | 2 +- drivers/media/platform/mtk-vpu/mtk_vpu.c | 2 +- drivers/misc/vmw_balloon.c | 2 +- drivers/parisc/ccio-dma.c | 4 ++-- drivers/parisc/sba_iommu.c | 4 ++-- drivers/staging/android/ion/ion_system_heap.c | 2 +- drivers/xen/xen-selfballoon.c | 6 +++--- fs/ceph/super.h | 2 +- fs/file_table.c | 2 +- fs/fuse/inode.c | 2 +- fs/nfs/write.c | 2 +- fs/nfsd/nfscache.c | 2 +- fs/ntfs/malloc.h | 2 +- fs/proc/base.c | 2 +- include/linux/highmem.h | 28 +++++++++++++++++++++++++-- include/linux/mm.h | 27 +++++++++++++++++++++++++- include/linux/swap.h | 1 - kernel/fork.c | 2 +- kernel/kexec_core.c | 2 +- kernel/power/snapshot.c | 2 +- mm/highmem.c | 5 ++--- mm/huge_memory.c | 2 +- mm/kasan/quarantine.c | 2 +- mm/memblock.c | 4 ++-- mm/mm_init.c | 2 +- mm/oom_kill.c | 2 +- mm/page_alloc.c | 20 ++++++++++--------- mm/shmem.c | 9 +++++---- mm/slab.c | 2 +- mm/swap.c | 2 +- mm/util.c | 2 +- mm/vmalloc.c | 4 ++-- mm/workingset.c | 2 +- mm/zswap.c | 4 ++-- net/dccp/proto.c | 2 +- net/decnet/dn_route.c | 2 +- net/ipv4/tcp_metrics.c | 2 +- net/netfilter/nf_conntrack_core.c | 2 +- net/netfilter/xt_hashlimit.c | 2 +- net/sctp/protocol.c | 2 +- security/integrity/ima/ima_kexec.c | 2 +- 53 files changed, 131 insertions(+), 81 deletions(-) (limited to 'kernel') diff --git a/arch/csky/mm/init.c b/arch/csky/mm/init.c index dc07c078f9b8..66e597053488 100644 --- a/arch/csky/mm/init.c +++ b/arch/csky/mm/init.c @@ -71,7 +71,7 @@ void free_initrd_mem(unsigned long start, unsigned long end) ClearPageReserved(virt_to_page(start)); init_page_count(virt_to_page(start)); free_page(start); - totalram_pages++; + totalram_pages_inc(); } } #endif @@ -88,7 +88,7 @@ void free_initmem(void) ClearPageReserved(virt_to_page(addr)); init_page_count(virt_to_page(addr)); free_page(addr); - totalram_pages++; + totalram_pages_inc(); addr += PAGE_SIZE; } diff --git a/arch/powerpc/platforms/pseries/cmm.c b/arch/powerpc/platforms/pseries/cmm.c index 25427a48feae..e8d63a6a9002 100644 --- a/arch/powerpc/platforms/pseries/cmm.c +++ b/arch/powerpc/platforms/pseries/cmm.c @@ -208,7 +208,7 @@ static long cmm_alloc_pages(long nr) pa->page[pa->index++] = addr; loaned_pages++; - totalram_pages--; + totalram_pages_dec(); spin_unlock(&cmm_lock); nr--; } @@ -247,7 +247,7 @@ static long cmm_free_pages(long nr) free_page(addr); loaned_pages--; nr--; - totalram_pages++; + totalram_pages_inc(); } spin_unlock(&cmm_lock); cmm_dbg("End request with %ld pages unfulfilled\n", nr); @@ -291,7 +291,7 @@ static void cmm_get_mpp(void) int rc; struct hvcall_mpp_data mpp_data; signed long active_pages_target, page_loan_request, target; - signed long total_pages = totalram_pages + loaned_pages; + signed long total_pages = totalram_pages() + loaned_pages; signed long min_mem_pages = (min_mem_mb * 1024 * 1024) / PAGE_SIZE; rc = h_get_mpp(&mpp_data); @@ -322,7 +322,7 @@ static void cmm_get_mpp(void) cmm_dbg("delta = %ld, loaned = %lu, target = %lu, oom = %lu, totalram = %lu\n", page_loan_request, loaned_pages, loaned_pages_target, - oom_freed_pages, totalram_pages); + oom_freed_pages, totalram_pages()); } static struct notifier_block cmm_oom_nb = { @@ -581,7 +581,7 @@ static int cmm_mem_going_offline(void *arg) free_page(pa_curr->page[idx]); freed++; loaned_pages--; - totalram_pages++; + totalram_pages_inc(); pa_curr->page[idx] = pa_last->page[--pa_last->index]; if (pa_last->index == 0) { if (pa_curr == pa_last) diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index 76d0708438e9..50388190b393 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -59,7 +59,7 @@ static void __init setup_zero_pages(void) order = 7; /* Limit number of empty zero pages for small memory sizes */ - while (order > 2 && (totalram_pages >> 10) < (1UL << order)) + while (order > 2 && (totalram_pages() >> 10) < (1UL << order)) order--; empty_zero_page = __get_free_pages(GFP_KERNEL | __GFP_ZERO, order); diff --git a/arch/um/kernel/mem.c b/arch/um/kernel/mem.c index 2da209687a22..8d21a83dd289 100644 --- a/arch/um/kernel/mem.c +++ b/arch/um/kernel/mem.c @@ -51,7 +51,7 @@ void __init mem_init(void) /* this will put all low memory onto the freelists */ memblock_free_all(); - max_low_pfn = totalram_pages; + max_low_pfn = totalram_pages(); max_pfn = max_low_pfn; mem_init_print_info(NULL); kmalloc_ok = 1; diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index 168fa272cc3e..97f9ada9ceda 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -434,7 +434,7 @@ static ssize_t microcode_write(struct file *file, const char __user *buf, size_t len, loff_t *ppos) { ssize_t ret = -EINVAL; - unsigned long nr_pages = totalram_pages; + unsigned long nr_pages = totalram_pages(); if ((len >> PAGE_SHIFT) > nr_pages) { pr_err("too much data (max %ld pages)\n", nr_pages); diff --git a/drivers/char/agp/backend.c b/drivers/char/agp/backend.c index 38ffb281df97..004a3ce8ba72 100644 --- a/drivers/char/agp/backend.c +++ b/drivers/char/agp/backend.c @@ -115,9 +115,9 @@ static int agp_find_max(void) long memory, index, result; #if PAGE_SHIFT < 20 - memory = totalram_pages >> (20 - PAGE_SHIFT); + memory = totalram_pages() >> (20 - PAGE_SHIFT); #else - memory = totalram_pages << (PAGE_SHIFT - 20); + memory = totalram_pages() << (PAGE_SHIFT - 20); #endif index = 1; diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index d36a9755ad91..a9de07bb72c8 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c @@ -2559,7 +2559,7 @@ static int i915_gem_object_get_pages_gtt(struct drm_i915_gem_object *obj) * If there's no chance of allocating enough pages for the whole * object, bail early. */ - if (page_count > totalram_pages) + if (page_count > totalram_pages()) return -ENOMEM; st = kmalloc(sizeof(*st), GFP_KERNEL); diff --git a/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c b/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c index 69fe86b30fbb..a9ed0ecc94e2 100644 --- a/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c +++ b/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c @@ -170,7 +170,7 @@ static int igt_ppgtt_alloc(void *arg) * This should ensure that we do not run into the oomkiller during * the test and take down the machine wilfully. */ - limit = totalram_pages << PAGE_SHIFT; + limit = totalram_pages() << PAGE_SHIFT; limit = min(ppgtt->vm.total, limit); /* Check we can allocate the entire range */ @@ -1244,7 +1244,7 @@ static int exercise_mock(struct drm_i915_private *i915, u64 hole_start, u64 hole_end, unsigned long end_time)) { - const u64 limit = totalram_pages << PAGE_SHIFT; + const u64 limit = totalram_pages() << PAGE_SHIFT; struct i915_gem_context *ctx; struct i915_hw_ppgtt *ppgtt; IGT_TIMEOUT(end_time); diff --git a/drivers/hv/hv_balloon.c b/drivers/hv/hv_balloon.c index f3e7da981610..5301fef16c31 100644 --- a/drivers/hv/hv_balloon.c +++ b/drivers/hv/hv_balloon.c @@ -1090,7 +1090,7 @@ static void process_info(struct hv_dynmem_device *dm, struct dm_info_msg *msg) static unsigned long compute_balloon_floor(void) { unsigned long min_pages; - unsigned long nr_pages = totalram_pages; + unsigned long nr_pages = totalram_pages(); #define MB2PAGES(mb) ((mb) << (20 - PAGE_SHIFT)) /* Simple continuous piecewiese linear function: * max MiB -> min MiB gradient diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index dc385b70e4c3..8b0b628e5d1c 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -1887,7 +1887,7 @@ static int __init dm_bufio_init(void) dm_bufio_allocated_vmalloc = 0; dm_bufio_current_allocated = 0; - mem = (__u64)mult_frac(totalram_pages - totalhigh_pages, + mem = (__u64)mult_frac(totalram_pages() - totalhigh_pages(), DM_BUFIO_MEMORY_PERCENT, 100) << PAGE_SHIFT; if (mem > ULONG_MAX) diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index a7195eb5b8d8..a8c32de29e3f 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -2158,7 +2158,7 @@ static int crypt_wipe_key(struct crypt_config *cc) static void crypt_calculate_pages_per_client(void) { - unsigned long pages = (totalram_pages - totalhigh_pages) * DM_CRYPT_MEMORY_PERCENT / 100; + unsigned long pages = (totalram_pages() - totalhigh_pages()) * DM_CRYPT_MEMORY_PERCENT / 100; if (!dm_crypt_clients_n) return; diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index d4ad0bfee251..62baa3214cc7 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -2843,7 +2843,7 @@ static int create_journal(struct dm_integrity_c *ic, char **error) journal_pages = roundup((__u64)ic->journal_sections * ic->journal_section_sectors, PAGE_SIZE >> SECTOR_SHIFT) >> (PAGE_SHIFT - SECTOR_SHIFT); journal_desc_size = journal_pages * sizeof(struct page_list); - if (journal_pages >= totalram_pages - totalhigh_pages || journal_desc_size > ULONG_MAX) { + if (journal_pages >= totalram_pages() - totalhigh_pages() || journal_desc_size > ULONG_MAX) { *error = "Journal doesn't fit into memory"; r = -ENOMEM; goto bad; diff --git a/drivers/md/dm-stats.c b/drivers/md/dm-stats.c index 21de30b4e2a1..45b92a3d9d8e 100644 --- a/drivers/md/dm-stats.c +++ b/drivers/md/dm-stats.c @@ -85,7 +85,7 @@ static bool __check_shared_memory(size_t alloc_size) a = shared_memory_amount + alloc_size; if (a < shared_memory_amount) return false; - if (a >> PAGE_SHIFT > totalram_pages / DM_STATS_MEMORY_FACTOR) + if (a >> PAGE_SHIFT > totalram_pages() / DM_STATS_MEMORY_FACTOR) return false; #ifdef CONFIG_MMU if (a > (VMALLOC_END - VMALLOC_START) / DM_STATS_VMALLOC_FACTOR) diff --git a/drivers/media/platform/mtk-vpu/mtk_vpu.c b/drivers/media/platform/mtk-vpu/mtk_vpu.c index 616f78b24a79..b6602490a247 100644 --- a/drivers/media/platform/mtk-vpu/mtk_vpu.c +++ b/drivers/media/platform/mtk-vpu/mtk_vpu.c @@ -855,7 +855,7 @@ static int mtk_vpu_probe(struct platform_device *pdev) /* Set PTCM to 96K and DTCM to 32K */ vpu_cfg_writel(vpu, 0x2, VPU_TCM_CFG); - vpu->enable_4GB = !!(totalram_pages > (SZ_2G >> PAGE_SHIFT)); + vpu->enable_4GB = !!(totalram_pages() > (SZ_2G >> PAGE_SHIFT)); dev_info(dev, "4GB mode %u\n", vpu->enable_4GB); if (vpu->enable_4GB) { diff --git a/drivers/misc/vmw_balloon.c b/drivers/misc/vmw_balloon.c index 9b0b3fa4f836..e6126a4b95d3 100644 --- a/drivers/misc/vmw_balloon.c +++ b/drivers/misc/vmw_balloon.c @@ -570,7 +570,7 @@ static int vmballoon_send_get_target(struct vmballoon *b) unsigned long status; unsigned long limit; - limit = totalram_pages; + limit = totalram_pages(); /* Ensure limit fits in 32-bits */ if (limit != (u32)limit) diff --git a/drivers/parisc/ccio-dma.c b/drivers/parisc/ccio-dma.c index 701a7d6a74d5..358e380eb7fa 100644 --- a/drivers/parisc/ccio-dma.c +++ b/drivers/parisc/ccio-dma.c @@ -1251,7 +1251,7 @@ ccio_ioc_init(struct ioc *ioc) ** Hot-Plug/Removal of PCI cards. (aka PCI OLARD). */ - iova_space_size = (u32) (totalram_pages / count_parisc_driver(&ccio_driver)); + iova_space_size = (u32) (totalram_pages() / count_parisc_driver(&ccio_driver)); /* limit IOVA space size to 1MB-1GB */ @@ -1290,7 +1290,7 @@ ccio_ioc_init(struct ioc *ioc) DBG_INIT("%s() hpa 0x%p mem %luMB IOV %dMB (%d bits)\n", __func__, ioc->ioc_regs, - (unsigned long) totalram_pages >> (20 - PAGE_SHIFT), + (unsigned long) totalram_pages() >> (20 - PAGE_SHIFT), iova_space_size>>20, iov_order + PAGE_SHIFT); diff --git a/drivers/parisc/sba_iommu.c b/drivers/parisc/sba_iommu.c index c1e599a429af..e0655949480a 100644 --- a/drivers/parisc/sba_iommu.c +++ b/drivers/parisc/sba_iommu.c @@ -1414,7 +1414,7 @@ sba_ioc_init(struct parisc_device *sba, struct ioc *ioc, int ioc_num) ** for DMA hints - ergo only 30 bits max. */ - iova_space_size = (u32) (totalram_pages/global_ioc_cnt); + iova_space_size = (u32) (totalram_pages()/global_ioc_cnt); /* limit IOVA space size to 1MB-1GB */ if (iova_space_size < (1 << (20 - PAGE_SHIFT))) { @@ -1439,7 +1439,7 @@ sba_ioc_init(struct parisc_device *sba, struct ioc *ioc, int ioc_num) DBG_INIT("%s() hpa 0x%lx mem %ldMB IOV %dMB (%d bits)\n", __func__, ioc->ioc_hpa, - (unsigned long) totalram_pages >> (20 - PAGE_SHIFT), + (unsigned long) totalram_pages() >> (20 - PAGE_SHIFT), iova_space_size>>20, iov_order + PAGE_SHIFT); diff --git a/drivers/staging/android/ion/ion_system_heap.c b/drivers/staging/android/ion/ion_system_heap.c index 548bb02c0ca6..6cb0eebdff89 100644 --- a/drivers/staging/android/ion/ion_system_heap.c +++ b/drivers/staging/android/ion/ion_system_heap.c @@ -110,7 +110,7 @@ static int ion_system_heap_allocate(struct ion_heap *heap, unsigned long size_remaining = PAGE_ALIGN(size); unsigned int max_order = orders[0]; - if (size / PAGE_SIZE > totalram_pages / 2) + if (size / PAGE_SIZE > totalram_pages() / 2) return -ENOMEM; INIT_LIST_HEAD(&pages); diff --git a/drivers/xen/xen-selfballoon.c b/drivers/xen/xen-selfballoon.c index 5165aa82bf7d..246f6122c9ee 100644 --- a/drivers/xen/xen-selfballoon.c +++ b/drivers/xen/xen-selfballoon.c @@ -189,7 +189,7 @@ static void selfballoon_process(struct work_struct *work) bool reset_timer = false; if (xen_selfballooning_enabled) { - cur_pages = totalram_pages; + cur_pages = totalram_pages(); tgt_pages = cur_pages; /* default is no change */ goal_pages = vm_memory_committed() + totalreserve_pages + @@ -227,7 +227,7 @@ static void selfballoon_process(struct work_struct *work) if (tgt_pages < floor_pages) tgt_pages = floor_pages; balloon_set_new_target(tgt_pages + - balloon_stats.current_pages - totalram_pages); + balloon_stats.current_pages - totalram_pages()); reset_timer = true; } #ifdef CONFIG_FRONTSWAP @@ -569,7 +569,7 @@ int xen_selfballoon_init(bool use_selfballooning, bool use_frontswap_selfshrink) * much more reliably and response faster in some cases. */ if (!selfballoon_reserved_mb) { - reserve_pages = totalram_pages / 10; + reserve_pages = totalram_pages() / 10; selfballoon_reserved_mb = PAGES2MB(reserve_pages); } schedule_delayed_work(&selfballoon_worker, selfballoon_interval * HZ); diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 79a265ba9200..dfb64a5211b6 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -810,7 +810,7 @@ static inline int default_congestion_kb(void) * This allows larger machines to have larger/more transfers. * Limit the default to 256M */ - congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10); + congestion_kb = (16*int_sqrt(totalram_pages())) << (PAGE_SHIFT-10); if (congestion_kb > 256*1024) congestion_kb = 256*1024; diff --git a/fs/file_table.c b/fs/file_table.c index b6e9587f05c7..5679e7fcb6b0 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -380,7 +380,7 @@ void __init files_init(void) void __init files_maxfiles_init(void) { unsigned long n; - unsigned long nr_pages = totalram_pages; + unsigned long nr_pages = totalram_pages(); unsigned long memreserve = (nr_pages - nr_free_pages()) * 3/2; memreserve = min(memreserve, nr_pages - 1); diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 568abed20eb2..76baaa6be393 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -824,7 +824,7 @@ static const struct super_operations fuse_super_operations = { static void sanitize_global_limit(unsigned *limit) { if (*limit == 0) - *limit = ((totalram_pages << PAGE_SHIFT) >> 13) / + *limit = ((totalram_pages() << PAGE_SHIFT) >> 13) / sizeof(struct fuse_req); if (*limit >= 1 << 16) diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 586726a590d8..4f15665f0ad1 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -2121,7 +2121,7 @@ int __init nfs_init_writepagecache(void) * This allows larger machines to have larger/more transfers. * Limit the default to 256M */ - nfs_congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10); + nfs_congestion_kb = (16*int_sqrt(totalram_pages())) << (PAGE_SHIFT-10); if (nfs_congestion_kb > 256*1024) nfs_congestion_kb = 256*1024; diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c index e2fe0e9ce0df..da52b594362a 100644 --- a/fs/nfsd/nfscache.c +++ b/fs/nfsd/nfscache.c @@ -99,7 +99,7 @@ static unsigned int nfsd_cache_size_limit(void) { unsigned int limit; - unsigned long low_pages = totalram_pages - totalhigh_pages; + unsigned long low_pages = totalram_pages() - totalhigh_pages(); limit = (16 * int_sqrt(low_pages)) << (PAGE_SHIFT-10); return min_t(unsigned int, limit, 256*1024); diff --git a/fs/ntfs/malloc.h b/fs/ntfs/malloc.h index ab172e5f51d9..5becc8acc8f4 100644 --- a/fs/ntfs/malloc.h +++ b/fs/ntfs/malloc.h @@ -47,7 +47,7 @@ static inline void *__ntfs_malloc(unsigned long size, gfp_t gfp_mask) return kmalloc(PAGE_SIZE, gfp_mask & ~__GFP_HIGHMEM); /* return (void *)__get_free_page(gfp_mask); */ } - if (likely((size >> PAGE_SHIFT) < totalram_pages)) + if (likely((size >> PAGE_SHIFT) < totalram_pages())) return __vmalloc(size, gfp_mask, PAGE_KERNEL); return NULL; } diff --git a/fs/proc/base.c b/fs/proc/base.c index ce3465479447..d7fd1ca807d2 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -530,7 +530,7 @@ static const struct file_operations proc_lstats_operations = { static int proc_oom_score(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { - unsigned long totalpages = totalram_pages + total_swap_pages; + unsigned long totalpages = totalram_pages() + total_swap_pages; unsigned long points = 0; points = oom_badness(task, NULL, NULL, totalpages) * diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 0690679832d4..ea5cdbd8c2c3 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -36,7 +36,31 @@ static inline void invalidate_kernel_vmap_range(void *vaddr, int size) /* declarations for linux/mm/highmem.c */ unsigned int nr_free_highpages(void); -extern unsigned long totalhigh_pages; +extern atomic_long_t _totalhigh_pages; +static inline unsigned long totalhigh_pages(void) +{ + return (unsigned long)atomic_long_read(&_totalhigh_pages); +} + +static inline void totalhigh_pages_inc(void) +{ + atomic_long_inc(&_totalhigh_pages); +} + +static inline void totalhigh_pages_dec(void) +{ + atomic_long_dec(&_totalhigh_pages); +} + +static inline void totalhigh_pages_add(long count) +{ + atomic_long_add(count, &_totalhigh_pages); +} + +static inline void totalhigh_pages_set(long val) +{ + atomic_long_set(&_totalhigh_pages, val); +} void kmap_flush_unused(void); @@ -51,7 +75,7 @@ static inline struct page *kmap_to_page(void *addr) return virt_to_page(addr); } -#define totalhigh_pages 0UL +static inline unsigned long totalhigh_pages(void) { return 0UL; } #ifndef ARCH_HAS_KMAP static inline void *kmap(struct page *page) diff --git a/include/linux/mm.h b/include/linux/mm.h index b4d01969e700..1d2be4c2d34a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -48,7 +48,32 @@ static inline void set_max_mapnr(unsigned long limit) static inline void set_max_mapnr(unsigned long limit) { } #endif -extern unsigned long totalram_pages; +extern atomic_long_t _totalram_pages; +static inline unsigned long totalram_pages(void) +{ + return (unsigned long)atomic_long_read(&_totalram_pages); +} + +static inline void totalram_pages_inc(void) +{ + atomic_long_inc(&_totalram_pages); +} + +static inline void totalram_pages_dec(void) +{ + atomic_long_dec(&_totalram_pages); +} + +static inline void totalram_pages_add(long count) +{ + atomic_long_add(count, &_totalram_pages); +} + +static inline void totalram_pages_set(long val) +{ + atomic_long_set(&_totalram_pages, val); +} + extern void * high_memory; extern int page_cluster; diff --git a/include/linux/swap.h b/include/linux/swap.h index a8f6d5d89524..77459d695010 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -310,7 +310,6 @@ void workingset_update_node(struct xa_node *node); } while (0) /* linux/mm/page_alloc.c */ -extern unsigned long totalram_pages; extern unsigned long totalreserve_pages; extern unsigned long nr_free_buffer_pages(void); extern unsigned long nr_free_pagecache_pages(void); diff --git a/kernel/fork.c b/kernel/fork.c index 8617a326e9f5..c979605fe806 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -744,7 +744,7 @@ void __init __weak arch_task_cache_init(void) { } static void set_max_threads(unsigned int max_threads_suggested) { u64 threads; - unsigned long nr_pages = totalram_pages; + unsigned long nr_pages = totalram_pages(); /* * The number of threads shall be limited such that the thread diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 7e967ca98d92..d7140447be75 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -152,7 +152,7 @@ int sanity_check_segment_list(struct kimage *image) int i; unsigned long nr_segments = image->nr_segments; unsigned long total_pages = 0; - unsigned long nr_pages = totalram_pages; + unsigned long nr_pages = totalram_pages(); /* * Verify we have good destination addresses. The caller is diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index b0308a2c6000..640b2034edd6 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -105,7 +105,7 @@ unsigned long image_size; void __init hibernate_image_size_init(void) { - image_size = ((totalram_pages * 2) / 5) * PAGE_SIZE; + image_size = ((totalram_pages() * 2) / 5) * PAGE_SIZE; } /* diff --git a/mm/highmem.c b/mm/highmem.c index 59db3223a5d6..107b10f9878e 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -105,9 +105,8 @@ static inline wait_queue_head_t *get_pkmap_wait_queue_head(unsigned int color) } #endif -unsigned long totalhigh_pages __read_mostly; -EXPORT_SYMBOL(totalhigh_pages); - +atomic_long_t _totalhigh_pages __read_mostly; +EXPORT_SYMBOL(_totalhigh_pages); EXPORT_PER_CPU_SYMBOL(__kmap_atomic_idx); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index e84a10b0d310..da6682bb69aa 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -420,7 +420,7 @@ static int __init hugepage_init(void) * where the extra memory used could hurt more than TLB overhead * is likely to save. The admin can still enable it through /sys. */ - if (totalram_pages < (512 << (20 - PAGE_SHIFT))) { + if (totalram_pages() < (512 << (20 - PAGE_SHIFT))) { transparent_hugepage_flags = 0; return 0; } diff --git a/mm/kasan/quarantine.c b/mm/kasan/quarantine.c index 57334ef2d7ef..978bc4a3eb51 100644 --- a/mm/kasan/quarantine.c +++ b/mm/kasan/quarantine.c @@ -237,7 +237,7 @@ void quarantine_reduce(void) * Update quarantine size in case of hotplug. Allocate a fraction of * the installed memory to quarantine minus per-cpu queue limits. */ - total_size = (READ_ONCE(totalram_pages) << PAGE_SHIFT) / + total_size = (totalram_pages() << PAGE_SHIFT) / QUARANTINE_FRACTION; percpu_quarantines = QUARANTINE_PERCPU_SIZE * num_online_cpus(); new_quarantine_size = (total_size < percpu_quarantines) ? diff --git a/mm/memblock.c b/mm/memblock.c index 0068f87af1e8..a53d8697612c 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -1576,7 +1576,7 @@ void __init __memblock_free_late(phys_addr_t base, phys_addr_t size) for (; cursor < end; cursor++) { memblock_free_pages(pfn_to_page(cursor), cursor, 0); - totalram_pages++; + totalram_pages_inc(); } } @@ -1978,7 +1978,7 @@ unsigned long __init memblock_free_all(void) reset_all_zones_managed_pages(); pages = free_low_memory_core_early(); - totalram_pages += pages; + totalram_pages_add(pages); return pages; } diff --git a/mm/mm_init.c b/mm/mm_init.c index 6838a530789b..33917105a3a2 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -146,7 +146,7 @@ static void __meminit mm_compute_batch(void) s32 batch = max_t(s32, nr*2, 32); /* batch size set to 0.4% of (total memory/#cpus), or max int32 */ - memsized_batch = min_t(u64, (totalram_pages/nr)/256, 0x7fffffff); + memsized_batch = min_t(u64, (totalram_pages()/nr)/256, 0x7fffffff); vm_committed_as_batch = max_t(s32, memsized_batch, batch); } diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 6589f60d5018..21d487749e1d 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -269,7 +269,7 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc) } /* Default to all available memory */ - oc->totalpages = totalram_pages + total_swap_pages; + oc->totalpages = totalram_pages() + total_swap_pages; if (!IS_ENABLED(CONFIG_NUMA)) return CONSTRAINT_NONE; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 4b5c4ff68f18..eb2027892ef9 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -16,6 +16,7 @@ #include #include +#include #include #include #include @@ -124,7 +125,8 @@ EXPORT_SYMBOL(node_states); /* Protect totalram_pages and zone->managed_pages */ static DEFINE_SPINLOCK(managed_page_count_lock); -unsigned long totalram_pages __read_mostly; +atomic_long_t _totalram_pages __read_mostly; +EXPORT_SYMBOL(_totalram_pages); unsigned long totalreserve_pages __read_mostly; unsigned long totalcma_pages __read_mostly; @@ -4747,11 +4749,11 @@ EXPORT_SYMBOL_GPL(si_mem_available); void si_meminfo(struct sysinfo *val) { - val->totalram = totalram_pages; + val->totalram = totalram_pages(); val->sharedram = global_node_page_state(NR_SHMEM); val->freeram = global_zone_page_state(NR_FREE_PAGES); val->bufferram = nr_blockdev_pages(); - val->totalhigh = totalhigh_pages; + val->totalhigh = totalhigh_pages(); val->freehigh = nr_free_highpages(); val->mem_unit = PAGE_SIZE; } @@ -7077,10 +7079,10 @@ void adjust_managed_page_count(struct page *page, long count) { spin_lock(&managed_page_count_lock); atomic_long_add(count, &page_zone(page)->managed_pages); - totalram_pages += count; + totalram_pages_add(count); #ifdef CONFIG_HIGHMEM if (PageHighMem(page)) - totalhigh_pages += count; + totalhigh_pages_add(count); #endif spin_unlock(&managed_page_count_lock); } @@ -7123,9 +7125,9 @@ EXPORT_SYMBOL(free_reserved_area); void free_highmem_page(struct page *page) { __free_reserved_page(page); - totalram_pages++; + totalram_pages_inc(); atomic_long_inc(&page_zone(page)->managed_pages); - totalhigh_pages++; + totalhigh_pages_inc(); } #endif @@ -7174,10 +7176,10 @@ void __init mem_init_print_info(const char *str) physpages << (PAGE_SHIFT - 10), codesize >> 10, datasize >> 10, rosize >> 10, (init_data_size + init_code_size) >> 10, bss_size >> 10, - (physpages - totalram_pages - totalcma_pages) << (PAGE_SHIFT - 10), + (physpages - totalram_pages() - totalcma_pages) << (PAGE_SHIFT - 10), totalcma_pages << (PAGE_SHIFT - 10), #ifdef CONFIG_HIGHMEM - totalhigh_pages << (PAGE_SHIFT - 10), + totalhigh_pages() << (PAGE_SHIFT - 10), #endif str ? ", " : "", str ? str : ""); } diff --git a/mm/shmem.c b/mm/shmem.c index b1f0f54470fb..6ece1e2fe76e 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -109,13 +109,14 @@ struct shmem_falloc { #ifdef CONFIG_TMPFS static unsigned long shmem_default_max_blocks(void) { - return totalram_pages / 2; + return totalram_pages() / 2; } static unsigned long shmem_default_max_inodes(void) { - unsigned long nr_pages = totalram_pages; - return min(nr_pages - totalhigh_pages, nr_pages / 2); + unsigned long nr_pages = totalram_pages(); + + return min(nr_pages - totalhigh_pages(), nr_pages / 2); } #endif @@ -3302,7 +3303,7 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo, size = memparse(value,&rest); if (*rest == '%') { size <<= PAGE_SHIFT; - size *= totalram_pages; + size *= totalram_pages(); do_div(size, 100); rest++; } diff --git a/mm/slab.c b/mm/slab.c index 01991060714c..73fe23e649c9 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1235,7 +1235,7 @@ void __init kmem_cache_init(void) * page orders on machines with more than 32MB of memory if * not overridden on the command line. */ - if (!slab_max_order_set && totalram_pages > (32 << 20) >> PAGE_SHIFT) + if (!slab_max_order_set && totalram_pages() > (32 << 20) >> PAGE_SHIFT) slab_max_order = SLAB_MAX_ORDER_HI; /* Bootstrap is tricky, because several objects are allocated diff --git a/mm/swap.c b/mm/swap.c index 5d786019eab9..4d8a1f1afaab 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -1022,7 +1022,7 @@ EXPORT_SYMBOL(pagevec_lookup_range_nr_tag); */ void __init swap_setup(void) { - unsigned long megs = totalram_pages >> (20 - PAGE_SHIFT); + unsigned long megs = totalram_pages() >> (20 - PAGE_SHIFT); /* Use a smaller cluster for small-memory machines */ if (megs < 16) diff --git a/mm/util.c b/mm/util.c index 8bf08b5b5760..4df23d64aac7 100644 --- a/mm/util.c +++ b/mm/util.c @@ -593,7 +593,7 @@ unsigned long vm_commit_limit(void) if (sysctl_overcommit_kbytes) allowed = sysctl_overcommit_kbytes >> (PAGE_SHIFT - 10); else - allowed = ((totalram_pages - hugetlb_total_pages()) + allowed = ((totalram_pages() - hugetlb_total_pages()) * sysctl_overcommit_ratio / 100); allowed += total_swap_pages; diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 97d4b25d0373..871e41c55e23 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1634,7 +1634,7 @@ void *vmap(struct page **pages, unsigned int count, might_sleep(); - if (count > totalram_pages) + if (count > totalram_pages()) return NULL; size = (unsigned long)count << PAGE_SHIFT; @@ -1739,7 +1739,7 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, unsigned long real_size = size; size = PAGE_ALIGN(size); - if (!size || (size >> PAGE_SHIFT) > totalram_pages) + if (!size || (size >> PAGE_SHIFT) > totalram_pages()) goto fail; area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNINITIALIZED | diff --git a/mm/workingset.c b/mm/workingset.c index d46f8c92aa2f..dcb994f2acc2 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -549,7 +549,7 @@ static int __init workingset_init(void) * double the initial memory by using totalram_pages as-is. */ timestamp_bits = BITS_PER_LONG - EVICTION_SHIFT; - max_order = fls_long(totalram_pages - 1); + max_order = fls_long(totalram_pages() - 1); if (max_order > timestamp_bits) bucket_order = max_order - timestamp_bits; pr_info("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n", diff --git a/mm/zswap.c b/mm/zswap.c index cd91fd9d96b8..a4e4d36ec085 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -219,8 +219,8 @@ static const struct zpool_ops zswap_zpool_ops = { static bool zswap_is_full(void) { - return totalram_pages * zswap_max_pool_percent / 100 < - DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE); + return totalram_pages() * zswap_max_pool_percent / 100 < + DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE); } static void zswap_update_total_size(void) diff --git a/net/dccp/proto.c b/net/dccp/proto.c index ff727ff61b5b..0e2f71ab8367 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -1131,7 +1131,7 @@ EXPORT_SYMBOL_GPL(dccp_debug); static int __init dccp_init(void) { unsigned long goal; - unsigned long nr_pages = totalram_pages; + unsigned long nr_pages = totalram_pages(); int ehash_order, bhash_order, i; int rc; diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c index 1c002c0fb712..950613ee7881 100644 --- a/net/decnet/dn_route.c +++ b/net/decnet/dn_route.c @@ -1866,7 +1866,7 @@ void __init dn_route_init(void) dn_route_timer.expires = jiffies + decnet_dst_gc_interval * HZ; add_timer(&dn_route_timer); - goal = totalram_pages >> (26 - PAGE_SHIFT); + goal = totalram_pages() >> (26 - PAGE_SHIFT); for(order = 0; (1UL << order) < goal; order++) /* NOTHING */; diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index 03b51cdcc731..b467a7cabf40 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -1000,7 +1000,7 @@ static int __net_init tcp_net_metrics_init(struct net *net) slots = tcpmhash_entries; if (!slots) { - if (totalram_pages >= 128 * 1024) + if (totalram_pages() >= 128 * 1024) slots = 16 * 1024; else slots = 8 * 1024; diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 5eb990830348..741b533148ba 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -2248,7 +2248,7 @@ static __always_inline unsigned int total_extension_size(void) int nf_conntrack_init_start(void) { - unsigned long nr_pages = totalram_pages; + unsigned long nr_pages = totalram_pages(); int max_factor = 8; int ret = -ENOMEM; int i; diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c index 88b520ba2abc..8d86e39d6280 100644 --- a/net/netfilter/xt_hashlimit.c +++ b/net/netfilter/xt_hashlimit.c @@ -274,7 +274,7 @@ static int htable_create(struct net *net, struct hashlimit_cfg3 *cfg, struct xt_hashlimit_htable *hinfo; const struct seq_operations *ops; unsigned int size, i; - unsigned long nr_pages = totalram_pages; + unsigned long nr_pages = totalram_pages(); int ret; if (cfg->size) { diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index a5b24182b3cc..d5878ae55840 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -1368,7 +1368,7 @@ static __init int sctp_init(void) int status = -EINVAL; unsigned long goal; unsigned long limit; - unsigned long nr_pages = totalram_pages; + unsigned long nr_pages = totalram_pages(); int max_share; int order; int num_entries; diff --git a/security/integrity/ima/ima_kexec.c b/security/integrity/ima/ima_kexec.c index 16bd18747cfa..d6f32807b347 100644 --- a/security/integrity/ima/ima_kexec.c +++ b/security/integrity/ima/ima_kexec.c @@ -106,7 +106,7 @@ void ima_add_kexec_buffer(struct kimage *image) kexec_segment_size = ALIGN(ima_get_binary_runtime_size() + PAGE_SIZE / 2, PAGE_SIZE); if ((kexec_segment_size == ULONG_MAX) || - ((kexec_segment_size >> PAGE_SHIFT) > totalram_pages / 2)) { + ((kexec_segment_size >> PAGE_SHIFT) > totalram_pages() / 2)) { pr_err("Binary measurement list too large.\n"); return; } -- cgit From 808153e1187fa77ac7d7dad261ff476888dcf398 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 28 Dec 2018 00:34:50 -0800 Subject: mm, devm_memremap_pages: mark devm_memremap_pages() EXPORT_SYMBOL_GPL MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit devm_memremap_pages() is a facility that can create struct page entries for any arbitrary range and give drivers the ability to subvert core aspects of page management. Specifically the facility is tightly integrated with the kernel's memory hotplug functionality. It injects an altmap argument deep into the architecture specific vmemmap implementation to allow allocating from specific reserved pages, and it has Linux specific assumptions about page structure reference counting relative to get_user_pages() and get_user_pages_fast(). It was an oversight and a mistake that this was not marked EXPORT_SYMBOL_GPL from the outset. Again, devm_memremap_pagex() exposes and relies upon core kernel internal assumptions and will continue to evolve along with 'struct page', memory hotplug, and support for new memory types / topologies. Only an in-kernel GPL-only driver is expected to keep up with this ongoing evolution. This interface, and functionality derived from this interface, is not suitable for kernel-external drivers. Link: http://lkml.kernel.org/r/154275557457.76910.16923571232582744134.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams Reviewed-by: Christoph Hellwig Acked-by: Michal Hocko Cc: "Jérôme Glisse" Cc: Balbir Singh Cc: Logan Gunthorpe Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/memremap.c | 2 +- tools/testing/nvdimm/test/iomap.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/memremap.c b/kernel/memremap.c index 9eced2cc9f94..61dbcaa95530 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -233,7 +233,7 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) err_array: return ERR_PTR(error); } -EXPORT_SYMBOL(devm_memremap_pages); +EXPORT_SYMBOL_GPL(devm_memremap_pages); unsigned long vmem_altmap_offset(struct vmem_altmap *altmap) { diff --git a/tools/testing/nvdimm/test/iomap.c b/tools/testing/nvdimm/test/iomap.c index ff9d3a5825e1..ed18a0cbc0c8 100644 --- a/tools/testing/nvdimm/test/iomap.c +++ b/tools/testing/nvdimm/test/iomap.c @@ -113,7 +113,7 @@ void *__wrap_devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) return nfit_res->buf + offset - nfit_res->res.start; return devm_memremap_pages(dev, pgmap); } -EXPORT_SYMBOL(__wrap_devm_memremap_pages); +EXPORT_SYMBOL_GPL(__wrap_devm_memremap_pages); pfn_t __wrap_phys_to_pfn_t(phys_addr_t addr, unsigned long flags) { -- cgit From 06489cfbd915ff36c8e36df27f1c2dc60f97ca56 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 28 Dec 2018 00:34:54 -0800 Subject: mm, devm_memremap_pages: kill mapping "System RAM" support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Given the fact that devm_memremap_pages() requires a percpu_ref that is torn down by devm_memremap_pages_release() the current support for mapping RAM is broken. Support for remapping "System RAM" has been broken since the beginning and there is no existing user of this this code path, so just kill the support and make it an explicit error. This cleanup also simplifies a follow-on patch to fix the error path when setting a devm release action for devm_memremap_pages_release() fails. Link: http://lkml.kernel.org/r/154275557997.76910.14689813630968180480.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams Reviewed-by: "Jérôme Glisse" Reviewed-by: Christoph Hellwig Reviewed-by: Logan Gunthorpe Cc: Balbir Singh Cc: Michal Hocko Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/memremap.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/memremap.c b/kernel/memremap.c index 61dbcaa95530..99d14940acfa 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -167,15 +167,12 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) is_ram = region_intersects(align_start, align_size, IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE); - if (is_ram == REGION_MIXED) { - WARN_ONCE(1, "%s attempted on mixed region %pr\n", - __func__, res); + if (is_ram != REGION_DISJOINT) { + WARN_ONCE(1, "%s attempted on %s region %pr\n", __func__, + is_ram == REGION_MIXED ? "mixed" : "ram", res); return ERR_PTR(-ENXIO); } - if (is_ram == REGION_INTERSECTS) - return __va(res->start); - if (!pgmap->ref) return ERR_PTR(-EINVAL); -- cgit From a95c90f1e2c253b280385ecf3d4ebfe476926b28 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 28 Dec 2018 00:34:57 -0800 Subject: mm, devm_memremap_pages: fix shutdown handling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The last step before devm_memremap_pages() returns success is to allocate a release action, devm_memremap_pages_release(), to tear the entire setup down. However, the result from devm_add_action() is not checked. Checking the error from devm_add_action() is not enough. The api currently relies on the fact that the percpu_ref it is using is killed by the time the devm_memremap_pages_release() is run. Rather than continue this awkward situation, offload the responsibility of killing the percpu_ref to devm_memremap_pages_release() directly. This allows devm_memremap_pages() to do the right thing relative to init failures and shutdown. Without this change we could fail to register the teardown of devm_memremap_pages(). The likelihood of hitting this failure is tiny as small memory allocations almost always succeed. However, the impact of the failure is large given any future reconfiguration, or disable/enable, of an nvdimm namespace will fail forever as subsequent calls to devm_memremap_pages() will fail to setup the pgmap_radix since there will be stale entries for the physical address range. An argument could be made to require that the ->kill() operation be set in the @pgmap arg rather than passed in separately. However, it helps code readability, tracking the lifetime of a given instance, to be able to grep the kill routine directly at the devm_memremap_pages() call site. Link: http://lkml.kernel.org/r/154275558526.76910.7535251937849268605.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams Fixes: e8d513483300 ("memremap: change devm_memremap_pages interface...") Reviewed-by: "Jérôme Glisse" Reported-by: Logan Gunthorpe Reviewed-by: Logan Gunthorpe Reviewed-by: Christoph Hellwig Cc: Balbir Singh Cc: Michal Hocko Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/dax/pmem.c | 14 +++----------- drivers/nvdimm/pmem.c | 13 +++++-------- include/linux/memremap.h | 2 ++ kernel/memremap.c | 30 ++++++++++++++---------------- tools/testing/nvdimm/test/iomap.c | 15 ++++++++++++++- 5 files changed, 38 insertions(+), 36 deletions(-) (limited to 'kernel') diff --git a/drivers/dax/pmem.c b/drivers/dax/pmem.c index 99e2aace8078..2c1f459c0c63 100644 --- a/drivers/dax/pmem.c +++ b/drivers/dax/pmem.c @@ -48,9 +48,8 @@ static void dax_pmem_percpu_exit(void *data) percpu_ref_exit(ref); } -static void dax_pmem_percpu_kill(void *data) +static void dax_pmem_percpu_kill(struct percpu_ref *ref) { - struct percpu_ref *ref = data; struct dax_pmem *dax_pmem = to_dax_pmem(ref); dev_dbg(dax_pmem->dev, "trace\n"); @@ -112,17 +111,10 @@ static int dax_pmem_probe(struct device *dev) } dax_pmem->pgmap.ref = &dax_pmem->ref; + dax_pmem->pgmap.kill = dax_pmem_percpu_kill; addr = devm_memremap_pages(dev, &dax_pmem->pgmap); - if (IS_ERR(addr)) { - devm_remove_action(dev, dax_pmem_percpu_exit, &dax_pmem->ref); - percpu_ref_exit(&dax_pmem->ref); + if (IS_ERR(addr)) return PTR_ERR(addr); - } - - rc = devm_add_action_or_reset(dev, dax_pmem_percpu_kill, - &dax_pmem->ref); - if (rc) - return rc; /* adjust the dax_region resource to the start of data */ memcpy(&res, &dax_pmem->pgmap.res, sizeof(res)); diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index 0e39e3d1846f..d28418b05a04 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -309,8 +309,11 @@ static void pmem_release_queue(void *q) blk_cleanup_queue(q); } -static void pmem_freeze_queue(void *q) +static void pmem_freeze_queue(struct percpu_ref *ref) { + struct request_queue *q; + + q = container_of(ref, typeof(*q), q_usage_counter); blk_freeze_queue_start(q); } @@ -402,6 +405,7 @@ static int pmem_attach_disk(struct device *dev, pmem->pfn_flags = PFN_DEV; pmem->pgmap.ref = &q->q_usage_counter; + pmem->pgmap.kill = pmem_freeze_queue; if (is_nd_pfn(dev)) { if (setup_pagemap_fsdax(dev, &pmem->pgmap)) return -ENOMEM; @@ -427,13 +431,6 @@ static int pmem_attach_disk(struct device *dev, memcpy(&bb_res, &nsio->res, sizeof(bb_res)); } - /* - * At release time the queue must be frozen before - * devm_memremap_pages is unwound - */ - if (devm_add_action_or_reset(dev, pmem_freeze_queue, q)) - return -ENOMEM; - if (IS_ERR(addr)) return PTR_ERR(addr); pmem->virt_addr = addr; diff --git a/include/linux/memremap.h b/include/linux/memremap.h index 0ac69ddf5fc4..55db66b3716f 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -111,6 +111,7 @@ typedef void (*dev_page_free_t)(struct page *page, void *data); * @altmap: pre-allocated/reserved memory for vmemmap allocations * @res: physical address range covered by @ref * @ref: reference count that pins the devm_memremap_pages() mapping + * @kill: callback to transition @ref to the dead state * @dev: host device of the mapping for debug * @data: private data pointer for page_free() * @type: memory type: see MEMORY_* in memory_hotplug.h @@ -122,6 +123,7 @@ struct dev_pagemap { bool altmap_valid; struct resource res; struct percpu_ref *ref; + void (*kill)(struct percpu_ref *ref); struct device *dev; void *data; enum memory_type type; diff --git a/kernel/memremap.c b/kernel/memremap.c index 99d14940acfa..5e45f0c327a5 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -88,14 +88,10 @@ static void devm_memremap_pages_release(void *data) resource_size_t align_start, align_size; unsigned long pfn; + pgmap->kill(pgmap->ref); for_each_device_pfn(pfn, pgmap) put_page(pfn_to_page(pfn)); - if (percpu_ref_tryget_live(pgmap->ref)) { - dev_WARN(dev, "%s: page mapping is still live!\n", __func__); - percpu_ref_put(pgmap->ref); - } - /* pages are dead and unused, undo the arch mapping */ align_start = res->start & ~(SECTION_SIZE - 1); align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE) @@ -116,7 +112,7 @@ static void devm_memremap_pages_release(void *data) /** * devm_memremap_pages - remap and provide memmap backing for the given resource * @dev: hosting device for @res - * @pgmap: pointer to a struct dev_pgmap + * @pgmap: pointer to a struct dev_pagemap * * Notes: * 1/ At a minimum the res, ref and type members of @pgmap must be initialized @@ -125,11 +121,8 @@ static void devm_memremap_pages_release(void *data) * 2/ The altmap field may optionally be initialized, in which case altmap_valid * must be set to true * - * 3/ pgmap.ref must be 'live' on entry and 'dead' before devm_memunmap_pages() - * time (or devm release event). The expected order of events is that ref has - * been through percpu_ref_kill() before devm_memremap_pages_release(). The - * wait for the completion of all references being dropped and - * percpu_ref_exit() must occur after devm_memremap_pages_release(). + * 3/ pgmap->ref must be 'live' on entry and will be killed at + * devm_memremap_pages_release() time, or if this routine fails. * * 4/ res is expected to be a host memory range that could feasibly be * treated as a "System RAM" range, i.e. not a device mmio range, but @@ -145,6 +138,9 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) pgprot_t pgprot = PAGE_KERNEL; int error, nid, is_ram; + if (!pgmap->ref || !pgmap->kill) + return ERR_PTR(-EINVAL); + align_start = res->start & ~(SECTION_SIZE - 1); align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE) - align_start; @@ -170,12 +166,10 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) if (is_ram != REGION_DISJOINT) { WARN_ONCE(1, "%s attempted on %s region %pr\n", __func__, is_ram == REGION_MIXED ? "mixed" : "ram", res); - return ERR_PTR(-ENXIO); + error = -ENXIO; + goto err_array; } - if (!pgmap->ref) - return ERR_PTR(-EINVAL); - pgmap->dev = dev; error = xa_err(xa_store_range(&pgmap_array, PHYS_PFN(res->start), @@ -217,7 +211,10 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) align_size >> PAGE_SHIFT, pgmap); percpu_ref_get_many(pgmap->ref, pfn_end(pgmap) - pfn_first(pgmap)); - devm_add_action(dev, devm_memremap_pages_release, pgmap); + error = devm_add_action_or_reset(dev, devm_memremap_pages_release, + pgmap); + if (error) + return ERR_PTR(error); return __va(res->start); @@ -228,6 +225,7 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) err_pfn_remap: pgmap_array_delete(res); err_array: + pgmap->kill(pgmap->ref); return ERR_PTR(error); } EXPORT_SYMBOL_GPL(devm_memremap_pages); diff --git a/tools/testing/nvdimm/test/iomap.c b/tools/testing/nvdimm/test/iomap.c index ed18a0cbc0c8..c6635fee27d8 100644 --- a/tools/testing/nvdimm/test/iomap.c +++ b/tools/testing/nvdimm/test/iomap.c @@ -104,13 +104,26 @@ void *__wrap_devm_memremap(struct device *dev, resource_size_t offset, } EXPORT_SYMBOL(__wrap_devm_memremap); +static void nfit_test_kill(void *_pgmap) +{ + struct dev_pagemap *pgmap = _pgmap; + + pgmap->kill(pgmap->ref); +} + void *__wrap_devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) { resource_size_t offset = pgmap->res.start; struct nfit_test_resource *nfit_res = get_nfit_res(offset); - if (nfit_res) + if (nfit_res) { + int rc; + + rc = devm_add_action_or_reset(dev, nfit_test_kill, pgmap); + if (rc) + return ERR_PTR(rc); return nfit_res->buf + offset - nfit_res->res.start; + } return devm_memremap_pages(dev, pgmap); } EXPORT_SYMBOL_GPL(__wrap_devm_memremap_pages); -- cgit From 69324b8f48339de2f90fdf2f774687fc6c47629a Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 28 Dec 2018 00:35:01 -0800 Subject: mm, devm_memremap_pages: add MEMORY_DEVICE_PRIVATE support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In preparation for consolidating all ZONE_DEVICE enabling via devm_memremap_pages(), teach it how to handle the constraints of MEMORY_DEVICE_PRIVATE ranges. [jglisse@redhat.com: call move_pfn_range_to_zone for MEMORY_DEVICE_PRIVATE] Link: http://lkml.kernel.org/r/154275559036.76910.12434636179931292607.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams Reviewed-by: Jérôme Glisse Acked-by: Christoph Hellwig Reported-by: Logan Gunthorpe Reviewed-by: Logan Gunthorpe Cc: Balbir Singh Cc: Michal Hocko Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/memremap.c | 53 +++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 41 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/memremap.c b/kernel/memremap.c index 5e45f0c327a5..3eef989ef035 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -98,9 +98,15 @@ static void devm_memremap_pages_release(void *data) - align_start; mem_hotplug_begin(); - arch_remove_memory(align_start, align_size, pgmap->altmap_valid ? - &pgmap->altmap : NULL); - kasan_remove_zero_shadow(__va(align_start), align_size); + if (pgmap->type == MEMORY_DEVICE_PRIVATE) { + pfn = align_start >> PAGE_SHIFT; + __remove_pages(page_zone(pfn_to_page(pfn)), pfn, + align_size >> PAGE_SHIFT, NULL); + } else { + arch_remove_memory(align_start, align_size, + pgmap->altmap_valid ? &pgmap->altmap : NULL); + kasan_remove_zero_shadow(__va(align_start), align_size); + } mem_hotplug_done(); untrack_pfn(NULL, PHYS_PFN(align_start), align_size); @@ -187,17 +193,40 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) goto err_pfn_remap; mem_hotplug_begin(); - error = kasan_add_zero_shadow(__va(align_start), align_size); - if (error) { - mem_hotplug_done(); - goto err_kasan; + + /* + * For device private memory we call add_pages() as we only need to + * allocate and initialize struct page for the device memory. More- + * over the device memory is un-accessible thus we do not want to + * create a linear mapping for the memory like arch_add_memory() + * would do. + * + * For all other device memory types, which are accessible by + * the CPU, we do want the linear mapping and thus use + * arch_add_memory(). + */ + if (pgmap->type == MEMORY_DEVICE_PRIVATE) { + error = add_pages(nid, align_start >> PAGE_SHIFT, + align_size >> PAGE_SHIFT, NULL, false); + } else { + error = kasan_add_zero_shadow(__va(align_start), align_size); + if (error) { + mem_hotplug_done(); + goto err_kasan; + } + + error = arch_add_memory(nid, align_start, align_size, altmap, + false); + } + + if (!error) { + struct zone *zone; + + zone = &NODE_DATA(nid)->node_zones[ZONE_DEVICE]; + move_pfn_range_to_zone(zone, align_start >> PAGE_SHIFT, + align_size >> PAGE_SHIFT, altmap); } - error = arch_add_memory(nid, align_start, align_size, altmap, false); - if (!error) - move_pfn_range_to_zone(&NODE_DATA(nid)->node_zones[ZONE_DEVICE], - align_start >> PAGE_SHIFT, - align_size >> PAGE_SHIFT, altmap); mem_hotplug_done(); if (error) goto err_add_memory; -- cgit From 1c30844d2dfe272d58c8fc000960b835d13aa2ac Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 28 Dec 2018 00:35:52 -0800 Subject: mm: reclaim small amounts of memory when an external fragmentation event occurs An external fragmentation event was previously described as When the page allocator fragments memory, it records the event using the mm_page_alloc_extfrag event. If the fallback_order is smaller than a pageblock order (order-9 on 64-bit x86) then it's considered an event that will cause external fragmentation issues in the future. The kernel reduces the probability of such events by increasing the watermark sizes by calling set_recommended_min_free_kbytes early in the lifetime of the system. This works reasonably well in general but if there are enough sparsely populated pageblocks then the problem can still occur as enough memory is free overall and kswapd stays asleep. This patch introduces a watermark_boost_factor sysctl that allows a zone watermark to be temporarily boosted when an external fragmentation causing events occurs. The boosting will stall allocations that would decrease free memory below the boosted low watermark and kswapd is woken if the calling context allows to reclaim an amount of memory relative to the size of the high watermark and the watermark_boost_factor until the boost is cleared. When kswapd finishes, it wakes kcompactd at the pageblock order to clean some of the pageblocks that may have been affected by the fragmentation event. kswapd avoids any writeback, slab shrinkage and swap from reclaim context during this operation to avoid excessive system disruption in the name of fragmentation avoidance. Care is taken so that kswapd will do normal reclaim work if the system is really low on memory. This was evaluated using the same workloads as "mm, page_alloc: Spread allocations across zones before introducing fragmentation". 1-socket Skylake machine config-global-dhp__workload_thpfioscale XFS (no special madvise) 4 fio threads, 1 THP allocating thread -------------------------------------- 4.20-rc3 extfrag events < order 9: 804694 4.20-rc3+patch: 408912 (49% reduction) 4.20-rc3+patch1-4: 18421 (98% reduction) 4.20.0-rc3 4.20.0-rc3 lowzone-v5r8 boost-v5r8 Amean fault-base-1 653.58 ( 0.00%) 652.71 ( 0.13%) Amean fault-huge-1 0.00 ( 0.00%) 178.93 * -99.00%* 4.20.0-rc3 4.20.0-rc3 lowzone-v5r8 boost-v5r8 Percentage huge-1 0.00 ( 0.00%) 5.12 ( 100.00%) Note that external fragmentation causing events are massively reduced by this path whether in comparison to the previous kernel or the vanilla kernel. The fault latency for huge pages appears to be increased but that is only because THP allocations were successful with the patch applied. 1-socket Skylake machine global-dhp__workload_thpfioscale-madvhugepage-xfs (MADV_HUGEPAGE) ----------------------------------------------------------------- 4.20-rc3 extfrag events < order 9: 291392 4.20-rc3+patch: 191187 (34% reduction) 4.20-rc3+patch1-4: 13464 (95% reduction) thpfioscale Fault Latencies 4.20.0-rc3 4.20.0-rc3 lowzone-v5r8 boost-v5r8 Min fault-base-1 912.00 ( 0.00%) 905.00 ( 0.77%) Min fault-huge-1 127.00 ( 0.00%) 135.00 ( -6.30%) Amean fault-base-1 1467.55 ( 0.00%) 1481.67 ( -0.96%) Amean fault-huge-1 1127.11 ( 0.00%) 1063.88 * 5.61%* 4.20.0-rc3 4.20.0-rc3 lowzone-v5r8 boost-v5r8 Percentage huge-1 77.64 ( 0.00%) 83.46 ( 7.49%) As before, massive reduction in external fragmentation events, some jitter on latencies and an increase in THP allocation success rates. 2-socket Haswell machine config-global-dhp__workload_thpfioscale XFS (no special madvise) 4 fio threads, 5 THP allocating threads ---------------------------------------------------------------- 4.20-rc3 extfrag events < order 9: 215698 4.20-rc3+patch: 200210 (7% reduction) 4.20-rc3+patch1-4: 14263 (93% reduction) 4.20.0-rc3 4.20.0-rc3 lowzone-v5r8 boost-v5r8 Amean fault-base-5 1346.45 ( 0.00%) 1306.87 ( 2.94%) Amean fault-huge-5 3418.60 ( 0.00%) 1348.94 ( 60.54%) 4.20.0-rc3 4.20.0-rc3 lowzone-v5r8 boost-v5r8 Percentage huge-5 0.78 ( 0.00%) 7.91 ( 910.64%) There is a 93% reduction in fragmentation causing events, there is a big reduction in the huge page fault latency and allocation success rate is higher. 2-socket Haswell machine global-dhp__workload_thpfioscale-madvhugepage-xfs (MADV_HUGEPAGE) ----------------------------------------------------------------- 4.20-rc3 extfrag events < order 9: 166352 4.20-rc3+patch: 147463 (11% reduction) 4.20-rc3+patch1-4: 11095 (93% reduction) thpfioscale Fault Latencies 4.20.0-rc3 4.20.0-rc3 lowzone-v5r8 boost-v5r8 Amean fault-base-5 6217.43 ( 0.00%) 7419.67 * -19.34%* Amean fault-huge-5 3163.33 ( 0.00%) 3263.80 ( -3.18%) 4.20.0-rc3 4.20.0-rc3 lowzone-v5r8 boost-v5r8 Percentage huge-5 95.14 ( 0.00%) 87.98 ( -7.53%) There is a large reduction in fragmentation events with some jitter around the latencies and success rates. As before, the high THP allocation success rate does mean the system is under a lot of pressure. However, as the fragmentation events are reduced, it would be expected that the long-term allocation success rate would be higher. Link: http://lkml.kernel.org/r/20181123114528.28802-5-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Andrea Arcangeli Cc: David Rientjes Cc: Michal Hocko Cc: Zi Yan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/sysctl/vm.txt | 21 +++++++ include/linux/mm.h | 1 + include/linux/mmzone.h | 11 ++-- kernel/sysctl.c | 8 +++ mm/page_alloc.c | 43 +++++++++++++- mm/vmscan.c | 133 +++++++++++++++++++++++++++++++++++++++++--- 6 files changed, 202 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index 7d73882e2c27..187ce4f599a2 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -63,6 +63,7 @@ Currently, these files are in /proc/sys/vm: - swappiness - user_reserve_kbytes - vfs_cache_pressure +- watermark_boost_factor - watermark_scale_factor - zone_reclaim_mode @@ -856,6 +857,26 @@ ten times more freeable objects than there are. ============================================================= +watermark_boost_factor: + +This factor controls the level of reclaim when memory is being fragmented. +It defines the percentage of the high watermark of a zone that will be +reclaimed if pages of different mobility are being mixed within pageblocks. +The intent is that compaction has less work to do in the future and to +increase the success rate of future high-order allocations such as SLUB +allocations, THP and hugetlbfs pages. + +To make it sensible with respect to the watermark_scale_factor parameter, +the unit is in fractions of 10,000. The default value of 15,000 means +that up to 150% of the high watermark will be reclaimed in the event of +a pageblock being mixed due to fragmentation. The level of reclaim is +determined by the number of fragmentation events that occurred in the +recent past. If this value is smaller than a pageblock then a pageblocks +worth of pages will be reclaimed (e.g. 2MB on 64-bit x86). A boost factor +of 0 will disable the feature. + +============================================================= + watermark_scale_factor: This factor controls the aggressiveness of kswapd. It defines the diff --git a/include/linux/mm.h b/include/linux/mm.h index 1d2be4c2d34a..031b2ce983f9 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2256,6 +2256,7 @@ extern void zone_pcp_reset(struct zone *zone); /* page_alloc.c */ extern int min_free_kbytes; +extern int watermark_boost_factor; extern int watermark_scale_factor; /* nommu.c */ diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index dcf1b66a96ab..5b4bfb90fb94 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -269,10 +269,10 @@ enum zone_watermarks { NR_WMARK }; -#define min_wmark_pages(z) (z->_watermark[WMARK_MIN]) -#define low_wmark_pages(z) (z->_watermark[WMARK_LOW]) -#define high_wmark_pages(z) (z->_watermark[WMARK_HIGH]) -#define wmark_pages(z, i) (z->_watermark[i]) +#define min_wmark_pages(z) (z->_watermark[WMARK_MIN] + z->watermark_boost) +#define low_wmark_pages(z) (z->_watermark[WMARK_LOW] + z->watermark_boost) +#define high_wmark_pages(z) (z->_watermark[WMARK_HIGH] + z->watermark_boost) +#define wmark_pages(z, i) (z->_watermark[i] + z->watermark_boost) struct per_cpu_pages { int count; /* number of pages in the list */ @@ -364,6 +364,7 @@ struct zone { /* zone watermarks, access with *_wmark_pages(zone) macros */ unsigned long _watermark[NR_WMARK]; + unsigned long watermark_boost; unsigned long nr_reserved_highatomic; @@ -890,6 +891,8 @@ static inline int is_highmem(struct zone *zone) struct ctl_table; int min_free_kbytes_sysctl_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); +int watermark_boost_factor_sysctl_handler(struct ctl_table *, int, + void __user *, size_t *, loff_t *); int watermark_scale_factor_sysctl_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES]; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 5fc724e4e454..1825f712e73b 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1462,6 +1462,14 @@ static struct ctl_table vm_table[] = { .proc_handler = min_free_kbytes_sysctl_handler, .extra1 = &zero, }, + { + .procname = "watermark_boost_factor", + .data = &watermark_boost_factor, + .maxlen = sizeof(watermark_boost_factor), + .mode = 0644, + .proc_handler = watermark_boost_factor_sysctl_handler, + .extra1 = &zero, + }, { .procname = "watermark_scale_factor", .data = &watermark_scale_factor, diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 32b3e121a388..80373eca453d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -262,6 +262,7 @@ compound_page_dtor * const compound_page_dtors[] = { int min_free_kbytes = 1024; int user_min_free_kbytes = -1; +int watermark_boost_factor __read_mostly = 15000; int watermark_scale_factor = 10; static unsigned long nr_kernel_pages __meminitdata; @@ -2129,6 +2130,21 @@ static bool can_steal_fallback(unsigned int order, int start_mt) return false; } +static inline void boost_watermark(struct zone *zone) +{ + unsigned long max_boost; + + if (!watermark_boost_factor) + return; + + max_boost = mult_frac(zone->_watermark[WMARK_HIGH], + watermark_boost_factor, 10000); + max_boost = max(pageblock_nr_pages, max_boost); + + zone->watermark_boost = min(zone->watermark_boost + pageblock_nr_pages, + max_boost); +} + /* * This function implements actual steal behaviour. If order is large enough, * we can steal whole pageblock. If not, we first move freepages in this @@ -2138,7 +2154,7 @@ static bool can_steal_fallback(unsigned int order, int start_mt) * itself, so pages freed in the future will be put on the correct free list. */ static void steal_suitable_fallback(struct zone *zone, struct page *page, - int start_type, bool whole_block) + unsigned int alloc_flags, int start_type, bool whole_block) { unsigned int current_order = page_order(page); struct free_area *area; @@ -2160,6 +2176,15 @@ static void steal_suitable_fallback(struct zone *zone, struct page *page, goto single_page; } + /* + * Boost watermarks to increase reclaim pressure to reduce the + * likelihood of future fallbacks. Wake kswapd now as the node + * may be balanced overall and kswapd will not wake naturally. + */ + boost_watermark(zone); + if (alloc_flags & ALLOC_KSWAPD) + wakeup_kswapd(zone, 0, 0, zone_idx(zone)); + /* We are not allowed to try stealing from the whole block */ if (!whole_block) goto single_page; @@ -2443,7 +2468,8 @@ do_steal: page = list_first_entry(&area->free_list[fallback_mt], struct page, lru); - steal_suitable_fallback(zone, page, start_migratetype, can_steal); + steal_suitable_fallback(zone, page, alloc_flags, start_migratetype, + can_steal); trace_mm_page_alloc_extfrag(page, order, current_order, start_migratetype, fallback_mt); @@ -7454,6 +7480,7 @@ static void __setup_per_zone_wmarks(void) zone->_watermark[WMARK_LOW] = min_wmark_pages(zone) + tmp; zone->_watermark[WMARK_HIGH] = min_wmark_pages(zone) + tmp * 2; + zone->watermark_boost = 0; spin_unlock_irqrestore(&zone->lock, flags); } @@ -7554,6 +7581,18 @@ int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write, return 0; } +int watermark_boost_factor_sysctl_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *length, loff_t *ppos) +{ + int rc; + + rc = proc_dointvec_minmax(table, write, buffer, length, ppos); + if (rc) + return rc; + + return 0; +} + int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { diff --git a/mm/vmscan.c b/mm/vmscan.c index 24ab1f7394ab..bd8971a29204 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -88,6 +88,9 @@ struct scan_control { /* Can pages be swapped as part of reclaim? */ unsigned int may_swap:1; + /* e.g. boosted watermark reclaim leaves slabs alone */ + unsigned int may_shrinkslab:1; + /* * Cgroups are not reclaimed below their configured memory.low, * unless we threaten to OOM. If any cgroups are skipped due to @@ -2756,8 +2759,10 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) shrink_node_memcg(pgdat, memcg, sc, &lru_pages); node_lru_pages += lru_pages; - shrink_slab(sc->gfp_mask, pgdat->node_id, + if (sc->may_shrinkslab) { + shrink_slab(sc->gfp_mask, pgdat->node_id, memcg, sc->priority); + } /* Record the group's reclaim efficiency */ vmpressure(sc->gfp_mask, memcg, false, @@ -3239,6 +3244,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, .may_writepage = !laptop_mode, .may_unmap = 1, .may_swap = 1, + .may_shrinkslab = 1, }; /* @@ -3283,6 +3289,7 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg, .may_unmap = 1, .reclaim_idx = MAX_NR_ZONES - 1, .may_swap = !noswap, + .may_shrinkslab = 1, }; unsigned long lru_pages; @@ -3329,6 +3336,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, .may_writepage = !laptop_mode, .may_unmap = 1, .may_swap = may_swap, + .may_shrinkslab = 1, }; /* @@ -3379,6 +3387,30 @@ static void age_active_anon(struct pglist_data *pgdat, } while (memcg); } +static bool pgdat_watermark_boosted(pg_data_t *pgdat, int classzone_idx) +{ + int i; + struct zone *zone; + + /* + * Check for watermark boosts top-down as the higher zones + * are more likely to be boosted. Both watermarks and boosts + * should not be checked at the time time as reclaim would + * start prematurely when there is no boosting and a lower + * zone is balanced. + */ + for (i = classzone_idx; i >= 0; i--) { + zone = pgdat->node_zones + i; + if (!managed_zone(zone)) + continue; + + if (zone->watermark_boost) + return true; + } + + return false; +} + /* * Returns true if there is an eligible zone balanced for the request order * and classzone_idx @@ -3389,6 +3421,10 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx) unsigned long mark = -1; struct zone *zone; + /* + * Check watermarks bottom-up as lower zones are more likely to + * meet watermarks. + */ for (i = 0; i <= classzone_idx; i++) { zone = pgdat->node_zones + i; @@ -3517,14 +3553,14 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) unsigned long nr_soft_reclaimed; unsigned long nr_soft_scanned; unsigned long pflags; + unsigned long nr_boost_reclaim; + unsigned long zone_boosts[MAX_NR_ZONES] = { 0, }; + bool boosted; struct zone *zone; struct scan_control sc = { .gfp_mask = GFP_KERNEL, .order = order, - .priority = DEF_PRIORITY, - .may_writepage = !laptop_mode, .may_unmap = 1, - .may_swap = 1, }; psi_memstall_enter(&pflags); @@ -3532,9 +3568,28 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) count_vm_event(PAGEOUTRUN); + /* + * Account for the reclaim boost. Note that the zone boost is left in + * place so that parallel allocations that are near the watermark will + * stall or direct reclaim until kswapd is finished. + */ + nr_boost_reclaim = 0; + for (i = 0; i <= classzone_idx; i++) { + zone = pgdat->node_zones + i; + if (!managed_zone(zone)) + continue; + + nr_boost_reclaim += zone->watermark_boost; + zone_boosts[i] = zone->watermark_boost; + } + boosted = nr_boost_reclaim; + +restart: + sc.priority = DEF_PRIORITY; do { unsigned long nr_reclaimed = sc.nr_reclaimed; bool raise_priority = true; + bool balanced; bool ret; sc.reclaim_idx = classzone_idx; @@ -3561,13 +3616,40 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) } /* - * Only reclaim if there are no eligible zones. Note that - * sc.reclaim_idx is not used as buffer_heads_over_limit may - * have adjusted it. + * If the pgdat is imbalanced then ignore boosting and preserve + * the watermarks for a later time and restart. Note that the + * zone watermarks will be still reset at the end of balancing + * on the grounds that the normal reclaim should be enough to + * re-evaluate if boosting is required when kswapd next wakes. + */ + balanced = pgdat_balanced(pgdat, sc.order, classzone_idx); + if (!balanced && nr_boost_reclaim) { + nr_boost_reclaim = 0; + goto restart; + } + + /* + * If boosting is not active then only reclaim if there are no + * eligible zones. Note that sc.reclaim_idx is not used as + * buffer_heads_over_limit may have adjusted it. */ - if (pgdat_balanced(pgdat, sc.order, classzone_idx)) + if (!nr_boost_reclaim && balanced) goto out; + /* Limit the priority of boosting to avoid reclaim writeback */ + if (nr_boost_reclaim && sc.priority == DEF_PRIORITY - 2) + raise_priority = false; + + /* + * Do not writeback or swap pages for boosted reclaim. The + * intent is to relieve pressure not issue sub-optimal IO + * from reclaim context. If no pages are reclaimed, the + * reclaim will be aborted. + */ + sc.may_writepage = !laptop_mode && !nr_boost_reclaim; + sc.may_swap = !nr_boost_reclaim; + sc.may_shrinkslab = !nr_boost_reclaim; + /* * Do some background aging of the anon list, to give * pages a chance to be referenced before reclaiming. All @@ -3619,6 +3701,16 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) * progress in reclaiming pages */ nr_reclaimed = sc.nr_reclaimed - nr_reclaimed; + nr_boost_reclaim -= min(nr_boost_reclaim, nr_reclaimed); + + /* + * If reclaim made no progress for a boost, stop reclaim as + * IO cannot be queued and it could be an infinite loop in + * extreme circumstances. + */ + if (nr_boost_reclaim && !nr_reclaimed) + break; + if (raise_priority || !nr_reclaimed) sc.priority--; } while (sc.priority >= 1); @@ -3627,6 +3719,28 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) pgdat->kswapd_failures++; out: + /* If reclaim was boosted, account for the reclaim done in this pass */ + if (boosted) { + unsigned long flags; + + for (i = 0; i <= classzone_idx; i++) { + if (!zone_boosts[i]) + continue; + + /* Increments are under the zone lock */ + zone = pgdat->node_zones + i; + spin_lock_irqsave(&zone->lock, flags); + zone->watermark_boost -= min(zone->watermark_boost, zone_boosts[i]); + spin_unlock_irqrestore(&zone->lock, flags); + } + + /* + * As there is now likely space, wakeup kcompact to defragment + * pageblocks. + */ + wakeup_kcompactd(pgdat, pageblock_order, classzone_idx); + } + snapshot_refaults(NULL, pgdat); __fs_reclaim_release(); psi_memstall_leave(&pflags); @@ -3855,7 +3969,8 @@ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order, /* Hopeless node, leave it to direct reclaim if possible */ if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES || - pgdat_balanced(pgdat, order, classzone_idx)) { + (pgdat_balanced(pgdat, order, classzone_idx) && + !pgdat_watermark_boosted(pgdat, classzone_idx))) { /* * There may be plenty of free memory available, but it's too * fragmented for high-order allocations. Wake up kcompactd -- cgit From ef8444ea01d7442652f8e1b8a8b94278cb57eafd Mon Sep 17 00:00:00 2001 From: yuzhoujian Date: Fri, 28 Dec 2018 00:36:07 -0800 Subject: mm, oom: reorganize the oom report in dump_header OOM report contains several sections. The first one is the allocation context that has triggered the OOM. Then we have cpuset context followed by the stack trace of the OOM path. The tird one is the OOM memory information. Followed by the current memory state of all system tasks. At last, we will show oom eligible tasks and the information about the chosen oom victim. One thing that makes parsing more awkward than necessary is that we do not have a single and easily parsable line about the oom context. This patch is reorganizing the oom report to 1) who invoked oom and what was the allocation request [ 515.902945] tuned invoked oom-killer: gfp_mask=0x6200ca(GFP_HIGHUSER_MOVABLE), order=0, oom_score_adj=0 2) OOM stack trace [ 515.904273] CPU: 24 PID: 1809 Comm: tuned Not tainted 4.20.0-rc3+ #3 [ 515.905518] Hardware name: Inspur SA5212M4/YZMB-00370-107, BIOS 4.1.10 11/14/2016 [ 515.906821] Call Trace: [ 515.908062] dump_stack+0x5a/0x73 [ 515.909311] dump_header+0x55/0x28c [ 515.914260] oom_kill_process+0x2d8/0x300 [ 515.916708] out_of_memory+0x145/0x4a0 [ 515.917932] __alloc_pages_slowpath+0x7d2/0xa16 [ 515.919157] __alloc_pages_nodemask+0x277/0x290 [ 515.920367] filemap_fault+0x3d0/0x6c0 [ 515.921529] ? filemap_map_pages+0x2b8/0x420 [ 515.922709] ext4_filemap_fault+0x2c/0x40 [ext4] [ 515.923884] __do_fault+0x20/0x80 [ 515.925032] __handle_mm_fault+0xbc0/0xe80 [ 515.926195] handle_mm_fault+0xfa/0x210 [ 515.927357] __do_page_fault+0x233/0x4c0 [ 515.928506] do_page_fault+0x32/0x140 [ 515.929646] ? page_fault+0x8/0x30 [ 515.930770] page_fault+0x1e/0x30 3) OOM memory information [ 515.958093] Mem-Info: [ 515.959647] active_anon:26501758 inactive_anon:1179809 isolated_anon:0 active_file:4402672 inactive_file:483963 isolated_file:1344 unevictable:0 dirty:4886753 writeback:0 unstable:0 slab_reclaimable:148442 slab_unreclaimable:18741 mapped:1347 shmem:1347 pagetables:58669 bounce:0 free:88663 free_pcp:0 free_cma:0 ... 4) current memory state of all system tasks [ 516.079544] [ 744] 0 744 9211 1345 114688 82 0 systemd-journal [ 516.082034] [ 787] 0 787 31764 0 143360 92 0 lvmetad [ 516.084465] [ 792] 0 792 10930 1 110592 208 -1000 systemd-udevd [ 516.086865] [ 1199] 0 1199 13866 0 131072 112 -1000 auditd [ 516.089190] [ 1222] 0 1222 31990 1 110592 157 0 smartd [ 516.091477] [ 1225] 0 1225 4864 85 81920 43 0 irqbalance [ 516.093712] [ 1226] 0 1226 52612 0 258048 426 0 abrtd [ 516.112128] [ 1280] 0 1280 109774 55 299008 400 0 NetworkManager [ 516.113998] [ 1295] 0 1295 28817 37 69632 24 0 ksmtuned [ 516.144596] [ 10718] 0 10718 2622484 1721372 15998976 267219 0 panic [ 516.145792] [ 10719] 0 10719 2622484 1164767 9818112 53576 0 panic [ 516.146977] [ 10720] 0 10720 2622484 1174361 9904128 53709 0 panic [ 516.148163] [ 10721] 0 10721 2622484 1209070 10194944 54824 0 panic [ 516.149329] [ 10722] 0 10722 2622484 1745799 14774272 91138 0 panic 5) oom context (contrains and the chosen victim). oom-kill:constraint=CONSTRAINT_NONE,nodemask=(null),cpuset=/,mems_allowed=0-1,task=panic,pid=10737,uid=0 An admin can easily get the full oom context at a single line which makes parsing much easier. Link: http://lkml.kernel.org/r/1542799799-36184-1-git-send-email-ufo19890607@gmail.com Signed-off-by: yuzhoujian Acked-by: Michal Hocko Cc: Andrea Arcangeli Cc: David Rientjes Cc: "Kirill A . Shutemov" Cc: Roman Gushchin Cc: Tetsuo Handa Cc: Yang Shi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/oom.h | 10 ++++++++++ kernel/cgroup/cpuset.c | 4 ++-- mm/oom_kill.c | 29 ++++++++++++++++++++--------- mm/page_alloc.c | 4 ++-- 4 files changed, 34 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/include/linux/oom.h b/include/linux/oom.h index 69864a547663..d07992009265 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -15,6 +15,13 @@ struct notifier_block; struct mem_cgroup; struct task_struct; +enum oom_constraint { + CONSTRAINT_NONE, + CONSTRAINT_CPUSET, + CONSTRAINT_MEMORY_POLICY, + CONSTRAINT_MEMCG, +}; + /* * Details of the page allocation that triggered the oom killer that are used to * determine what should be killed. @@ -42,6 +49,9 @@ struct oom_control { unsigned long totalpages; struct task_struct *chosen; unsigned long chosen_points; + + /* Used to print the constraint info. */ + enum oom_constraint constraint; }; extern struct mutex oom_lock; diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 266f10cb7222..9510a5b32eaf 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -2666,9 +2666,9 @@ void cpuset_print_current_mems_allowed(void) rcu_read_lock(); cgrp = task_cs(current)->css.cgroup; - pr_info("%s cpuset=", current->comm); + pr_cont(",cpuset="); pr_cont_cgroup_name(cgrp); - pr_cont(" mems_allowed=%*pbl\n", + pr_cont(",mems_allowed=%*pbl", nodemask_pr_args(¤t->mems_allowed)); rcu_read_unlock(); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 21d487749e1d..d90253f1ff93 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -245,11 +245,11 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, return points > 0 ? points : 1; } -enum oom_constraint { - CONSTRAINT_NONE, - CONSTRAINT_CPUSET, - CONSTRAINT_MEMORY_POLICY, - CONSTRAINT_MEMCG, +static const char * const oom_constraint_text[] = { + [CONSTRAINT_NONE] = "CONSTRAINT_NONE", + [CONSTRAINT_CPUSET] = "CONSTRAINT_CPUSET", + [CONSTRAINT_MEMORY_POLICY] = "CONSTRAINT_MEMORY_POLICY", + [CONSTRAINT_MEMCG] = "CONSTRAINT_MEMCG", }; /* @@ -428,16 +428,25 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask) rcu_read_unlock(); } +static void dump_oom_summary(struct oom_control *oc, struct task_struct *victim) +{ + /* one line summary of the oom killer context. */ + pr_info("oom-kill:constraint=%s,nodemask=%*pbl", + oom_constraint_text[oc->constraint], + nodemask_pr_args(oc->nodemask)); + cpuset_print_current_mems_allowed(); + pr_cont(",task=%s,pid=%d,uid=%d\n", victim->comm, victim->pid, + from_kuid(&init_user_ns, task_uid(victim))); +} + static void dump_header(struct oom_control *oc, struct task_struct *p) { - pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), nodemask=%*pbl, order=%d, oom_score_adj=%hd\n", - current->comm, oc->gfp_mask, &oc->gfp_mask, - nodemask_pr_args(oc->nodemask), oc->order, + pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, oom_score_adj=%hd\n", + current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order, current->signal->oom_score_adj); if (!IS_ENABLED(CONFIG_COMPACTION) && oc->order) pr_warn("COMPACTION is disabled!!!\n"); - cpuset_print_current_mems_allowed(); dump_stack(); if (is_memcg_oom(oc)) mem_cgroup_print_oom_info(oc->memcg, p); @@ -448,6 +457,8 @@ static void dump_header(struct oom_control *oc, struct task_struct *p) } if (sysctl_oom_dump_tasks) dump_tasks(oc->memcg, oc->nodemask); + if (p) + dump_oom_summary(oc, p); } /* diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e97ebaf5ba26..a48db99da7b5 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3517,13 +3517,13 @@ void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...) va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; - pr_warn("%s: %pV, mode:%#x(%pGg), nodemask=%*pbl\n", + pr_warn("%s: %pV, mode:%#x(%pGg), nodemask=%*pbl", current->comm, &vaf, gfp_mask, &gfp_mask, nodemask_pr_args(nodemask)); va_end(args); cpuset_print_current_mems_allowed(); - + pr_cont("\n"); dump_stack(); warn_alloc_show_mem(gfp_mask, nodemask); } -- cgit From 2c2a5af6fed20cf74401c9d64319c76c5ff81309 Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Fri, 28 Dec 2018 00:36:22 -0800 Subject: mm, memory_hotplug: add nid parameter to arch_remove_memory Patch series "Do not touch pages in hot-remove path", v2. This patchset aims for two things: 1) A better definition about offline and hot-remove stage 2) Solving bugs where we can access non-initialized pages during hot-remove operations [2] [3]. This is achieved by moving all page/zone handling to the offline stage, so we do not need to access pages when hot-removing memory. [1] https://patchwork.kernel.org/cover/10691415/ [2] https://patchwork.kernel.org/patch/10547445/ [3] https://www.spinics.net/lists/linux-mm/msg161316.html This patch (of 5): This is a preparation for the following-up patches. The idea of passing the nid is that it will allow us to get rid of the zone parameter afterwards. Link: http://lkml.kernel.org/r/20181127162005.15833-2-osalvador@suse.de Signed-off-by: Oscar Salvador Reviewed-by: David Hildenbrand Reviewed-by: Pavel Tatashin Cc: Michal Hocko Cc: Dan Williams Cc: Jerome Glisse Cc: Jonathan Cameron Cc: "Rafael J. Wysocki" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/mm/init.c | 2 +- arch/powerpc/mm/mem.c | 3 ++- arch/s390/mm/init.c | 2 +- arch/sh/mm/init.c | 2 +- arch/x86/mm/init_32.c | 2 +- arch/x86/mm/init_64.c | 3 ++- include/linux/memory_hotplug.h | 4 ++-- kernel/memremap.c | 5 ++++- mm/memory_hotplug.c | 2 +- 9 files changed, 15 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index d5e12ff1d73c..904fe55e10fc 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -661,7 +661,7 @@ int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap, } #ifdef CONFIG_MEMORY_HOTREMOVE -int arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap) +int arch_remove_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 20394e52fe27..33cc6f676fa6 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -139,7 +139,8 @@ int __meminit arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap * } #ifdef CONFIG_MEMORY_HOTREMOVE -int __meminit arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap) +int __meminit arch_remove_memory(int nid, u64 start, u64 size, + struct vmem_altmap *altmap) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index 50388190b393..3e82f66d5c61 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -242,7 +242,7 @@ int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap, } #ifdef CONFIG_MEMORY_HOTREMOVE -int arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap) +int arch_remove_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap) { /* * There is no hardware or firmware interface which could trigger a diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c index c8c13c777162..a8e5c0e00fca 100644 --- a/arch/sh/mm/init.c +++ b/arch/sh/mm/init.c @@ -443,7 +443,7 @@ EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); #endif #ifdef CONFIG_MEMORY_HOTREMOVE -int arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap) +int arch_remove_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap) { unsigned long start_pfn = PFN_DOWN(start); unsigned long nr_pages = size >> PAGE_SHIFT; diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 49ecf5ecf6d3..85c94f9a87f8 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -860,7 +860,7 @@ int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap, } #ifdef CONFIG_MEMORY_HOTREMOVE -int arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap) +int arch_remove_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 484c1b92f078..bccff68e3267 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -1141,7 +1141,8 @@ kernel_physical_mapping_remove(unsigned long start, unsigned long end) remove_pagetable(start, end, true, NULL); } -int __ref arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap) +int __ref arch_remove_memory(int nid, u64 start, u64 size, + struct vmem_altmap *altmap) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 7383a7a76d69..9e4d9b9b93ea 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -107,8 +107,8 @@ static inline bool movable_node_is_enabled(void) } #ifdef CONFIG_MEMORY_HOTREMOVE -extern int arch_remove_memory(u64 start, u64 size, - struct vmem_altmap *altmap); +extern int arch_remove_memory(int nid, u64 start, u64 size, + struct vmem_altmap *altmap); extern int __remove_pages(struct zone *zone, unsigned long start_pfn, unsigned long nr_pages, struct vmem_altmap *altmap); #endif /* CONFIG_MEMORY_HOTREMOVE */ diff --git a/kernel/memremap.c b/kernel/memremap.c index 3eef989ef035..0d5603d76c37 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -87,6 +87,7 @@ static void devm_memremap_pages_release(void *data) struct resource *res = &pgmap->res; resource_size_t align_start, align_size; unsigned long pfn; + int nid; pgmap->kill(pgmap->ref); for_each_device_pfn(pfn, pgmap) @@ -97,13 +98,15 @@ static void devm_memremap_pages_release(void *data) align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE) - align_start; + nid = page_to_nid(pfn_to_page(align_start >> PAGE_SHIFT)); + mem_hotplug_begin(); if (pgmap->type == MEMORY_DEVICE_PRIVATE) { pfn = align_start >> PAGE_SHIFT; __remove_pages(page_zone(pfn_to_page(pfn)), pfn, align_size >> PAGE_SHIFT, NULL); } else { - arch_remove_memory(align_start, align_size, + arch_remove_memory(nid, align_start, align_size, pgmap->altmap_valid ? &pgmap->altmap : NULL); kasan_remove_zero_shadow(__va(align_start), align_size); } diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 6258e0e923cc..0718cf7427b2 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1841,7 +1841,7 @@ void __ref __remove_memory(int nid, u64 start, u64 size) memblock_free(start, size); memblock_remove(start, size); - arch_remove_memory(start, size, NULL); + arch_remove_memory(nid, start, size, NULL); try_offline_node(nid); -- cgit From 65c78784135f847e49eb98e6b976e453e71100c3 Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Fri, 28 Dec 2018 00:36:26 -0800 Subject: kernel, resource: check for IORESOURCE_SYSRAM in release_mem_region_adjustable This is a preparation for the next patch. Currently, we only call release_mem_region_adjustable() in __remove_pages if the zone is not ZONE_DEVICE, because resources that belong to HMM/devm are being released by themselves with devm_release_mem_region. Since we do not want to touch any zone/page stuff during the removing of the memory (but during the offlining), we do not want to check for the zone here. So we need another way to tell release_mem_region_adjustable() to not realease the resource in case it belongs to HMM/devm. HMM/devm acquires/releases a resource through devm_request_mem_region/devm_release_mem_region. These resources have the flag IORESOURCE_MEM, while resources acquired by hot-add memory path (register_memory_resource()) contain IORESOURCE_SYSTEM_RAM. So, we can check for this flag in release_mem_region_adjustable, and if the resource does not contain such flag, we know that we are dealing with a HMM/devm resource, so we can back off. Link: http://lkml.kernel.org/r/20181127162005.15833-3-osalvador@suse.de Signed-off-by: Oscar Salvador Reviewed-by: David Hildenbrand Reviewed-by: Pavel Tatashin Cc: Dan Williams Cc: Jerome Glisse Cc: Jonathan Cameron Cc: Michal Hocko Cc: Oscar Salvador Cc: "Rafael J. Wysocki" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/resource.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index b0fbf685c77a..915c02e8e5dd 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -1256,6 +1256,21 @@ int release_mem_region_adjustable(struct resource *parent, continue; } + /* + * All memory regions added from memory-hotplug path have the + * flag IORESOURCE_SYSTEM_RAM. If the resource does not have + * this flag, we know that we are dealing with a resource coming + * from HMM/devm. HMM/devm use another mechanism to add/release + * a resource. This goes via devm_request_mem_region and + * devm_release_mem_region. + * HMM/devm take care to release their resources when they want, + * so if we are dealing with them, let us just back off here. + */ + if (!(res->flags & IORESOURCE_SYSRAM)) { + ret = 0; + break; + } + if (!(res->flags & IORESOURCE_MEM)) break; -- cgit From ac46d4f3c43241ffa23d5bf36153a0830c0e02cc Mon Sep 17 00:00:00 2001 From: Jérôme Glisse Date: Fri, 28 Dec 2018 00:38:09 -0800 Subject: mm/mmu_notifier: use structure for invalidate_range_start/end calls v2 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To avoid having to change many call sites everytime we want to add a parameter use a structure to group all parameters for the mmu_notifier invalidate_range_start/end cakks. No functional changes with this patch. [akpm@linux-foundation.org: coding style fixes] Link: http://lkml.kernel.org/r/20181205053628.3210-3-jglisse@redhat.com Signed-off-by: Jérôme Glisse Acked-by: Christian König Acked-by: Jan Kara Cc: Matthew Wilcox Cc: Ross Zwisler Cc: Dan Williams Cc: Paolo Bonzini Cc: Radim Krcmar Cc: Michal Hocko Cc: Felix Kuehling Cc: Ralph Campbell Cc: John Hubbard From: Jérôme Glisse Subject: mm/mmu_notifier: use structure for invalidate_range_start/end calls v3 fix build warning in migrate.c when CONFIG_MMU_NOTIFIER=n Link: http://lkml.kernel.org/r/20181213171330.8489-3-jglisse@redhat.com Signed-off-by: Jérôme Glisse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/dax.c | 8 ++-- fs/proc/task_mmu.c | 7 +++- include/linux/mm.h | 6 ++- include/linux/mmu_notifier.h | 87 +++++++++++++++++++++++++------------- kernel/events/uprobes.c | 10 ++--- mm/huge_memory.c | 54 +++++++++++------------- mm/hugetlb.c | 52 +++++++++++------------ mm/khugepaged.c | 10 ++--- mm/ksm.c | 21 ++++------ mm/madvise.c | 21 +++++----- mm/memory.c | 99 ++++++++++++++++++++++---------------------- mm/migrate.c | 28 ++++++------- mm/mmu_notifier.c | 37 ++++------------- mm/mprotect.c | 15 +++---- mm/mremap.c | 10 ++--- mm/oom_kill.c | 17 ++++---- mm/rmap.c | 30 ++++++++------ 17 files changed, 262 insertions(+), 250 deletions(-) (limited to 'kernel') diff --git a/fs/dax.c b/fs/dax.c index 48132eca3761..262e14f29933 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -779,7 +779,8 @@ static void dax_entry_mkclean(struct address_space *mapping, pgoff_t index, i_mmap_lock_read(mapping); vma_interval_tree_foreach(vma, &mapping->i_mmap, index, index) { - unsigned long address, start, end; + struct mmu_notifier_range range; + unsigned long address; cond_resched(); @@ -793,7 +794,8 @@ static void dax_entry_mkclean(struct address_space *mapping, pgoff_t index, * call mmu_notifier_invalidate_range_start() on our behalf * before taking any lock. */ - if (follow_pte_pmd(vma->vm_mm, address, &start, &end, &ptep, &pmdp, &ptl)) + if (follow_pte_pmd(vma->vm_mm, address, &range, + &ptep, &pmdp, &ptl)) continue; /* @@ -835,7 +837,7 @@ unlock_pte: pte_unmap_unlock(ptep, ptl); } - mmu_notifier_invalidate_range_end(vma->vm_mm, start, end); + mmu_notifier_invalidate_range_end(&range); } i_mmap_unlock_read(mapping); } diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 47c3764c469b..b3ddceb003bc 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1096,6 +1096,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, return -ESRCH; mm = get_task_mm(task); if (mm) { + struct mmu_notifier_range range; struct clear_refs_private cp = { .type = type, }; @@ -1139,11 +1140,13 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, downgrade_write(&mm->mmap_sem); break; } - mmu_notifier_invalidate_range_start(mm, 0, -1); + + mmu_notifier_range_init(&range, mm, 0, -1UL); + mmu_notifier_invalidate_range_start(&range); } walk_page_range(0, mm->highest_vm_end, &clear_refs_walk); if (type == CLEAR_REFS_SOFT_DIRTY) - mmu_notifier_invalidate_range_end(mm, 0, -1); + mmu_notifier_invalidate_range_end(&range); tlb_finish_mmu(&tlb, 0, -1); up_read(&mm->mmap_sem); out_mm: diff --git a/include/linux/mm.h b/include/linux/mm.h index 3c39b9dc7a90..ea1f12d15365 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1451,6 +1451,8 @@ struct mm_walk { void *private; }; +struct mmu_notifier_range; + int walk_page_range(unsigned long addr, unsigned long end, struct mm_walk *walk); int walk_page_vma(struct vm_area_struct *vma, struct mm_walk *walk); @@ -1459,8 +1461,8 @@ void free_pgd_range(struct mmu_gather *tlb, unsigned long addr, int copy_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *vma); int follow_pte_pmd(struct mm_struct *mm, unsigned long address, - unsigned long *start, unsigned long *end, - pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp); + struct mmu_notifier_range *range, + pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp); int follow_pfn(struct vm_area_struct *vma, unsigned long address, unsigned long *pfn); int follow_phys(struct vm_area_struct *vma, unsigned long address, diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index 3d377805b29c..4050ec1c3b45 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -220,11 +220,8 @@ extern int __mmu_notifier_test_young(struct mm_struct *mm, unsigned long address); extern void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address, pte_t pte); -extern int __mmu_notifier_invalidate_range_start(struct mm_struct *mm, - unsigned long start, unsigned long end, - bool blockable); -extern void __mmu_notifier_invalidate_range_end(struct mm_struct *mm, - unsigned long start, unsigned long end, +extern int __mmu_notifier_invalidate_range_start(struct mmu_notifier_range *r); +extern void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *r, bool only_end); extern void __mmu_notifier_invalidate_range(struct mm_struct *mm, unsigned long start, unsigned long end); @@ -268,33 +265,37 @@ static inline void mmu_notifier_change_pte(struct mm_struct *mm, __mmu_notifier_change_pte(mm, address, pte); } -static inline void mmu_notifier_invalidate_range_start(struct mm_struct *mm, - unsigned long start, unsigned long end) +static inline void +mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range) { - if (mm_has_notifiers(mm)) - __mmu_notifier_invalidate_range_start(mm, start, end, true); + if (mm_has_notifiers(range->mm)) { + range->blockable = true; + __mmu_notifier_invalidate_range_start(range); + } } -static inline int mmu_notifier_invalidate_range_start_nonblock(struct mm_struct *mm, - unsigned long start, unsigned long end) +static inline int +mmu_notifier_invalidate_range_start_nonblock(struct mmu_notifier_range *range) { - if (mm_has_notifiers(mm)) - return __mmu_notifier_invalidate_range_start(mm, start, end, false); + if (mm_has_notifiers(range->mm)) { + range->blockable = false; + return __mmu_notifier_invalidate_range_start(range); + } return 0; } -static inline void mmu_notifier_invalidate_range_end(struct mm_struct *mm, - unsigned long start, unsigned long end) +static inline void +mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range) { - if (mm_has_notifiers(mm)) - __mmu_notifier_invalidate_range_end(mm, start, end, false); + if (mm_has_notifiers(range->mm)) + __mmu_notifier_invalidate_range_end(range, false); } -static inline void mmu_notifier_invalidate_range_only_end(struct mm_struct *mm, - unsigned long start, unsigned long end) +static inline void +mmu_notifier_invalidate_range_only_end(struct mmu_notifier_range *range) { - if (mm_has_notifiers(mm)) - __mmu_notifier_invalidate_range_end(mm, start, end, true); + if (mm_has_notifiers(range->mm)) + __mmu_notifier_invalidate_range_end(range, true); } static inline void mmu_notifier_invalidate_range(struct mm_struct *mm, @@ -315,6 +316,17 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm) __mmu_notifier_mm_destroy(mm); } + +static inline void mmu_notifier_range_init(struct mmu_notifier_range *range, + struct mm_struct *mm, + unsigned long start, + unsigned long end) +{ + range->mm = mm; + range->start = start; + range->end = end; +} + #define ptep_clear_flush_young_notify(__vma, __address, __ptep) \ ({ \ int __young; \ @@ -427,6 +439,23 @@ extern void mmu_notifier_call_srcu(struct rcu_head *rcu, #else /* CONFIG_MMU_NOTIFIER */ +struct mmu_notifier_range { + unsigned long start; + unsigned long end; +}; + +static inline void _mmu_notifier_range_init(struct mmu_notifier_range *range, + unsigned long start, + unsigned long end) +{ + range->start = start; + range->end = end; +} + +#define mmu_notifier_range_init(range, mm, start, end) \ + _mmu_notifier_range_init(range, start, end) + + static inline int mm_has_notifiers(struct mm_struct *mm) { return 0; @@ -454,24 +483,24 @@ static inline void mmu_notifier_change_pte(struct mm_struct *mm, { } -static inline void mmu_notifier_invalidate_range_start(struct mm_struct *mm, - unsigned long start, unsigned long end) +static inline void +mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range) { } -static inline int mmu_notifier_invalidate_range_start_nonblock(struct mm_struct *mm, - unsigned long start, unsigned long end) +static inline int +mmu_notifier_invalidate_range_start_nonblock(struct mmu_notifier_range *range) { return 0; } -static inline void mmu_notifier_invalidate_range_end(struct mm_struct *mm, - unsigned long start, unsigned long end) +static inline +void mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range) { } -static inline void mmu_notifier_invalidate_range_only_end(struct mm_struct *mm, - unsigned long start, unsigned long end) +static inline void +mmu_notifier_invalidate_range_only_end(struct mmu_notifier_range *range) { } diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index abbd8da9ac21..8aef47ee7bfa 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -171,11 +171,11 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, .address = addr, }; int err; - /* For mmu_notifiers */ - const unsigned long mmun_start = addr; - const unsigned long mmun_end = addr + PAGE_SIZE; + struct mmu_notifier_range range; struct mem_cgroup *memcg; + mmu_notifier_range_init(&range, mm, addr, addr + PAGE_SIZE); + VM_BUG_ON_PAGE(PageTransHuge(old_page), old_page); err = mem_cgroup_try_charge(new_page, vma->vm_mm, GFP_KERNEL, &memcg, @@ -186,7 +186,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, /* For try_to_free_swap() and munlock_vma_page() below */ lock_page(old_page); - mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); + mmu_notifier_invalidate_range_start(&range); err = -EAGAIN; if (!page_vma_mapped_walk(&pvmw)) { mem_cgroup_cancel_charge(new_page, memcg, false); @@ -220,7 +220,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, err = 0; unlock: - mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); + mmu_notifier_invalidate_range_end(&range); unlock_page(old_page); return err; } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 0c0e18409fde..05136ad0f325 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1134,8 +1134,7 @@ static vm_fault_t do_huge_pmd_wp_page_fallback(struct vm_fault *vmf, int i; vm_fault_t ret = 0; struct page **pages; - unsigned long mmun_start; /* For mmu_notifiers */ - unsigned long mmun_end; /* For mmu_notifiers */ + struct mmu_notifier_range range; pages = kmalloc_array(HPAGE_PMD_NR, sizeof(struct page *), GFP_KERNEL); @@ -1173,9 +1172,9 @@ static vm_fault_t do_huge_pmd_wp_page_fallback(struct vm_fault *vmf, cond_resched(); } - mmun_start = haddr; - mmun_end = haddr + HPAGE_PMD_SIZE; - mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end); + mmu_notifier_range_init(&range, vma->vm_mm, haddr, + haddr + HPAGE_PMD_SIZE); + mmu_notifier_invalidate_range_start(&range); vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) @@ -1220,8 +1219,7 @@ static vm_fault_t do_huge_pmd_wp_page_fallback(struct vm_fault *vmf, * No need to double call mmu_notifier->invalidate_range() callback as * the above pmdp_huge_clear_flush_notify() did already call it. */ - mmu_notifier_invalidate_range_only_end(vma->vm_mm, mmun_start, - mmun_end); + mmu_notifier_invalidate_range_only_end(&range); ret |= VM_FAULT_WRITE; put_page(page); @@ -1231,7 +1229,7 @@ out: out_free_pages: spin_unlock(vmf->ptl); - mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end); + mmu_notifier_invalidate_range_end(&range); for (i = 0; i < HPAGE_PMD_NR; i++) { memcg = (void *)page_private(pages[i]); set_page_private(pages[i], 0); @@ -1248,8 +1246,7 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd) struct page *page = NULL, *new_page; struct mem_cgroup *memcg; unsigned long haddr = vmf->address & HPAGE_PMD_MASK; - unsigned long mmun_start; /* For mmu_notifiers */ - unsigned long mmun_end; /* For mmu_notifiers */ + struct mmu_notifier_range range; gfp_t huge_gfp; /* for allocation and charge */ vm_fault_t ret = 0; @@ -1338,9 +1335,9 @@ alloc: vma, HPAGE_PMD_NR); __SetPageUptodate(new_page); - mmun_start = haddr; - mmun_end = haddr + HPAGE_PMD_SIZE; - mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end); + mmu_notifier_range_init(&range, vma->vm_mm, haddr, + haddr + HPAGE_PMD_SIZE); + mmu_notifier_invalidate_range_start(&range); spin_lock(vmf->ptl); if (page) @@ -1375,8 +1372,7 @@ out_mn: * No need to double call mmu_notifier->invalidate_range() callback as * the above pmdp_huge_clear_flush_notify() did already call it. */ - mmu_notifier_invalidate_range_only_end(vma->vm_mm, mmun_start, - mmun_end); + mmu_notifier_invalidate_range_only_end(&range); out: return ret; out_unlock: @@ -2015,14 +2011,15 @@ void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud, unsigned long address) { spinlock_t *ptl; - struct mm_struct *mm = vma->vm_mm; - unsigned long haddr = address & HPAGE_PUD_MASK; + struct mmu_notifier_range range; - mmu_notifier_invalidate_range_start(mm, haddr, haddr + HPAGE_PUD_SIZE); - ptl = pud_lock(mm, pud); + mmu_notifier_range_init(&range, vma->vm_mm, address & HPAGE_PUD_MASK, + (address & HPAGE_PUD_MASK) + HPAGE_PUD_SIZE); + mmu_notifier_invalidate_range_start(&range); + ptl = pud_lock(vma->vm_mm, pud); if (unlikely(!pud_trans_huge(*pud) && !pud_devmap(*pud))) goto out; - __split_huge_pud_locked(vma, pud, haddr); + __split_huge_pud_locked(vma, pud, range.start); out: spin_unlock(ptl); @@ -2030,8 +2027,7 @@ out: * No need to double call mmu_notifier->invalidate_range() callback as * the above pudp_huge_clear_flush_notify() did already call it. */ - mmu_notifier_invalidate_range_only_end(mm, haddr, haddr + - HPAGE_PUD_SIZE); + mmu_notifier_invalidate_range_only_end(&range); } #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ @@ -2233,11 +2229,12 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, unsigned long address, bool freeze, struct page *page) { spinlock_t *ptl; - struct mm_struct *mm = vma->vm_mm; - unsigned long haddr = address & HPAGE_PMD_MASK; + struct mmu_notifier_range range; - mmu_notifier_invalidate_range_start(mm, haddr, haddr + HPAGE_PMD_SIZE); - ptl = pmd_lock(mm, pmd); + mmu_notifier_range_init(&range, vma->vm_mm, address & HPAGE_PMD_MASK, + (address & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE); + mmu_notifier_invalidate_range_start(&range); + ptl = pmd_lock(vma->vm_mm, pmd); /* * If caller asks to setup a migration entries, we need a page to check @@ -2253,7 +2250,7 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, clear_page_mlock(page); } else if (!(pmd_devmap(*pmd) || is_pmd_migration_entry(*pmd))) goto out; - __split_huge_pmd_locked(vma, pmd, haddr, freeze); + __split_huge_pmd_locked(vma, pmd, range.start, freeze); out: spin_unlock(ptl); /* @@ -2269,8 +2266,7 @@ out: * any further changes to individual pte will notify. So no need * to call mmu_notifier->invalidate_range() */ - mmu_notifier_invalidate_range_only_end(mm, haddr, haddr + - HPAGE_PMD_SIZE); + mmu_notifier_invalidate_range_only_end(&range); } void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address, diff --git a/mm/hugetlb.c b/mm/hugetlb.c index a80832487981..12000ba5c868 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3240,16 +3240,16 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, int cow; struct hstate *h = hstate_vma(vma); unsigned long sz = huge_page_size(h); - unsigned long mmun_start; /* For mmu_notifiers */ - unsigned long mmun_end; /* For mmu_notifiers */ + struct mmu_notifier_range range; int ret = 0; cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; - mmun_start = vma->vm_start; - mmun_end = vma->vm_end; - if (cow) - mmu_notifier_invalidate_range_start(src, mmun_start, mmun_end); + if (cow) { + mmu_notifier_range_init(&range, src, vma->vm_start, + vma->vm_end); + mmu_notifier_invalidate_range_start(&range); + } for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) { spinlock_t *src_ptl, *dst_ptl; @@ -3325,7 +3325,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, } if (cow) - mmu_notifier_invalidate_range_end(src, mmun_start, mmun_end); + mmu_notifier_invalidate_range_end(&range); return ret; } @@ -3342,8 +3342,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, struct page *page; struct hstate *h = hstate_vma(vma); unsigned long sz = huge_page_size(h); - unsigned long mmun_start = start; /* For mmu_notifiers */ - unsigned long mmun_end = end; /* For mmu_notifiers */ + struct mmu_notifier_range range; WARN_ON(!is_vm_hugetlb_page(vma)); BUG_ON(start & ~huge_page_mask(h)); @@ -3359,8 +3358,9 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, /* * If sharing possible, alert mmu notifiers of worst case. */ - adjust_range_if_pmd_sharing_possible(vma, &mmun_start, &mmun_end); - mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); + mmu_notifier_range_init(&range, mm, start, end); + adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end); + mmu_notifier_invalidate_range_start(&range); address = start; for (; address < end; address += sz) { ptep = huge_pte_offset(mm, address, sz); @@ -3428,7 +3428,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, if (ref_page) break; } - mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); + mmu_notifier_invalidate_range_end(&range); tlb_end_vma(tlb, vma); } @@ -3546,9 +3546,8 @@ static vm_fault_t hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, struct page *old_page, *new_page; int outside_reserve = 0; vm_fault_t ret = 0; - unsigned long mmun_start; /* For mmu_notifiers */ - unsigned long mmun_end; /* For mmu_notifiers */ unsigned long haddr = address & huge_page_mask(h); + struct mmu_notifier_range range; pte = huge_ptep_get(ptep); old_page = pte_page(pte); @@ -3627,9 +3626,8 @@ retry_avoidcopy: __SetPageUptodate(new_page); set_page_huge_active(new_page); - mmun_start = haddr; - mmun_end = mmun_start + huge_page_size(h); - mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); + mmu_notifier_range_init(&range, mm, haddr, haddr + huge_page_size(h)); + mmu_notifier_invalidate_range_start(&range); /* * Retake the page table lock to check for racing updates @@ -3642,7 +3640,7 @@ retry_avoidcopy: /* Break COW */ huge_ptep_clear_flush(vma, haddr, ptep); - mmu_notifier_invalidate_range(mm, mmun_start, mmun_end); + mmu_notifier_invalidate_range(mm, range.start, range.end); set_huge_pte_at(mm, haddr, ptep, make_huge_pte(vma, new_page, 1)); page_remove_rmap(old_page, true); @@ -3651,7 +3649,7 @@ retry_avoidcopy: new_page = old_page; } spin_unlock(ptl); - mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); + mmu_notifier_invalidate_range_end(&range); out_release_all: restore_reserve_on_error(h, vma, haddr, new_page); put_page(new_page); @@ -4340,21 +4338,21 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, pte_t pte; struct hstate *h = hstate_vma(vma); unsigned long pages = 0; - unsigned long f_start = start; - unsigned long f_end = end; bool shared_pmd = false; + struct mmu_notifier_range range; /* * In the case of shared PMDs, the area to flush could be beyond - * start/end. Set f_start/f_end to cover the maximum possible + * start/end. Set range.start/range.end to cover the maximum possible * range if PMD sharing is possible. */ - adjust_range_if_pmd_sharing_possible(vma, &f_start, &f_end); + mmu_notifier_range_init(&range, mm, start, end); + adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end); BUG_ON(address >= end); - flush_cache_range(vma, f_start, f_end); + flush_cache_range(vma, range.start, range.end); - mmu_notifier_invalidate_range_start(mm, f_start, f_end); + mmu_notifier_invalidate_range_start(&range); i_mmap_lock_write(vma->vm_file->f_mapping); for (; address < end; address += huge_page_size(h)) { spinlock_t *ptl; @@ -4405,7 +4403,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, * did unshare a page of pmds, flush the range corresponding to the pud. */ if (shared_pmd) - flush_hugetlb_tlb_range(vma, f_start, f_end); + flush_hugetlb_tlb_range(vma, range.start, range.end); else flush_hugetlb_tlb_range(vma, start, end); /* @@ -4415,7 +4413,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, * See Documentation/vm/mmu_notifier.rst */ i_mmap_unlock_write(vma->vm_file->f_mapping); - mmu_notifier_invalidate_range_end(mm, f_start, f_end); + mmu_notifier_invalidate_range_end(&range); return pages << h->order; } diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 43ce2f4d2551..4f017339ddb2 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -944,8 +944,7 @@ static void collapse_huge_page(struct mm_struct *mm, int isolated = 0, result = 0; struct mem_cgroup *memcg; struct vm_area_struct *vma; - unsigned long mmun_start; /* For mmu_notifiers */ - unsigned long mmun_end; /* For mmu_notifiers */ + struct mmu_notifier_range range; gfp_t gfp; VM_BUG_ON(address & ~HPAGE_PMD_MASK); @@ -1017,9 +1016,8 @@ static void collapse_huge_page(struct mm_struct *mm, pte = pte_offset_map(pmd, address); pte_ptl = pte_lockptr(mm, pmd); - mmun_start = address; - mmun_end = address + HPAGE_PMD_SIZE; - mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); + mmu_notifier_range_init(&range, mm, address, address + HPAGE_PMD_SIZE); + mmu_notifier_invalidate_range_start(&range); pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */ /* * After this gup_fast can't run anymore. This also removes @@ -1029,7 +1027,7 @@ static void collapse_huge_page(struct mm_struct *mm, */ _pmd = pmdp_collapse_flush(vma, address, pmd); spin_unlock(pmd_ptl); - mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); + mmu_notifier_invalidate_range_end(&range); spin_lock(pte_ptl); isolated = __collapse_huge_page_isolate(vma, address, pte); diff --git a/mm/ksm.c b/mm/ksm.c index 1a088306ef81..38c0360482fa 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1042,8 +1042,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, }; int swapped; int err = -EFAULT; - unsigned long mmun_start; /* For mmu_notifiers */ - unsigned long mmun_end; /* For mmu_notifiers */ + struct mmu_notifier_range range; pvmw.address = page_address_in_vma(page, vma); if (pvmw.address == -EFAULT) @@ -1051,9 +1050,9 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, BUG_ON(PageTransCompound(page)); - mmun_start = pvmw.address; - mmun_end = pvmw.address + PAGE_SIZE; - mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); + mmu_notifier_range_init(&range, mm, pvmw.address, + pvmw.address + PAGE_SIZE); + mmu_notifier_invalidate_range_start(&range); if (!page_vma_mapped_walk(&pvmw)) goto out_mn; @@ -1105,7 +1104,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, out_unlock: page_vma_mapped_walk_done(&pvmw); out_mn: - mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); + mmu_notifier_invalidate_range_end(&range); out: return err; } @@ -1129,8 +1128,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, spinlock_t *ptl; unsigned long addr; int err = -EFAULT; - unsigned long mmun_start; /* For mmu_notifiers */ - unsigned long mmun_end; /* For mmu_notifiers */ + struct mmu_notifier_range range; addr = page_address_in_vma(page, vma); if (addr == -EFAULT) @@ -1140,9 +1138,8 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, if (!pmd) goto out; - mmun_start = addr; - mmun_end = addr + PAGE_SIZE; - mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); + mmu_notifier_range_init(&range, mm, addr, addr + PAGE_SIZE); + mmu_notifier_invalidate_range_start(&range); ptep = pte_offset_map_lock(mm, pmd, addr, &ptl); if (!pte_same(*ptep, orig_pte)) { @@ -1188,7 +1185,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, pte_unmap_unlock(ptep, ptl); err = 0; out_mn: - mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); + mmu_notifier_invalidate_range_end(&range); out: return err; } diff --git a/mm/madvise.c b/mm/madvise.c index 6cb1ca93e290..21a7881a2db4 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -458,29 +458,30 @@ static void madvise_free_page_range(struct mmu_gather *tlb, static int madvise_free_single_vma(struct vm_area_struct *vma, unsigned long start_addr, unsigned long end_addr) { - unsigned long start, end; struct mm_struct *mm = vma->vm_mm; + struct mmu_notifier_range range; struct mmu_gather tlb; /* MADV_FREE works for only anon vma at the moment */ if (!vma_is_anonymous(vma)) return -EINVAL; - start = max(vma->vm_start, start_addr); - if (start >= vma->vm_end) + range.start = max(vma->vm_start, start_addr); + if (range.start >= vma->vm_end) return -EINVAL; - end = min(vma->vm_end, end_addr); - if (end <= vma->vm_start) + range.end = min(vma->vm_end, end_addr); + if (range.end <= vma->vm_start) return -EINVAL; + mmu_notifier_range_init(&range, mm, range.start, range.end); lru_add_drain(); - tlb_gather_mmu(&tlb, mm, start, end); + tlb_gather_mmu(&tlb, mm, range.start, range.end); update_hiwater_rss(mm); - mmu_notifier_invalidate_range_start(mm, start, end); - madvise_free_page_range(&tlb, vma, start, end); - mmu_notifier_invalidate_range_end(mm, start, end); - tlb_finish_mmu(&tlb, start, end); + mmu_notifier_invalidate_range_start(&range); + madvise_free_page_range(&tlb, vma, range.start, range.end); + mmu_notifier_invalidate_range_end(&range); + tlb_finish_mmu(&tlb, range.start, range.end); return 0; } diff --git a/mm/memory.c b/mm/memory.c index 4ad2d293ddc2..b7a8bfe5f5ec 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -973,8 +973,7 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, unsigned long next; unsigned long addr = vma->vm_start; unsigned long end = vma->vm_end; - unsigned long mmun_start; /* For mmu_notifiers */ - unsigned long mmun_end; /* For mmu_notifiers */ + struct mmu_notifier_range range; bool is_cow; int ret; @@ -1008,11 +1007,11 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, * is_cow_mapping() returns true. */ is_cow = is_cow_mapping(vma->vm_flags); - mmun_start = addr; - mmun_end = end; - if (is_cow) - mmu_notifier_invalidate_range_start(src_mm, mmun_start, - mmun_end); + + if (is_cow) { + mmu_notifier_range_init(&range, src_mm, addr, end); + mmu_notifier_invalidate_range_start(&range); + } ret = 0; dst_pgd = pgd_offset(dst_mm, addr); @@ -1029,7 +1028,7 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, } while (dst_pgd++, src_pgd++, addr = next, addr != end); if (is_cow) - mmu_notifier_invalidate_range_end(src_mm, mmun_start, mmun_end); + mmu_notifier_invalidate_range_end(&range); return ret; } @@ -1332,12 +1331,13 @@ void unmap_vmas(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start_addr, unsigned long end_addr) { - struct mm_struct *mm = vma->vm_mm; + struct mmu_notifier_range range; - mmu_notifier_invalidate_range_start(mm, start_addr, end_addr); + mmu_notifier_range_init(&range, vma->vm_mm, start_addr, end_addr); + mmu_notifier_invalidate_range_start(&range); for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) unmap_single_vma(tlb, vma, start_addr, end_addr, NULL); - mmu_notifier_invalidate_range_end(mm, start_addr, end_addr); + mmu_notifier_invalidate_range_end(&range); } /** @@ -1351,18 +1351,18 @@ void unmap_vmas(struct mmu_gather *tlb, void zap_page_range(struct vm_area_struct *vma, unsigned long start, unsigned long size) { - struct mm_struct *mm = vma->vm_mm; + struct mmu_notifier_range range; struct mmu_gather tlb; - unsigned long end = start + size; lru_add_drain(); - tlb_gather_mmu(&tlb, mm, start, end); - update_hiwater_rss(mm); - mmu_notifier_invalidate_range_start(mm, start, end); - for ( ; vma && vma->vm_start < end; vma = vma->vm_next) - unmap_single_vma(&tlb, vma, start, end, NULL); - mmu_notifier_invalidate_range_end(mm, start, end); - tlb_finish_mmu(&tlb, start, end); + mmu_notifier_range_init(&range, vma->vm_mm, start, start + size); + tlb_gather_mmu(&tlb, vma->vm_mm, start, range.end); + update_hiwater_rss(vma->vm_mm); + mmu_notifier_invalidate_range_start(&range); + for ( ; vma && vma->vm_start < range.end; vma = vma->vm_next) + unmap_single_vma(&tlb, vma, start, range.end, NULL); + mmu_notifier_invalidate_range_end(&range); + tlb_finish_mmu(&tlb, start, range.end); } /** @@ -1377,17 +1377,17 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long start, static void zap_page_range_single(struct vm_area_struct *vma, unsigned long address, unsigned long size, struct zap_details *details) { - struct mm_struct *mm = vma->vm_mm; + struct mmu_notifier_range range; struct mmu_gather tlb; - unsigned long end = address + size; lru_add_drain(); - tlb_gather_mmu(&tlb, mm, address, end); - update_hiwater_rss(mm); - mmu_notifier_invalidate_range_start(mm, address, end); - unmap_single_vma(&tlb, vma, address, end, details); - mmu_notifier_invalidate_range_end(mm, address, end); - tlb_finish_mmu(&tlb, address, end); + mmu_notifier_range_init(&range, vma->vm_mm, address, address + size); + tlb_gather_mmu(&tlb, vma->vm_mm, address, range.end); + update_hiwater_rss(vma->vm_mm); + mmu_notifier_invalidate_range_start(&range); + unmap_single_vma(&tlb, vma, address, range.end, details); + mmu_notifier_invalidate_range_end(&range); + tlb_finish_mmu(&tlb, address, range.end); } /** @@ -2247,9 +2247,8 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) struct page *new_page = NULL; pte_t entry; int page_copied = 0; - const unsigned long mmun_start = vmf->address & PAGE_MASK; - const unsigned long mmun_end = mmun_start + PAGE_SIZE; struct mem_cgroup *memcg; + struct mmu_notifier_range range; if (unlikely(anon_vma_prepare(vma))) goto oom; @@ -2272,7 +2271,9 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) __SetPageUptodate(new_page); - mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); + mmu_notifier_range_init(&range, mm, vmf->address & PAGE_MASK, + (vmf->address & PAGE_MASK) + PAGE_SIZE); + mmu_notifier_invalidate_range_start(&range); /* * Re-check the pte - we dropped the lock @@ -2349,7 +2350,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) * No need to double call mmu_notifier->invalidate_range() callback as * the above ptep_clear_flush_notify() did already call it. */ - mmu_notifier_invalidate_range_only_end(mm, mmun_start, mmun_end); + mmu_notifier_invalidate_range_only_end(&range); if (old_page) { /* * Don't let another task, with possibly unlocked vma, @@ -4030,7 +4031,7 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) #endif /* __PAGETABLE_PMD_FOLDED */ static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address, - unsigned long *start, unsigned long *end, + struct mmu_notifier_range *range, pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp) { pgd_t *pgd; @@ -4058,10 +4059,10 @@ static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address, if (!pmdpp) goto out; - if (start && end) { - *start = address & PMD_MASK; - *end = *start + PMD_SIZE; - mmu_notifier_invalidate_range_start(mm, *start, *end); + if (range) { + mmu_notifier_range_init(range, mm, address & PMD_MASK, + (address & PMD_MASK) + PMD_SIZE); + mmu_notifier_invalidate_range_start(range); } *ptlp = pmd_lock(mm, pmd); if (pmd_huge(*pmd)) { @@ -4069,17 +4070,17 @@ static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address, return 0; } spin_unlock(*ptlp); - if (start && end) - mmu_notifier_invalidate_range_end(mm, *start, *end); + if (range) + mmu_notifier_invalidate_range_end(range); } if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) goto out; - if (start && end) { - *start = address & PAGE_MASK; - *end = *start + PAGE_SIZE; - mmu_notifier_invalidate_range_start(mm, *start, *end); + if (range) { + range->start = address & PAGE_MASK; + range->end = range->start + PAGE_SIZE; + mmu_notifier_invalidate_range_start(range); } ptep = pte_offset_map_lock(mm, pmd, address, ptlp); if (!pte_present(*ptep)) @@ -4088,8 +4089,8 @@ static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address, return 0; unlock: pte_unmap_unlock(ptep, *ptlp); - if (start && end) - mmu_notifier_invalidate_range_end(mm, *start, *end); + if (range) + mmu_notifier_invalidate_range_end(range); out: return -EINVAL; } @@ -4101,20 +4102,20 @@ static inline int follow_pte(struct mm_struct *mm, unsigned long address, /* (void) is needed to make gcc happy */ (void) __cond_lock(*ptlp, - !(res = __follow_pte_pmd(mm, address, NULL, NULL, + !(res = __follow_pte_pmd(mm, address, NULL, ptepp, NULL, ptlp))); return res; } int follow_pte_pmd(struct mm_struct *mm, unsigned long address, - unsigned long *start, unsigned long *end, - pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp) + struct mmu_notifier_range *range, + pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp) { int res; /* (void) is needed to make gcc happy */ (void) __cond_lock(*ptlp, - !(res = __follow_pte_pmd(mm, address, start, end, + !(res = __follow_pte_pmd(mm, address, range, ptepp, pmdpp, ptlp))); return res; } diff --git a/mm/migrate.c b/mm/migrate.c index acda06f99754..462163f5f278 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -2299,6 +2299,7 @@ next: */ static void migrate_vma_collect(struct migrate_vma *migrate) { + struct mmu_notifier_range range; struct mm_walk mm_walk; mm_walk.pmd_entry = migrate_vma_collect_pmd; @@ -2310,13 +2311,11 @@ static void migrate_vma_collect(struct migrate_vma *migrate) mm_walk.mm = migrate->vma->vm_mm; mm_walk.private = migrate; - mmu_notifier_invalidate_range_start(mm_walk.mm, - migrate->start, - migrate->end); + mmu_notifier_range_init(&range, mm_walk.mm, migrate->start, + migrate->end); + mmu_notifier_invalidate_range_start(&range); walk_page_range(migrate->start, migrate->end, &mm_walk); - mmu_notifier_invalidate_range_end(mm_walk.mm, - migrate->start, - migrate->end); + mmu_notifier_invalidate_range_end(&range); migrate->end = migrate->start + (migrate->npages << PAGE_SHIFT); } @@ -2697,9 +2696,8 @@ static void migrate_vma_pages(struct migrate_vma *migrate) { const unsigned long npages = migrate->npages; const unsigned long start = migrate->start; - struct vm_area_struct *vma = migrate->vma; - struct mm_struct *mm = vma->vm_mm; - unsigned long addr, i, mmu_start; + struct mmu_notifier_range range; + unsigned long addr, i; bool notified = false; for (i = 0, addr = start; i < npages; addr += PAGE_SIZE, i++) { @@ -2718,11 +2716,12 @@ static void migrate_vma_pages(struct migrate_vma *migrate) continue; } if (!notified) { - mmu_start = addr; notified = true; - mmu_notifier_invalidate_range_start(mm, - mmu_start, - migrate->end); + + mmu_notifier_range_init(&range, + migrate->vma->vm_mm, + addr, migrate->end); + mmu_notifier_invalidate_range_start(&range); } migrate_vma_insert_page(migrate, addr, newpage, &migrate->src[i], @@ -2763,8 +2762,7 @@ static void migrate_vma_pages(struct migrate_vma *migrate) * did already call it. */ if (notified) - mmu_notifier_invalidate_range_only_end(mm, mmu_start, - migrate->end); + mmu_notifier_invalidate_range_only_end(&range); } /* diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 74a7dc3d11c8..9c884abc7850 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -167,28 +167,20 @@ void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address, srcu_read_unlock(&srcu, id); } -int __mmu_notifier_invalidate_range_start(struct mm_struct *mm, - unsigned long start, unsigned long end, - bool blockable) +int __mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range) { - struct mmu_notifier_range _range, *range = &_range; struct mmu_notifier *mn; int ret = 0; int id; - range->blockable = blockable; - range->start = start; - range->end = end; - range->mm = mm; - id = srcu_read_lock(&srcu); - hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) { + hlist_for_each_entry_rcu(mn, &range->mm->mmu_notifier_mm->list, hlist) { if (mn->ops->invalidate_range_start) { int _ret = mn->ops->invalidate_range_start(mn, range); if (_ret) { pr_info("%pS callback failed with %d in %sblockable context.\n", - mn->ops->invalidate_range_start, _ret, - !blockable ? "non-" : ""); + mn->ops->invalidate_range_start, _ret, + !range->blockable ? "non-" : ""); ret = _ret; } } @@ -199,27 +191,14 @@ int __mmu_notifier_invalidate_range_start(struct mm_struct *mm, } EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range_start); -void __mmu_notifier_invalidate_range_end(struct mm_struct *mm, - unsigned long start, - unsigned long end, +void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range, bool only_end) { - struct mmu_notifier_range _range, *range = &_range; struct mmu_notifier *mn; int id; - /* - * The end call back will never be call if the start refused to go - * through because of blockable was false so here assume that we - * can block. - */ - range->blockable = true; - range->start = start; - range->end = end; - range->mm = mm; - id = srcu_read_lock(&srcu); - hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) { + hlist_for_each_entry_rcu(mn, &range->mm->mmu_notifier_mm->list, hlist) { /* * Call invalidate_range here too to avoid the need for the * subsystem of having to register an invalidate_range_end @@ -234,7 +213,9 @@ void __mmu_notifier_invalidate_range_end(struct mm_struct *mm, * already happen under page table lock. */ if (!only_end && mn->ops->invalidate_range) - mn->ops->invalidate_range(mn, mm, start, end); + mn->ops->invalidate_range(mn, range->mm, + range->start, + range->end); if (mn->ops->invalidate_range_end) mn->ops->invalidate_range_end(mn, range); } diff --git a/mm/mprotect.c b/mm/mprotect.c index 6d331620b9e5..36cb358db170 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -167,11 +167,12 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, pgprot_t newprot, int dirty_accountable, int prot_numa) { pmd_t *pmd; - struct mm_struct *mm = vma->vm_mm; unsigned long next; unsigned long pages = 0; unsigned long nr_huge_updates = 0; - unsigned long mni_start = 0; + struct mmu_notifier_range range; + + range.start = 0; pmd = pmd_offset(pud, addr); do { @@ -183,9 +184,9 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, goto next; /* invoke the mmu notifier if the pmd is populated */ - if (!mni_start) { - mni_start = addr; - mmu_notifier_invalidate_range_start(mm, mni_start, end); + if (!range.start) { + mmu_notifier_range_init(&range, vma->vm_mm, addr, end); + mmu_notifier_invalidate_range_start(&range); } if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) { @@ -214,8 +215,8 @@ next: cond_resched(); } while (pmd++, addr = next, addr != end); - if (mni_start) - mmu_notifier_invalidate_range_end(mm, mni_start, end); + if (range.start) + mmu_notifier_invalidate_range_end(&range); if (nr_huge_updates) count_vm_numa_events(NUMA_HUGE_PTE_UPDATES, nr_huge_updates); diff --git a/mm/mremap.c b/mm/mremap.c index 7f9f9180e401..def01d86e36f 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -197,16 +197,14 @@ unsigned long move_page_tables(struct vm_area_struct *vma, bool need_rmap_locks) { unsigned long extent, next, old_end; + struct mmu_notifier_range range; pmd_t *old_pmd, *new_pmd; - unsigned long mmun_start; /* For mmu_notifiers */ - unsigned long mmun_end; /* For mmu_notifiers */ old_end = old_addr + len; flush_cache_range(vma, old_addr, old_end); - mmun_start = old_addr; - mmun_end = old_end; - mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end); + mmu_notifier_range_init(&range, vma->vm_mm, old_addr, old_end); + mmu_notifier_invalidate_range_start(&range); for (; old_addr < old_end; old_addr += extent, new_addr += extent) { cond_resched(); @@ -247,7 +245,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma, new_pmd, new_addr, need_rmap_locks); } - mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end); + mmu_notifier_invalidate_range_end(&range); return len + old_addr - old_end; /* how much done */ } diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 5442cb12e4ed..f0e8cd9edb1a 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -528,19 +528,20 @@ bool __oom_reap_task_mm(struct mm_struct *mm) * count elevated without a good reason. */ if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED)) { - const unsigned long start = vma->vm_start; - const unsigned long end = vma->vm_end; + struct mmu_notifier_range range; struct mmu_gather tlb; - tlb_gather_mmu(&tlb, mm, start, end); - if (mmu_notifier_invalidate_range_start_nonblock(mm, start, end)) { - tlb_finish_mmu(&tlb, start, end); + mmu_notifier_range_init(&range, mm, vma->vm_start, + vma->vm_end); + tlb_gather_mmu(&tlb, mm, range.start, range.end); + if (mmu_notifier_invalidate_range_start_nonblock(&range)) { + tlb_finish_mmu(&tlb, range.start, range.end); ret = false; continue; } - unmap_page_range(&tlb, vma, start, end, NULL); - mmu_notifier_invalidate_range_end(mm, start, end); - tlb_finish_mmu(&tlb, start, end); + unmap_page_range(&tlb, vma, range.start, range.end, NULL); + mmu_notifier_invalidate_range_end(&range); + tlb_finish_mmu(&tlb, range.start, range.end); } } diff --git a/mm/rmap.c b/mm/rmap.c index 85b7f9423352..c75f72f6fe0e 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -889,15 +889,17 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma, .address = address, .flags = PVMW_SYNC, }; - unsigned long start = address, end; + struct mmu_notifier_range range; int *cleaned = arg; /* * We have to assume the worse case ie pmd for invalidation. Note that * the page can not be free from this function. */ - end = min(vma->vm_end, start + (PAGE_SIZE << compound_order(page))); - mmu_notifier_invalidate_range_start(vma->vm_mm, start, end); + mmu_notifier_range_init(&range, vma->vm_mm, address, + min(vma->vm_end, address + + (PAGE_SIZE << compound_order(page)))); + mmu_notifier_invalidate_range_start(&range); while (page_vma_mapped_walk(&pvmw)) { unsigned long cstart; @@ -949,7 +951,7 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma, (*cleaned)++; } - mmu_notifier_invalidate_range_end(vma->vm_mm, start, end); + mmu_notifier_invalidate_range_end(&range); return true; } @@ -1345,7 +1347,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, pte_t pteval; struct page *subpage; bool ret = true; - unsigned long start = address, end; + struct mmu_notifier_range range; enum ttu_flags flags = (enum ttu_flags)arg; /* munlock has nothing to gain from examining un-locked vmas */ @@ -1369,15 +1371,18 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, * Note that the page can not be free in this function as call of * try_to_unmap() must hold a reference on the page. */ - end = min(vma->vm_end, start + (PAGE_SIZE << compound_order(page))); + mmu_notifier_range_init(&range, vma->vm_mm, vma->vm_start, + min(vma->vm_end, vma->vm_start + + (PAGE_SIZE << compound_order(page)))); if (PageHuge(page)) { /* * If sharing is possible, start and end will be adjusted * accordingly. */ - adjust_range_if_pmd_sharing_possible(vma, &start, &end); + adjust_range_if_pmd_sharing_possible(vma, &range.start, + &range.end); } - mmu_notifier_invalidate_range_start(vma->vm_mm, start, end); + mmu_notifier_invalidate_range_start(&range); while (page_vma_mapped_walk(&pvmw)) { #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION @@ -1428,9 +1433,10 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, * we must flush them all. start/end were * already adjusted above to cover this range. */ - flush_cache_range(vma, start, end); - flush_tlb_range(vma, start, end); - mmu_notifier_invalidate_range(mm, start, end); + flush_cache_range(vma, range.start, range.end); + flush_tlb_range(vma, range.start, range.end); + mmu_notifier_invalidate_range(mm, range.start, + range.end); /* * The ref count of the PMD page was dropped @@ -1650,7 +1656,7 @@ discard: put_page(page); } - mmu_notifier_invalidate_range_end(vma->vm_mm, start, end); + mmu_notifier_invalidate_range_end(&range); return ret; } -- cgit From 063a7d1d3623db31ca5d2309cab6030ebf93b72f Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 28 Dec 2018 00:39:46 -0800 Subject: mm/hmm: fix memremap.h, move dev_page_fault_t callback to hmm MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The kbuild robot reported the following on a development branch that used memremap.h in a new path: In file included from arch/m68k/include/asm/pgtable_mm.h:148:0, from arch/m68k/include/asm/pgtable.h:5, from include/linux/memremap.h:7, from drivers//dax/bus.c:3: arch/m68k/include/asm/motorola_pgtable.h: In function 'pgd_offset': >> arch/m68k/include/asm/motorola_pgtable.h:199:11: error: dereferencing pointer to incomplete type 'const struct mm_struct' return mm->pgd + pgd_index(address); ^~ The ->page_fault() callback is specific to HMM. Move it to 'struct hmm_devmem' where the unusual asm/pgtable.h dependency can be contained in include/linux/hmm.h. Longer term refactoring this dependency out of HMM is recommended, but in the meantime memremap.h remains generic. Link: http://lkml.kernel.org/r/154534090899.3120190.6652620807617715272.stgit@dwillia2-desk3.amr.corp.intel.com Fixes: 5042db43cc26 ("mm/ZONE_DEVICE: new type of ZONE_DEVICE memory...") Signed-off-by: Dan Williams Reviewed-by: "Jérôme Glisse" Cc: Logan Gunthorpe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hmm.h | 24 ++++++++++++++++++++++++ include/linux/memremap.h | 32 -------------------------------- kernel/memremap.c | 6 +++++- mm/hmm.c | 4 ++-- 4 files changed, 31 insertions(+), 35 deletions(-) (limited to 'kernel') diff --git a/include/linux/hmm.h b/include/linux/hmm.h index ed89fbc525d2..66f9ebbb1df3 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -69,6 +69,7 @@ #define LINUX_HMM_H #include +#include #if IS_ENABLED(CONFIG_HMM) @@ -486,6 +487,7 @@ struct hmm_devmem_ops { * @device: device to bind resource to * @ops: memory operations callback * @ref: per CPU refcount + * @page_fault: callback when CPU fault on an unaddressable device page * * This an helper structure for device drivers that do not wish to implement * the gory details related to hotplugging new memoy and allocating struct @@ -493,7 +495,28 @@ struct hmm_devmem_ops { * * Device drivers can directly use ZONE_DEVICE memory on their own if they * wish to do so. + * + * The page_fault() callback must migrate page back, from device memory to + * system memory, so that the CPU can access it. This might fail for various + * reasons (device issues, device have been unplugged, ...). When such error + * conditions happen, the page_fault() callback must return VM_FAULT_SIGBUS and + * set the CPU page table entry to "poisoned". + * + * Note that because memory cgroup charges are transferred to the device memory, + * this should never fail due to memory restrictions. However, allocation + * of a regular system page might still fail because we are out of memory. If + * that happens, the page_fault() callback must return VM_FAULT_OOM. + * + * The page_fault() callback can also try to migrate back multiple pages in one + * chunk, as an optimization. It must, however, prioritize the faulting address + * over all the others. */ +typedef int (*dev_page_fault_t)(struct vm_area_struct *vma, + unsigned long addr, + const struct page *page, + unsigned int flags, + pmd_t *pmdp); + struct hmm_devmem { struct completion completion; unsigned long pfn_first; @@ -503,6 +526,7 @@ struct hmm_devmem { struct dev_pagemap pagemap; const struct hmm_devmem_ops *ops; struct percpu_ref ref; + dev_page_fault_t page_fault; }; /* diff --git a/include/linux/memremap.h b/include/linux/memremap.h index 55db66b3716f..f0628660d541 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -4,8 +4,6 @@ #include #include -#include - struct resource; struct device; @@ -66,47 +64,18 @@ enum memory_type { }; /* - * For MEMORY_DEVICE_PRIVATE we use ZONE_DEVICE and extend it with two - * callbacks: - * page_fault() - * page_free() - * * Additional notes about MEMORY_DEVICE_PRIVATE may be found in * include/linux/hmm.h and Documentation/vm/hmm.rst. There is also a brief * explanation in include/linux/memory_hotplug.h. * - * The page_fault() callback must migrate page back, from device memory to - * system memory, so that the CPU can access it. This might fail for various - * reasons (device issues, device have been unplugged, ...). When such error - * conditions happen, the page_fault() callback must return VM_FAULT_SIGBUS and - * set the CPU page table entry to "poisoned". - * - * Note that because memory cgroup charges are transferred to the device memory, - * this should never fail due to memory restrictions. However, allocation - * of a regular system page might still fail because we are out of memory. If - * that happens, the page_fault() callback must return VM_FAULT_OOM. - * - * The page_fault() callback can also try to migrate back multiple pages in one - * chunk, as an optimization. It must, however, prioritize the faulting address - * over all the others. - * - * * The page_free() callback is called once the page refcount reaches 1 * (ZONE_DEVICE pages never reach 0 refcount unless there is a refcount bug. * This allows the device driver to implement its own memory management.) - * - * For MEMORY_DEVICE_PUBLIC only the page_free() callback matter. */ -typedef int (*dev_page_fault_t)(struct vm_area_struct *vma, - unsigned long addr, - const struct page *page, - unsigned int flags, - pmd_t *pmdp); typedef void (*dev_page_free_t)(struct page *page, void *data); /** * struct dev_pagemap - metadata for ZONE_DEVICE mappings - * @page_fault: callback when CPU fault on an unaddressable device page * @page_free: free page callback when page refcount reaches 1 * @altmap: pre-allocated/reserved memory for vmemmap allocations * @res: physical address range covered by @ref @@ -117,7 +86,6 @@ typedef void (*dev_page_free_t)(struct page *page, void *data); * @type: memory type: see MEMORY_* in memory_hotplug.h */ struct dev_pagemap { - dev_page_fault_t page_fault; dev_page_free_t page_free; struct vmem_altmap altmap; bool altmap_valid; diff --git a/kernel/memremap.c b/kernel/memremap.c index 0d5603d76c37..a856cb5ff192 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -11,6 +11,7 @@ #include #include #include +#include static DEFINE_XARRAY(pgmap_array); #define SECTION_MASK ~((1UL << PA_SECTION_SHIFT) - 1) @@ -24,6 +25,9 @@ vm_fault_t device_private_entry_fault(struct vm_area_struct *vma, pmd_t *pmdp) { struct page *page = device_private_entry_to_page(entry); + struct hmm_devmem *devmem; + + devmem = container_of(page->pgmap, typeof(*devmem), pagemap); /* * The page_fault() callback must migrate page back to system memory @@ -39,7 +43,7 @@ vm_fault_t device_private_entry_fault(struct vm_area_struct *vma, * There is a more in-depth description of what that callback can and * cannot do, in include/linux/memremap.h */ - return page->pgmap->page_fault(vma, addr, page, flags, pmdp); + return devmem->page_fault(vma, addr, page, flags, pmdp); } EXPORT_SYMBOL(device_private_entry_fault); #endif /* CONFIG_DEVICE_PRIVATE */ diff --git a/mm/hmm.c b/mm/hmm.c index 789587731217..a04e4b810610 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -1087,10 +1087,10 @@ struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops, devmem->pfn_first = devmem->resource->start >> PAGE_SHIFT; devmem->pfn_last = devmem->pfn_first + (resource_size(devmem->resource) >> PAGE_SHIFT); + devmem->page_fault = hmm_devmem_fault; devmem->pagemap.type = MEMORY_DEVICE_PRIVATE; devmem->pagemap.res = *devmem->resource; - devmem->pagemap.page_fault = hmm_devmem_fault; devmem->pagemap.page_free = hmm_devmem_free; devmem->pagemap.altmap_valid = false; devmem->pagemap.ref = &devmem->ref; @@ -1141,10 +1141,10 @@ struct hmm_devmem *hmm_devmem_add_resource(const struct hmm_devmem_ops *ops, devmem->pfn_first = devmem->resource->start >> PAGE_SHIFT; devmem->pfn_last = devmem->pfn_first + (resource_size(devmem->resource) >> PAGE_SHIFT); + devmem->page_fault = hmm_devmem_fault; devmem->pagemap.type = MEMORY_DEVICE_PUBLIC; devmem->pagemap.res = *devmem->resource; - devmem->pagemap.page_fault = hmm_devmem_fault; devmem->pagemap.page_free = hmm_devmem_free; devmem->pagemap.altmap_valid = false; devmem->pagemap.ref = &devmem->ref; -- cgit From 0f4991e8fd48987ae476a92cdee6bfec4aff31b8 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Fri, 28 Dec 2018 00:40:00 -0800 Subject: kernel/fork.c: mark 'stack_vm_area' with __maybe_unused Fixes gcc '-Wunused-but-set-variable' warning when CONFIG_VMAP_STACK is not set: kernel/fork.c: In function 'dup_task_struct': kernel/fork.c:843:20: warning: variable 'stack_vm_area' set but not used [-Wunused-but-set-variable] Link: http://lkml.kernel.org/r/1545965190-2381-1-git-send-email-yuehaibing@huawei.com Signed-off-by: YueHaibing Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index c979605fe806..d439c48ecf18 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -841,7 +841,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) { struct task_struct *tsk; unsigned long *stack; - struct vm_struct *stack_vm_area; + struct vm_struct *stack_vm_area __maybe_unused; int err; if (node == NUMA_NO_NODE) -- cgit From 1a80dade010c7a7f4885a4c4c2a7ac22cc7b34df Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 28 Dec 2018 07:22:26 -0800 Subject: Fix failure path in alloc_pid() The failure path removes the allocated PIDs from the wrong namespace. This could lead to us inadvertently reusing PIDs in the leaf namespace and leaking PIDs in parent namespaces. Fixes: 95846ecf9dac ("pid: replace pid bitmap implementation with IDR API") Cc: Signed-off-by: Matthew Wilcox Acked-by: "Eric W. Biederman" Reviewed-by: Oleg Nesterov Signed-off-by: Linus Torvalds --- kernel/pid.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/pid.c b/kernel/pid.c index b2f6c506035d..20881598bdfa 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -233,8 +233,10 @@ out_unlock: out_free: spin_lock_irq(&pidmap_lock); - while (++i <= ns->level) - idr_remove(&ns->idr, (pid->numbers + i)->nr); + while (++i <= ns->level) { + upid = pid->numbers + i; + idr_remove(&upid->ns->idr, upid->nr); + } /* On failure to allocate the first pid, reset the state */ if (ns->pid_allocated == PIDNS_ADDING) -- cgit From 9ef7fa507d6b53a96de4da3298c5f01bde603c0a Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Tue, 4 Dec 2018 19:38:25 -0800 Subject: kgdb: Remove irq flags from roundup The function kgdb_roundup_cpus() was passed a parameter that was documented as: > the flags that will be used when restoring the interrupts. There is > local_irq_save() call before kgdb_roundup_cpus(). Nobody used those flags. Anyone who wanted to temporarily turn on interrupts just did local_irq_enable() and local_irq_disable() without looking at them. So we can definitely remove the flags. Signed-off-by: Douglas Anderson Cc: Vineet Gupta Cc: Russell King Cc: Catalin Marinas Cc: Will Deacon Cc: Richard Kuo Cc: Ralf Baechle Cc: Paul Burton Cc: James Hogan Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Cc: Yoshinori Sato Cc: Rich Felker Cc: "David S. Miller" Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: "H. Peter Anvin" Acked-by: Will Deacon Signed-off-by: Daniel Thompson --- arch/arc/kernel/kgdb.c | 2 +- arch/arm/kernel/kgdb.c | 2 +- arch/arm64/kernel/kgdb.c | 2 +- arch/hexagon/kernel/kgdb.c | 9 ++------- arch/mips/kernel/kgdb.c | 2 +- arch/powerpc/kernel/kgdb.c | 2 +- arch/sh/kernel/kgdb.c | 2 +- arch/sparc/kernel/smp_64.c | 2 +- arch/x86/kernel/kgdb.c | 9 ++------- include/linux/kgdb.h | 9 ++------- kernel/debug/debug_core.c | 2 +- 11 files changed, 14 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/arch/arc/kernel/kgdb.c b/arch/arc/kernel/kgdb.c index 9a3c34af2ae8..0932851028e0 100644 --- a/arch/arc/kernel/kgdb.c +++ b/arch/arc/kernel/kgdb.c @@ -197,7 +197,7 @@ static void kgdb_call_nmi_hook(void *ignored) kgdb_nmicallback(raw_smp_processor_id(), NULL); } -void kgdb_roundup_cpus(unsigned long flags) +void kgdb_roundup_cpus(void) { local_irq_enable(); smp_call_function(kgdb_call_nmi_hook, NULL, 0); diff --git a/arch/arm/kernel/kgdb.c b/arch/arm/kernel/kgdb.c index caa0dbe3dc61..f21077b077be 100644 --- a/arch/arm/kernel/kgdb.c +++ b/arch/arm/kernel/kgdb.c @@ -175,7 +175,7 @@ static void kgdb_call_nmi_hook(void *ignored) kgdb_nmicallback(raw_smp_processor_id(), get_irq_regs()); } -void kgdb_roundup_cpus(unsigned long flags) +void kgdb_roundup_cpus(void) { local_irq_enable(); smp_call_function(kgdb_call_nmi_hook, NULL, 0); diff --git a/arch/arm64/kernel/kgdb.c b/arch/arm64/kernel/kgdb.c index a20de58061a8..12c339ff6e75 100644 --- a/arch/arm64/kernel/kgdb.c +++ b/arch/arm64/kernel/kgdb.c @@ -289,7 +289,7 @@ static void kgdb_call_nmi_hook(void *ignored) kgdb_nmicallback(raw_smp_processor_id(), get_irq_regs()); } -void kgdb_roundup_cpus(unsigned long flags) +void kgdb_roundup_cpus(void) { local_irq_enable(); smp_call_function(kgdb_call_nmi_hook, NULL, 0); diff --git a/arch/hexagon/kernel/kgdb.c b/arch/hexagon/kernel/kgdb.c index 16c24b22d0b2..012e0e230ac2 100644 --- a/arch/hexagon/kernel/kgdb.c +++ b/arch/hexagon/kernel/kgdb.c @@ -119,17 +119,12 @@ void kgdb_arch_set_pc(struct pt_regs *regs, unsigned long pc) /** * kgdb_roundup_cpus - Get other CPUs into a holding pattern - * @flags: Current IRQ state * * On SMP systems, we need to get the attention of the other CPUs * and get them be in a known state. This should do what is needed * to get the other CPUs to call kgdb_wait(). Note that on some arches, * the NMI approach is not used for rounding up all the CPUs. For example, - * in case of MIPS, smp_call_function() is used to roundup CPUs. In - * this case, we have to make sure that interrupts are enabled before - * calling smp_call_function(). The argument to this function is - * the flags that will be used when restoring the interrupts. There is - * local_irq_save() call before kgdb_roundup_cpus(). + * in case of MIPS, smp_call_function() is used to roundup CPUs. * * On non-SMP systems, this is not called. */ @@ -139,7 +134,7 @@ static void hexagon_kgdb_nmi_hook(void *ignored) kgdb_nmicallback(raw_smp_processor_id(), get_irq_regs()); } -void kgdb_roundup_cpus(unsigned long flags) +void kgdb_roundup_cpus(void) { local_irq_enable(); smp_call_function(hexagon_kgdb_nmi_hook, NULL, 0); diff --git a/arch/mips/kernel/kgdb.c b/arch/mips/kernel/kgdb.c index eb6c0d582626..2b05effc17b4 100644 --- a/arch/mips/kernel/kgdb.c +++ b/arch/mips/kernel/kgdb.c @@ -219,7 +219,7 @@ static void kgdb_call_nmi_hook(void *ignored) set_fs(old_fs); } -void kgdb_roundup_cpus(unsigned long flags) +void kgdb_roundup_cpus(void) { local_irq_enable(); smp_call_function(kgdb_call_nmi_hook, NULL, 0); diff --git a/arch/powerpc/kernel/kgdb.c b/arch/powerpc/kernel/kgdb.c index 59c578f865aa..b0e804844be0 100644 --- a/arch/powerpc/kernel/kgdb.c +++ b/arch/powerpc/kernel/kgdb.c @@ -124,7 +124,7 @@ static int kgdb_call_nmi_hook(struct pt_regs *regs) } #ifdef CONFIG_SMP -void kgdb_roundup_cpus(unsigned long flags) +void kgdb_roundup_cpus(void) { smp_send_debugger_break(); } diff --git a/arch/sh/kernel/kgdb.c b/arch/sh/kernel/kgdb.c index 4f04c6638a4d..cc57630f6bf2 100644 --- a/arch/sh/kernel/kgdb.c +++ b/arch/sh/kernel/kgdb.c @@ -319,7 +319,7 @@ static void kgdb_call_nmi_hook(void *ignored) kgdb_nmicallback(raw_smp_processor_id(), get_irq_regs()); } -void kgdb_roundup_cpus(unsigned long flags) +void kgdb_roundup_cpus(void) { local_irq_enable(); smp_call_function(kgdb_call_nmi_hook, NULL, 0); diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c index 4792e08ad36b..f45d876983f1 100644 --- a/arch/sparc/kernel/smp_64.c +++ b/arch/sparc/kernel/smp_64.c @@ -1014,7 +1014,7 @@ void flush_dcache_page_all(struct mm_struct *mm, struct page *page) } #ifdef CONFIG_KGDB -void kgdb_roundup_cpus(unsigned long flags) +void kgdb_roundup_cpus(void) { smp_cross_call(&xcall_kgdb_capture, 0, 0, 0); } diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index 8e36f249646e..ac6291a4178d 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -422,21 +422,16 @@ static void kgdb_disable_hw_debug(struct pt_regs *regs) #ifdef CONFIG_SMP /** * kgdb_roundup_cpus - Get other CPUs into a holding pattern - * @flags: Current IRQ state * * On SMP systems, we need to get the attention of the other CPUs * and get them be in a known state. This should do what is needed * to get the other CPUs to call kgdb_wait(). Note that on some arches, * the NMI approach is not used for rounding up all the CPUs. For example, - * in case of MIPS, smp_call_function() is used to roundup CPUs. In - * this case, we have to make sure that interrupts are enabled before - * calling smp_call_function(). The argument to this function is - * the flags that will be used when restoring the interrupts. There is - * local_irq_save() call before kgdb_roundup_cpus(). + * in case of MIPS, smp_call_function() is used to roundup CPUs. * * On non-SMP systems, this is not called. */ -void kgdb_roundup_cpus(unsigned long flags) +void kgdb_roundup_cpus(void) { apic->send_IPI_allbutself(APIC_DM_NMI); } diff --git a/include/linux/kgdb.h b/include/linux/kgdb.h index e465bb15912d..05e5b2eb0d32 100644 --- a/include/linux/kgdb.h +++ b/include/linux/kgdb.h @@ -178,21 +178,16 @@ kgdb_arch_handle_exception(int vector, int signo, int err_code, /** * kgdb_roundup_cpus - Get other CPUs into a holding pattern - * @flags: Current IRQ state * * On SMP systems, we need to get the attention of the other CPUs * and get them into a known state. This should do what is needed * to get the other CPUs to call kgdb_wait(). Note that on some arches, * the NMI approach is not used for rounding up all the CPUs. For example, - * in case of MIPS, smp_call_function() is used to roundup CPUs. In - * this case, we have to make sure that interrupts are enabled before - * calling smp_call_function(). The argument to this function is - * the flags that will be used when restoring the interrupts. There is - * local_irq_save() call before kgdb_roundup_cpus(). + * in case of MIPS, smp_call_function() is used to roundup CPUs. * * On non-SMP systems, this is not called. */ -extern void kgdb_roundup_cpus(unsigned long flags); +extern void kgdb_roundup_cpus(void); /** * kgdb_arch_set_pc - Generic call back to the program counter diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 65c0f1363788..f3cadda45f07 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -593,7 +593,7 @@ return_normal: /* Signal the other CPUs to enter kgdb_wait() */ else if ((!kgdb_single_step) && kgdb_do_roundup) - kgdb_roundup_cpus(flags); + kgdb_roundup_cpus(); #endif /* -- cgit From 3cd99ac3559855f69afbc1d5080e17eaa12394ff Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Tue, 4 Dec 2018 19:38:26 -0800 Subject: kgdb: Fix kgdb_roundup_cpus() for arches who used smp_call_function() When I had lockdep turned on and dropped into kgdb I got a nice splat on my system. Specifically it hit: DEBUG_LOCKS_WARN_ON(current->hardirq_context) Specifically it looked like this: sysrq: SysRq : DEBUG ------------[ cut here ]------------ DEBUG_LOCKS_WARN_ON(current->hardirq_context) WARNING: CPU: 0 PID: 0 at .../kernel/locking/lockdep.c:2875 lockdep_hardirqs_on+0xf0/0x160 CPU: 0 PID: 0 Comm: swapper/0 Not tainted 4.19.0 #27 pstate: 604003c9 (nZCv DAIF +PAN -UAO) pc : lockdep_hardirqs_on+0xf0/0x160 ... Call trace: lockdep_hardirqs_on+0xf0/0x160 trace_hardirqs_on+0x188/0x1ac kgdb_roundup_cpus+0x14/0x3c kgdb_cpu_enter+0x53c/0x5cc kgdb_handle_exception+0x180/0x1d4 kgdb_compiled_brk_fn+0x30/0x3c brk_handler+0x134/0x178 do_debug_exception+0xfc/0x178 el1_dbg+0x18/0x78 kgdb_breakpoint+0x34/0x58 sysrq_handle_dbg+0x54/0x5c __handle_sysrq+0x114/0x21c handle_sysrq+0x30/0x3c qcom_geni_serial_isr+0x2dc/0x30c ... ... irq event stamp: ...45 hardirqs last enabled at (...44): [...] __do_softirq+0xd8/0x4e4 hardirqs last disabled at (...45): [...] el1_irq+0x74/0x130 softirqs last enabled at (...42): [...] _local_bh_enable+0x2c/0x34 softirqs last disabled at (...43): [...] irq_exit+0xa8/0x100 ---[ end trace adf21f830c46e638 ]--- Looking closely at it, it seems like a really bad idea to be calling local_irq_enable() in kgdb_roundup_cpus(). If nothing else that seems like it could violate spinlock semantics and cause a deadlock. Instead, let's use a private csd alongside smp_call_function_single_async() to round up the other CPUs. Using smp_call_function_single_async() doesn't require interrupts to be enabled so we can remove the offending bit of code. In order to avoid duplicating this across all the architectures that use the default kgdb_roundup_cpus(), we'll add a "weak" implementation to debug_core.c. Looking at all the people who previously had copies of this code, there were a few variants. I've attempted to keep the variants working like they used to. Specifically: * For arch/arc we passed NULL to kgdb_nmicallback() instead of get_irq_regs(). * For arch/mips there was a bit of extra code around kgdb_nmicallback() NOTE: In this patch we will still get into trouble if we try to round up a CPU that failed to round up before. We'll try to round it up again and potentially hang when we try to grab the csd lock. That's not new behavior but we'll still try to do better in a future patch. Suggested-by: Daniel Thompson Signed-off-by: Douglas Anderson Cc: Vineet Gupta Cc: Russell King Cc: Catalin Marinas Cc: Will Deacon Cc: Richard Kuo Cc: Ralf Baechle Cc: Paul Burton Cc: James Hogan Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Cc: Yoshinori Sato Cc: Rich Felker Cc: "David S. Miller" Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: "H. Peter Anvin" Acked-by: Will Deacon Signed-off-by: Daniel Thompson --- arch/arc/kernel/kgdb.c | 10 ++-------- arch/arm/kernel/kgdb.c | 12 ------------ arch/arm64/kernel/kgdb.c | 12 ------------ arch/hexagon/kernel/kgdb.c | 27 --------------------------- arch/mips/kernel/kgdb.c | 9 +-------- arch/powerpc/kernel/kgdb.c | 4 ++-- arch/sh/kernel/kgdb.c | 12 ------------ include/linux/kgdb.h | 15 +++++++++++++-- kernel/debug/debug_core.c | 41 +++++++++++++++++++++++++++++++++++++++++ 9 files changed, 59 insertions(+), 83 deletions(-) (limited to 'kernel') diff --git a/arch/arc/kernel/kgdb.c b/arch/arc/kernel/kgdb.c index 0932851028e0..68d9fe4b5aa7 100644 --- a/arch/arc/kernel/kgdb.c +++ b/arch/arc/kernel/kgdb.c @@ -192,18 +192,12 @@ void kgdb_arch_set_pc(struct pt_regs *regs, unsigned long ip) instruction_pointer(regs) = ip; } -static void kgdb_call_nmi_hook(void *ignored) +void kgdb_call_nmi_hook(void *ignored) { + /* Default implementation passes get_irq_regs() but we don't */ kgdb_nmicallback(raw_smp_processor_id(), NULL); } -void kgdb_roundup_cpus(void) -{ - local_irq_enable(); - smp_call_function(kgdb_call_nmi_hook, NULL, 0); - local_irq_disable(); -} - struct kgdb_arch arch_kgdb_ops = { /* breakpoint instruction: TRAP_S 0x3 */ #ifdef CONFIG_CPU_BIG_ENDIAN diff --git a/arch/arm/kernel/kgdb.c b/arch/arm/kernel/kgdb.c index f21077b077be..d9a69e941463 100644 --- a/arch/arm/kernel/kgdb.c +++ b/arch/arm/kernel/kgdb.c @@ -170,18 +170,6 @@ static struct undef_hook kgdb_compiled_brkpt_hook = { .fn = kgdb_compiled_brk_fn }; -static void kgdb_call_nmi_hook(void *ignored) -{ - kgdb_nmicallback(raw_smp_processor_id(), get_irq_regs()); -} - -void kgdb_roundup_cpus(void) -{ - local_irq_enable(); - smp_call_function(kgdb_call_nmi_hook, NULL, 0); - local_irq_disable(); -} - static int __kgdb_notify(struct die_args *args, unsigned long cmd) { struct pt_regs *regs = args->regs; diff --git a/arch/arm64/kernel/kgdb.c b/arch/arm64/kernel/kgdb.c index 12c339ff6e75..da880247c734 100644 --- a/arch/arm64/kernel/kgdb.c +++ b/arch/arm64/kernel/kgdb.c @@ -284,18 +284,6 @@ static struct step_hook kgdb_step_hook = { .fn = kgdb_step_brk_fn }; -static void kgdb_call_nmi_hook(void *ignored) -{ - kgdb_nmicallback(raw_smp_processor_id(), get_irq_regs()); -} - -void kgdb_roundup_cpus(void) -{ - local_irq_enable(); - smp_call_function(kgdb_call_nmi_hook, NULL, 0); - local_irq_disable(); -} - static int __kgdb_notify(struct die_args *args, unsigned long cmd) { struct pt_regs *regs = args->regs; diff --git a/arch/hexagon/kernel/kgdb.c b/arch/hexagon/kernel/kgdb.c index 012e0e230ac2..b95d12038a4e 100644 --- a/arch/hexagon/kernel/kgdb.c +++ b/arch/hexagon/kernel/kgdb.c @@ -115,33 +115,6 @@ void kgdb_arch_set_pc(struct pt_regs *regs, unsigned long pc) instruction_pointer(regs) = pc; } -#ifdef CONFIG_SMP - -/** - * kgdb_roundup_cpus - Get other CPUs into a holding pattern - * - * On SMP systems, we need to get the attention of the other CPUs - * and get them be in a known state. This should do what is needed - * to get the other CPUs to call kgdb_wait(). Note that on some arches, - * the NMI approach is not used for rounding up all the CPUs. For example, - * in case of MIPS, smp_call_function() is used to roundup CPUs. - * - * On non-SMP systems, this is not called. - */ - -static void hexagon_kgdb_nmi_hook(void *ignored) -{ - kgdb_nmicallback(raw_smp_processor_id(), get_irq_regs()); -} - -void kgdb_roundup_cpus(void) -{ - local_irq_enable(); - smp_call_function(hexagon_kgdb_nmi_hook, NULL, 0); - local_irq_disable(); -} -#endif - /* Not yet working */ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, diff --git a/arch/mips/kernel/kgdb.c b/arch/mips/kernel/kgdb.c index 2b05effc17b4..42f057a6c215 100644 --- a/arch/mips/kernel/kgdb.c +++ b/arch/mips/kernel/kgdb.c @@ -207,7 +207,7 @@ void arch_kgdb_breakpoint(void) ".set\treorder"); } -static void kgdb_call_nmi_hook(void *ignored) +void kgdb_call_nmi_hook(void *ignored) { mm_segment_t old_fs; @@ -219,13 +219,6 @@ static void kgdb_call_nmi_hook(void *ignored) set_fs(old_fs); } -void kgdb_roundup_cpus(void) -{ - local_irq_enable(); - smp_call_function(kgdb_call_nmi_hook, NULL, 0); - local_irq_disable(); -} - static int compute_signal(int tt) { struct hard_trap_info *ht; diff --git a/arch/powerpc/kernel/kgdb.c b/arch/powerpc/kernel/kgdb.c index b0e804844be0..b4ce54d73337 100644 --- a/arch/powerpc/kernel/kgdb.c +++ b/arch/powerpc/kernel/kgdb.c @@ -117,7 +117,7 @@ int kgdb_skipexception(int exception, struct pt_regs *regs) return kgdb_isremovedbreak(regs->nip); } -static int kgdb_call_nmi_hook(struct pt_regs *regs) +static int kgdb_debugger_ipi(struct pt_regs *regs) { kgdb_nmicallback(raw_smp_processor_id(), regs); return 0; @@ -502,7 +502,7 @@ int kgdb_arch_init(void) old__debugger_break_match = __debugger_break_match; old__debugger_fault_handler = __debugger_fault_handler; - __debugger_ipi = kgdb_call_nmi_hook; + __debugger_ipi = kgdb_debugger_ipi; __debugger = kgdb_debugger; __debugger_bpt = kgdb_handle_breakpoint; __debugger_sstep = kgdb_singlestep; diff --git a/arch/sh/kernel/kgdb.c b/arch/sh/kernel/kgdb.c index cc57630f6bf2..14e012ad7c57 100644 --- a/arch/sh/kernel/kgdb.c +++ b/arch/sh/kernel/kgdb.c @@ -314,18 +314,6 @@ BUILD_TRAP_HANDLER(singlestep) local_irq_restore(flags); } -static void kgdb_call_nmi_hook(void *ignored) -{ - kgdb_nmicallback(raw_smp_processor_id(), get_irq_regs()); -} - -void kgdb_roundup_cpus(void) -{ - local_irq_enable(); - smp_call_function(kgdb_call_nmi_hook, NULL, 0); - local_irq_disable(); -} - static int __kgdb_notify(struct die_args *args, unsigned long cmd) { int ret; diff --git a/include/linux/kgdb.h b/include/linux/kgdb.h index 05e5b2eb0d32..24422865cd18 100644 --- a/include/linux/kgdb.h +++ b/include/linux/kgdb.h @@ -176,14 +176,25 @@ kgdb_arch_handle_exception(int vector, int signo, int err_code, char *remcom_out_buffer, struct pt_regs *regs); +/** + * kgdb_call_nmi_hook - Call kgdb_nmicallback() on the current CPU + * @ignored: This parameter is only here to match the prototype. + * + * If you're using the default implementation of kgdb_roundup_cpus() + * this function will be called per CPU. If you don't implement + * kgdb_call_nmi_hook() a default will be used. + */ + +extern void kgdb_call_nmi_hook(void *ignored); + /** * kgdb_roundup_cpus - Get other CPUs into a holding pattern * * On SMP systems, we need to get the attention of the other CPUs * and get them into a known state. This should do what is needed * to get the other CPUs to call kgdb_wait(). Note that on some arches, - * the NMI approach is not used for rounding up all the CPUs. For example, - * in case of MIPS, smp_call_function() is used to roundup CPUs. + * the NMI approach is not used for rounding up all the CPUs. Normally + * those architectures can just not implement this and get the default. * * On non-SMP systems, this is not called. */ diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index f3cadda45f07..10db2833a423 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -55,6 +55,7 @@ #include #include #include +#include #include #include @@ -220,6 +221,46 @@ int __weak kgdb_skipexception(int exception, struct pt_regs *regs) return 0; } +#ifdef CONFIG_SMP + +/* + * Default (weak) implementation for kgdb_roundup_cpus + */ + +static DEFINE_PER_CPU(call_single_data_t, kgdb_roundup_csd); + +void __weak kgdb_call_nmi_hook(void *ignored) +{ + /* + * NOTE: get_irq_regs() is supposed to get the registers from + * before the IPI interrupt happened and so is supposed to + * show where the processor was. In some situations it's + * possible we might be called without an IPI, so it might be + * safer to figure out how to make kgdb_breakpoint() work + * properly here. + */ + kgdb_nmicallback(raw_smp_processor_id(), get_irq_regs()); +} + +void __weak kgdb_roundup_cpus(void) +{ + call_single_data_t *csd; + int this_cpu = raw_smp_processor_id(); + int cpu; + + for_each_online_cpu(cpu) { + /* No need to roundup ourselves */ + if (cpu == this_cpu) + continue; + + csd = &per_cpu(kgdb_roundup_csd, cpu); + csd->func = kgdb_call_nmi_hook; + smp_call_function_single_async(cpu, csd); + } +} + +#endif + /* * Some architectures need cache flushes when we set/clear a * breakpoint: -- cgit From 87b095928584da7d5cb3149016f00b0b139c2292 Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Tue, 4 Dec 2018 19:38:27 -0800 Subject: kgdb: Don't round up a CPU that failed rounding up before If we're using the default implementation of kgdb_roundup_cpus() that uses smp_call_function_single_async() we can end up hanging kgdb_roundup_cpus() if we try to round up a CPU that failed to round up before. Specifically smp_call_function_single_async() will try to wait on the csd lock for the CPU that we're trying to round up. If the previous round up never finished then that lock could still be held and we'll just sit there hanging. There's not a lot of use trying to round up a CPU that failed to round up before. Let's keep a flag that indicates whether the CPU started but didn't finish to round up before. If we see that flag set then we'll skip the next round up. In general we have a few goals here: - We never want to end up calling smp_call_function_single_async() when the csd is still locked. This is accomplished because flush_smp_call_function_queue() unlocks the csd _before_ invoking the callback. That means that when kgdb_nmicallback() runs we know for sure the the csd is no longer locked. Thus when we set "rounding_up = false" we know for sure that the csd is unlocked. - If there are no timeouts rounding up we should never skip a round up. NOTE #1: In general trying to continue running after failing to round up CPUs doesn't appear to be supported in the debugger. When I simulate this I find that kdb reports "Catastrophic error detected" when I try to continue. I can overrule and continue anyway, but it should be noted that we may be entering the land of dragons here. Possibly the "Catastrophic error detected" was added _because_ of the future failure to round up, but even so this is an area of the code that hasn't been strongly tested. NOTE #2: I did a bit of testing before and after this change. I introduced a 10 second hang in the kernel while holding a spinlock that I could invoke on a certain CPU with 'taskset -c 3 cat /sys/...". Before this change if I did: - Invoke hang - Enter debugger - g (which warns about Catastrophic error, g again to go anyway) - g - Enter debugger ...I'd hang the rest of the 10 seconds without getting a debugger prompt. After this change I end up in the debugger the 2nd time after only 1 second with the standard warning about 'Timed out waiting for secondary CPUs.' I'll also note that once the CPU finished waiting I could actually debug it (aka "btc" worked) I won't promise that everything works perfectly if the errant CPU comes back at just the wrong time (like as we're entering or exiting the debugger) but it certainly seems like an improvement. NOTE #3: setting 'kgdb_info[cpu].rounding_up = false' is in kgdb_nmicallback() instead of kgdb_call_nmi_hook() because some implementations override kgdb_call_nmi_hook(). It shouldn't hurt to have it in kgdb_nmicallback() in any case. NOTE #4: this logic is really only needed because there is no API call like "smp_try_call_function_single_async()" or "smp_csd_is_locked()". If such an API existed then we'd use it instead, but it seemed a bit much to add an API like this just for kgdb. Signed-off-by: Douglas Anderson Acked-by: Daniel Thompson Signed-off-by: Daniel Thompson --- kernel/debug/debug_core.c | 20 +++++++++++++++++++- kernel/debug/debug_core.h | 1 + 2 files changed, 20 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 10db2833a423..1fb8b239e567 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -247,6 +247,7 @@ void __weak kgdb_roundup_cpus(void) call_single_data_t *csd; int this_cpu = raw_smp_processor_id(); int cpu; + int ret; for_each_online_cpu(cpu) { /* No need to roundup ourselves */ @@ -254,8 +255,23 @@ void __weak kgdb_roundup_cpus(void) continue; csd = &per_cpu(kgdb_roundup_csd, cpu); + + /* + * If it didn't round up last time, don't try again + * since smp_call_function_single_async() will block. + * + * If rounding_up is false then we know that the + * previous call must have at least started and that + * means smp_call_function_single_async() won't block. + */ + if (kgdb_info[cpu].rounding_up) + continue; + kgdb_info[cpu].rounding_up = true; + csd->func = kgdb_call_nmi_hook; - smp_call_function_single_async(cpu, csd); + ret = smp_call_function_single_async(cpu, csd); + if (ret) + kgdb_info[cpu].rounding_up = false; } } @@ -788,6 +804,8 @@ int kgdb_nmicallback(int cpu, void *regs) struct kgdb_state kgdb_var; struct kgdb_state *ks = &kgdb_var; + kgdb_info[cpu].rounding_up = false; + memset(ks, 0, sizeof(struct kgdb_state)); ks->cpu = cpu; ks->linux_regs = regs; diff --git a/kernel/debug/debug_core.h b/kernel/debug/debug_core.h index 127d9bc49fb4..b4a7c326d546 100644 --- a/kernel/debug/debug_core.h +++ b/kernel/debug/debug_core.h @@ -42,6 +42,7 @@ struct debuggerinfo_struct { int ret_state; int irq_depth; int enter_kgdb; + bool rounding_up; }; extern struct debuggerinfo_struct kgdb_info[]; -- cgit From 162bc7f5afd75b72acbe3c5f3488ef7e64a3fe36 Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Tue, 4 Dec 2018 19:38:28 -0800 Subject: kdb: Don't back trace on a cpu that didn't round up If you have a CPU that fails to round up and then run 'btc' you'll end up crashing in kdb becaue we dereferenced NULL. Let's add a check. It's wise to also set the task to NULL when leaving the debugger so that if we fail to round up on a later entry into the debugger we won't backtrace a stale task. Signed-off-by: Douglas Anderson Acked-by: Daniel Thompson Signed-off-by: Daniel Thompson --- kernel/debug/debug_core.c | 4 ++++ kernel/debug/kdb/kdb_bt.c | 11 ++++++++++- kernel/debug/kdb/kdb_debugger.c | 7 ------- 3 files changed, 14 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 1fb8b239e567..5cc608de6883 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -592,6 +592,8 @@ return_normal: arch_kgdb_ops.correct_hw_break(); if (trace_on) tracing_on(); + kgdb_info[cpu].debuggerinfo = NULL; + kgdb_info[cpu].task = NULL; kgdb_info[cpu].exception_state &= ~(DCPU_WANT_MASTER | DCPU_IS_SLAVE); kgdb_info[cpu].enter_kgdb--; @@ -724,6 +726,8 @@ kgdb_restore: if (trace_on) tracing_on(); + kgdb_info[cpu].debuggerinfo = NULL; + kgdb_info[cpu].task = NULL; kgdb_info[cpu].exception_state &= ~(DCPU_WANT_MASTER | DCPU_IS_SLAVE); kgdb_info[cpu].enter_kgdb--; diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c index 7921ae4fca8d..7e2379aa0a1e 100644 --- a/kernel/debug/kdb/kdb_bt.c +++ b/kernel/debug/kdb/kdb_bt.c @@ -186,7 +186,16 @@ kdb_bt(int argc, const char **argv) kdb_printf("btc: cpu status: "); kdb_parse("cpu\n"); for_each_online_cpu(cpu) { - sprintf(buf, "btt 0x%px\n", KDB_TSK(cpu)); + void *kdb_tsk = KDB_TSK(cpu); + + /* If a CPU failed to round up we could be here */ + if (!kdb_tsk) { + kdb_printf("WARNING: no task for cpu %ld\n", + cpu); + continue; + } + + sprintf(buf, "btt 0x%px\n", kdb_tsk); kdb_parse(buf); touch_nmi_watchdog(); } diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c index 15e1a7af5dd0..53a0df6e4d92 100644 --- a/kernel/debug/kdb/kdb_debugger.c +++ b/kernel/debug/kdb/kdb_debugger.c @@ -118,13 +118,6 @@ int kdb_stub(struct kgdb_state *ks) kdb_bp_remove(); KDB_STATE_CLEAR(DOING_SS); KDB_STATE_SET(PAGER); - /* zero out any offline cpu data */ - for_each_present_cpu(i) { - if (!cpu_online(i)) { - kgdb_info[i].debuggerinfo = NULL; - kgdb_info[i].task = NULL; - } - } if (ks->err_code == DIE_OOPS || reason == KDB_REASON_OOPS) { ks->pass_exception = 1; KDB_FLAG_SET(CATASTROPHIC); -- cgit From 7faedcd4de4356dfcda30841a5ffef3bf35fa06c Mon Sep 17 00:00:00 2001 From: Nicholas Mc Guire Date: Fri, 20 Jul 2018 11:23:37 +0200 Subject: kdb: use bool for binary state indicators defcmd_in_progress is the state trace for command group processing - within a command group or not - usable is an indicator if a command set is valid (allocated/non-empty) - so use a bool for those binary indication here. Signed-off-by: Nicholas Mc Guire Reviewed-by: Daniel Thompson Signed-off-by: Daniel Thompson --- kernel/debug/kdb/kdb_main.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index d72b32c66f7d..82a3b32a7cfc 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -658,7 +658,7 @@ static void kdb_cmderror(int diag) */ struct defcmd_set { int count; - int usable; + bool usable; char *name; char *usage; char *help; @@ -666,7 +666,7 @@ struct defcmd_set { }; static struct defcmd_set *defcmd_set; static int defcmd_set_count; -static int defcmd_in_progress; +static bool defcmd_in_progress; /* Forward references */ static int kdb_exec_defcmd(int argc, const char **argv); @@ -676,9 +676,9 @@ static int kdb_defcmd2(const char *cmdstr, const char *argv0) struct defcmd_set *s = defcmd_set + defcmd_set_count - 1; char **save_command = s->command; if (strcmp(argv0, "endefcmd") == 0) { - defcmd_in_progress = 0; + defcmd_in_progress = false; if (!s->count) - s->usable = 0; + s->usable = false; if (s->usable) /* macros are always safe because when executed each * internal command re-enters kdb_parse() and is @@ -695,7 +695,7 @@ static int kdb_defcmd2(const char *cmdstr, const char *argv0) if (!s->command) { kdb_printf("Could not allocate new kdb_defcmd table for %s\n", cmdstr); - s->usable = 0; + s->usable = false; return KDB_NOTIMP; } memcpy(s->command, save_command, s->count * sizeof(*(s->command))); @@ -737,7 +737,7 @@ static int kdb_defcmd(int argc, const char **argv) defcmd_set_count * sizeof(*defcmd_set)); s = defcmd_set + defcmd_set_count; memset(s, 0, sizeof(*s)); - s->usable = 1; + s->usable = true; s->name = kdb_strdup(argv[1], GFP_KDB); if (!s->name) goto fail_name; @@ -756,7 +756,7 @@ static int kdb_defcmd(int argc, const char **argv) s->help[strlen(s->help)-1] = '\0'; } ++defcmd_set_count; - defcmd_in_progress = 1; + defcmd_in_progress = true; kfree(save_defcmd_set); return 0; fail_help: -- cgit From c40f7d74c741a907cfaeb73a7697081881c497d0 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 27 Dec 2018 13:46:17 -0800 Subject: sched/fair: Fix infinite loop in update_blocked_averages() by reverting a9e7f6544b9c Zhipeng Xie, Xie XiuQi and Sargun Dhillon reported lockups in the scheduler under high loads, starting at around the v4.18 time frame, and Zhipeng Xie tracked it down to bugs in the rq->leaf_cfs_rq_list manipulation. Do a (manual) revert of: a9e7f6544b9c ("sched/fair: Fix O(nr_cgroups) in load balance path") It turns out that the list_del_leaf_cfs_rq() introduced by this commit is a surprising property that was not considered in followup commits such as: 9c2791f936ef ("sched/fair: Fix hierarchical order in rq->leaf_cfs_rq_list") As Vincent Guittot explains: "I think that there is a bigger problem with commit a9e7f6544b9c and cfs_rq throttling: Let take the example of the following topology TG2 --> TG1 --> root: 1) The 1st time a task is enqueued, we will add TG2 cfs_rq then TG1 cfs_rq to leaf_cfs_rq_list and we are sure to do the whole branch in one path because it has never been used and can't be throttled so tmp_alone_branch will point to leaf_cfs_rq_list at the end. 2) Then TG1 is throttled 3) and we add TG3 as a new child of TG1. 4) The 1st enqueue of a task on TG3 will add TG3 cfs_rq just before TG1 cfs_rq and tmp_alone_branch will stay on rq->leaf_cfs_rq_list. With commit a9e7f6544b9c, we can del a cfs_rq from rq->leaf_cfs_rq_list. So if the load of TG1 cfs_rq becomes NULL before step 2) above, TG1 cfs_rq is removed from the list. Then at step 4), TG3 cfs_rq is added at the beginning of rq->leaf_cfs_rq_list but tmp_alone_branch still points to TG3 cfs_rq because its throttled parent can't be enqueued when the lock is released. tmp_alone_branch doesn't point to rq->leaf_cfs_rq_list whereas it should. So if TG3 cfs_rq is removed or destroyed before tmp_alone_branch points on another TG cfs_rq, the next TG cfs_rq that will be added, will be linked outside rq->leaf_cfs_rq_list - which is bad. In addition, we can break the ordering of the cfs_rq in rq->leaf_cfs_rq_list but this ordering is used to update and propagate the update from leaf down to root." Instead of trying to work through all these cases and trying to reproduce the very high loads that produced the lockup to begin with, simplify the code temporarily by reverting a9e7f6544b9c - which change was clearly not thought through completely. This (hopefully) gives us a kernel that doesn't lock up so people can continue to enjoy their holidays without worrying about regressions. ;-) [ mingo: Wrote changelog, fixed weird spelling in code comment while at it. ] Analyzed-by: Xie XiuQi Analyzed-by: Vincent Guittot Reported-by: Zhipeng Xie Reported-by: Sargun Dhillon Reported-by: Xie XiuQi Tested-by: Zhipeng Xie Tested-by: Sargun Dhillon Signed-off-by: Linus Torvalds Acked-by: Vincent Guittot Cc: # v4.13+ Cc: Bin Li Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Tejun Heo Cc: Thomas Gleixner Fixes: a9e7f6544b9c ("sched/fair: Fix O(nr_cgroups) in load balance path") Link: http://lkml.kernel.org/r/1545879866-27809-1-git-send-email-xiexiuqi@huawei.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 43 +++++++++---------------------------------- 1 file changed, 9 insertions(+), 34 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d1907506318a..6483834f1278 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -352,10 +352,9 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) } } -/* Iterate thr' all leaf cfs_rq's on a runqueue */ -#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \ - list_for_each_entry_safe(cfs_rq, pos, &rq->leaf_cfs_rq_list, \ - leaf_cfs_rq_list) +/* Iterate through all leaf cfs_rq's on a runqueue: */ +#define for_each_leaf_cfs_rq(rq, cfs_rq) \ + list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) /* Do the two (enqueued) entities belong to the same group ? */ static inline struct cfs_rq * @@ -447,8 +446,8 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) { } -#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \ - for (cfs_rq = &rq->cfs, pos = NULL; cfs_rq; cfs_rq = pos) +#define for_each_leaf_cfs_rq(rq, cfs_rq) \ + for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL) static inline struct sched_entity *parent_entity(struct sched_entity *se) { @@ -7647,27 +7646,10 @@ static inline bool others_have_blocked(struct rq *rq) #ifdef CONFIG_FAIR_GROUP_SCHED -static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) -{ - if (cfs_rq->load.weight) - return false; - - if (cfs_rq->avg.load_sum) - return false; - - if (cfs_rq->avg.util_sum) - return false; - - if (cfs_rq->avg.runnable_load_sum) - return false; - - return true; -} - static void update_blocked_averages(int cpu) { struct rq *rq = cpu_rq(cpu); - struct cfs_rq *cfs_rq, *pos; + struct cfs_rq *cfs_rq; const struct sched_class *curr_class; struct rq_flags rf; bool done = true; @@ -7679,7 +7661,7 @@ static void update_blocked_averages(int cpu) * Iterates the task_group tree in a bottom up fashion, see * list_add_leaf_cfs_rq() for details. */ - for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) { + for_each_leaf_cfs_rq(rq, cfs_rq) { struct sched_entity *se; /* throttled entities do not contribute to load */ @@ -7694,13 +7676,6 @@ static void update_blocked_averages(int cpu) if (se && !skip_blocked_update(se)) update_load_avg(cfs_rq_of(se), se, 0); - /* - * There can be a lot of idle CPU cgroups. Don't let fully - * decayed cfs_rqs linger on the list. - */ - if (cfs_rq_is_decayed(cfs_rq)) - list_del_leaf_cfs_rq(cfs_rq); - /* Don't need periodic decay once load/util_avg are null */ if (cfs_rq_has_blocked(cfs_rq)) done = false; @@ -10570,10 +10545,10 @@ const struct sched_class fair_sched_class = { #ifdef CONFIG_SCHED_DEBUG void print_cfs_stats(struct seq_file *m, int cpu) { - struct cfs_rq *cfs_rq, *pos; + struct cfs_rq *cfs_rq; rcu_read_lock(); - for_each_leaf_cfs_rq_safe(cpu_rq(cpu), cfs_rq, pos) + for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq) print_cfs_rq(m, cpu, cfs_rq); rcu_read_unlock(); } -- cgit From c08435ec7f2bc8f4109401f696fd55159b4b40cb Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 3 Jan 2019 00:58:27 +0100 Subject: bpf: move {prev_,}insn_idx into verifier env Move prev_insn_idx and insn_idx from the do_check() function into the verifier environment, so they can be read inside the various helper functions for handling the instructions. It's easier to put this into the environment rather than changing all call-sites only to pass it along. insn_idx is useful in particular since this later on allows to hold state in env->insn_aux_data[env->insn_idx]. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 2 ++ kernel/bpf/verifier.c | 76 ++++++++++++++++++++++---------------------- 2 files changed, 40 insertions(+), 38 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index c233efc106c6..3f84f3e87704 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -212,6 +212,8 @@ struct bpf_subprog_info { * one verifier_env per bpf_check() call */ struct bpf_verifier_env { + u32 insn_idx; + u32 prev_insn_idx; struct bpf_prog *prog; /* eBPF program being verified */ const struct bpf_verifier_ops *ops; struct bpf_verifier_stack_elem *head; /* stack of verifier states to be processed */ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 71d86e3024ae..afa8515bbb34 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5650,7 +5650,6 @@ static int do_check(struct bpf_verifier_env *env) struct bpf_insn *insns = env->prog->insnsi; struct bpf_reg_state *regs; int insn_cnt = env->prog->len, i; - int insn_idx, prev_insn_idx = 0; int insn_processed = 0; bool do_print_state = false; @@ -5670,19 +5669,19 @@ static int do_check(struct bpf_verifier_env *env) BPF_MAIN_FUNC /* callsite */, 0 /* frameno */, 0 /* subprogno, zero == main subprog */); - insn_idx = 0; + for (;;) { struct bpf_insn *insn; u8 class; int err; - if (insn_idx >= insn_cnt) { + if (env->insn_idx >= insn_cnt) { verbose(env, "invalid insn idx %d insn_cnt %d\n", - insn_idx, insn_cnt); + env->insn_idx, insn_cnt); return -EFAULT; } - insn = &insns[insn_idx]; + insn = &insns[env->insn_idx]; class = BPF_CLASS(insn->code); if (++insn_processed > BPF_COMPLEXITY_LIMIT_INSNS) { @@ -5692,7 +5691,7 @@ static int do_check(struct bpf_verifier_env *env) return -E2BIG; } - err = is_state_visited(env, insn_idx); + err = is_state_visited(env, env->insn_idx); if (err < 0) return err; if (err == 1) { @@ -5700,9 +5699,9 @@ static int do_check(struct bpf_verifier_env *env) if (env->log.level) { if (do_print_state) verbose(env, "\nfrom %d to %d: safe\n", - prev_insn_idx, insn_idx); + env->prev_insn_idx, env->insn_idx); else - verbose(env, "%d: safe\n", insn_idx); + verbose(env, "%d: safe\n", env->insn_idx); } goto process_bpf_exit; } @@ -5715,10 +5714,10 @@ static int do_check(struct bpf_verifier_env *env) if (env->log.level > 1 || (env->log.level && do_print_state)) { if (env->log.level > 1) - verbose(env, "%d:", insn_idx); + verbose(env, "%d:", env->insn_idx); else verbose(env, "\nfrom %d to %d:", - prev_insn_idx, insn_idx); + env->prev_insn_idx, env->insn_idx); print_verifier_state(env, state->frame[state->curframe]); do_print_state = false; } @@ -5729,20 +5728,20 @@ static int do_check(struct bpf_verifier_env *env) .private_data = env, }; - verbose_linfo(env, insn_idx, "; "); - verbose(env, "%d: ", insn_idx); + verbose_linfo(env, env->insn_idx, "; "); + verbose(env, "%d: ", env->insn_idx); print_bpf_insn(&cbs, insn, env->allow_ptr_leaks); } if (bpf_prog_is_dev_bound(env->prog->aux)) { - err = bpf_prog_offload_verify_insn(env, insn_idx, - prev_insn_idx); + err = bpf_prog_offload_verify_insn(env, env->insn_idx, + env->prev_insn_idx); if (err) return err; } regs = cur_regs(env); - env->insn_aux_data[insn_idx].seen = true; + env->insn_aux_data[env->insn_idx].seen = true; if (class == BPF_ALU || class == BPF_ALU64) { err = check_alu_op(env, insn); @@ -5768,13 +5767,13 @@ static int do_check(struct bpf_verifier_env *env) /* check that memory (src_reg + off) is readable, * the state of dst_reg will be updated by this func */ - err = check_mem_access(env, insn_idx, insn->src_reg, insn->off, - BPF_SIZE(insn->code), BPF_READ, - insn->dst_reg, false); + err = check_mem_access(env, env->insn_idx, insn->src_reg, + insn->off, BPF_SIZE(insn->code), + BPF_READ, insn->dst_reg, false); if (err) return err; - prev_src_type = &env->insn_aux_data[insn_idx].ptr_type; + prev_src_type = &env->insn_aux_data[env->insn_idx].ptr_type; if (*prev_src_type == NOT_INIT) { /* saw a valid insn @@ -5799,10 +5798,10 @@ static int do_check(struct bpf_verifier_env *env) enum bpf_reg_type *prev_dst_type, dst_reg_type; if (BPF_MODE(insn->code) == BPF_XADD) { - err = check_xadd(env, insn_idx, insn); + err = check_xadd(env, env->insn_idx, insn); if (err) return err; - insn_idx++; + env->insn_idx++; continue; } @@ -5818,13 +5817,13 @@ static int do_check(struct bpf_verifier_env *env) dst_reg_type = regs[insn->dst_reg].type; /* check that memory (dst_reg + off) is writeable */ - err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, - BPF_SIZE(insn->code), BPF_WRITE, - insn->src_reg, false); + err = check_mem_access(env, env->insn_idx, insn->dst_reg, + insn->off, BPF_SIZE(insn->code), + BPF_WRITE, insn->src_reg, false); if (err) return err; - prev_dst_type = &env->insn_aux_data[insn_idx].ptr_type; + prev_dst_type = &env->insn_aux_data[env->insn_idx].ptr_type; if (*prev_dst_type == NOT_INIT) { *prev_dst_type = dst_reg_type; @@ -5852,9 +5851,9 @@ static int do_check(struct bpf_verifier_env *env) } /* check that memory (dst_reg + off) is writeable */ - err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, - BPF_SIZE(insn->code), BPF_WRITE, - -1, false); + err = check_mem_access(env, env->insn_idx, insn->dst_reg, + insn->off, BPF_SIZE(insn->code), + BPF_WRITE, -1, false); if (err) return err; @@ -5872,9 +5871,9 @@ static int do_check(struct bpf_verifier_env *env) } if (insn->src_reg == BPF_PSEUDO_CALL) - err = check_func_call(env, insn, &insn_idx); + err = check_func_call(env, insn, &env->insn_idx); else - err = check_helper_call(env, insn->imm, insn_idx); + err = check_helper_call(env, insn->imm, env->insn_idx); if (err) return err; @@ -5887,7 +5886,7 @@ static int do_check(struct bpf_verifier_env *env) return -EINVAL; } - insn_idx += insn->off + 1; + env->insn_idx += insn->off + 1; continue; } else if (opcode == BPF_EXIT) { @@ -5901,8 +5900,8 @@ static int do_check(struct bpf_verifier_env *env) if (state->curframe) { /* exit from nested function */ - prev_insn_idx = insn_idx; - err = prepare_func_exit(env, &insn_idx); + env->prev_insn_idx = env->insn_idx; + err = prepare_func_exit(env, &env->insn_idx); if (err) return err; do_print_state = true; @@ -5932,7 +5931,8 @@ static int do_check(struct bpf_verifier_env *env) if (err) return err; process_bpf_exit: - err = pop_stack(env, &prev_insn_idx, &insn_idx); + err = pop_stack(env, &env->prev_insn_idx, + &env->insn_idx); if (err < 0) { if (err != -ENOENT) return err; @@ -5942,7 +5942,7 @@ process_bpf_exit: continue; } } else { - err = check_cond_jmp_op(env, insn, &insn_idx); + err = check_cond_jmp_op(env, insn, &env->insn_idx); if (err) return err; } @@ -5959,8 +5959,8 @@ process_bpf_exit: if (err) return err; - insn_idx++; - env->insn_aux_data[insn_idx].seen = true; + env->insn_idx++; + env->insn_aux_data[env->insn_idx].seen = true; } else { verbose(env, "invalid BPF_LD mode\n"); return -EINVAL; @@ -5970,7 +5970,7 @@ process_bpf_exit: return -EINVAL; } - insn_idx++; + env->insn_idx++; } verbose(env, "processed %d insns (limit %d), stack depth ", -- cgit From 144cd91c4c2bced6eb8a7e25e590f6618a11e854 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 3 Jan 2019 00:58:28 +0100 Subject: bpf: move tmp variable into ax register in interpreter This change moves the on-stack 64 bit tmp variable in ___bpf_prog_run() into the hidden ax register. The latter is currently only used in JITs for constant blinding as a temporary scratch register, meaning the BPF interpreter will never see the use of ax. Therefore it is safe to use it for the cases where tmp has been used earlier. This is needed to later on allow restricted hidden use of ax in both interpreter and JITs. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- include/linux/filter.h | 3 ++- kernel/bpf/core.c | 34 +++++++++++++++++----------------- 2 files changed, 19 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/include/linux/filter.h b/include/linux/filter.h index 8c8544b375eb..84a6a98f8328 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -60,7 +60,8 @@ struct sock_reuseport; * constants. See JIT pre-step in bpf_jit_blind_constants(). */ #define BPF_REG_AX MAX_BPF_REG -#define MAX_BPF_JIT_REG (MAX_BPF_REG + 1) +#define MAX_BPF_EXT_REG (MAX_BPF_REG + 1) +#define MAX_BPF_JIT_REG MAX_BPF_EXT_REG /* unused opcode to mark special call to bpf_tail_call() helper */ #define BPF_TAIL_CALL 0xf0 diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 38de580abcc2..a34312a5eea2 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -54,6 +54,7 @@ #define DST regs[insn->dst_reg] #define SRC regs[insn->src_reg] #define FP regs[BPF_REG_FP] +#define AX regs[BPF_REG_AX] #define ARG1 regs[BPF_REG_ARG1] #define CTX regs[BPF_REG_CTX] #define IMM insn->imm @@ -1188,7 +1189,6 @@ bool bpf_opcode_in_insntable(u8 code) */ static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack) { - u64 tmp; #define BPF_INSN_2_LBL(x, y) [BPF_##x | BPF_##y] = &&x##_##y #define BPF_INSN_3_LBL(x, y, z) [BPF_##x | BPF_##y | BPF_##z] = &&x##_##y##_##z static const void *jumptable[256] = { @@ -1268,36 +1268,36 @@ select_insn: (*(s64 *) &DST) >>= IMM; CONT; ALU64_MOD_X: - div64_u64_rem(DST, SRC, &tmp); - DST = tmp; + div64_u64_rem(DST, SRC, &AX); + DST = AX; CONT; ALU_MOD_X: - tmp = (u32) DST; - DST = do_div(tmp, (u32) SRC); + AX = (u32) DST; + DST = do_div(AX, (u32) SRC); CONT; ALU64_MOD_K: - div64_u64_rem(DST, IMM, &tmp); - DST = tmp; + div64_u64_rem(DST, IMM, &AX); + DST = AX; CONT; ALU_MOD_K: - tmp = (u32) DST; - DST = do_div(tmp, (u32) IMM); + AX = (u32) DST; + DST = do_div(AX, (u32) IMM); CONT; ALU64_DIV_X: DST = div64_u64(DST, SRC); CONT; ALU_DIV_X: - tmp = (u32) DST; - do_div(tmp, (u32) SRC); - DST = (u32) tmp; + AX = (u32) DST; + do_div(AX, (u32) SRC); + DST = (u32) AX; CONT; ALU64_DIV_K: DST = div64_u64(DST, IMM); CONT; ALU_DIV_K: - tmp = (u32) DST; - do_div(tmp, (u32) IMM); - DST = (u32) tmp; + AX = (u32) DST; + do_div(AX, (u32) IMM); + DST = (u32) AX; CONT; ALU_END_TO_BE: switch (IMM) { @@ -1553,7 +1553,7 @@ STACK_FRAME_NON_STANDARD(___bpf_prog_run); /* jump table */ static unsigned int PROG_NAME(stack_size)(const void *ctx, const struct bpf_insn *insn) \ { \ u64 stack[stack_size / sizeof(u64)]; \ - u64 regs[MAX_BPF_REG]; \ + u64 regs[MAX_BPF_EXT_REG]; \ \ FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; \ ARG1 = (u64) (unsigned long) ctx; \ @@ -1566,7 +1566,7 @@ static u64 PROG_NAME_ARGS(stack_size)(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5, \ const struct bpf_insn *insn) \ { \ u64 stack[stack_size / sizeof(u64)]; \ - u64 regs[MAX_BPF_REG]; \ + u64 regs[MAX_BPF_EXT_REG]; \ \ FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; \ BPF_R1 = r1; \ -- cgit From 9b73bfdd08e73231d6a90ae6db4b46b3fbf56c30 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 3 Jan 2019 00:58:29 +0100 Subject: bpf: enable access to ax register also from verifier rewrite Right now we are using BPF ax register in JIT for constant blinding as well as in interpreter as temporary variable. Verifier will not be able to use it simply because its use will get overridden from the former in bpf_jit_blind_insn(). However, it can be made to work in that blinding will be skipped if there is prior use in either source or destination register on the instruction. Taking constraints of ax into account, the verifier is then open to use it in rewrites under some constraints. Note, ax register already has mappings in every eBPF JIT. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- include/linux/filter.h | 7 +------ kernel/bpf/core.c | 20 ++++++++++++++++++++ 2 files changed, 21 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/include/linux/filter.h b/include/linux/filter.h index 84a6a98f8328..ad106d845b22 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -53,12 +53,7 @@ struct sock_reuseport; #define BPF_REG_D BPF_REG_8 /* data, callee-saved */ #define BPF_REG_H BPF_REG_9 /* hlen, callee-saved */ -/* Kernel hidden auxiliary/helper register for hardening step. - * Only used by eBPF JITs. It's nothing more than a temporary - * register that JITs use internally, only that here it's part - * of eBPF instructions that have been rewritten for blinding - * constants. See JIT pre-step in bpf_jit_blind_constants(). - */ +/* Kernel hidden auxiliary/helper register. */ #define BPF_REG_AX MAX_BPF_REG #define MAX_BPF_EXT_REG (MAX_BPF_REG + 1) #define MAX_BPF_JIT_REG MAX_BPF_EXT_REG diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index a34312a5eea2..f908b9356025 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -858,6 +858,26 @@ static int bpf_jit_blind_insn(const struct bpf_insn *from, BUILD_BUG_ON(BPF_REG_AX + 1 != MAX_BPF_JIT_REG); BUILD_BUG_ON(MAX_BPF_REG + 1 != MAX_BPF_JIT_REG); + /* Constraints on AX register: + * + * AX register is inaccessible from user space. It is mapped in + * all JITs, and used here for constant blinding rewrites. It is + * typically "stateless" meaning its contents are only valid within + * the executed instruction, but not across several instructions. + * There are a few exceptions however which are further detailed + * below. + * + * Constant blinding is only used by JITs, not in the interpreter. + * The interpreter uses AX in some occasions as a local temporary + * register e.g. in DIV or MOD instructions. + * + * In restricted circumstances, the verifier can also use the AX + * register for rewrites as long as they do not interfere with + * the above cases! + */ + if (from->dst_reg == BPF_REG_AX || from->src_reg == BPF_REG_AX) + goto out; + if (from->imm == 0 && (from->code == (BPF_ALU | BPF_MOV | BPF_K) || from->code == (BPF_ALU64 | BPF_MOV | BPF_K))) { -- cgit From 0d6303db7970e6f56ae700fa07e11eb510cda125 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 3 Jan 2019 00:58:30 +0100 Subject: bpf: restrict map value pointer arithmetic for unprivileged Restrict map value pointer arithmetic for unprivileged users in that arithmetic itself must not go out of bounds as opposed to the actual access later on. Therefore after each adjust_ptr_min_max_vals() with a map value pointer as a destination it will simulate a check_map_access() of 1 byte on the destination and once that fails the program is rejected for unprivileged program loads. We use this later on for masking any pointer arithmetic with the remainder of the map value space. The likelihood of breaking any existing real-world unprivileged eBPF program is very small for this corner case. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index afa8515bbb34..4da8c73e168f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3249,6 +3249,17 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, __update_reg_bounds(dst_reg); __reg_deduce_bounds(dst_reg); __reg_bound_offset(dst_reg); + + /* For unprivileged we require that resulting offset must be in bounds + * in order to be able to sanitize access later on. + */ + if (!env->allow_ptr_leaks && dst_reg->type == PTR_TO_MAP_VALUE && + check_map_access(env, dst, dst_reg->off, 1, false)) { + verbose(env, "R%d pointer arithmetic of map value goes out of range, prohibited for !root\n", + dst); + return -EACCES; + } + return 0; } -- cgit From e4298d25830a866cc0f427d4bccb858e76715859 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 3 Jan 2019 00:58:31 +0100 Subject: bpf: restrict stack pointer arithmetic for unprivileged Restrict stack pointer arithmetic for unprivileged users in that arithmetic itself must not go out of bounds as opposed to the actual access later on. Therefore after each adjust_ptr_min_max_vals() with a stack pointer as a destination we simulate a check_stack_access() of 1 byte on the destination and once that fails the program is rejected for unprivileged program loads. This is analog to map value pointer arithmetic and needed for masking later on. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 63 +++++++++++++++++++++++++++++++++------------------ 1 file changed, 41 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 4da8c73e168f..9ac205d1b8b7 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1387,6 +1387,31 @@ static int check_stack_read(struct bpf_verifier_env *env, } } +static int check_stack_access(struct bpf_verifier_env *env, + const struct bpf_reg_state *reg, + int off, int size) +{ + /* Stack accesses must be at a fixed offset, so that we + * can determine what type of data were returned. See + * check_stack_read(). + */ + if (!tnum_is_const(reg->var_off)) { + char tn_buf[48]; + + tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); + verbose(env, "variable stack access var_off=%s off=%d size=%d", + tn_buf, off, size); + return -EACCES; + } + + if (off >= 0 || off < -MAX_BPF_STACK) { + verbose(env, "invalid stack off=%d size=%d\n", off, size); + return -EACCES; + } + + return 0; +} + /* check read/write into map element returned by bpf_map_lookup_elem() */ static int __check_map_access(struct bpf_verifier_env *env, u32 regno, int off, int size, bool zero_size_allowed) @@ -1954,24 +1979,10 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn } } else if (reg->type == PTR_TO_STACK) { - /* stack accesses must be at a fixed offset, so that we can - * determine what type of data were returned. - * See check_stack_read(). - */ - if (!tnum_is_const(reg->var_off)) { - char tn_buf[48]; - - tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose(env, "variable stack access var_off=%s off=%d size=%d", - tn_buf, off, size); - return -EACCES; - } off += reg->var_off.value; - if (off >= 0 || off < -MAX_BPF_STACK) { - verbose(env, "invalid stack off=%d size=%d\n", off, - size); - return -EACCES; - } + err = check_stack_access(env, reg, off, size); + if (err) + return err; state = func(env, reg); err = update_stack_depth(env, state, off); @@ -3253,11 +3264,19 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, /* For unprivileged we require that resulting offset must be in bounds * in order to be able to sanitize access later on. */ - if (!env->allow_ptr_leaks && dst_reg->type == PTR_TO_MAP_VALUE && - check_map_access(env, dst, dst_reg->off, 1, false)) { - verbose(env, "R%d pointer arithmetic of map value goes out of range, prohibited for !root\n", - dst); - return -EACCES; + if (!env->allow_ptr_leaks) { + if (dst_reg->type == PTR_TO_MAP_VALUE && + check_map_access(env, dst, dst_reg->off, 1, false)) { + verbose(env, "R%d pointer arithmetic of map value goes out of range, " + "prohibited for !root\n", dst); + return -EACCES; + } else if (dst_reg->type == PTR_TO_STACK && + check_stack_access(env, dst_reg, dst_reg->off + + dst_reg->var_off.value, 1)) { + verbose(env, "R%d stack pointer arithmetic goes out of range, " + "prohibited for !root\n", dst); + return -EACCES; + } } return 0; -- cgit From 9d7eceede769f90b66cfa06ad5b357140d5141ed Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 3 Jan 2019 00:58:32 +0100 Subject: bpf: restrict unknown scalars of mixed signed bounds for unprivileged For unknown scalars of mixed signed bounds, meaning their smin_value is negative and their smax_value is positive, we need to reject arithmetic with pointer to map value. For unprivileged the goal is to mask every map pointer arithmetic and this cannot reliably be done when it is unknown at verification time whether the scalar value is negative or positive. Given this is a corner case, the likelihood of breaking should be very small. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 9ac205d1b8b7..eebbc03e5af2 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3081,8 +3081,8 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, smin_ptr = ptr_reg->smin_value, smax_ptr = ptr_reg->smax_value; u64 umin_val = off_reg->umin_value, umax_val = off_reg->umax_value, umin_ptr = ptr_reg->umin_value, umax_ptr = ptr_reg->umax_value; + u32 dst = insn->dst_reg, src = insn->src_reg; u8 opcode = BPF_OP(insn->code); - u32 dst = insn->dst_reg; dst_reg = ®s[dst]; @@ -3115,6 +3115,13 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, verbose(env, "R%d pointer arithmetic on %s prohibited\n", dst, reg_type_str[ptr_reg->type]); return -EACCES; + case PTR_TO_MAP_VALUE: + if (!env->allow_ptr_leaks && !known && (smin_val < 0) != (smax_val < 0)) { + verbose(env, "R%d has unknown scalar with mixed signed bounds, pointer arithmetic with it prohibited for !root\n", + off_reg == dst_reg ? dst : src); + return -EACCES; + } + /* fall-through */ default: break; } -- cgit From b7137c4eab85c1cf3d46acdde90ce1163b28c873 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 3 Jan 2019 00:58:33 +0100 Subject: bpf: fix check_map_access smin_value test when pointer contains offset In check_map_access() we probe actual bounds through __check_map_access() with offset of reg->smin_value + off for lower bound and offset of reg->umax_value + off for the upper bound. However, even though the reg->smin_value could have a negative value, the final result of the sum with off could be positive when pointer arithmetic with known and unknown scalars is combined. In this case we reject the program with an error such as "R min value is negative, either use unsigned index or do a if (index >=0) check." even though the access itself would be fine. Therefore extend the check to probe whether the actual resulting reg->smin_value + off is less than zero. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index eebbc03e5af2..8e5da1ce5da4 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1443,13 +1443,17 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, */ if (env->log.level) print_verifier_state(env, state); + /* The minimum value is only important with signed * comparisons where we can't assume the floor of a * value is 0. If we are using signed variables for our * index'es we need to make sure that whatever we use * will have a set floor within our range. */ - if (reg->smin_value < 0) { + if (reg->smin_value < 0 && + (reg->smin_value == S64_MIN || + (off + reg->smin_value != (s64)(s32)(off + reg->smin_value)) || + reg->smin_value + off < 0)) { verbose(env, "R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n", regno); return -EACCES; -- cgit From 979d63d50c0c0f7bc537bf821e056cc9fe5abd38 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 3 Jan 2019 00:58:34 +0100 Subject: bpf: prevent out of bounds speculation on pointer arithmetic Jann reported that the original commit back in b2157399cc98 ("bpf: prevent out-of-bounds speculation") was not sufficient to stop CPU from speculating out of bounds memory access: While b2157399cc98 only focussed on masking array map access for unprivileged users for tail calls and data access such that the user provided index gets sanitized from BPF program and syscall side, there is still a more generic form affected from BPF programs that applies to most maps that hold user data in relation to dynamic map access when dealing with unknown scalars or "slow" known scalars as access offset, for example: - Load a map value pointer into R6 - Load an index into R7 - Do a slow computation (e.g. with a memory dependency) that loads a limit into R8 (e.g. load the limit from a map for high latency, then mask it to make the verifier happy) - Exit if R7 >= R8 (mispredicted branch) - Load R0 = R6[R7] - Load R0 = R6[R0] For unknown scalars there are two options in the BPF verifier where we could derive knowledge from in order to guarantee safe access to the memory: i) While /<=/>= variants won't allow to derive any lower or upper bounds from the unknown scalar where it would be safe to add it to the map value pointer, it is possible through ==/!= test however. ii) another option is to transform the unknown scalar into a known scalar, for example, through ALU ops combination such as R &= followed by R |= or any similar combination where the original information from the unknown scalar would be destroyed entirely leaving R with a constant. The initial slow load still precedes the latter ALU ops on that register, so the CPU executes speculatively from that point. Once we have the known scalar, any compare operation would work then. A third option only involving registers with known scalars could be crafted as described in [0] where a CPU port (e.g. Slow Int unit) would be filled with many dependent computations such that the subsequent condition depending on its outcome has to wait for evaluation on its execution port and thereby executing speculatively if the speculated code can be scheduled on a different execution port, or any other form of mistraining as described in [1], for example. Given this is not limited to only unknown scalars, not only map but also stack access is affected since both is accessible for unprivileged users and could potentially be used for out of bounds access under speculation. In order to prevent any of these cases, the verifier is now sanitizing pointer arithmetic on the offset such that any out of bounds speculation would be masked in a way where the pointer arithmetic result in the destination register will stay unchanged, meaning offset masked into zero similar as in array_index_nospec() case. With regards to implementation, there are three options that were considered: i) new insn for sanitation, ii) push/pop insn and sanitation as inlined BPF, iii) reuse of ax register and sanitation as inlined BPF. Option i) has the downside that we end up using from reserved bits in the opcode space, but also that we would require each JIT to emit masking as native arch opcodes meaning mitigation would have slow adoption till everyone implements it eventually which is counter-productive. Option ii) and iii) have both in common that a temporary register is needed in order to implement the sanitation as inlined BPF since we are not allowed to modify the source register. While a push / pop insn in ii) would be useful to have in any case, it requires once again that every JIT needs to implement it first. While possible, amount of changes needed would also be unsuitable for a -stable patch. Therefore, the path which has fewer changes, less BPF instructions for the mitigation and does not require anything to be changed in the JITs is option iii) which this work is pursuing. The ax register is already mapped to a register in all JITs (modulo arm32 where it's mapped to stack as various other BPF registers there) and used in constant blinding for JITs-only so far. It can be reused for verifier rewrites under certain constraints. The interpreter's tmp "register" has therefore been remapped into extending the register set with hidden ax register and reusing that for a number of instructions that needed the prior temporary variable internally (e.g. div, mod). This allows for zero increase in stack space usage in the interpreter, and enables (restricted) generic use in rewrites otherwise as long as such a patchlet does not make use of these instructions. The sanitation mask is dynamic and relative to the offset the map value or stack pointer currently holds. There are various cases that need to be taken under consideration for the masking, e.g. such operation could look as follows: ptr += val or val += ptr or ptr -= val. Thus, the value to be sanitized could reside either in source or in destination register, and the limit is different depending on whether the ALU op is addition or subtraction and depending on the current known and bounded offset. The limit is derived as follows: limit := max_value_size - (smin_value + off). For subtraction: limit := umax_value + off. This holds because we do not allow any pointer arithmetic that would temporarily go out of bounds or would have an unknown value with mixed signed bounds where it is unclear at verification time whether the actual runtime value would be either negative or positive. For example, we have a derived map pointer value with constant offset and bounded one, so limit based on smin_value works because the verifier requires that statically analyzed arithmetic on the pointer must be in bounds, and thus it checks if resulting smin_value + off and umax_value + off is still within map value bounds at time of arithmetic in addition to time of access. Similarly, for the case of stack access we derive the limit as follows: MAX_BPF_STACK + off for subtraction and -off for the case of addition where off := ptr_reg->off + ptr_reg->var_off.value. Subtraction is a special case for the masking which can be in form of ptr += -val, ptr -= -val, or ptr -= val. In the first two cases where we know that the value is negative, we need to temporarily negate the value in order to do the sanitation on a positive value where we later swap the ALU op, and restore original source register if the value was in source. The sanitation of pointer arithmetic alone is still not fully sufficient as is, since a scenario like the following could happen ... PTR += 0x1000 (e.g. K-based imm) PTR -= BIG_NUMBER_WITH_SLOW_COMPARISON PTR += 0x1000 PTR -= BIG_NUMBER_WITH_SLOW_COMPARISON [...] ... which under speculation could end up as ... PTR += 0x1000 PTR -= 0 [ truncated by mitigation ] PTR += 0x1000 PTR -= 0 [ truncated by mitigation ] [...] ... and therefore still access out of bounds. To prevent such case, the verifier is also analyzing safety for potential out of bounds access under speculative execution. Meaning, it is also simulating pointer access under truncation. We therefore "branch off" and push the current verification state after the ALU operation with known 0 to the verification stack for later analysis. Given the current path analysis succeeded it is likely that the one under speculation can be pruned. In any case, it is also subject to existing complexity limits and therefore anything beyond this point will be rejected. In terms of pruning, it needs to be ensured that the verification state from speculative execution simulation must never prune a non-speculative execution path, therefore, we mark verifier state accordingly at the time of push_stack(). If verifier detects out of bounds access under speculative execution from one of the possible paths that includes a truncation, it will reject such program. Given we mask every reg-based pointer arithmetic for unprivileged programs, we've been looking into how it could affect real-world programs in terms of size increase. As the majority of programs are targeted for privileged-only use case, we've unconditionally enabled masking (with its alu restrictions on top of it) for privileged programs for the sake of testing in order to check i) whether they get rejected in its current form, and ii) by how much the number of instructions and size will increase. We've tested this by using Katran, Cilium and test_l4lb from the kernel selftests. For Katran we've evaluated balancer_kern.o, Cilium bpf_lxc.o and an older test object bpf_lxc_opt_-DUNKNOWN.o and l4lb we've used test_l4lb.o as well as test_l4lb_noinline.o. We found that none of the programs got rejected by the verifier with this change, and that impact is rather minimal to none. balancer_kern.o had 13,904 bytes (1,738 insns) xlated and 7,797 bytes JITed before and after the change. Most complex program in bpf_lxc.o had 30,544 bytes (3,817 insns) xlated and 18,538 bytes JITed before and after and none of the other tail call programs in bpf_lxc.o had any changes either. For the older bpf_lxc_opt_-DUNKNOWN.o object we found a small increase from 20,616 bytes (2,576 insns) and 12,536 bytes JITed before to 20,664 bytes (2,582 insns) and 12,558 bytes JITed after the change. Other programs from that object file had similar small increase. Both test_l4lb.o had no change and remained at 6,544 bytes (817 insns) xlated and 3,401 bytes JITed and for test_l4lb_noinline.o constant at 5,080 bytes (634 insns) xlated and 3,313 bytes JITed. This can be explained in that LLVM typically optimizes stack based pointer arithmetic by using K-based operations and that use of dynamic map access is not overly frequent. However, in future we may decide to optimize the algorithm further under known guarantees from branch and value speculation. Latter seems also unclear in terms of prediction heuristics that today's CPUs apply as well as whether there could be collisions in e.g. the predictor's Value History/Pattern Table for triggering out of bounds access, thus masking is performed unconditionally at this point but could be subject to relaxation later on. We were generally also brainstorming various other approaches for mitigation, but the blocker was always lack of available registers at runtime and/or overhead for runtime tracking of limits belonging to a specific pointer. Thus, we found this to be minimally intrusive under given constraints. With that in place, a simple example with sanitized access on unprivileged load at post-verification time looks as follows: # bpftool prog dump xlated id 282 [...] 28: (79) r1 = *(u64 *)(r7 +0) 29: (79) r2 = *(u64 *)(r7 +8) 30: (57) r1 &= 15 31: (79) r3 = *(u64 *)(r0 +4608) 32: (57) r3 &= 1 33: (47) r3 |= 1 34: (2d) if r2 > r3 goto pc+19 35: (b4) (u32) r11 = (u32) 20479 | 36: (1f) r11 -= r2 | Dynamic sanitation for pointer 37: (4f) r11 |= r2 | arithmetic with registers 38: (87) r11 = -r11 | containing bounded or known 39: (c7) r11 s>>= 63 | scalars in order to prevent 40: (5f) r11 &= r2 | out of bounds speculation. 41: (0f) r4 += r11 | 42: (71) r4 = *(u8 *)(r4 +0) 43: (6f) r4 <<= r1 [...] For the case where the scalar sits in the destination register as opposed to the source register, the following code is emitted for the above example: [...] 16: (b4) (u32) r11 = (u32) 20479 17: (1f) r11 -= r2 18: (4f) r11 |= r2 19: (87) r11 = -r11 20: (c7) r11 s>>= 63 21: (5f) r2 &= r11 22: (0f) r2 += r0 23: (61) r0 = *(u32 *)(r2 +0) [...] JIT blinding example with non-conflicting use of r10: [...] d5: je 0x0000000000000106 _ d7: mov 0x0(%rax),%edi | da: mov $0xf153246,%r10d | Index load from map value and e0: xor $0xf153259,%r10 | (const blinded) mask with 0x1f. e7: and %r10,%rdi |_ ea: mov $0x2f,%r10d | f0: sub %rdi,%r10 | Sanitized addition. Both use r10 f3: or %rdi,%r10 | but do not interfere with each f6: neg %r10 | other. (Neither do these instructions f9: sar $0x3f,%r10 | interfere with the use of ax as temp fd: and %r10,%rdi | in interpreter.) 100: add %rax,%rdi |_ 103: mov 0x0(%rdi),%eax [...] Tested that it fixes Jann's reproducer, and also checked that test_verifier and test_progs suite with interpreter, JIT and JIT with hardening enabled on x86-64 and arm64 runs successfully. [0] Speculose: Analyzing the Security Implications of Speculative Execution in CPUs, Giorgi Maisuradze and Christian Rossow, https://arxiv.org/pdf/1801.04084.pdf [1] A Systematic Evaluation of Transient Execution Attacks and Defenses, Claudio Canella, Jo Van Bulck, Michael Schwarz, Moritz Lipp, Benjamin von Berg, Philipp Ortner, Frank Piessens, Dmitry Evtyushkin, Daniel Gruss, https://arxiv.org/pdf/1811.05441.pdf Fixes: b2157399cc98 ("bpf: prevent out-of-bounds speculation") Reported-by: Jann Horn Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 10 +++ kernel/bpf/verifier.c | 185 +++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 189 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 3f84f3e87704..27b74947cd2b 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -148,6 +148,7 @@ struct bpf_verifier_state { /* call stack tracking */ struct bpf_func_state *frame[MAX_CALL_FRAMES]; u32 curframe; + bool speculative; }; #define bpf_get_spilled_reg(slot, frame) \ @@ -167,15 +168,24 @@ struct bpf_verifier_state_list { struct bpf_verifier_state_list *next; }; +/* Possible states for alu_state member. */ +#define BPF_ALU_SANITIZE_SRC 1U +#define BPF_ALU_SANITIZE_DST 2U +#define BPF_ALU_NEG_VALUE (1U << 2) +#define BPF_ALU_SANITIZE (BPF_ALU_SANITIZE_SRC | \ + BPF_ALU_SANITIZE_DST) + struct bpf_insn_aux_data { union { enum bpf_reg_type ptr_type; /* pointer type for load/store insns */ unsigned long map_state; /* pointer/poison value for maps */ s32 call_imm; /* saved imm field of call insn */ + u32 alu_limit; /* limit for add/sub register with pointer */ }; int ctx_field_size; /* the ctx field size for load insn, maybe 0 */ int sanitize_stack_off; /* stack slot to be cleared */ bool seen; /* this insn was processed by the verifier */ + u8 alu_state; /* used in combination with alu_limit */ }; #define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 8e5da1ce5da4..f6bc62a9ee8e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -710,6 +710,7 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state, free_func_state(dst_state->frame[i]); dst_state->frame[i] = NULL; } + dst_state->speculative = src->speculative; dst_state->curframe = src->curframe; for (i = 0; i <= src->curframe; i++) { dst = dst_state->frame[i]; @@ -754,7 +755,8 @@ static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx, } static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env, - int insn_idx, int prev_insn_idx) + int insn_idx, int prev_insn_idx, + bool speculative) { struct bpf_verifier_state *cur = env->cur_state; struct bpf_verifier_stack_elem *elem; @@ -772,6 +774,7 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env, err = copy_verifier_state(&elem->st, cur); if (err) goto err; + elem->st.speculative |= speculative; if (env->stack_size > BPF_COMPLEXITY_LIMIT_STACK) { verbose(env, "BPF program is too complex\n"); goto err; @@ -3067,6 +3070,102 @@ static bool check_reg_sane_offset(struct bpf_verifier_env *env, return true; } +static struct bpf_insn_aux_data *cur_aux(struct bpf_verifier_env *env) +{ + return &env->insn_aux_data[env->insn_idx]; +} + +static int retrieve_ptr_limit(const struct bpf_reg_state *ptr_reg, + u32 *ptr_limit, u8 opcode, bool off_is_neg) +{ + bool mask_to_left = (opcode == BPF_ADD && off_is_neg) || + (opcode == BPF_SUB && !off_is_neg); + u32 off; + + switch (ptr_reg->type) { + case PTR_TO_STACK: + off = ptr_reg->off + ptr_reg->var_off.value; + if (mask_to_left) + *ptr_limit = MAX_BPF_STACK + off; + else + *ptr_limit = -off; + return 0; + case PTR_TO_MAP_VALUE: + if (mask_to_left) { + *ptr_limit = ptr_reg->umax_value + ptr_reg->off; + } else { + off = ptr_reg->smin_value + ptr_reg->off; + *ptr_limit = ptr_reg->map_ptr->value_size - off; + } + return 0; + default: + return -EINVAL; + } +} + +static int sanitize_ptr_alu(struct bpf_verifier_env *env, + struct bpf_insn *insn, + const struct bpf_reg_state *ptr_reg, + struct bpf_reg_state *dst_reg, + bool off_is_neg) +{ + struct bpf_verifier_state *vstate = env->cur_state; + struct bpf_insn_aux_data *aux = cur_aux(env); + bool ptr_is_dst_reg = ptr_reg == dst_reg; + u8 opcode = BPF_OP(insn->code); + u32 alu_state, alu_limit; + struct bpf_reg_state tmp; + bool ret; + + if (env->allow_ptr_leaks || BPF_SRC(insn->code) == BPF_K) + return 0; + + /* We already marked aux for masking from non-speculative + * paths, thus we got here in the first place. We only care + * to explore bad access from here. + */ + if (vstate->speculative) + goto do_sim; + + alu_state = off_is_neg ? BPF_ALU_NEG_VALUE : 0; + alu_state |= ptr_is_dst_reg ? + BPF_ALU_SANITIZE_SRC : BPF_ALU_SANITIZE_DST; + + if (retrieve_ptr_limit(ptr_reg, &alu_limit, opcode, off_is_neg)) + return 0; + + /* If we arrived here from different branches with different + * limits to sanitize, then this won't work. + */ + if (aux->alu_state && + (aux->alu_state != alu_state || + aux->alu_limit != alu_limit)) + return -EACCES; + + /* Corresponding fixup done in fixup_bpf_calls(). */ + aux->alu_state = alu_state; + aux->alu_limit = alu_limit; + +do_sim: + /* Simulate and find potential out-of-bounds access under + * speculative execution from truncation as a result of + * masking when off was not within expected range. If off + * sits in dst, then we temporarily need to move ptr there + * to simulate dst (== 0) +/-= ptr. Needed, for example, + * for cases where we use K-based arithmetic in one direction + * and truncated reg-based in the other in order to explore + * bad access. + */ + if (!ptr_is_dst_reg) { + tmp = *dst_reg; + *dst_reg = *ptr_reg; + } + ret = push_stack(env, env->insn_idx + 1, env->insn_idx, true); + if (!ptr_is_dst_reg) + *dst_reg = tmp; + return !ret ? -EFAULT : 0; +} + /* Handles arithmetic on a pointer and a scalar: computes new min/max and var_off. * Caller should also handle BPF_MOV case separately. * If we return -EACCES, caller may want to try again treating pointer as a @@ -3087,6 +3186,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, umin_ptr = ptr_reg->umin_value, umax_ptr = ptr_reg->umax_value; u32 dst = insn->dst_reg, src = insn->src_reg; u8 opcode = BPF_OP(insn->code); + int ret; dst_reg = ®s[dst]; @@ -3142,6 +3242,11 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, switch (opcode) { case BPF_ADD: + ret = sanitize_ptr_alu(env, insn, ptr_reg, dst_reg, smin_val < 0); + if (ret < 0) { + verbose(env, "R%d tried to add from different maps or paths\n", dst); + return ret; + } /* We can take a fixed offset as long as it doesn't overflow * the s32 'off' field */ @@ -3192,6 +3297,11 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, } break; case BPF_SUB: + ret = sanitize_ptr_alu(env, insn, ptr_reg, dst_reg, smin_val < 0); + if (ret < 0) { + verbose(env, "R%d tried to sub from different maps or paths\n", dst); + return ret; + } if (dst_reg == off_reg) { /* scalar -= pointer. Creates an unknown scalar */ verbose(env, "R%d tried to subtract pointer from scalar\n", @@ -4389,7 +4499,8 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, } } - other_branch = push_stack(env, *insn_idx + insn->off + 1, *insn_idx); + other_branch = push_stack(env, *insn_idx + insn->off + 1, *insn_idx, + false); if (!other_branch) return -EFAULT; other_branch_regs = other_branch->frame[other_branch->curframe]->regs; @@ -5499,6 +5610,12 @@ static bool states_equal(struct bpf_verifier_env *env, if (old->curframe != cur->curframe) return false; + /* Verification state from speculative execution simulation + * must never prune a non-speculative execution one. + */ + if (old->speculative && !cur->speculative) + return false; + /* for states to be equal callsites have to be the same * and all frame states need to be equivalent */ @@ -5700,6 +5817,7 @@ static int do_check(struct bpf_verifier_env *env) if (!state) return -ENOMEM; state->curframe = 0; + state->speculative = false; state->frame[0] = kzalloc(sizeof(struct bpf_func_state), GFP_KERNEL); if (!state->frame[0]) { kfree(state); @@ -5739,8 +5857,10 @@ static int do_check(struct bpf_verifier_env *env) /* found equivalent state, can prune the search */ if (env->log.level) { if (do_print_state) - verbose(env, "\nfrom %d to %d: safe\n", - env->prev_insn_idx, env->insn_idx); + verbose(env, "\nfrom %d to %d%s: safe\n", + env->prev_insn_idx, env->insn_idx, + env->cur_state->speculative ? + " (speculative execution)" : ""); else verbose(env, "%d: safe\n", env->insn_idx); } @@ -5757,8 +5877,10 @@ static int do_check(struct bpf_verifier_env *env) if (env->log.level > 1) verbose(env, "%d:", env->insn_idx); else - verbose(env, "\nfrom %d to %d:", - env->prev_insn_idx, env->insn_idx); + verbose(env, "\nfrom %d to %d%s:", + env->prev_insn_idx, env->insn_idx, + env->cur_state->speculative ? + " (speculative execution)" : ""); print_verifier_state(env, state->frame[state->curframe]); do_print_state = false; } @@ -6750,6 +6872,57 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) continue; } + if (insn->code == (BPF_ALU64 | BPF_ADD | BPF_X) || + insn->code == (BPF_ALU64 | BPF_SUB | BPF_X)) { + const u8 code_add = BPF_ALU64 | BPF_ADD | BPF_X; + const u8 code_sub = BPF_ALU64 | BPF_SUB | BPF_X; + struct bpf_insn insn_buf[16]; + struct bpf_insn *patch = &insn_buf[0]; + bool issrc, isneg; + u32 off_reg; + + aux = &env->insn_aux_data[i + delta]; + if (!aux->alu_state) + continue; + + isneg = aux->alu_state & BPF_ALU_NEG_VALUE; + issrc = (aux->alu_state & BPF_ALU_SANITIZE) == + BPF_ALU_SANITIZE_SRC; + + off_reg = issrc ? insn->src_reg : insn->dst_reg; + if (isneg) + *patch++ = BPF_ALU64_IMM(BPF_MUL, off_reg, -1); + *patch++ = BPF_MOV32_IMM(BPF_REG_AX, aux->alu_limit - 1); + *patch++ = BPF_ALU64_REG(BPF_SUB, BPF_REG_AX, off_reg); + *patch++ = BPF_ALU64_REG(BPF_OR, BPF_REG_AX, off_reg); + *patch++ = BPF_ALU64_IMM(BPF_NEG, BPF_REG_AX, 0); + *patch++ = BPF_ALU64_IMM(BPF_ARSH, BPF_REG_AX, 63); + if (issrc) { + *patch++ = BPF_ALU64_REG(BPF_AND, BPF_REG_AX, + off_reg); + insn->src_reg = BPF_REG_AX; + } else { + *patch++ = BPF_ALU64_REG(BPF_AND, off_reg, + BPF_REG_AX); + } + if (isneg) + insn->code = insn->code == code_add ? + code_sub : code_add; + *patch++ = *insn; + if (issrc && isneg) + *patch++ = BPF_ALU64_IMM(BPF_MUL, off_reg, -1); + cnt = patch - insn_buf; + + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; + continue; + } + if (insn->code != (BPF_JMP | BPF_CALL)) continue; if (insn->src_reg == BPF_PSEUDO_CALL) -- cgit From 96d4f267e40f9509e8a66e2b39e8b95655617693 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 3 Jan 2019 18:57:57 -0800 Subject: Remove 'type' argument from access_ok() function Nobody has actually used the type (VERIFY_READ vs VERIFY_WRITE) argument of the user address range verification function since we got rid of the old racy i386-only code to walk page tables by hand. It existed because the original 80386 would not honor the write protect bit when in kernel mode, so you had to do COW by hand before doing any user access. But we haven't supported that in a long time, and these days the 'type' argument is a purely historical artifact. A discussion about extending 'user_access_begin()' to do the range checking resulted this patch, because there is no way we're going to move the old VERIFY_xyz interface to that model. And it's best done at the end of the merge window when I've done most of my merges, so let's just get this done once and for all. This patch was mostly done with a sed-script, with manual fix-ups for the cases that weren't of the trivial 'access_ok(VERIFY_xyz' form. There were a couple of notable cases: - csky still had the old "verify_area()" name as an alias. - the iter_iov code had magical hardcoded knowledge of the actual values of VERIFY_{READ,WRITE} (not that they mattered, since nothing really used it) - microblaze used the type argument for a debug printout but other than those oddities this should be a total no-op patch. I tried to fix up all architectures, did fairly extensive grepping for access_ok() uses, and the changes are trivial, but I may have missed something. Any missed conversion should be trivially fixable, though. Signed-off-by: Linus Torvalds --- arch/alpha/include/asm/futex.h | 2 +- arch/alpha/include/asm/uaccess.h | 2 +- arch/alpha/kernel/signal.c | 12 +-- arch/alpha/lib/csum_partial_copy.c | 2 +- arch/arc/include/asm/futex.h | 2 +- arch/arc/kernel/process.c | 2 +- arch/arc/kernel/signal.c | 4 +- arch/arm/include/asm/futex.h | 4 +- arch/arm/include/asm/uaccess.h | 4 +- arch/arm/kernel/perf_callchain.c | 2 +- arch/arm/kernel/signal.c | 6 +- arch/arm/kernel/swp_emulate.c | 2 +- arch/arm/kernel/sys_oabi-compat.c | 4 +- arch/arm/kernel/traps.c | 2 +- arch/arm/oprofile/common.c | 2 +- arch/arm64/include/asm/futex.h | 2 +- arch/arm64/include/asm/uaccess.h | 8 +- arch/arm64/kernel/armv8_deprecated.c | 2 +- arch/arm64/kernel/perf_callchain.c | 4 +- arch/arm64/kernel/signal.c | 6 +- arch/arm64/kernel/signal32.c | 6 +- arch/arm64/kernel/sys_compat.c | 2 +- arch/c6x/kernel/signal.c | 4 +- arch/csky/abiv1/alignment.c | 4 +- arch/csky/include/asm/uaccess.h | 16 +--- arch/csky/kernel/signal.c | 2 +- arch/csky/lib/usercopy.c | 8 +- arch/h8300/kernel/signal.c | 4 +- arch/hexagon/include/asm/futex.h | 2 +- arch/hexagon/include/asm/uaccess.h | 3 - arch/hexagon/kernel/signal.c | 4 +- arch/hexagon/mm/uaccess.c | 2 +- arch/ia64/include/asm/futex.h | 2 +- arch/ia64/include/asm/uaccess.h | 2 +- arch/ia64/kernel/ptrace.c | 4 +- arch/ia64/kernel/signal.c | 4 +- arch/m68k/include/asm/uaccess_mm.h | 2 +- arch/m68k/include/asm/uaccess_no.h | 2 +- arch/m68k/kernel/signal.c | 4 +- arch/microblaze/include/asm/futex.h | 2 +- arch/microblaze/include/asm/uaccess.h | 23 +++--- arch/microblaze/kernel/signal.c | 4 +- arch/mips/include/asm/checksum.h | 4 +- arch/mips/include/asm/futex.h | 2 +- arch/mips/include/asm/termios.h | 4 +- arch/mips/include/asm/uaccess.h | 12 +-- arch/mips/kernel/mips-r2-to-r6-emul.c | 24 +++--- arch/mips/kernel/ptrace.c | 12 +-- arch/mips/kernel/signal.c | 12 +-- arch/mips/kernel/signal32.c | 4 +- arch/mips/kernel/signal_n32.c | 4 +- arch/mips/kernel/signal_o32.c | 8 +- arch/mips/kernel/syscall.c | 2 +- arch/mips/kernel/unaligned.c | 98 ++++++++++++------------- arch/mips/math-emu/cp1emu.c | 16 ++-- arch/mips/mm/cache.c | 2 +- arch/mips/mm/gup.c | 3 +- arch/mips/oprofile/backtrace.c | 2 +- arch/mips/sibyte/common/sb_tbprof.c | 2 +- arch/nds32/include/asm/futex.h | 2 +- arch/nds32/include/asm/uaccess.h | 11 +-- arch/nds32/kernel/perf_event_cpu.c | 11 ++- arch/nds32/kernel/signal.c | 4 +- arch/nds32/mm/alignment.c | 8 +- arch/nios2/include/asm/uaccess.h | 8 +- arch/nios2/kernel/signal.c | 2 +- arch/openrisc/include/asm/futex.h | 2 +- arch/openrisc/include/asm/uaccess.h | 8 +- arch/openrisc/kernel/signal.c | 6 +- arch/parisc/include/asm/futex.h | 2 +- arch/parisc/include/asm/uaccess.h | 2 +- arch/powerpc/include/asm/futex.h | 2 +- arch/powerpc/include/asm/uaccess.h | 8 +- arch/powerpc/kernel/align.c | 3 +- arch/powerpc/kernel/rtas_flash.c | 2 +- arch/powerpc/kernel/rtasd.c | 2 +- arch/powerpc/kernel/signal.c | 2 +- arch/powerpc/kernel/signal_32.c | 12 +-- arch/powerpc/kernel/signal_64.c | 13 ++-- arch/powerpc/kernel/syscalls.c | 2 +- arch/powerpc/kernel/traps.c | 2 +- arch/powerpc/kvm/book3s_64_mmu_hv.c | 4 +- arch/powerpc/lib/checksum_wrappers.c | 4 +- arch/powerpc/mm/fault.c | 2 +- arch/powerpc/mm/subpage-prot.c | 2 +- arch/powerpc/oprofile/backtrace.c | 4 +- arch/powerpc/platforms/cell/spufs/file.c | 16 ++-- arch/powerpc/platforms/powernv/opal-lpc.c | 4 +- arch/powerpc/platforms/pseries/scanlog.c | 2 +- arch/riscv/include/asm/futex.h | 2 +- arch/riscv/include/asm/uaccess.h | 14 +--- arch/riscv/kernel/signal.c | 4 +- arch/s390/include/asm/uaccess.h | 2 +- arch/sh/include/asm/checksum_32.h | 2 +- arch/sh/include/asm/futex.h | 2 +- arch/sh/include/asm/uaccess.h | 9 +-- arch/sh/kernel/signal_32.c | 8 +- arch/sh/kernel/signal_64.c | 8 +- arch/sh/kernel/traps_64.c | 12 +-- arch/sh/mm/gup.c | 3 +- arch/sh/oprofile/backtrace.c | 2 +- arch/sparc/include/asm/checksum_32.h | 2 +- arch/sparc/include/asm/uaccess_32.h | 2 +- arch/sparc/include/asm/uaccess_64.h | 2 +- arch/sparc/kernel/sigutil_32.c | 2 +- arch/sparc/kernel/unaligned_32.c | 7 +- arch/um/kernel/ptrace.c | 4 +- arch/unicore32/kernel/signal.c | 4 +- arch/x86/entry/vsyscall/vsyscall_64.c | 2 +- arch/x86/ia32/ia32_aout.c | 4 +- arch/x86/ia32/ia32_signal.c | 8 +- arch/x86/ia32/sys_ia32.c | 2 +- arch/x86/include/asm/checksum_32.h | 2 +- arch/x86/include/asm/pgtable_32.h | 2 +- arch/x86/include/asm/uaccess.h | 7 +- arch/x86/kernel/fpu/signal.c | 4 +- arch/x86/kernel/signal.c | 14 ++-- arch/x86/kernel/stacktrace.c | 2 +- arch/x86/kernel/vm86_32.c | 4 +- arch/x86/lib/csum-wrappers_64.c | 4 +- arch/x86/lib/usercopy_32.c | 2 +- arch/x86/lib/usercopy_64.c | 2 +- arch/x86/math-emu/fpu_system.h | 4 +- arch/x86/math-emu/load_store.c | 6 +- arch/x86/math-emu/reg_ld_str.c | 48 ++++++------ arch/x86/mm/mpx.c | 2 +- arch/x86/um/asm/checksum_32.h | 2 +- arch/x86/um/signal.c | 6 +- arch/xtensa/include/asm/checksum.h | 2 +- arch/xtensa/include/asm/futex.h | 2 +- arch/xtensa/include/asm/uaccess.h | 10 +-- arch/xtensa/kernel/signal.c | 4 +- arch/xtensa/kernel/stacktrace.c | 2 +- drivers/acpi/acpi_dbg.c | 4 +- drivers/char/generic_nvram.c | 4 +- drivers/char/mem.c | 4 +- drivers/char/nwflash.c | 2 +- drivers/char/pcmcia/cm4000_cs.c | 4 +- drivers/crypto/ccp/psp-dev.c | 6 +- drivers/firewire/core-cdev.c | 2 +- drivers/firmware/efi/test/efi_test.c | 8 +- drivers/fpga/dfl-afu-dma-region.c | 2 +- drivers/fpga/dfl-fme-pr.c | 3 +- drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 18 ++--- drivers/gpu/drm/armada/armada_gem.c | 2 +- drivers/gpu/drm/drm_file.c | 2 +- drivers/gpu/drm/etnaviv/etnaviv_drv.c | 8 +- drivers/gpu/drm/i915/i915_gem.c | 7 +- drivers/gpu/drm/i915/i915_gem_execbuffer.c | 6 +- drivers/gpu/drm/i915/i915_gem_userptr.c | 3 +- drivers/gpu/drm/i915/i915_ioc32.c | 2 +- drivers/gpu/drm/i915/i915_perf.c | 2 +- drivers/gpu/drm/i915/i915_query.c | 2 +- drivers/gpu/drm/msm/msm_gem_submit.c | 2 +- drivers/gpu/drm/qxl/qxl_ioctl.c | 3 +- drivers/infiniband/core/uverbs_main.c | 3 +- drivers/infiniband/hw/hfi1/user_exp_rcv.c | 2 +- drivers/infiniband/hw/qib/qib_file_ops.c | 2 +- drivers/macintosh/ans-lcd.c | 2 +- drivers/macintosh/via-pmu.c | 2 +- drivers/media/pci/ivtv/ivtvfb.c | 2 +- drivers/media/v4l2-core/v4l2-compat-ioctl32.c | 46 ++++++------ drivers/misc/vmw_vmci/vmci_host.c | 2 +- drivers/pci/proc.c | 4 +- drivers/platform/goldfish/goldfish_pipe.c | 3 +- drivers/pnp/isapnp/proc.c | 2 +- drivers/scsi/pmcraid.c | 4 +- drivers/scsi/scsi_ioctl.c | 2 +- drivers/scsi/sg.c | 16 ++-- drivers/staging/comedi/comedi_compat32.c | 24 +++--- drivers/tty/n_hdlc.c | 2 +- drivers/usb/core/devices.c | 2 +- drivers/usb/core/devio.c | 7 +- drivers/usb/gadget/function/f_hid.c | 4 +- drivers/usb/gadget/udc/atmel_usba_udc.c | 2 +- drivers/vhost/vhost.c | 16 ++-- drivers/video/fbdev/amifb.c | 4 +- drivers/video/fbdev/omap2/omapfb/omapfb-ioctl.c | 2 +- drivers/xen/privcmd.c | 6 +- fs/binfmt_aout.c | 4 +- fs/btrfs/send.c | 2 +- fs/eventpoll.c | 2 +- fs/fat/dir.c | 4 +- fs/ioctl.c | 2 +- fs/namespace.c | 2 +- fs/ocfs2/dlmfs/dlmfs.c | 4 +- fs/pstore/pmsg.c | 2 +- fs/pstore/ram_core.c | 2 +- fs/read_write.c | 13 ++-- fs/readdir.c | 10 +-- fs/select.c | 11 +-- include/asm-generic/uaccess.h | 12 +-- include/linux/regset.h | 4 +- include/linux/uaccess.h | 9 +-- include/net/checksum.h | 4 +- kernel/bpf/syscall.c | 2 +- kernel/compat.c | 16 ++-- kernel/events/core.c | 2 +- kernel/exit.c | 4 +- kernel/futex.c | 35 +++++---- kernel/printk/printk.c | 4 +- kernel/ptrace.c | 4 +- kernel/rseq.c | 6 +- kernel/sched/core.c | 4 +- kernel/signal.c | 8 +- kernel/sys.c | 2 +- kernel/trace/bpf_trace.c | 2 +- lib/bitmap.c | 4 +- lib/iov_iter.c | 8 +- lib/usercopy.c | 4 +- mm/gup.c | 6 +- mm/mincore.c | 4 +- net/batman-adv/icmp_socket.c | 2 +- net/batman-adv/log.c | 2 +- net/compat.c | 30 ++++---- net/sunrpc/sysctl.c | 2 +- security/tomoyo/common.c | 2 +- sound/core/seq/seq_clientmgr.c | 2 +- sound/isa/sb/emu8000_patch.c | 4 +- tools/perf/util/include/asm/uaccess.h | 2 +- virt/kvm/kvm_main.c | 3 +- 221 files changed, 610 insertions(+), 679 deletions(-) (limited to 'kernel') diff --git a/arch/alpha/include/asm/futex.h b/arch/alpha/include/asm/futex.h index ca3322536f72..bfd3c01038f8 100644 --- a/arch/alpha/include/asm/futex.h +++ b/arch/alpha/include/asm/futex.h @@ -68,7 +68,7 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, int ret = 0, cmp; u32 prev; - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) + if (!access_ok(uaddr, sizeof(u32))) return -EFAULT; __asm__ __volatile__ ( diff --git a/arch/alpha/include/asm/uaccess.h b/arch/alpha/include/asm/uaccess.h index 87d8c4f0307d..e69c4e13c328 100644 --- a/arch/alpha/include/asm/uaccess.h +++ b/arch/alpha/include/asm/uaccess.h @@ -36,7 +36,7 @@ #define __access_ok(addr, size) \ ((get_fs().seg & (addr | size | (addr+size))) == 0) -#define access_ok(type, addr, size) \ +#define access_ok(addr, size) \ ({ \ __chk_user_ptr(addr); \ __access_ok(((unsigned long)(addr)), (size)); \ diff --git a/arch/alpha/kernel/signal.c b/arch/alpha/kernel/signal.c index 8c0c4ee0be6e..33e904a05881 100644 --- a/arch/alpha/kernel/signal.c +++ b/arch/alpha/kernel/signal.c @@ -65,7 +65,7 @@ SYSCALL_DEFINE3(osf_sigaction, int, sig, if (act) { old_sigset_t mask; - if (!access_ok(VERIFY_READ, act, sizeof(*act)) || + if (!access_ok(act, sizeof(*act)) || __get_user(new_ka.sa.sa_handler, &act->sa_handler) || __get_user(new_ka.sa.sa_flags, &act->sa_flags) || __get_user(mask, &act->sa_mask)) @@ -77,7 +77,7 @@ SYSCALL_DEFINE3(osf_sigaction, int, sig, ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL); if (!ret && oact) { - if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) || + if (!access_ok(oact, sizeof(*oact)) || __put_user(old_ka.sa.sa_handler, &oact->sa_handler) || __put_user(old_ka.sa.sa_flags, &oact->sa_flags) || __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask)) @@ -207,7 +207,7 @@ do_sigreturn(struct sigcontext __user *sc) sigset_t set; /* Verify that it's a good sigcontext before using it */ - if (!access_ok(VERIFY_READ, sc, sizeof(*sc))) + if (!access_ok(sc, sizeof(*sc))) goto give_sigsegv; if (__get_user(set.sig[0], &sc->sc_mask)) goto give_sigsegv; @@ -235,7 +235,7 @@ do_rt_sigreturn(struct rt_sigframe __user *frame) sigset_t set; /* Verify that it's a good ucontext_t before using it */ - if (!access_ok(VERIFY_READ, &frame->uc, sizeof(frame->uc))) + if (!access_ok(&frame->uc, sizeof(frame->uc))) goto give_sigsegv; if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set))) goto give_sigsegv; @@ -332,7 +332,7 @@ setup_frame(struct ksignal *ksig, sigset_t *set, struct pt_regs *regs) oldsp = rdusp(); frame = get_sigframe(ksig, oldsp, sizeof(*frame)); - if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) + if (!access_ok(frame, sizeof(*frame))) return -EFAULT; err |= setup_sigcontext(&frame->sc, regs, set->sig[0], oldsp); @@ -377,7 +377,7 @@ setup_rt_frame(struct ksignal *ksig, sigset_t *set, struct pt_regs *regs) oldsp = rdusp(); frame = get_sigframe(ksig, oldsp, sizeof(*frame)); - if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) + if (!access_ok(frame, sizeof(*frame))) return -EFAULT; err |= copy_siginfo_to_user(&frame->info, &ksig->info); diff --git a/arch/alpha/lib/csum_partial_copy.c b/arch/alpha/lib/csum_partial_copy.c index ddb9c2f376fa..e53f96e8aa6d 100644 --- a/arch/alpha/lib/csum_partial_copy.c +++ b/arch/alpha/lib/csum_partial_copy.c @@ -333,7 +333,7 @@ csum_partial_copy_from_user(const void __user *src, void *dst, int len, unsigned long doff = 7 & (unsigned long) dst; if (len) { - if (!access_ok(VERIFY_READ, src, len)) { + if (!access_ok(src, len)) { if (errp) *errp = -EFAULT; memset(dst, 0, len); return sum; diff --git a/arch/arc/include/asm/futex.h b/arch/arc/include/asm/futex.h index eb887dd13e74..c29c3fae6854 100644 --- a/arch/arc/include/asm/futex.h +++ b/arch/arc/include/asm/futex.h @@ -126,7 +126,7 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, u32 expval, int ret = 0; u32 existval; - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) + if (!access_ok(uaddr, sizeof(u32))) return -EFAULT; #ifndef CONFIG_ARC_HAS_LLSC diff --git a/arch/arc/kernel/process.c b/arch/arc/kernel/process.c index 8ce6e7235915..641c364fc232 100644 --- a/arch/arc/kernel/process.c +++ b/arch/arc/kernel/process.c @@ -61,7 +61,7 @@ SYSCALL_DEFINE3(arc_usr_cmpxchg, int *, uaddr, int, expected, int, new) /* Z indicates to userspace if operation succeded */ regs->status32 &= ~STATUS_Z_MASK; - ret = access_ok(VERIFY_WRITE, uaddr, sizeof(*uaddr)); + ret = access_ok(uaddr, sizeof(*uaddr)); if (!ret) goto fail; diff --git a/arch/arc/kernel/signal.c b/arch/arc/kernel/signal.c index 48685445002e..1bfb7de696bd 100644 --- a/arch/arc/kernel/signal.c +++ b/arch/arc/kernel/signal.c @@ -169,7 +169,7 @@ SYSCALL_DEFINE0(rt_sigreturn) sf = (struct rt_sigframe __force __user *)(regs->sp); - if (!access_ok(VERIFY_READ, sf, sizeof(*sf))) + if (!access_ok(sf, sizeof(*sf))) goto badframe; if (__get_user(magic, &sf->sigret_magic)) @@ -219,7 +219,7 @@ static inline void __user *get_sigframe(struct ksignal *ksig, frame = (void __user *)((sp - framesize) & ~7); /* Check that we can actually write to the signal frame */ - if (!access_ok(VERIFY_WRITE, frame, framesize)) + if (!access_ok(frame, framesize)) frame = NULL; return frame; diff --git a/arch/arm/include/asm/futex.h b/arch/arm/include/asm/futex.h index ffebe7b7a5b7..0a46676b4245 100644 --- a/arch/arm/include/asm/futex.h +++ b/arch/arm/include/asm/futex.h @@ -50,7 +50,7 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, int ret; u32 val; - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) + if (!access_ok(uaddr, sizeof(u32))) return -EFAULT; smp_mb(); @@ -104,7 +104,7 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, int ret = 0; u32 val; - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) + if (!access_ok(uaddr, sizeof(u32))) return -EFAULT; preempt_disable(); diff --git a/arch/arm/include/asm/uaccess.h b/arch/arm/include/asm/uaccess.h index c136eef8f690..27ed17ec45fe 100644 --- a/arch/arm/include/asm/uaccess.h +++ b/arch/arm/include/asm/uaccess.h @@ -279,7 +279,7 @@ static inline void set_fs(mm_segment_t fs) #endif /* CONFIG_MMU */ -#define access_ok(type, addr, size) (__range_ok(addr, size) == 0) +#define access_ok(addr, size) (__range_ok(addr, size) == 0) #define user_addr_max() \ (uaccess_kernel() ? ~0UL : get_fs()) @@ -560,7 +560,7 @@ raw_copy_to_user(void __user *to, const void *from, unsigned long n) static inline unsigned long __must_check clear_user(void __user *to, unsigned long n) { - if (access_ok(VERIFY_WRITE, to, n)) + if (access_ok(to, n)) n = __clear_user(to, n); return n; } diff --git a/arch/arm/kernel/perf_callchain.c b/arch/arm/kernel/perf_callchain.c index 08e43a32a693..3b69a76d341e 100644 --- a/arch/arm/kernel/perf_callchain.c +++ b/arch/arm/kernel/perf_callchain.c @@ -37,7 +37,7 @@ user_backtrace(struct frame_tail __user *tail, struct frame_tail buftail; unsigned long err; - if (!access_ok(VERIFY_READ, tail, sizeof(buftail))) + if (!access_ok(tail, sizeof(buftail))) return NULL; pagefault_disable(); diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c index b908382b69ff..76bb8de6bf6b 100644 --- a/arch/arm/kernel/signal.c +++ b/arch/arm/kernel/signal.c @@ -241,7 +241,7 @@ asmlinkage int sys_sigreturn(struct pt_regs *regs) frame = (struct sigframe __user *)regs->ARM_sp; - if (!access_ok(VERIFY_READ, frame, sizeof (*frame))) + if (!access_ok(frame, sizeof (*frame))) goto badframe; if (restore_sigframe(regs, frame)) @@ -271,7 +271,7 @@ asmlinkage int sys_rt_sigreturn(struct pt_regs *regs) frame = (struct rt_sigframe __user *)regs->ARM_sp; - if (!access_ok(VERIFY_READ, frame, sizeof (*frame))) + if (!access_ok(frame, sizeof (*frame))) goto badframe; if (restore_sigframe(regs, &frame->sig)) @@ -355,7 +355,7 @@ get_sigframe(struct ksignal *ksig, struct pt_regs *regs, int framesize) /* * Check that we can actually write to the signal frame. */ - if (!access_ok(VERIFY_WRITE, frame, framesize)) + if (!access_ok(frame, framesize)) frame = NULL; return frame; diff --git a/arch/arm/kernel/swp_emulate.c b/arch/arm/kernel/swp_emulate.c index a188d5e8ab7f..76f6e6a9736c 100644 --- a/arch/arm/kernel/swp_emulate.c +++ b/arch/arm/kernel/swp_emulate.c @@ -198,7 +198,7 @@ static int swp_handler(struct pt_regs *regs, unsigned int instr) destreg, EXTRACT_REG_NUM(instr, RT2_OFFSET), data); /* Check access in reasonable access range for both SWP and SWPB */ - if (!access_ok(VERIFY_WRITE, (address & ~3), 4)) { + if (!access_ok((address & ~3), 4)) { pr_debug("SWP{B} emulation: access to %p not allowed!\n", (void *)address); res = -EFAULT; diff --git a/arch/arm/kernel/sys_oabi-compat.c b/arch/arm/kernel/sys_oabi-compat.c index 40da0872170f..92ab36f38795 100644 --- a/arch/arm/kernel/sys_oabi-compat.c +++ b/arch/arm/kernel/sys_oabi-compat.c @@ -285,7 +285,7 @@ asmlinkage long sys_oabi_epoll_wait(int epfd, maxevents > (INT_MAX/sizeof(*kbuf)) || maxevents > (INT_MAX/sizeof(*events))) return -EINVAL; - if (!access_ok(VERIFY_WRITE, events, sizeof(*events) * maxevents)) + if (!access_ok(events, sizeof(*events) * maxevents)) return -EFAULT; kbuf = kmalloc_array(maxevents, sizeof(*kbuf), GFP_KERNEL); if (!kbuf) @@ -326,7 +326,7 @@ asmlinkage long sys_oabi_semtimedop(int semid, if (nsops < 1 || nsops > SEMOPM) return -EINVAL; - if (!access_ok(VERIFY_READ, tsops, sizeof(*tsops) * nsops)) + if (!access_ok(tsops, sizeof(*tsops) * nsops)) return -EFAULT; sops = kmalloc_array(nsops, sizeof(*sops), GFP_KERNEL); if (!sops) diff --git a/arch/arm/kernel/traps.c b/arch/arm/kernel/traps.c index 2d668cff8ef4..33af097c454b 100644 --- a/arch/arm/kernel/traps.c +++ b/arch/arm/kernel/traps.c @@ -582,7 +582,7 @@ do_cache_op(unsigned long start, unsigned long end, int flags) if (end < start || flags) return -EINVAL; - if (!access_ok(VERIFY_READ, start, end - start)) + if (!access_ok(start, end - start)) return -EFAULT; return __do_cache_op(start, end); diff --git a/arch/arm/oprofile/common.c b/arch/arm/oprofile/common.c index cc649a1e46da..7cb3e0453fcd 100644 --- a/arch/arm/oprofile/common.c +++ b/arch/arm/oprofile/common.c @@ -88,7 +88,7 @@ static struct frame_tail* user_backtrace(struct frame_tail *tail) struct frame_tail buftail[2]; /* Also check accessibility of one struct frame_tail beyond */ - if (!access_ok(VERIFY_READ, tail, sizeof(buftail))) + if (!access_ok(tail, sizeof(buftail))) return NULL; if (__copy_from_user_inatomic(buftail, tail, sizeof(buftail))) return NULL; diff --git a/arch/arm64/include/asm/futex.h b/arch/arm64/include/asm/futex.h index 07fe2479d310..cccb83ad7fa8 100644 --- a/arch/arm64/include/asm/futex.h +++ b/arch/arm64/include/asm/futex.h @@ -96,7 +96,7 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *_uaddr, u32 val, tmp; u32 __user *uaddr; - if (!access_ok(VERIFY_WRITE, _uaddr, sizeof(u32))) + if (!access_ok(_uaddr, sizeof(u32))) return -EFAULT; uaddr = __uaccess_mask_ptr(_uaddr); diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h index ed252435fd92..547d7a0c9d05 100644 --- a/arch/arm64/include/asm/uaccess.h +++ b/arch/arm64/include/asm/uaccess.h @@ -95,7 +95,7 @@ static inline unsigned long __range_ok(const void __user *addr, unsigned long si return ret; } -#define access_ok(type, addr, size) __range_ok(addr, size) +#define access_ok(addr, size) __range_ok(addr, size) #define user_addr_max get_fs #define _ASM_EXTABLE(from, to) \ @@ -301,7 +301,7 @@ do { \ ({ \ __typeof__(*(ptr)) __user *__p = (ptr); \ might_fault(); \ - if (access_ok(VERIFY_READ, __p, sizeof(*__p))) { \ + if (access_ok(__p, sizeof(*__p))) { \ __p = uaccess_mask_ptr(__p); \ __get_user_err((x), __p, (err)); \ } else { \ @@ -370,7 +370,7 @@ do { \ ({ \ __typeof__(*(ptr)) __user *__p = (ptr); \ might_fault(); \ - if (access_ok(VERIFY_WRITE, __p, sizeof(*__p))) { \ + if (access_ok(__p, sizeof(*__p))) { \ __p = uaccess_mask_ptr(__p); \ __put_user_err((x), __p, (err)); \ } else { \ @@ -418,7 +418,7 @@ extern unsigned long __must_check __arch_copy_in_user(void __user *to, const voi extern unsigned long __must_check __arch_clear_user(void __user *to, unsigned long n); static inline unsigned long __must_check __clear_user(void __user *to, unsigned long n) { - if (access_ok(VERIFY_WRITE, to, n)) + if (access_ok(to, n)) n = __arch_clear_user(__uaccess_mask_ptr(to), n); return n; } diff --git a/arch/arm64/kernel/armv8_deprecated.c b/arch/arm64/kernel/armv8_deprecated.c index 92be1d12d590..e52e7280884a 100644 --- a/arch/arm64/kernel/armv8_deprecated.c +++ b/arch/arm64/kernel/armv8_deprecated.c @@ -402,7 +402,7 @@ static int swp_handler(struct pt_regs *regs, u32 instr) /* Check access in reasonable access range for both SWP and SWPB */ user_ptr = (const void __user *)(unsigned long)(address & ~3); - if (!access_ok(VERIFY_WRITE, user_ptr, 4)) { + if (!access_ok(user_ptr, 4)) { pr_debug("SWP{B} emulation: access to 0x%08x not allowed!\n", address); goto fault; diff --git a/arch/arm64/kernel/perf_callchain.c b/arch/arm64/kernel/perf_callchain.c index a34c26afacb0..61d983f5756f 100644 --- a/arch/arm64/kernel/perf_callchain.c +++ b/arch/arm64/kernel/perf_callchain.c @@ -39,7 +39,7 @@ user_backtrace(struct frame_tail __user *tail, unsigned long lr; /* Also check accessibility of one struct frame_tail beyond */ - if (!access_ok(VERIFY_READ, tail, sizeof(buftail))) + if (!access_ok(tail, sizeof(buftail))) return NULL; pagefault_disable(); @@ -86,7 +86,7 @@ compat_user_backtrace(struct compat_frame_tail __user *tail, unsigned long err; /* Also check accessibility of one struct frame_tail beyond */ - if (!access_ok(VERIFY_READ, tail, sizeof(buftail))) + if (!access_ok(tail, sizeof(buftail))) return NULL; pagefault_disable(); diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c index 5dcc942906db..867a7cea70e5 100644 --- a/arch/arm64/kernel/signal.c +++ b/arch/arm64/kernel/signal.c @@ -470,7 +470,7 @@ static int parse_user_sigframe(struct user_ctxs *user, offset = 0; limit = extra_size; - if (!access_ok(VERIFY_READ, base, limit)) + if (!access_ok(base, limit)) goto invalid; continue; @@ -556,7 +556,7 @@ SYSCALL_DEFINE0(rt_sigreturn) frame = (struct rt_sigframe __user *)regs->sp; - if (!access_ok(VERIFY_READ, frame, sizeof (*frame))) + if (!access_ok(frame, sizeof (*frame))) goto badframe; if (restore_sigframe(regs, frame)) @@ -730,7 +730,7 @@ static int get_sigframe(struct rt_sigframe_user_layout *user, /* * Check that we can actually write to the signal frame. */ - if (!access_ok(VERIFY_WRITE, user->sigframe, sp_top - sp)) + if (!access_ok(user->sigframe, sp_top - sp)) return -EFAULT; return 0; diff --git a/arch/arm64/kernel/signal32.c b/arch/arm64/kernel/signal32.c index 24b09003f821..cb7800acd19f 100644 --- a/arch/arm64/kernel/signal32.c +++ b/arch/arm64/kernel/signal32.c @@ -303,7 +303,7 @@ COMPAT_SYSCALL_DEFINE0(sigreturn) frame = (struct compat_sigframe __user *)regs->compat_sp; - if (!access_ok(VERIFY_READ, frame, sizeof (*frame))) + if (!access_ok(frame, sizeof (*frame))) goto badframe; if (compat_restore_sigframe(regs, frame)) @@ -334,7 +334,7 @@ COMPAT_SYSCALL_DEFINE0(rt_sigreturn) frame = (struct compat_rt_sigframe __user *)regs->compat_sp; - if (!access_ok(VERIFY_READ, frame, sizeof (*frame))) + if (!access_ok(frame, sizeof (*frame))) goto badframe; if (compat_restore_sigframe(regs, &frame->sig)) @@ -365,7 +365,7 @@ static void __user *compat_get_sigframe(struct ksignal *ksig, /* * Check that we can actually write to the signal frame. */ - if (!access_ok(VERIFY_WRITE, frame, framesize)) + if (!access_ok(frame, framesize)) frame = NULL; return frame; diff --git a/arch/arm64/kernel/sys_compat.c b/arch/arm64/kernel/sys_compat.c index 32653d156747..21005dfe8406 100644 --- a/arch/arm64/kernel/sys_compat.c +++ b/arch/arm64/kernel/sys_compat.c @@ -58,7 +58,7 @@ do_compat_cache_op(unsigned long start, unsigned long end, int flags) if (end < start || flags) return -EINVAL; - if (!access_ok(VERIFY_READ, (const void __user *)start, end - start)) + if (!access_ok((const void __user *)start, end - start)) return -EFAULT; return __do_compat_cache_op(start, end); diff --git a/arch/c6x/kernel/signal.c b/arch/c6x/kernel/signal.c index 3c4bb5a5c382..33b9f69c38f7 100644 --- a/arch/c6x/kernel/signal.c +++ b/arch/c6x/kernel/signal.c @@ -80,7 +80,7 @@ asmlinkage int do_rt_sigreturn(struct pt_regs *regs) frame = (struct rt_sigframe __user *) ((unsigned long) regs->sp + 8); - if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) + if (!access_ok(frame, sizeof(*frame))) goto badframe; if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set))) goto badframe; @@ -149,7 +149,7 @@ static int setup_rt_frame(struct ksignal *ksig, sigset_t *set, frame = get_sigframe(ksig, regs, sizeof(*frame)); - if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) + if (!access_ok(frame, sizeof(*frame))) return -EFAULT; err |= __put_user(&frame->info, &frame->pinfo); diff --git a/arch/csky/abiv1/alignment.c b/arch/csky/abiv1/alignment.c index 60205e98fb87..d789be36eb4f 100644 --- a/arch/csky/abiv1/alignment.c +++ b/arch/csky/abiv1/alignment.c @@ -32,7 +32,7 @@ static int ldb_asm(uint32_t addr, uint32_t *valp) uint32_t val; int err; - if (!access_ok(VERIFY_READ, (void *)addr, 1)) + if (!access_ok((void *)addr, 1)) return 1; asm volatile ( @@ -67,7 +67,7 @@ static int stb_asm(uint32_t addr, uint32_t val) { int err; - if (!access_ok(VERIFY_WRITE, (void *)addr, 1)) + if (!access_ok((void *)addr, 1)) return 1; asm volatile ( diff --git a/arch/csky/include/asm/uaccess.h b/arch/csky/include/asm/uaccess.h index acaf0e210d81..eaa1c3403a42 100644 --- a/arch/csky/include/asm/uaccess.h +++ b/arch/csky/include/asm/uaccess.h @@ -16,10 +16,7 @@ #include #include -#define VERIFY_READ 0 -#define VERIFY_WRITE 1 - -static inline int access_ok(int type, const void *addr, unsigned long size) +static inline int access_ok(const void *addr, unsigned long size) { unsigned long limit = current_thread_info()->addr_limit.seg; @@ -27,12 +24,7 @@ static inline int access_ok(int type, const void *addr, unsigned long size) ((unsigned long)(addr + size) < limit)); } -static inline int verify_area(int type, const void *addr, unsigned long size) -{ - return access_ok(type, addr, size) ? 0 : -EFAULT; -} - -#define __addr_ok(addr) (access_ok(VERIFY_READ, addr, 0)) +#define __addr_ok(addr) (access_ok(addr, 0)) extern int __put_user_bad(void); @@ -91,7 +83,7 @@ extern int __put_user_bad(void); long __pu_err = -EFAULT; \ typeof(*(ptr)) *__pu_addr = (ptr); \ typeof(*(ptr)) __pu_val = (typeof(*(ptr)))(x); \ - if (access_ok(VERIFY_WRITE, __pu_addr, size) && __pu_addr) \ + if (access_ok(__pu_addr, size) && __pu_addr) \ __put_user_size(__pu_val, __pu_addr, (size), __pu_err); \ __pu_err; \ }) @@ -217,7 +209,7 @@ do { \ ({ \ int __gu_err = -EFAULT; \ const __typeof__(*(ptr)) __user *__gu_ptr = (ptr); \ - if (access_ok(VERIFY_READ, __gu_ptr, size) && __gu_ptr) \ + if (access_ok(__gu_ptr, size) && __gu_ptr) \ __get_user_size(x, __gu_ptr, size, __gu_err); \ __gu_err; \ }) diff --git a/arch/csky/kernel/signal.c b/arch/csky/kernel/signal.c index 66e1b729b10b..9967c10eee2b 100644 --- a/arch/csky/kernel/signal.c +++ b/arch/csky/kernel/signal.c @@ -88,7 +88,7 @@ do_rt_sigreturn(void) struct pt_regs *regs = current_pt_regs(); struct rt_sigframe *frame = (struct rt_sigframe *)(regs->usp); - if (verify_area(VERIFY_READ, frame, sizeof(*frame))) + if (!access_ok(frame, sizeof(*frame))) goto badframe; if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set))) goto badframe; diff --git a/arch/csky/lib/usercopy.c b/arch/csky/lib/usercopy.c index ac9170e2cbb8..647a23986fb5 100644 --- a/arch/csky/lib/usercopy.c +++ b/arch/csky/lib/usercopy.c @@ -7,7 +7,7 @@ unsigned long raw_copy_from_user(void *to, const void *from, unsigned long n) { - if (access_ok(VERIFY_READ, from, n)) + if (access_ok(from, n)) __copy_user_zeroing(to, from, n); else memset(to, 0, n); @@ -18,7 +18,7 @@ EXPORT_SYMBOL(raw_copy_from_user); unsigned long raw_copy_to_user(void *to, const void *from, unsigned long n) { - if (access_ok(VERIFY_WRITE, to, n)) + if (access_ok(to, n)) __copy_user(to, from, n); return n; } @@ -113,7 +113,7 @@ long strncpy_from_user(char *dst, const char *src, long count) { long res = -EFAULT; - if (access_ok(VERIFY_READ, src, 1)) + if (access_ok(src, 1)) __do_strncpy_from_user(dst, src, count, res); return res; } @@ -236,7 +236,7 @@ do { \ unsigned long clear_user(void __user *to, unsigned long n) { - if (access_ok(VERIFY_WRITE, to, n)) + if (access_ok(to, n)) __do_clear_user(to, n); return n; } diff --git a/arch/h8300/kernel/signal.c b/arch/h8300/kernel/signal.c index 1e8070d08770..e0f2b708e5d9 100644 --- a/arch/h8300/kernel/signal.c +++ b/arch/h8300/kernel/signal.c @@ -110,7 +110,7 @@ asmlinkage int sys_rt_sigreturn(void) sigset_t set; int er0; - if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) + if (!access_ok(frame, sizeof(*frame))) goto badframe; if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set))) goto badframe; @@ -165,7 +165,7 @@ static int setup_rt_frame(struct ksignal *ksig, sigset_t *set, frame = get_sigframe(ksig, regs, sizeof(*frame)); - if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) + if (!access_ok(frame, sizeof(*frame))) return -EFAULT; if (ksig->ka.sa.sa_flags & SA_SIGINFO) diff --git a/arch/hexagon/include/asm/futex.h b/arch/hexagon/include/asm/futex.h index c889f5993ecd..cb635216a732 100644 --- a/arch/hexagon/include/asm/futex.h +++ b/arch/hexagon/include/asm/futex.h @@ -77,7 +77,7 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, u32 oldval, int prev; int ret; - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) + if (!access_ok(uaddr, sizeof(u32))) return -EFAULT; __asm__ __volatile__ ( diff --git a/arch/hexagon/include/asm/uaccess.h b/arch/hexagon/include/asm/uaccess.h index 458b69886b34..a30e58d5f351 100644 --- a/arch/hexagon/include/asm/uaccess.h +++ b/arch/hexagon/include/asm/uaccess.h @@ -29,9 +29,6 @@ /* * access_ok: - Checks if a user space pointer is valid - * @type: Type of access: %VERIFY_READ or %VERIFY_WRITE. Note that - * %VERIFY_WRITE is a superset of %VERIFY_READ - if it is safe - * to write to a block, it is always safe to read from it. * @addr: User space pointer to start of block to check * @size: Size of block to check * diff --git a/arch/hexagon/kernel/signal.c b/arch/hexagon/kernel/signal.c index 78aa7304a5c9..31e2cf95f189 100644 --- a/arch/hexagon/kernel/signal.c +++ b/arch/hexagon/kernel/signal.c @@ -115,7 +115,7 @@ static int setup_rt_frame(struct ksignal *ksig, sigset_t *set, frame = get_sigframe(ksig, regs, sizeof(struct rt_sigframe)); - if (!access_ok(VERIFY_WRITE, frame, sizeof(struct rt_sigframe))) + if (!access_ok(frame, sizeof(struct rt_sigframe))) return -EFAULT; if (copy_siginfo_to_user(&frame->info, &ksig->info)) @@ -244,7 +244,7 @@ asmlinkage int sys_rt_sigreturn(void) current->restart_block.fn = do_no_restart_syscall; frame = (struct rt_sigframe __user *)pt_psp(regs); - if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) + if (!access_ok(frame, sizeof(*frame))) goto badframe; if (__copy_from_user(&blocked, &frame->uc.uc_sigmask, sizeof(blocked))) goto badframe; diff --git a/arch/hexagon/mm/uaccess.c b/arch/hexagon/mm/uaccess.c index c599eb126c9e..6f9c4697552c 100644 --- a/arch/hexagon/mm/uaccess.c +++ b/arch/hexagon/mm/uaccess.c @@ -51,7 +51,7 @@ __kernel_size_t __clear_user_hexagon(void __user *dest, unsigned long count) unsigned long clear_user_hexagon(void __user *dest, unsigned long count) { - if (!access_ok(VERIFY_WRITE, dest, count)) + if (!access_ok(dest, count)) return count; else return __clear_user_hexagon(dest, count); diff --git a/arch/ia64/include/asm/futex.h b/arch/ia64/include/asm/futex.h index db2dd85918c2..2e106d462196 100644 --- a/arch/ia64/include/asm/futex.h +++ b/arch/ia64/include/asm/futex.h @@ -86,7 +86,7 @@ static inline int futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, u32 oldval, u32 newval) { - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) + if (!access_ok(uaddr, sizeof(u32))) return -EFAULT; { diff --git a/arch/ia64/include/asm/uaccess.h b/arch/ia64/include/asm/uaccess.h index a74524f2d625..306d469e43da 100644 --- a/arch/ia64/include/asm/uaccess.h +++ b/arch/ia64/include/asm/uaccess.h @@ -67,7 +67,7 @@ static inline int __access_ok(const void __user *p, unsigned long size) return likely(addr <= seg) && (seg == KERNEL_DS.seg || likely(REGION_OFFSET(addr) < RGN_MAP_LIMIT)); } -#define access_ok(type, addr, size) __access_ok((addr), (size)) +#define access_ok(addr, size) __access_ok((addr), (size)) /* * These are the main single-value transfer routines. They automatically diff --git a/arch/ia64/kernel/ptrace.c b/arch/ia64/kernel/ptrace.c index 427cd565fd61..6d50ede0ed69 100644 --- a/arch/ia64/kernel/ptrace.c +++ b/arch/ia64/kernel/ptrace.c @@ -836,7 +836,7 @@ ptrace_getregs (struct task_struct *child, struct pt_all_user_regs __user *ppr) char nat = 0; int i; - if (!access_ok(VERIFY_WRITE, ppr, sizeof(struct pt_all_user_regs))) + if (!access_ok(ppr, sizeof(struct pt_all_user_regs))) return -EIO; pt = task_pt_regs(child); @@ -981,7 +981,7 @@ ptrace_setregs (struct task_struct *child, struct pt_all_user_regs __user *ppr) memset(&fpval, 0, sizeof(fpval)); - if (!access_ok(VERIFY_READ, ppr, sizeof(struct pt_all_user_regs))) + if (!access_ok(ppr, sizeof(struct pt_all_user_regs))) return -EIO; pt = task_pt_regs(child); diff --git a/arch/ia64/kernel/signal.c b/arch/ia64/kernel/signal.c index 99099f73b207..6062fd14e34e 100644 --- a/arch/ia64/kernel/signal.c +++ b/arch/ia64/kernel/signal.c @@ -132,7 +132,7 @@ ia64_rt_sigreturn (struct sigscratch *scr) */ retval = (long) &ia64_strace_leave_kernel; - if (!access_ok(VERIFY_READ, sc, sizeof(*sc))) + if (!access_ok(sc, sizeof(*sc))) goto give_sigsegv; if (GET_SIGSET(&set, &sc->sc_mask)) @@ -264,7 +264,7 @@ setup_frame(struct ksignal *ksig, sigset_t *set, struct sigscratch *scr) } frame = (void __user *) ((new_sp - sizeof(*frame)) & -STACK_ALIGN); - if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) { + if (!access_ok(frame, sizeof(*frame))) { force_sigsegv(ksig->sig, current); return 1; } diff --git a/arch/m68k/include/asm/uaccess_mm.h b/arch/m68k/include/asm/uaccess_mm.h index c4cb889660aa..7e85de984df1 100644 --- a/arch/m68k/include/asm/uaccess_mm.h +++ b/arch/m68k/include/asm/uaccess_mm.h @@ -10,7 +10,7 @@ #include /* We let the MMU do all checking */ -static inline int access_ok(int type, const void __user *addr, +static inline int access_ok(const void __user *addr, unsigned long size) { return 1; diff --git a/arch/m68k/include/asm/uaccess_no.h b/arch/m68k/include/asm/uaccess_no.h index 892efb56beef..0134008bf539 100644 --- a/arch/m68k/include/asm/uaccess_no.h +++ b/arch/m68k/include/asm/uaccess_no.h @@ -10,7 +10,7 @@ #include -#define access_ok(type,addr,size) _access_ok((unsigned long)(addr),(size)) +#define access_ok(addr,size) _access_ok((unsigned long)(addr),(size)) /* * It is not enough to just have access_ok check for a real RAM address. diff --git a/arch/m68k/kernel/signal.c b/arch/m68k/kernel/signal.c index 72850b85ecf8..e2a9421c5797 100644 --- a/arch/m68k/kernel/signal.c +++ b/arch/m68k/kernel/signal.c @@ -787,7 +787,7 @@ asmlinkage int do_sigreturn(struct pt_regs *regs, struct switch_stack *sw) struct sigframe __user *frame = (struct sigframe __user *)(usp - 4); sigset_t set; - if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) + if (!access_ok(frame, sizeof(*frame))) goto badframe; if (__get_user(set.sig[0], &frame->sc.sc_mask) || (_NSIG_WORDS > 1 && @@ -812,7 +812,7 @@ asmlinkage int do_rt_sigreturn(struct pt_regs *regs, struct switch_stack *sw) struct rt_sigframe __user *frame = (struct rt_sigframe __user *)(usp - 4); sigset_t set; - if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) + if (!access_ok(frame, sizeof(*frame))) goto badframe; if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set))) goto badframe; diff --git a/arch/microblaze/include/asm/futex.h b/arch/microblaze/include/asm/futex.h index 2572077b04ea..8c90357e5983 100644 --- a/arch/microblaze/include/asm/futex.h +++ b/arch/microblaze/include/asm/futex.h @@ -71,7 +71,7 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, int ret = 0, cmp; u32 prev; - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) + if (!access_ok(uaddr, sizeof(u32))) return -EFAULT; __asm__ __volatile__ ("1: lwx %1, %3, r0; \ diff --git a/arch/microblaze/include/asm/uaccess.h b/arch/microblaze/include/asm/uaccess.h index 81f16aadbf9e..dbfea093a7c7 100644 --- a/arch/microblaze/include/asm/uaccess.h +++ b/arch/microblaze/include/asm/uaccess.h @@ -60,26 +60,25 @@ static inline int ___range_ok(unsigned long addr, unsigned long size) #define __range_ok(addr, size) \ ___range_ok((unsigned long)(addr), (unsigned long)(size)) -#define access_ok(type, addr, size) (__range_ok((addr), (size)) == 0) +#define access_ok(addr, size) (__range_ok((addr), (size)) == 0) #else -static inline int access_ok(int type, const void __user *addr, - unsigned long size) +static inline int access_ok(const void __user *addr, unsigned long size) { if (!size) goto ok; if ((get_fs().seg < ((unsigned long)addr)) || (get_fs().seg < ((unsigned long)addr + size - 1))) { - pr_devel("ACCESS fail: %s at 0x%08x (size 0x%x), seg 0x%08x\n", - type ? "WRITE" : "READ ", (__force u32)addr, (u32)size, + pr_devel("ACCESS fail at 0x%08x (size 0x%x), seg 0x%08x\n", + (__force u32)addr, (u32)size, (u32)get_fs().seg); return 0; } ok: - pr_devel("ACCESS OK: %s at 0x%08x (size 0x%x), seg 0x%08x\n", - type ? "WRITE" : "READ ", (__force u32)addr, (u32)size, + pr_devel("ACCESS OK at 0x%08x (size 0x%x), seg 0x%08x\n", + (__force u32)addr, (u32)size, (u32)get_fs().seg); return 1; } @@ -120,7 +119,7 @@ static inline unsigned long __must_check clear_user(void __user *to, unsigned long n) { might_fault(); - if (unlikely(!access_ok(VERIFY_WRITE, to, n))) + if (unlikely(!access_ok(to, n))) return n; return __clear_user(to, n); @@ -174,7 +173,7 @@ extern long __user_bad(void); const typeof(*(ptr)) __user *__gu_addr = (ptr); \ int __gu_err = 0; \ \ - if (access_ok(VERIFY_READ, __gu_addr, size)) { \ + if (access_ok(__gu_addr, size)) { \ switch (size) { \ case 1: \ __get_user_asm("lbu", __gu_addr, __gu_val, \ @@ -286,7 +285,7 @@ extern long __user_bad(void); typeof(*(ptr)) __user *__pu_addr = (ptr); \ int __pu_err = 0; \ \ - if (access_ok(VERIFY_WRITE, __pu_addr, size)) { \ + if (access_ok(__pu_addr, size)) { \ switch (size) { \ case 1: \ __put_user_asm("sb", __pu_addr, __pu_val, \ @@ -358,7 +357,7 @@ extern int __strncpy_user(char *to, const char __user *from, int len); static inline long strncpy_from_user(char *dst, const char __user *src, long count) { - if (!access_ok(VERIFY_READ, src, 1)) + if (!access_ok(src, 1)) return -EFAULT; return __strncpy_user(dst, src, count); } @@ -372,7 +371,7 @@ extern int __strnlen_user(const char __user *sstr, int len); static inline long strnlen_user(const char __user *src, long n) { - if (!access_ok(VERIFY_READ, src, 1)) + if (!access_ok(src, 1)) return 0; return __strnlen_user(src, n); } diff --git a/arch/microblaze/kernel/signal.c b/arch/microblaze/kernel/signal.c index 97001524ca2d..0685696349bb 100644 --- a/arch/microblaze/kernel/signal.c +++ b/arch/microblaze/kernel/signal.c @@ -91,7 +91,7 @@ asmlinkage long sys_rt_sigreturn(struct pt_regs *regs) /* Always make any pending restarted system calls return -EINTR */ current->restart_block.fn = do_no_restart_syscall; - if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) + if (!access_ok(frame, sizeof(*frame))) goto badframe; if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set))) @@ -166,7 +166,7 @@ static int setup_rt_frame(struct ksignal *ksig, sigset_t *set, frame = get_sigframe(ksig, regs, sizeof(*frame)); - if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) + if (!access_ok(frame, sizeof(*frame))) return -EFAULT; if (ksig->ka.sa.sa_flags & SA_SIGINFO) diff --git a/arch/mips/include/asm/checksum.h b/arch/mips/include/asm/checksum.h index e8161e4dfde7..dcebaaf8c862 100644 --- a/arch/mips/include/asm/checksum.h +++ b/arch/mips/include/asm/checksum.h @@ -63,7 +63,7 @@ static inline __wsum csum_and_copy_from_user(const void __user *src, void *dst, int len, __wsum sum, int *err_ptr) { - if (access_ok(VERIFY_READ, src, len)) + if (access_ok(src, len)) return csum_partial_copy_from_user(src, dst, len, sum, err_ptr); if (len) @@ -81,7 +81,7 @@ __wsum csum_and_copy_to_user(const void *src, void __user *dst, int len, __wsum sum, int *err_ptr) { might_fault(); - if (access_ok(VERIFY_WRITE, dst, len)) { + if (access_ok(dst, len)) { if (uaccess_kernel()) return __csum_partial_copy_kernel(src, (__force void *)dst, diff --git a/arch/mips/include/asm/futex.h b/arch/mips/include/asm/futex.h index 8eff134b3a43..c14d798f3888 100644 --- a/arch/mips/include/asm/futex.h +++ b/arch/mips/include/asm/futex.h @@ -129,7 +129,7 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, int ret = 0; u32 val; - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) + if (!access_ok(uaddr, sizeof(u32))) return -EFAULT; if (cpu_has_llsc && R10000_LLSC_WAR) { diff --git a/arch/mips/include/asm/termios.h b/arch/mips/include/asm/termios.h index ce2d72e34274..bc29eeacc55a 100644 --- a/arch/mips/include/asm/termios.h +++ b/arch/mips/include/asm/termios.h @@ -32,7 +32,7 @@ static inline int user_termio_to_kernel_termios(struct ktermios *termios, unsigned short iflag, oflag, cflag, lflag; unsigned int err; - if (!access_ok(VERIFY_READ, termio, sizeof(struct termio))) + if (!access_ok(termio, sizeof(struct termio))) return -EFAULT; err = __get_user(iflag, &termio->c_iflag); @@ -61,7 +61,7 @@ static inline int kernel_termios_to_user_termio(struct termio __user *termio, { int err; - if (!access_ok(VERIFY_WRITE, termio, sizeof(struct termio))) + if (!access_ok(termio, sizeof(struct termio))) return -EFAULT; err = __put_user(termios->c_iflag, &termio->c_iflag); diff --git a/arch/mips/include/asm/uaccess.h b/arch/mips/include/asm/uaccess.h index 06629011a434..d43c1dc6ef15 100644 --- a/arch/mips/include/asm/uaccess.h +++ b/arch/mips/include/asm/uaccess.h @@ -109,9 +109,6 @@ static inline bool eva_kernel_access(void) /* * access_ok: - Checks if a user space pointer is valid - * @type: Type of access: %VERIFY_READ or %VERIFY_WRITE. Note that - * %VERIFY_WRITE is a superset of %VERIFY_READ - if it is safe - * to write to a block, it is always safe to read from it. * @addr: User space pointer to start of block to check * @size: Size of block to check * @@ -134,7 +131,7 @@ static inline int __access_ok(const void __user *p, unsigned long size) return (get_fs().seg & (addr | (addr + size) | __ua_size(size))) == 0; } -#define access_ok(type, addr, size) \ +#define access_ok(addr, size) \ likely(__access_ok((addr), (size))) /* @@ -304,7 +301,7 @@ do { \ const __typeof__(*(ptr)) __user * __gu_ptr = (ptr); \ \ might_fault(); \ - if (likely(access_ok(VERIFY_READ, __gu_ptr, size))) { \ + if (likely(access_ok( __gu_ptr, size))) { \ if (eva_kernel_access()) \ __get_kernel_common((x), size, __gu_ptr); \ else \ @@ -446,7 +443,7 @@ do { \ int __pu_err = -EFAULT; \ \ might_fault(); \ - if (likely(access_ok(VERIFY_WRITE, __pu_addr, size))) { \ + if (likely(access_ok( __pu_addr, size))) { \ if (eva_kernel_access()) \ __put_kernel_common(__pu_addr, size); \ else \ @@ -691,8 +688,7 @@ __clear_user(void __user *addr, __kernel_size_t size) ({ \ void __user * __cl_addr = (addr); \ unsigned long __cl_size = (n); \ - if (__cl_size && access_ok(VERIFY_WRITE, \ - __cl_addr, __cl_size)) \ + if (__cl_size && access_ok(__cl_addr, __cl_size)) \ __cl_size = __clear_user(__cl_addr, __cl_size); \ __cl_size; \ }) diff --git a/arch/mips/kernel/mips-r2-to-r6-emul.c b/arch/mips/kernel/mips-r2-to-r6-emul.c index cb22a558431e..c50c89a978f1 100644 --- a/arch/mips/kernel/mips-r2-to-r6-emul.c +++ b/arch/mips/kernel/mips-r2-to-r6-emul.c @@ -1205,7 +1205,7 @@ fpu_emul: case lwl_op: rt = regs->regs[MIPSInst_RT(inst)]; vaddr = regs->regs[MIPSInst_RS(inst)] + MIPSInst_SIMM(inst); - if (!access_ok(VERIFY_READ, (void __user *)vaddr, 4)) { + if (!access_ok((void __user *)vaddr, 4)) { current->thread.cp0_baduaddr = vaddr; err = SIGSEGV; break; @@ -1278,7 +1278,7 @@ fpu_emul: case lwr_op: rt = regs->regs[MIPSInst_RT(inst)]; vaddr = regs->regs[MIPSInst_RS(inst)] + MIPSInst_SIMM(inst); - if (!access_ok(VERIFY_READ, (void __user *)vaddr, 4)) { + if (!access_ok((void __user *)vaddr, 4)) { current->thread.cp0_baduaddr = vaddr; err = SIGSEGV; break; @@ -1352,7 +1352,7 @@ fpu_emul: case swl_op: rt = regs->regs[MIPSInst_RT(inst)]; vaddr = regs->regs[MIPSInst_RS(inst)] + MIPSInst_SIMM(inst); - if (!access_ok(VERIFY_WRITE, (void __user *)vaddr, 4)) { + if (!access_ok((void __user *)vaddr, 4)) { current->thread.cp0_baduaddr = vaddr; err = SIGSEGV; break; @@ -1422,7 +1422,7 @@ fpu_emul: case swr_op: rt = regs->regs[MIPSInst_RT(inst)]; vaddr = regs->regs[MIPSInst_RS(inst)] + MIPSInst_SIMM(inst); - if (!access_ok(VERIFY_WRITE, (void __user *)vaddr, 4)) { + if (!access_ok((void __user *)vaddr, 4)) { current->thread.cp0_baduaddr = vaddr; err = SIGSEGV; break; @@ -1497,7 +1497,7 @@ fpu_emul: rt = regs->regs[MIPSInst_RT(inst)]; vaddr = regs->regs[MIPSInst_RS(inst)] + MIPSInst_SIMM(inst); - if (!access_ok(VERIFY_READ, (void __user *)vaddr, 8)) { + if (!access_ok((void __user *)vaddr, 8)) { current->thread.cp0_baduaddr = vaddr; err = SIGSEGV; break; @@ -1616,7 +1616,7 @@ fpu_emul: rt = regs->regs[MIPSInst_RT(inst)]; vaddr = regs->regs[MIPSInst_RS(inst)] + MIPSInst_SIMM(inst); - if (!access_ok(VERIFY_READ, (void __user *)vaddr, 8)) { + if (!access_ok((void __user *)vaddr, 8)) { current->thread.cp0_baduaddr = vaddr; err = SIGSEGV; break; @@ -1735,7 +1735,7 @@ fpu_emul: rt = regs->regs[MIPSInst_RT(inst)]; vaddr = regs->regs[MIPSInst_RS(inst)] + MIPSInst_SIMM(inst); - if (!access_ok(VERIFY_WRITE, (void __user *)vaddr, 8)) { + if (!access_ok((void __user *)vaddr, 8)) { current->thread.cp0_baduaddr = vaddr; err = SIGSEGV; break; @@ -1853,7 +1853,7 @@ fpu_emul: rt = regs->regs[MIPSInst_RT(inst)]; vaddr = regs->regs[MIPSInst_RS(inst)] + MIPSInst_SIMM(inst); - if (!access_ok(VERIFY_WRITE, (void __user *)vaddr, 8)) { + if (!access_ok((void __user *)vaddr, 8)) { current->thread.cp0_baduaddr = vaddr; err = SIGSEGV; break; @@ -1970,7 +1970,7 @@ fpu_emul: err = SIGBUS; break; } - if (!access_ok(VERIFY_READ, (void __user *)vaddr, 4)) { + if (!access_ok((void __user *)vaddr, 4)) { current->thread.cp0_baduaddr = vaddr; err = SIGBUS; break; @@ -2026,7 +2026,7 @@ fpu_emul: err = SIGBUS; break; } - if (!access_ok(VERIFY_WRITE, (void __user *)vaddr, 4)) { + if (!access_ok((void __user *)vaddr, 4)) { current->thread.cp0_baduaddr = vaddr; err = SIGBUS; break; @@ -2089,7 +2089,7 @@ fpu_emul: err = SIGBUS; break; } - if (!access_ok(VERIFY_READ, (void __user *)vaddr, 8)) { + if (!access_ok((void __user *)vaddr, 8)) { current->thread.cp0_baduaddr = vaddr; err = SIGBUS; break; @@ -2150,7 +2150,7 @@ fpu_emul: err = SIGBUS; break; } - if (!access_ok(VERIFY_WRITE, (void __user *)vaddr, 8)) { + if (!access_ok((void __user *)vaddr, 8)) { current->thread.cp0_baduaddr = vaddr; err = SIGBUS; break; diff --git a/arch/mips/kernel/ptrace.c b/arch/mips/kernel/ptrace.c index ea54575255ea..0057c910bc2f 100644 --- a/arch/mips/kernel/ptrace.c +++ b/arch/mips/kernel/ptrace.c @@ -71,7 +71,7 @@ int ptrace_getregs(struct task_struct *child, struct user_pt_regs __user *data) struct pt_regs *regs; int i; - if (!access_ok(VERIFY_WRITE, data, 38 * 8)) + if (!access_ok(data, 38 * 8)) return -EIO; regs = task_pt_regs(child); @@ -98,7 +98,7 @@ int ptrace_setregs(struct task_struct *child, struct user_pt_regs __user *data) struct pt_regs *regs; int i; - if (!access_ok(VERIFY_READ, data, 38 * 8)) + if (!access_ok(data, 38 * 8)) return -EIO; regs = task_pt_regs(child); @@ -125,7 +125,7 @@ int ptrace_get_watch_regs(struct task_struct *child, if (!cpu_has_watch || boot_cpu_data.watch_reg_use_cnt == 0) return -EIO; - if (!access_ok(VERIFY_WRITE, addr, sizeof(struct pt_watch_regs))) + if (!access_ok(addr, sizeof(struct pt_watch_regs))) return -EIO; #ifdef CONFIG_32BIT @@ -167,7 +167,7 @@ int ptrace_set_watch_regs(struct task_struct *child, if (!cpu_has_watch || boot_cpu_data.watch_reg_use_cnt == 0) return -EIO; - if (!access_ok(VERIFY_READ, addr, sizeof(struct pt_watch_regs))) + if (!access_ok(addr, sizeof(struct pt_watch_regs))) return -EIO; /* Check the values. */ for (i = 0; i < boot_cpu_data.watch_reg_use_cnt; i++) { @@ -359,7 +359,7 @@ int ptrace_getfpregs(struct task_struct *child, __u32 __user *data) { int i; - if (!access_ok(VERIFY_WRITE, data, 33 * 8)) + if (!access_ok(data, 33 * 8)) return -EIO; if (tsk_used_math(child)) { @@ -385,7 +385,7 @@ int ptrace_setfpregs(struct task_struct *child, __u32 __user *data) u32 value; int i; - if (!access_ok(VERIFY_READ, data, 33 * 8)) + if (!access_ok(data, 33 * 8)) return -EIO; init_fp_ctx(child); diff --git a/arch/mips/kernel/signal.c b/arch/mips/kernel/signal.c index d3a23758592c..d75337974ee9 100644 --- a/arch/mips/kernel/signal.c +++ b/arch/mips/kernel/signal.c @@ -590,7 +590,7 @@ SYSCALL_DEFINE3(sigaction, int, sig, const struct sigaction __user *, act, if (act) { old_sigset_t mask; - if (!access_ok(VERIFY_READ, act, sizeof(*act))) + if (!access_ok(act, sizeof(*act))) return -EFAULT; err |= __get_user(new_ka.sa.sa_handler, &act->sa_handler); err |= __get_user(new_ka.sa.sa_flags, &act->sa_flags); @@ -604,7 +604,7 @@ SYSCALL_DEFINE3(sigaction, int, sig, const struct sigaction __user *, act, ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL); if (!ret && oact) { - if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact))) + if (!access_ok(oact, sizeof(*oact))) return -EFAULT; err |= __put_user(old_ka.sa.sa_flags, &oact->sa_flags); err |= __put_user(old_ka.sa.sa_handler, &oact->sa_handler); @@ -630,7 +630,7 @@ asmlinkage void sys_sigreturn(void) regs = current_pt_regs(); frame = (struct sigframe __user *)regs->regs[29]; - if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) + if (!access_ok(frame, sizeof(*frame))) goto badframe; if (__copy_from_user(&blocked, &frame->sf_mask, sizeof(blocked))) goto badframe; @@ -667,7 +667,7 @@ asmlinkage void sys_rt_sigreturn(void) regs = current_pt_regs(); frame = (struct rt_sigframe __user *)regs->regs[29]; - if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) + if (!access_ok(frame, sizeof(*frame))) goto badframe; if (__copy_from_user(&set, &frame->rs_uc.uc_sigmask, sizeof(set))) goto badframe; @@ -705,7 +705,7 @@ static int setup_frame(void *sig_return, struct ksignal *ksig, int err = 0; frame = get_sigframe(ksig, regs, sizeof(*frame)); - if (!access_ok(VERIFY_WRITE, frame, sizeof (*frame))) + if (!access_ok(frame, sizeof (*frame))) return -EFAULT; err |= setup_sigcontext(regs, &frame->sf_sc); @@ -744,7 +744,7 @@ static int setup_rt_frame(void *sig_return, struct ksignal *ksig, int err = 0; frame = get_sigframe(ksig, regs, sizeof(*frame)); - if (!access_ok(VERIFY_WRITE, frame, sizeof (*frame))) + if (!access_ok(frame, sizeof (*frame))) return -EFAULT; /* Create siginfo. */ diff --git a/arch/mips/kernel/signal32.c b/arch/mips/kernel/signal32.c index b5d9e1784aff..59b8965433c2 100644 --- a/arch/mips/kernel/signal32.c +++ b/arch/mips/kernel/signal32.c @@ -46,7 +46,7 @@ SYSCALL_DEFINE3(32_sigaction, long, sig, const struct compat_sigaction __user *, old_sigset_t mask; s32 handler; - if (!access_ok(VERIFY_READ, act, sizeof(*act))) + if (!access_ok(act, sizeof(*act))) return -EFAULT; err |= __get_user(handler, &act->sa_handler); new_ka.sa.sa_handler = (void __user *)(s64)handler; @@ -61,7 +61,7 @@ SYSCALL_DEFINE3(32_sigaction, long, sig, const struct compat_sigaction __user *, ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL); if (!ret && oact) { - if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact))) + if (!access_ok(oact, sizeof(*oact))) return -EFAULT; err |= __put_user(old_ka.sa.sa_flags, &oact->sa_flags); err |= __put_user((u32)(u64)old_ka.sa.sa_handler, diff --git a/arch/mips/kernel/signal_n32.c b/arch/mips/kernel/signal_n32.c index 8f65aaf9206d..c498b027823e 100644 --- a/arch/mips/kernel/signal_n32.c +++ b/arch/mips/kernel/signal_n32.c @@ -73,7 +73,7 @@ asmlinkage void sysn32_rt_sigreturn(void) regs = current_pt_regs(); frame = (struct rt_sigframe_n32 __user *)regs->regs[29]; - if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) + if (!access_ok(frame, sizeof(*frame))) goto badframe; if (__copy_conv_sigset_from_user(&set, &frame->rs_uc.uc_sigmask)) goto badframe; @@ -110,7 +110,7 @@ static int setup_rt_frame_n32(void *sig_return, struct ksignal *ksig, int err = 0; frame = get_sigframe(ksig, regs, sizeof(*frame)); - if (!access_ok(VERIFY_WRITE, frame, sizeof (*frame))) + if (!access_ok(frame, sizeof (*frame))) return -EFAULT; /* Create siginfo. */ diff --git a/arch/mips/kernel/signal_o32.c b/arch/mips/kernel/signal_o32.c index b6e3ddef48a0..df259618e834 100644 --- a/arch/mips/kernel/signal_o32.c +++ b/arch/mips/kernel/signal_o32.c @@ -118,7 +118,7 @@ static int setup_frame_32(void *sig_return, struct ksignal *ksig, int err = 0; frame = get_sigframe(ksig, regs, sizeof(*frame)); - if (!access_ok(VERIFY_WRITE, frame, sizeof (*frame))) + if (!access_ok(frame, sizeof (*frame))) return -EFAULT; err |= setup_sigcontext32(regs, &frame->sf_sc); @@ -160,7 +160,7 @@ asmlinkage void sys32_rt_sigreturn(void) regs = current_pt_regs(); frame = (struct rt_sigframe32 __user *)regs->regs[29]; - if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) + if (!access_ok(frame, sizeof(*frame))) goto badframe; if (__copy_conv_sigset_from_user(&set, &frame->rs_uc.uc_sigmask)) goto badframe; @@ -197,7 +197,7 @@ static int setup_rt_frame_32(void *sig_return, struct ksignal *ksig, int err = 0; frame = get_sigframe(ksig, regs, sizeof(*frame)); - if (!access_ok(VERIFY_WRITE, frame, sizeof (*frame))) + if (!access_ok(frame, sizeof (*frame))) return -EFAULT; /* Convert (siginfo_t -> compat_siginfo_t) and copy to user. */ @@ -262,7 +262,7 @@ asmlinkage void sys32_sigreturn(void) regs = current_pt_regs(); frame = (struct sigframe32 __user *)regs->regs[29]; - if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) + if (!access_ok(frame, sizeof(*frame))) goto badframe; if (__copy_conv_sigset_from_user(&blocked, &frame->sf_mask)) goto badframe; diff --git a/arch/mips/kernel/syscall.c b/arch/mips/kernel/syscall.c index 41a0db08cd37..b6dc78ad5d8c 100644 --- a/arch/mips/kernel/syscall.c +++ b/arch/mips/kernel/syscall.c @@ -101,7 +101,7 @@ static inline int mips_atomic_set(unsigned long addr, unsigned long new) if (unlikely(addr & 3)) return -EINVAL; - if (unlikely(!access_ok(VERIFY_WRITE, (const void __user *)addr, 4))) + if (unlikely(!access_ok((const void __user *)addr, 4))) return -EINVAL; if (cpu_has_llsc && R10000_LLSC_WAR) { diff --git a/arch/mips/kernel/unaligned.c b/arch/mips/kernel/unaligned.c index c60e7719ef77..595ca9c85111 100644 --- a/arch/mips/kernel/unaligned.c +++ b/arch/mips/kernel/unaligned.c @@ -936,7 +936,7 @@ static void emulate_load_store_insn(struct pt_regs *regs, if (insn.dsp_format.func == lx_op) { switch (insn.dsp_format.op) { case lwx_op: - if (!access_ok(VERIFY_READ, addr, 4)) + if (!access_ok(addr, 4)) goto sigbus; LoadW(addr, value, res); if (res) @@ -945,7 +945,7 @@ static void emulate_load_store_insn(struct pt_regs *regs, regs->regs[insn.dsp_format.rd] = value; break; case lhx_op: - if (!access_ok(VERIFY_READ, addr, 2)) + if (!access_ok(addr, 2)) goto sigbus; LoadHW(addr, value, res); if (res) @@ -968,7 +968,7 @@ static void emulate_load_store_insn(struct pt_regs *regs, set_fs(USER_DS); switch (insn.spec3_format.func) { case lhe_op: - if (!access_ok(VERIFY_READ, addr, 2)) { + if (!access_ok(addr, 2)) { set_fs(seg); goto sigbus; } @@ -981,7 +981,7 @@ static void emulate_load_store_insn(struct pt_regs *regs, regs->regs[insn.spec3_format.rt] = value; break; case lwe_op: - if (!access_ok(VERIFY_READ, addr, 4)) { + if (!access_ok(addr, 4)) { set_fs(seg); goto sigbus; } @@ -994,7 +994,7 @@ static void emulate_load_store_insn(struct pt_regs *regs, regs->regs[insn.spec3_format.rt] = value; break; case lhue_op: - if (!access_ok(VERIFY_READ, addr, 2)) { + if (!access_ok(addr, 2)) { set_fs(seg); goto sigbus; } @@ -1007,7 +1007,7 @@ static void emulate_load_store_insn(struct pt_regs *regs, regs->regs[insn.spec3_format.rt] = value; break; case she_op: - if (!access_ok(VERIFY_WRITE, addr, 2)) { + if (!access_ok(addr, 2)) { set_fs(seg); goto sigbus; } @@ -1020,7 +1020,7 @@ static void emulate_load_store_insn(struct pt_regs *regs, } break; case swe_op: - if (!access_ok(VERIFY_WRITE, addr, 4)) { + if (!access_ok(addr, 4)) { set_fs(seg); goto sigbus; } @@ -1041,7 +1041,7 @@ static void emulate_load_store_insn(struct pt_regs *regs, #endif break; case lh_op: - if (!access_ok(VERIFY_READ, addr, 2)) + if (!access_ok(addr, 2)) goto sigbus; if (IS_ENABLED(CONFIG_EVA)) { @@ -1060,7 +1060,7 @@ static void emulate_load_store_insn(struct pt_regs *regs, break; case lw_op: - if (!access_ok(VERIFY_READ, addr, 4)) + if (!access_ok(addr, 4)) goto sigbus; if (IS_ENABLED(CONFIG_EVA)) { @@ -1079,7 +1079,7 @@ static void emulate_load_store_insn(struct pt_regs *regs, break; case lhu_op: - if (!access_ok(VERIFY_READ, addr, 2)) + if (!access_ok(addr, 2)) goto sigbus; if (IS_ENABLED(CONFIG_EVA)) { @@ -1106,7 +1106,7 @@ static void emulate_load_store_insn(struct pt_regs *regs, * would blow up, so for now we don't handle unaligned 64-bit * instructions on 32-bit kernels. */ - if (!access_ok(VERIFY_READ, addr, 4)) + if (!access_ok(addr, 4)) goto sigbus; LoadWU(addr, value, res); @@ -1129,7 +1129,7 @@ static void emulate_load_store_insn(struct pt_regs *regs, * would blow up, so for now we don't handle unaligned 64-bit * instructions on 32-bit kernels. */ - if (!access_ok(VERIFY_READ, addr, 8)) + if (!access_ok(addr, 8)) goto sigbus; LoadDW(addr, value, res); @@ -1144,7 +1144,7 @@ static void emulate_load_store_insn(struct pt_regs *regs, goto sigill; case sh_op: - if (!access_ok(VERIFY_WRITE, addr, 2)) + if (!access_ok(addr, 2)) goto sigbus; compute_return_epc(regs); @@ -1164,7 +1164,7 @@ static void emulate_load_store_insn(struct pt_regs *regs, break; case sw_op: - if (!access_ok(VERIFY_WRITE, addr, 4)) + if (!access_ok(addr, 4)) goto sigbus; compute_return_epc(regs); @@ -1192,7 +1192,7 @@ static void emulate_load_store_insn(struct pt_regs *regs, * would blow up, so for now we don't handle unaligned 64-bit * instructions on 32-bit kernels. */ - if (!access_ok(VERIFY_WRITE, addr, 8)) + if (!access_ok(addr, 8)) goto sigbus; compute_return_epc(regs); @@ -1254,7 +1254,7 @@ static void emulate_load_store_insn(struct pt_regs *regs, switch (insn.msa_mi10_format.func) { case msa_ld_op: - if (!access_ok(VERIFY_READ, addr, sizeof(*fpr))) + if (!access_ok(addr, sizeof(*fpr))) goto sigbus; do { @@ -1290,7 +1290,7 @@ static void emulate_load_store_insn(struct pt_regs *regs, break; case msa_st_op: - if (!access_ok(VERIFY_WRITE, addr, sizeof(*fpr))) + if (!access_ok(addr, sizeof(*fpr))) goto sigbus; /* @@ -1463,7 +1463,7 @@ static void emulate_load_store_microMIPS(struct pt_regs *regs, if (reg == 31) goto sigbus; - if (!access_ok(VERIFY_READ, addr, 8)) + if (!access_ok(addr, 8)) goto sigbus; LoadW(addr, value, res); @@ -1482,7 +1482,7 @@ static void emulate_load_store_microMIPS(struct pt_regs *regs, if (reg == 31) goto sigbus; - if (!access_ok(VERIFY_WRITE, addr, 8)) + if (!access_ok(addr, 8)) goto sigbus; value = regs->regs[reg]; @@ -1502,7 +1502,7 @@ static void emulate_load_store_microMIPS(struct pt_regs *regs, if (reg == 31) goto sigbus; - if (!access_ok(VERIFY_READ, addr, 16)) + if (!access_ok(addr, 16)) goto sigbus; LoadDW(addr, value, res); @@ -1525,7 +1525,7 @@ static void emulate_load_store_microMIPS(struct pt_regs *regs, if (reg == 31) goto sigbus; - if (!access_ok(VERIFY_WRITE, addr, 16)) + if (!access_ok(addr, 16)) goto sigbus; value = regs->regs[reg]; @@ -1548,11 +1548,10 @@ static void emulate_load_store_microMIPS(struct pt_regs *regs, if ((rvar > 9) || !reg) goto sigill; if (reg & 0x10) { - if (!access_ok - (VERIFY_READ, addr, 4 * (rvar + 1))) + if (!access_ok(addr, 4 * (rvar + 1))) goto sigbus; } else { - if (!access_ok(VERIFY_READ, addr, 4 * rvar)) + if (!access_ok(addr, 4 * rvar)) goto sigbus; } if (rvar == 9) @@ -1585,11 +1584,10 @@ static void emulate_load_store_microMIPS(struct pt_regs *regs, if ((rvar > 9) || !reg) goto sigill; if (reg & 0x10) { - if (!access_ok - (VERIFY_WRITE, addr, 4 * (rvar + 1))) + if (!access_ok(addr, 4 * (rvar + 1))) goto sigbus; } else { - if (!access_ok(VERIFY_WRITE, addr, 4 * rvar)) + if (!access_ok(addr, 4 * rvar)) goto sigbus; } if (rvar == 9) @@ -1623,11 +1621,10 @@ static void emulate_load_store_microMIPS(struct pt_regs *regs, if ((rvar > 9) || !reg) goto sigill; if (reg & 0x10) { - if (!access_ok - (VERIFY_READ, addr, 8 * (rvar + 1))) + if (!access_ok(addr, 8 * (rvar + 1))) goto sigbus; } else { - if (!access_ok(VERIFY_READ, addr, 8 * rvar)) + if (!access_ok(addr, 8 * rvar)) goto sigbus; } if (rvar == 9) @@ -1665,11 +1662,10 @@ static void emulate_load_store_microMIPS(struct pt_regs *regs, if ((rvar > 9) || !reg) goto sigill; if (reg & 0x10) { - if (!access_ok - (VERIFY_WRITE, addr, 8 * (rvar + 1))) + if (!access_ok(addr, 8 * (rvar + 1))) goto sigbus; } else { - if (!access_ok(VERIFY_WRITE, addr, 8 * rvar)) + if (!access_ok(addr, 8 * rvar)) goto sigbus; } if (rvar == 9) @@ -1788,7 +1784,7 @@ fpu_emul: case mm_lwm16_op: reg = insn.mm16_m_format.rlist; rvar = reg + 1; - if (!access_ok(VERIFY_READ, addr, 4 * rvar)) + if (!access_ok(addr, 4 * rvar)) goto sigbus; for (i = 16; rvar; rvar--, i++) { @@ -1808,7 +1804,7 @@ fpu_emul: case mm_swm16_op: reg = insn.mm16_m_format.rlist; rvar = reg + 1; - if (!access_ok(VERIFY_WRITE, addr, 4 * rvar)) + if (!access_ok(addr, 4 * rvar)) goto sigbus; for (i = 16; rvar; rvar--, i++) { @@ -1862,7 +1858,7 @@ fpu_emul: } loadHW: - if (!access_ok(VERIFY_READ, addr, 2)) + if (!access_ok(addr, 2)) goto sigbus; LoadHW(addr, value, res); @@ -1872,7 +1868,7 @@ loadHW: goto success; loadHWU: - if (!access_ok(VERIFY_READ, addr, 2)) + if (!access_ok(addr, 2)) goto sigbus; LoadHWU(addr, value, res); @@ -1882,7 +1878,7 @@ loadHWU: goto success; loadW: - if (!access_ok(VERIFY_READ, addr, 4)) + if (!access_ok(addr, 4)) goto sigbus; LoadW(addr, value, res); @@ -1900,7 +1896,7 @@ loadWU: * would blow up, so for now we don't handle unaligned 64-bit * instructions on 32-bit kernels. */ - if (!access_ok(VERIFY_READ, addr, 4)) + if (!access_ok(addr, 4)) goto sigbus; LoadWU(addr, value, res); @@ -1922,7 +1918,7 @@ loadDW: * would blow up, so for now we don't handle unaligned 64-bit * instructions on 32-bit kernels. */ - if (!access_ok(VERIFY_READ, addr, 8)) + if (!access_ok(addr, 8)) goto sigbus; LoadDW(addr, value, res); @@ -1936,7 +1932,7 @@ loadDW: goto sigill; storeHW: - if (!access_ok(VERIFY_WRITE, addr, 2)) + if (!access_ok(addr, 2)) goto sigbus; value = regs->regs[reg]; @@ -1946,7 +1942,7 @@ storeHW: goto success; storeW: - if (!access_ok(VERIFY_WRITE, addr, 4)) + if (!access_ok(addr, 4)) goto sigbus; value = regs->regs[reg]; @@ -1964,7 +1960,7 @@ storeDW: * would blow up, so for now we don't handle unaligned 64-bit * instructions on 32-bit kernels. */ - if (!access_ok(VERIFY_WRITE, addr, 8)) + if (!access_ok(addr, 8)) goto sigbus; value = regs->regs[reg]; @@ -2122,7 +2118,7 @@ static void emulate_load_store_MIPS16e(struct pt_regs *regs, void __user * addr) goto sigbus; case MIPS16e_lh_op: - if (!access_ok(VERIFY_READ, addr, 2)) + if (!access_ok(addr, 2)) goto sigbus; LoadHW(addr, value, res); @@ -2133,7 +2129,7 @@ static void emulate_load_store_MIPS16e(struct pt_regs *regs, void __user * addr) break; case MIPS16e_lhu_op: - if (!access_ok(VERIFY_READ, addr, 2)) + if (!access_ok(addr, 2)) goto sigbus; LoadHWU(addr, value, res); @@ -2146,7 +2142,7 @@ static void emulate_load_store_MIPS16e(struct pt_regs *regs, void __user * addr) case MIPS16e_lw_op: case MIPS16e_lwpc_op: case MIPS16e_lwsp_op: - if (!access_ok(VERIFY_READ, addr, 4)) + if (!access_ok(addr, 4)) goto sigbus; LoadW(addr, value, res); @@ -2165,7 +2161,7 @@ static void emulate_load_store_MIPS16e(struct pt_regs *regs, void __user * addr) * would blow up, so for now we don't handle unaligned 64-bit * instructions on 32-bit kernels. */ - if (!access_ok(VERIFY_READ, addr, 4)) + if (!access_ok(addr, 4)) goto sigbus; LoadWU(addr, value, res); @@ -2189,7 +2185,7 @@ loadDW: * would blow up, so for now we don't handle unaligned 64-bit * instructions on 32-bit kernels. */ - if (!access_ok(VERIFY_READ, addr, 8)) + if (!access_ok(addr, 8)) goto sigbus; LoadDW(addr, value, res); @@ -2204,7 +2200,7 @@ loadDW: goto sigill; case MIPS16e_sh_op: - if (!access_ok(VERIFY_WRITE, addr, 2)) + if (!access_ok(addr, 2)) goto sigbus; MIPS16e_compute_return_epc(regs, &oldinst); @@ -2217,7 +2213,7 @@ loadDW: case MIPS16e_sw_op: case MIPS16e_swsp_op: case MIPS16e_i8_op: /* actually - MIPS16e_swrasp_func */ - if (!access_ok(VERIFY_WRITE, addr, 4)) + if (!access_ok(addr, 4)) goto sigbus; MIPS16e_compute_return_epc(regs, &oldinst); @@ -2237,7 +2233,7 @@ writeDW: * would blow up, so for now we don't handle unaligned 64-bit * instructions on 32-bit kernels. */ - if (!access_ok(VERIFY_WRITE, addr, 8)) + if (!access_ok(addr, 8)) goto sigbus; MIPS16e_compute_return_epc(regs, &oldinst); diff --git a/arch/mips/math-emu/cp1emu.c b/arch/mips/math-emu/cp1emu.c index 82e2993c1a2c..e60e29078ef5 100644 --- a/arch/mips/math-emu/cp1emu.c +++ b/arch/mips/math-emu/cp1emu.c @@ -1063,7 +1063,7 @@ emul: MIPSInst_SIMM(ir)); MIPS_FPU_EMU_INC_STATS(loads); - if (!access_ok(VERIFY_READ, dva, sizeof(u64))) { + if (!access_ok(dva, sizeof(u64))) { MIPS_FPU_EMU_INC_STATS(errors); *fault_addr = dva; return SIGBUS; @@ -1081,7 +1081,7 @@ emul: MIPSInst_SIMM(ir)); MIPS_FPU_EMU_INC_STATS(stores); DIFROMREG(dval, MIPSInst_RT(ir)); - if (!access_ok(VERIFY_WRITE, dva, sizeof(u64))) { + if (!access_ok(dva, sizeof(u64))) { MIPS_FPU_EMU_INC_STATS(errors); *fault_addr = dva; return SIGBUS; @@ -1097,7 +1097,7 @@ emul: wva = (u32 __user *) (xcp->regs[MIPSInst_RS(ir)] + MIPSInst_SIMM(ir)); MIPS_FPU_EMU_INC_STATS(loads); - if (!access_ok(VERIFY_READ, wva, sizeof(u32))) { + if (!access_ok(wva, sizeof(u32))) { MIPS_FPU_EMU_INC_STATS(errors); *fault_addr = wva; return SIGBUS; @@ -1115,7 +1115,7 @@ emul: MIPSInst_SIMM(ir)); MIPS_FPU_EMU_INC_STATS(stores); SIFROMREG(wval, MIPSInst_RT(ir)); - if (!access_ok(VERIFY_WRITE, wva, sizeof(u32))) { + if (!access_ok(wva, sizeof(u32))) { MIPS_FPU_EMU_INC_STATS(errors); *fault_addr = wva; return SIGBUS; @@ -1493,7 +1493,7 @@ static int fpux_emu(struct pt_regs *xcp, struct mips_fpu_struct *ctx, xcp->regs[MIPSInst_FT(ir)]); MIPS_FPU_EMU_INC_STATS(loads); - if (!access_ok(VERIFY_READ, va, sizeof(u32))) { + if (!access_ok(va, sizeof(u32))) { MIPS_FPU_EMU_INC_STATS(errors); *fault_addr = va; return SIGBUS; @@ -1513,7 +1513,7 @@ static int fpux_emu(struct pt_regs *xcp, struct mips_fpu_struct *ctx, MIPS_FPU_EMU_INC_STATS(stores); SIFROMREG(val, MIPSInst_FS(ir)); - if (!access_ok(VERIFY_WRITE, va, sizeof(u32))) { + if (!access_ok(va, sizeof(u32))) { MIPS_FPU_EMU_INC_STATS(errors); *fault_addr = va; return SIGBUS; @@ -1590,7 +1590,7 @@ static int fpux_emu(struct pt_regs *xcp, struct mips_fpu_struct *ctx, xcp->regs[MIPSInst_FT(ir)]); MIPS_FPU_EMU_INC_STATS(loads); - if (!access_ok(VERIFY_READ, va, sizeof(u64))) { + if (!access_ok(va, sizeof(u64))) { MIPS_FPU_EMU_INC_STATS(errors); *fault_addr = va; return SIGBUS; @@ -1609,7 +1609,7 @@ static int fpux_emu(struct pt_regs *xcp, struct mips_fpu_struct *ctx, MIPS_FPU_EMU_INC_STATS(stores); DIFROMREG(val, MIPSInst_FS(ir)); - if (!access_ok(VERIFY_WRITE, va, sizeof(u64))) { + if (!access_ok(va, sizeof(u64))) { MIPS_FPU_EMU_INC_STATS(errors); *fault_addr = va; return SIGBUS; diff --git a/arch/mips/mm/cache.c b/arch/mips/mm/cache.c index 70a523151ff3..55099fbff4e6 100644 --- a/arch/mips/mm/cache.c +++ b/arch/mips/mm/cache.c @@ -76,7 +76,7 @@ SYSCALL_DEFINE3(cacheflush, unsigned long, addr, unsigned long, bytes, { if (bytes == 0) return 0; - if (!access_ok(VERIFY_WRITE, (void __user *) addr, bytes)) + if (!access_ok((void __user *) addr, bytes)) return -EFAULT; __flush_icache_user_range(addr, addr + bytes); diff --git a/arch/mips/mm/gup.c b/arch/mips/mm/gup.c index 5a4875cac1ec..0d14e0d8eacf 100644 --- a/arch/mips/mm/gup.c +++ b/arch/mips/mm/gup.c @@ -195,8 +195,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, addr = start; len = (unsigned long) nr_pages << PAGE_SHIFT; end = start + len; - if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ, - (void __user *)start, len))) + if (unlikely(!access_ok((void __user *)start, len))) return 0; /* diff --git a/arch/mips/oprofile/backtrace.c b/arch/mips/oprofile/backtrace.c index 806fb798091f..07d98ba7f49e 100644 --- a/arch/mips/oprofile/backtrace.c +++ b/arch/mips/oprofile/backtrace.c @@ -19,7 +19,7 @@ struct stackframe { static inline int get_mem(unsigned long addr, unsigned long *result) { unsigned long *address = (unsigned long *) addr; - if (!access_ok(VERIFY_READ, address, sizeof(unsigned long))) + if (!access_ok(address, sizeof(unsigned long))) return -1; if (__copy_from_user_inatomic(result, address, sizeof(unsigned long))) return -3; diff --git a/arch/mips/sibyte/common/sb_tbprof.c b/arch/mips/sibyte/common/sb_tbprof.c index 99c720be72d2..9ff26b0cd3b6 100644 --- a/arch/mips/sibyte/common/sb_tbprof.c +++ b/arch/mips/sibyte/common/sb_tbprof.c @@ -458,7 +458,7 @@ static ssize_t sbprof_tb_read(struct file *filp, char *buf, char *dest = buf; long cur_off = *offp; - if (!access_ok(VERIFY_WRITE, buf, size)) + if (!access_ok(buf, size)) return -EFAULT; mutex_lock(&sbp.lock); diff --git a/arch/nds32/include/asm/futex.h b/arch/nds32/include/asm/futex.h index cb6cb91cfdf8..baf178bf1d0b 100644 --- a/arch/nds32/include/asm/futex.h +++ b/arch/nds32/include/asm/futex.h @@ -40,7 +40,7 @@ futex_atomic_cmpxchg_inatomic(u32 * uval, u32 __user * uaddr, int ret = 0; u32 val, tmp, flags; - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) + if (!access_ok(uaddr, sizeof(u32))) return -EFAULT; smp_mb(); diff --git a/arch/nds32/include/asm/uaccess.h b/arch/nds32/include/asm/uaccess.h index 362a32d9bd16..53dcb49b0b12 100644 --- a/arch/nds32/include/asm/uaccess.h +++ b/arch/nds32/include/asm/uaccess.h @@ -13,9 +13,6 @@ #include #include -#define VERIFY_READ 0 -#define VERIFY_WRITE 1 - #define __asmeq(x, y) ".ifnc " x "," y " ; .err ; .endif\n\t" /* @@ -53,7 +50,7 @@ static inline void set_fs(mm_segment_t fs) #define __range_ok(addr, size) (size <= get_fs() && addr <= (get_fs() -size)) -#define access_ok(type, addr, size) \ +#define access_ok(addr, size) \ __range_ok((unsigned long)addr, (unsigned long)size) /* * Single-value transfer routines. They automatically use the right @@ -94,7 +91,7 @@ static inline void set_fs(mm_segment_t fs) ({ \ const __typeof__(*(ptr)) __user *__p = (ptr); \ might_fault(); \ - if (access_ok(VERIFY_READ, __p, sizeof(*__p))) { \ + if (access_ok(__p, sizeof(*__p))) { \ __get_user_err((x), __p, (err)); \ } else { \ (x) = 0; (err) = -EFAULT; \ @@ -189,7 +186,7 @@ do { \ ({ \ __typeof__(*(ptr)) __user *__p = (ptr); \ might_fault(); \ - if (access_ok(VERIFY_WRITE, __p, sizeof(*__p))) { \ + if (access_ok(__p, sizeof(*__p))) { \ __put_user_err((x), __p, (err)); \ } else { \ (err) = -EFAULT; \ @@ -279,7 +276,7 @@ extern unsigned long __arch_copy_to_user(void __user * to, const void *from, #define INLINE_COPY_TO_USER static inline unsigned long clear_user(void __user * to, unsigned long n) { - if (access_ok(VERIFY_WRITE, to, n)) + if (access_ok(to, n)) n = __arch_clear_user(to, n); return n; } diff --git a/arch/nds32/kernel/perf_event_cpu.c b/arch/nds32/kernel/perf_event_cpu.c index 5e00ce54d0ff..334c2a6cec23 100644 --- a/arch/nds32/kernel/perf_event_cpu.c +++ b/arch/nds32/kernel/perf_event_cpu.c @@ -1306,7 +1306,7 @@ user_backtrace(struct perf_callchain_entry_ctx *entry, unsigned long fp) (unsigned long *)(fp - (unsigned long)sizeof(buftail)); /* Check accessibility of one struct frame_tail beyond */ - if (!access_ok(VERIFY_READ, user_frame_tail, sizeof(buftail))) + if (!access_ok(user_frame_tail, sizeof(buftail))) return 0; if (__copy_from_user_inatomic (&buftail, user_frame_tail, sizeof(buftail))) @@ -1332,7 +1332,7 @@ user_backtrace_opt_size(struct perf_callchain_entry_ctx *entry, (unsigned long *)(fp - (unsigned long)sizeof(buftail)); /* Check accessibility of one struct frame_tail beyond */ - if (!access_ok(VERIFY_READ, user_frame_tail, sizeof(buftail))) + if (!access_ok(user_frame_tail, sizeof(buftail))) return 0; if (__copy_from_user_inatomic (&buftail, user_frame_tail, sizeof(buftail))) @@ -1386,7 +1386,7 @@ perf_callchain_user(struct perf_callchain_entry_ctx *entry, user_frame_tail = (unsigned long *)(fp - (unsigned long)sizeof(fp)); - if (!access_ok(VERIFY_READ, user_frame_tail, sizeof(fp))) + if (!access_ok(user_frame_tail, sizeof(fp))) return; if (__copy_from_user_inatomic @@ -1406,8 +1406,7 @@ perf_callchain_user(struct perf_callchain_entry_ctx *entry, (unsigned long *)(fp - (unsigned long)sizeof(buftail)); - if (!access_ok - (VERIFY_READ, user_frame_tail, sizeof(buftail))) + if (!access_ok(user_frame_tail, sizeof(buftail))) return; if (__copy_from_user_inatomic @@ -1424,7 +1423,7 @@ perf_callchain_user(struct perf_callchain_entry_ctx *entry, (unsigned long *)(fp - (unsigned long) sizeof(buftail_opt_size)); - if (!access_ok(VERIFY_READ, user_frame_tail, + if (!access_ok(user_frame_tail, sizeof(buftail_opt_size))) return; diff --git a/arch/nds32/kernel/signal.c b/arch/nds32/kernel/signal.c index 5b5be082cfa4..5f7660aa2d68 100644 --- a/arch/nds32/kernel/signal.c +++ b/arch/nds32/kernel/signal.c @@ -151,7 +151,7 @@ asmlinkage long sys_rt_sigreturn(struct pt_regs *regs) frame = (struct rt_sigframe __user *)regs->sp; - if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) + if (!access_ok(frame, sizeof(*frame))) goto badframe; if (restore_sigframe(regs, frame)) @@ -275,7 +275,7 @@ setup_rt_frame(struct ksignal *ksig, sigset_t * set, struct pt_regs *regs) get_sigframe(ksig, regs, sizeof(*frame)); int err = 0; - if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) + if (!access_ok(frame, sizeof(*frame))) return -EFAULT; __put_user_error(0, &frame->uc.uc_flags, err); diff --git a/arch/nds32/mm/alignment.c b/arch/nds32/mm/alignment.c index e1aed9dc692d..c8b9061a2ee3 100644 --- a/arch/nds32/mm/alignment.c +++ b/arch/nds32/mm/alignment.c @@ -289,13 +289,13 @@ static inline int do_16(unsigned long inst, struct pt_regs *regs) unaligned_addr += shift; if (load) { - if (!access_ok(VERIFY_READ, (void *)unaligned_addr, len)) + if (!access_ok((void *)unaligned_addr, len)) return -EACCES; get_data(unaligned_addr, &target_val, len); *idx_to_addr(regs, target_idx) = target_val; } else { - if (!access_ok(VERIFY_WRITE, (void *)unaligned_addr, len)) + if (!access_ok((void *)unaligned_addr, len)) return -EACCES; target_val = *idx_to_addr(regs, target_idx); set_data((void *)unaligned_addr, target_val, len); @@ -479,7 +479,7 @@ static inline int do_32(unsigned long inst, struct pt_regs *regs) if (load) { - if (!access_ok(VERIFY_READ, (void *)unaligned_addr, len)) + if (!access_ok((void *)unaligned_addr, len)) return -EACCES; get_data(unaligned_addr, &target_val, len); @@ -491,7 +491,7 @@ static inline int do_32(unsigned long inst, struct pt_regs *regs) *idx_to_addr(regs, RT(inst)) = target_val; } else { - if (!access_ok(VERIFY_WRITE, (void *)unaligned_addr, len)) + if (!access_ok((void *)unaligned_addr, len)) return -EACCES; target_val = *idx_to_addr(regs, RT(inst)); diff --git a/arch/nios2/include/asm/uaccess.h b/arch/nios2/include/asm/uaccess.h index dfa3c7cb30b4..e0ea10806491 100644 --- a/arch/nios2/include/asm/uaccess.h +++ b/arch/nios2/include/asm/uaccess.h @@ -37,7 +37,7 @@ (((signed long)(((long)get_fs().seg) & \ ((long)(addr) | (((long)(addr)) + (len)) | (len)))) == 0) -#define access_ok(type, addr, len) \ +#define access_ok(addr, len) \ likely(__access_ok((unsigned long)(addr), (unsigned long)(len))) # define __EX_TABLE_SECTION ".section __ex_table,\"a\"\n" @@ -70,7 +70,7 @@ static inline unsigned long __must_check __clear_user(void __user *to, static inline unsigned long __must_check clear_user(void __user *to, unsigned long n) { - if (!access_ok(VERIFY_WRITE, to, n)) + if (!access_ok(to, n)) return n; return __clear_user(to, n); } @@ -142,7 +142,7 @@ do { \ long __gu_err = -EFAULT; \ const __typeof__(*(ptr)) __user *__gu_ptr = (ptr); \ unsigned long __gu_val = 0; \ - if (access_ok(VERIFY_READ, __gu_ptr, sizeof(*__gu_ptr))) \ + if (access_ok( __gu_ptr, sizeof(*__gu_ptr))) \ __get_user_common(__gu_val, sizeof(*__gu_ptr), \ __gu_ptr, __gu_err); \ (x) = (__force __typeof__(x))__gu_val; \ @@ -168,7 +168,7 @@ do { \ long __pu_err = -EFAULT; \ __typeof__(*(ptr)) __user *__pu_ptr = (ptr); \ __typeof__(*(ptr)) __pu_val = (__typeof(*ptr))(x); \ - if (access_ok(VERIFY_WRITE, __pu_ptr, sizeof(*__pu_ptr))) { \ + if (access_ok(__pu_ptr, sizeof(*__pu_ptr))) { \ switch (sizeof(*__pu_ptr)) { \ case 1: \ __put_user_asm(__pu_val, "stb", __pu_ptr, __pu_err); \ diff --git a/arch/nios2/kernel/signal.c b/arch/nios2/kernel/signal.c index 20662b0f6c9e..4a81876b6086 100644 --- a/arch/nios2/kernel/signal.c +++ b/arch/nios2/kernel/signal.c @@ -106,7 +106,7 @@ asmlinkage int do_rt_sigreturn(struct switch_stack *sw) sigset_t set; int rval; - if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) + if (!access_ok(frame, sizeof(*frame))) goto badframe; if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set))) diff --git a/arch/openrisc/include/asm/futex.h b/arch/openrisc/include/asm/futex.h index 618da4a1bffb..fe894e6331ae 100644 --- a/arch/openrisc/include/asm/futex.h +++ b/arch/openrisc/include/asm/futex.h @@ -72,7 +72,7 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, int ret = 0; u32 prev; - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) + if (!access_ok(uaddr, sizeof(u32))) return -EFAULT; __asm__ __volatile__ ( \ diff --git a/arch/openrisc/include/asm/uaccess.h b/arch/openrisc/include/asm/uaccess.h index bbf5c79cce7a..bc8191a34db7 100644 --- a/arch/openrisc/include/asm/uaccess.h +++ b/arch/openrisc/include/asm/uaccess.h @@ -58,7 +58,7 @@ /* Ensure that addr is below task's addr_limit */ #define __addr_ok(addr) ((unsigned long) addr < get_fs()) -#define access_ok(type, addr, size) \ +#define access_ok(addr, size) \ __range_ok((unsigned long)addr, (unsigned long)size) /* @@ -102,7 +102,7 @@ extern long __put_user_bad(void); ({ \ long __pu_err = -EFAULT; \ __typeof__(*(ptr)) *__pu_addr = (ptr); \ - if (access_ok(VERIFY_WRITE, __pu_addr, size)) \ + if (access_ok(__pu_addr, size)) \ __put_user_size((x), __pu_addr, (size), __pu_err); \ __pu_err; \ }) @@ -175,7 +175,7 @@ struct __large_struct { ({ \ long __gu_err = -EFAULT, __gu_val = 0; \ const __typeof__(*(ptr)) * __gu_addr = (ptr); \ - if (access_ok(VERIFY_READ, __gu_addr, size)) \ + if (access_ok(__gu_addr, size)) \ __get_user_size(__gu_val, __gu_addr, (size), __gu_err); \ (x) = (__force __typeof__(*(ptr)))__gu_val; \ __gu_err; \ @@ -254,7 +254,7 @@ extern unsigned long __clear_user(void *addr, unsigned long size); static inline __must_check unsigned long clear_user(void *addr, unsigned long size) { - if (likely(access_ok(VERIFY_WRITE, addr, size))) + if (likely(access_ok(addr, size))) size = __clear_user(addr, size); return size; } diff --git a/arch/openrisc/kernel/signal.c b/arch/openrisc/kernel/signal.c index 265f10fb3930..5ac9d3b1d615 100644 --- a/arch/openrisc/kernel/signal.c +++ b/arch/openrisc/kernel/signal.c @@ -50,7 +50,7 @@ static int restore_sigcontext(struct pt_regs *regs, /* * Restore the regs from &sc->regs. - * (sc is already checked for VERIFY_READ since the sigframe was + * (sc is already checked since the sigframe was * checked in sys_sigreturn previously) */ err |= __copy_from_user(regs, sc->regs.gpr, 32 * sizeof(unsigned long)); @@ -83,7 +83,7 @@ asmlinkage long _sys_rt_sigreturn(struct pt_regs *regs) if (((long)frame) & 3) goto badframe; - if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) + if (!access_ok(frame, sizeof(*frame))) goto badframe; if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set))) goto badframe; @@ -161,7 +161,7 @@ static int setup_rt_frame(struct ksignal *ksig, sigset_t *set, frame = get_sigframe(ksig, regs, sizeof(*frame)); - if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) + if (!access_ok(frame, sizeof(*frame))) return -EFAULT; /* Create siginfo. */ diff --git a/arch/parisc/include/asm/futex.h b/arch/parisc/include/asm/futex.h index cf7ba058f619..d2c3e4106851 100644 --- a/arch/parisc/include/asm/futex.h +++ b/arch/parisc/include/asm/futex.h @@ -95,7 +95,7 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, if (uaccess_kernel() && !uaddr) return -EFAULT; - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) + if (!access_ok(uaddr, sizeof(u32))) return -EFAULT; /* HPPA has no cmpxchg in hardware and therefore the diff --git a/arch/parisc/include/asm/uaccess.h b/arch/parisc/include/asm/uaccess.h index ea70e36ce6af..30ac2865ea73 100644 --- a/arch/parisc/include/asm/uaccess.h +++ b/arch/parisc/include/asm/uaccess.h @@ -27,7 +27,7 @@ * that put_user is the same as __put_user, etc. */ -#define access_ok(type, uaddr, size) \ +#define access_ok(uaddr, size) \ ( (uaddr) == (uaddr) ) #define put_user __put_user diff --git a/arch/powerpc/include/asm/futex.h b/arch/powerpc/include/asm/futex.h index 94542776a62d..88b38b37c21b 100644 --- a/arch/powerpc/include/asm/futex.h +++ b/arch/powerpc/include/asm/futex.h @@ -72,7 +72,7 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, int ret = 0; u32 prev; - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) + if (!access_ok(uaddr, sizeof(u32))) return -EFAULT; __asm__ __volatile__ ( diff --git a/arch/powerpc/include/asm/uaccess.h b/arch/powerpc/include/asm/uaccess.h index ebc0b916dcf9..b31bf45eebd4 100644 --- a/arch/powerpc/include/asm/uaccess.h +++ b/arch/powerpc/include/asm/uaccess.h @@ -62,7 +62,7 @@ static inline int __access_ok(unsigned long addr, unsigned long size, #endif -#define access_ok(type, addr, size) \ +#define access_ok(addr, size) \ (__chk_user_ptr(addr), (void)(type), \ __access_ok((__force unsigned long)(addr), (size), get_fs())) @@ -166,7 +166,7 @@ do { \ long __pu_err = -EFAULT; \ __typeof__(*(ptr)) __user *__pu_addr = (ptr); \ might_fault(); \ - if (access_ok(VERIFY_WRITE, __pu_addr, size)) \ + if (access_ok(__pu_addr, size)) \ __put_user_size((x), __pu_addr, (size), __pu_err); \ __pu_err; \ }) @@ -276,7 +276,7 @@ do { \ __long_type(*(ptr)) __gu_val = 0; \ __typeof__(*(ptr)) __user *__gu_addr = (ptr); \ might_fault(); \ - if (access_ok(VERIFY_READ, __gu_addr, (size))) { \ + if (access_ok(__gu_addr, (size))) { \ barrier_nospec(); \ __get_user_size(__gu_val, __gu_addr, (size), __gu_err); \ } \ @@ -374,7 +374,7 @@ extern unsigned long __clear_user(void __user *addr, unsigned long size); static inline unsigned long clear_user(void __user *addr, unsigned long size) { might_fault(); - if (likely(access_ok(VERIFY_WRITE, addr, size))) + if (likely(access_ok(addr, size))) return __clear_user(addr, size); return size; } diff --git a/arch/powerpc/kernel/align.c b/arch/powerpc/kernel/align.c index 11550a3d1ac2..0d1b6370bae0 100644 --- a/arch/powerpc/kernel/align.c +++ b/arch/powerpc/kernel/align.c @@ -131,8 +131,7 @@ static int emulate_spe(struct pt_regs *regs, unsigned int reg, /* Verify the address of the operand */ if (unlikely(user_mode(regs) && - !access_ok((flags & ST ? VERIFY_WRITE : VERIFY_READ), - addr, nb))) + !access_ok(addr, nb))) return -EFAULT; /* userland only */ diff --git a/arch/powerpc/kernel/rtas_flash.c b/arch/powerpc/kernel/rtas_flash.c index 10fabae2574d..8246f437bbc6 100644 --- a/arch/powerpc/kernel/rtas_flash.c +++ b/arch/powerpc/kernel/rtas_flash.c @@ -523,7 +523,7 @@ static ssize_t validate_flash_write(struct file *file, const char __user *buf, args_buf->status = VALIDATE_INCOMPLETE; } - if (!access_ok(VERIFY_READ, buf, count)) { + if (!access_ok(buf, count)) { rc = -EFAULT; goto done; } diff --git a/arch/powerpc/kernel/rtasd.c b/arch/powerpc/kernel/rtasd.c index 38cadae4ca4f..8a1746d755c9 100644 --- a/arch/powerpc/kernel/rtasd.c +++ b/arch/powerpc/kernel/rtasd.c @@ -335,7 +335,7 @@ static ssize_t rtas_log_read(struct file * file, char __user * buf, count = rtas_error_log_buffer_max; - if (!access_ok(VERIFY_WRITE, buf, count)) + if (!access_ok(buf, count)) return -EFAULT; tmp = kmalloc(count, GFP_KERNEL); diff --git a/arch/powerpc/kernel/signal.c b/arch/powerpc/kernel/signal.c index b3e8db376ecd..e6c30cee6abf 100644 --- a/arch/powerpc/kernel/signal.c +++ b/arch/powerpc/kernel/signal.c @@ -44,7 +44,7 @@ void __user *get_sigframe(struct ksignal *ksig, unsigned long sp, newsp = (oldsp - frame_size) & ~0xFUL; /* Check access */ - if (!access_ok(VERIFY_WRITE, (void __user *)newsp, oldsp - newsp)) + if (!access_ok((void __user *)newsp, oldsp - newsp)) return NULL; return (void __user *)newsp; diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c index 2d47cc79e5b3..ede4f04281ae 100644 --- a/arch/powerpc/kernel/signal_32.c +++ b/arch/powerpc/kernel/signal_32.c @@ -1017,7 +1017,7 @@ static int do_setcontext(struct ucontext __user *ucp, struct pt_regs *regs, int #else if (__get_user(mcp, &ucp->uc_regs)) return -EFAULT; - if (!access_ok(VERIFY_READ, mcp, sizeof(*mcp))) + if (!access_ok(mcp, sizeof(*mcp))) return -EFAULT; #endif set_current_blocked(&set); @@ -1120,7 +1120,7 @@ SYSCALL_DEFINE3(swapcontext, struct ucontext __user *, old_ctx, */ mctx = (struct mcontext __user *) ((unsigned long) &old_ctx->uc_mcontext & ~0xfUL); - if (!access_ok(VERIFY_WRITE, old_ctx, ctx_size) + if (!access_ok(old_ctx, ctx_size) || save_user_regs(regs, mctx, NULL, 0, ctx_has_vsx_region) || put_sigset_t(&old_ctx->uc_sigmask, ¤t->blocked) || __put_user(to_user_ptr(mctx), &old_ctx->uc_regs)) @@ -1128,7 +1128,7 @@ SYSCALL_DEFINE3(swapcontext, struct ucontext __user *, old_ctx, } if (new_ctx == NULL) return 0; - if (!access_ok(VERIFY_READ, new_ctx, ctx_size) || + if (!access_ok(new_ctx, ctx_size) || fault_in_pages_readable((u8 __user *)new_ctx, ctx_size)) return -EFAULT; @@ -1169,7 +1169,7 @@ SYSCALL_DEFINE0(rt_sigreturn) rt_sf = (struct rt_sigframe __user *) (regs->gpr[1] + __SIGNAL_FRAMESIZE + 16); - if (!access_ok(VERIFY_READ, rt_sf, sizeof(*rt_sf))) + if (!access_ok(rt_sf, sizeof(*rt_sf))) goto bad; #ifdef CONFIG_PPC_TRANSACTIONAL_MEM @@ -1315,7 +1315,7 @@ SYSCALL_DEFINE3(debug_setcontext, struct ucontext __user *, ctx, current->thread.debug.dbcr0 = new_dbcr0; #endif - if (!access_ok(VERIFY_READ, ctx, sizeof(*ctx)) || + if (!access_ok(ctx, sizeof(*ctx)) || fault_in_pages_readable((u8 __user *)ctx, sizeof(*ctx))) return -EFAULT; @@ -1500,7 +1500,7 @@ SYSCALL_DEFINE0(sigreturn) { sr = (struct mcontext __user *)from_user_ptr(sigctx.regs); addr = sr; - if (!access_ok(VERIFY_READ, sr, sizeof(*sr)) + if (!access_ok(sr, sizeof(*sr)) || restore_user_regs(regs, sr, 1)) goto badframe; } diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c index 0935fe6c282a..bd5e6834ca69 100644 --- a/arch/powerpc/kernel/signal_64.c +++ b/arch/powerpc/kernel/signal_64.c @@ -383,7 +383,7 @@ static long restore_sigcontext(struct task_struct *tsk, sigset_t *set, int sig, err |= __get_user(v_regs, &sc->v_regs); if (err) return err; - if (v_regs && !access_ok(VERIFY_READ, v_regs, 34 * sizeof(vector128))) + if (v_regs && !access_ok(v_regs, 34 * sizeof(vector128))) return -EFAULT; /* Copy 33 vec registers (vr0..31 and vscr) from the stack */ if (v_regs != NULL && (msr & MSR_VEC) != 0) { @@ -502,10 +502,9 @@ static long restore_tm_sigcontexts(struct task_struct *tsk, err |= __get_user(tm_v_regs, &tm_sc->v_regs); if (err) return err; - if (v_regs && !access_ok(VERIFY_READ, v_regs, 34 * sizeof(vector128))) + if (v_regs && !access_ok(v_regs, 34 * sizeof(vector128))) return -EFAULT; - if (tm_v_regs && !access_ok(VERIFY_READ, - tm_v_regs, 34 * sizeof(vector128))) + if (tm_v_regs && !access_ok(tm_v_regs, 34 * sizeof(vector128))) return -EFAULT; /* Copy 33 vec registers (vr0..31 and vscr) from the stack */ if (v_regs != NULL && tm_v_regs != NULL && (msr & MSR_VEC) != 0) { @@ -671,7 +670,7 @@ SYSCALL_DEFINE3(swapcontext, struct ucontext __user *, old_ctx, ctx_has_vsx_region = 1; if (old_ctx != NULL) { - if (!access_ok(VERIFY_WRITE, old_ctx, ctx_size) + if (!access_ok(old_ctx, ctx_size) || setup_sigcontext(&old_ctx->uc_mcontext, current, 0, NULL, 0, ctx_has_vsx_region) || __copy_to_user(&old_ctx->uc_sigmask, @@ -680,7 +679,7 @@ SYSCALL_DEFINE3(swapcontext, struct ucontext __user *, old_ctx, } if (new_ctx == NULL) return 0; - if (!access_ok(VERIFY_READ, new_ctx, ctx_size) + if (!access_ok(new_ctx, ctx_size) || __get_user(tmp, (u8 __user *) new_ctx) || __get_user(tmp, (u8 __user *) new_ctx + ctx_size - 1)) return -EFAULT; @@ -725,7 +724,7 @@ SYSCALL_DEFINE0(rt_sigreturn) /* Always make any pending restarted system calls return -EINTR */ current->restart_block.fn = do_no_restart_syscall; - if (!access_ok(VERIFY_READ, uc, sizeof(*uc))) + if (!access_ok(uc, sizeof(*uc))) goto badframe; if (__copy_from_user(&set, &uc->uc_sigmask, sizeof(set))) diff --git a/arch/powerpc/kernel/syscalls.c b/arch/powerpc/kernel/syscalls.c index 466216506eb2..e6982ab21816 100644 --- a/arch/powerpc/kernel/syscalls.c +++ b/arch/powerpc/kernel/syscalls.c @@ -89,7 +89,7 @@ ppc_select(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp, s if ( (unsigned long)n >= 4096 ) { unsigned long __user *buffer = (unsigned long __user *)n; - if (!access_ok(VERIFY_READ, buffer, 5*sizeof(unsigned long)) + if (!access_ok(buffer, 5*sizeof(unsigned long)) || __get_user(n, buffer) || __get_user(inp, ((fd_set __user * __user *)(buffer+1))) || __get_user(outp, ((fd_set __user * __user *)(buffer+2))) diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index 00af2c4febf4..64936b60d521 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -837,7 +837,7 @@ static void p9_hmi_special_emu(struct pt_regs *regs) addr = (__force const void __user *)ea; /* Check it */ - if (!access_ok(VERIFY_READ, addr, 16)) { + if (!access_ok(addr, 16)) { pr_devel("HMI vec emu: bad access %i:%s[%d] nip=%016lx" " instr=%08x addr=%016lx\n", smp_processor_id(), current->comm, current->pid, diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index 6f2d2fb4e098..bd2dcfbf00cd 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -1744,7 +1744,7 @@ static ssize_t kvm_htab_read(struct file *file, char __user *buf, int first_pass; unsigned long hpte[2]; - if (!access_ok(VERIFY_WRITE, buf, count)) + if (!access_ok(buf, count)) return -EFAULT; if (kvm_is_radix(kvm)) return 0; @@ -1844,7 +1844,7 @@ static ssize_t kvm_htab_write(struct file *file, const char __user *buf, int mmu_ready; int pshift; - if (!access_ok(VERIFY_READ, buf, count)) + if (!access_ok(buf, count)) return -EFAULT; if (kvm_is_radix(kvm)) return -EINVAL; diff --git a/arch/powerpc/lib/checksum_wrappers.c b/arch/powerpc/lib/checksum_wrappers.c index a0cb63fb76a1..890d4ddd91d6 100644 --- a/arch/powerpc/lib/checksum_wrappers.c +++ b/arch/powerpc/lib/checksum_wrappers.c @@ -37,7 +37,7 @@ __wsum csum_and_copy_from_user(const void __user *src, void *dst, goto out; } - if (unlikely((len < 0) || !access_ok(VERIFY_READ, src, len))) { + if (unlikely((len < 0) || !access_ok(src, len))) { *err_ptr = -EFAULT; csum = (__force unsigned int)sum; goto out; @@ -78,7 +78,7 @@ __wsum csum_and_copy_to_user(const void *src, void __user *dst, int len, goto out; } - if (unlikely((len < 0) || !access_ok(VERIFY_WRITE, dst, len))) { + if (unlikely((len < 0) || !access_ok(dst, len))) { *err_ptr = -EFAULT; csum = -1; /* invalid checksum */ goto out; diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index a6dcfda3e11e..887f11bcf330 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -274,7 +274,7 @@ static bool bad_stack_expansion(struct pt_regs *regs, unsigned long address, return false; if ((flags & FAULT_FLAG_WRITE) && (flags & FAULT_FLAG_USER) && - access_ok(VERIFY_READ, nip, sizeof(*nip))) { + access_ok(nip, sizeof(*nip))) { unsigned int inst; int res; diff --git a/arch/powerpc/mm/subpage-prot.c b/arch/powerpc/mm/subpage-prot.c index 3327551c8b47..5e4178790dee 100644 --- a/arch/powerpc/mm/subpage-prot.c +++ b/arch/powerpc/mm/subpage-prot.c @@ -214,7 +214,7 @@ SYSCALL_DEFINE3(subpage_prot, unsigned long, addr, return 0; } - if (!access_ok(VERIFY_READ, map, (len >> PAGE_SHIFT) * sizeof(u32))) + if (!access_ok(map, (len >> PAGE_SHIFT) * sizeof(u32))) return -EFAULT; down_write(&mm->mmap_sem); diff --git a/arch/powerpc/oprofile/backtrace.c b/arch/powerpc/oprofile/backtrace.c index 5df6290d1ccc..260c53700978 100644 --- a/arch/powerpc/oprofile/backtrace.c +++ b/arch/powerpc/oprofile/backtrace.c @@ -31,7 +31,7 @@ static unsigned int user_getsp32(unsigned int sp, int is_first) unsigned int stack_frame[2]; void __user *p = compat_ptr(sp); - if (!access_ok(VERIFY_READ, p, sizeof(stack_frame))) + if (!access_ok(p, sizeof(stack_frame))) return 0; /* @@ -57,7 +57,7 @@ static unsigned long user_getsp64(unsigned long sp, int is_first) { unsigned long stack_frame[3]; - if (!access_ok(VERIFY_READ, (void __user *)sp, sizeof(stack_frame))) + if (!access_ok((void __user *)sp, sizeof(stack_frame))) return 0; if (__copy_from_user_inatomic(stack_frame, (void __user *)sp, diff --git a/arch/powerpc/platforms/cell/spufs/file.c b/arch/powerpc/platforms/cell/spufs/file.c index 43e7b93f27c7..ae8123edddc6 100644 --- a/arch/powerpc/platforms/cell/spufs/file.c +++ b/arch/powerpc/platforms/cell/spufs/file.c @@ -609,7 +609,7 @@ static ssize_t spufs_mbox_read(struct file *file, char __user *buf, if (len < 4) return -EINVAL; - if (!access_ok(VERIFY_WRITE, buf, len)) + if (!access_ok(buf, len)) return -EFAULT; udata = (void __user *)buf; @@ -717,7 +717,7 @@ static ssize_t spufs_ibox_read(struct file *file, char __user *buf, if (len < 4) return -EINVAL; - if (!access_ok(VERIFY_WRITE, buf, len)) + if (!access_ok(buf, len)) return -EFAULT; udata = (void __user *)buf; @@ -856,7 +856,7 @@ static ssize_t spufs_wbox_write(struct file *file, const char __user *buf, return -EINVAL; udata = (void __user *)buf; - if (!access_ok(VERIFY_READ, buf, len)) + if (!access_ok(buf, len)) return -EFAULT; if (__get_user(wbox_data, udata)) @@ -1994,7 +1994,7 @@ static ssize_t spufs_mbox_info_read(struct file *file, char __user *buf, int ret; struct spu_context *ctx = file->private_data; - if (!access_ok(VERIFY_WRITE, buf, len)) + if (!access_ok(buf, len)) return -EFAULT; ret = spu_acquire_saved(ctx); @@ -2034,7 +2034,7 @@ static ssize_t spufs_ibox_info_read(struct file *file, char __user *buf, struct spu_context *ctx = file->private_data; int ret; - if (!access_ok(VERIFY_WRITE, buf, len)) + if (!access_ok(buf, len)) return -EFAULT; ret = spu_acquire_saved(ctx); @@ -2077,7 +2077,7 @@ static ssize_t spufs_wbox_info_read(struct file *file, char __user *buf, struct spu_context *ctx = file->private_data; int ret; - if (!access_ok(VERIFY_WRITE, buf, len)) + if (!access_ok(buf, len)) return -EFAULT; ret = spu_acquire_saved(ctx); @@ -2129,7 +2129,7 @@ static ssize_t spufs_dma_info_read(struct file *file, char __user *buf, struct spu_context *ctx = file->private_data; int ret; - if (!access_ok(VERIFY_WRITE, buf, len)) + if (!access_ok(buf, len)) return -EFAULT; ret = spu_acquire_saved(ctx); @@ -2160,7 +2160,7 @@ static ssize_t __spufs_proxydma_info_read(struct spu_context *ctx, if (len < ret) return -EINVAL; - if (!access_ok(VERIFY_WRITE, buf, len)) + if (!access_ok(buf, len)) return -EFAULT; info.proxydma_info_type = ctx->csa.prob.dma_querytype_RW; diff --git a/arch/powerpc/platforms/powernv/opal-lpc.c b/arch/powerpc/platforms/powernv/opal-lpc.c index 6c7ad1d8b32e..2623996a193a 100644 --- a/arch/powerpc/platforms/powernv/opal-lpc.c +++ b/arch/powerpc/platforms/powernv/opal-lpc.c @@ -192,7 +192,7 @@ static ssize_t lpc_debug_read(struct file *filp, char __user *ubuf, u32 data, pos, len, todo; int rc; - if (!access_ok(VERIFY_WRITE, ubuf, count)) + if (!access_ok(ubuf, count)) return -EFAULT; todo = count; @@ -283,7 +283,7 @@ static ssize_t lpc_debug_write(struct file *filp, const char __user *ubuf, u32 data, pos, len, todo; int rc; - if (!access_ok(VERIFY_READ, ubuf, count)) + if (!access_ok(ubuf, count)) return -EFAULT; todo = count; diff --git a/arch/powerpc/platforms/pseries/scanlog.c b/arch/powerpc/platforms/pseries/scanlog.c index 054ce7a16fc3..24b157e1e890 100644 --- a/arch/powerpc/platforms/pseries/scanlog.c +++ b/arch/powerpc/platforms/pseries/scanlog.c @@ -63,7 +63,7 @@ static ssize_t scanlog_read(struct file *file, char __user *buf, return -EINVAL; } - if (!access_ok(VERIFY_WRITE, buf, count)) + if (!access_ok(buf, count)) return -EFAULT; for (;;) { diff --git a/arch/riscv/include/asm/futex.h b/arch/riscv/include/asm/futex.h index 3b19eba1bc8e..66641624d8a5 100644 --- a/arch/riscv/include/asm/futex.h +++ b/arch/riscv/include/asm/futex.h @@ -95,7 +95,7 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, u32 val; uintptr_t tmp; - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) + if (!access_ok(uaddr, sizeof(u32))) return -EFAULT; __enable_user_access(); diff --git a/arch/riscv/include/asm/uaccess.h b/arch/riscv/include/asm/uaccess.h index 8c3e3e3c8be1..637b896894fc 100644 --- a/arch/riscv/include/asm/uaccess.h +++ b/arch/riscv/include/asm/uaccess.h @@ -54,14 +54,8 @@ static inline void set_fs(mm_segment_t fs) #define user_addr_max() (get_fs()) -#define VERIFY_READ 0 -#define VERIFY_WRITE 1 - /** * access_ok: - Checks if a user space pointer is valid - * @type: Type of access: %VERIFY_READ or %VERIFY_WRITE. Note that - * %VERIFY_WRITE is a superset of %VERIFY_READ - if it is safe - * to write to a block, it is always safe to read from it. * @addr: User space pointer to start of block to check * @size: Size of block to check * @@ -76,7 +70,7 @@ static inline void set_fs(mm_segment_t fs) * checks that the pointer is in the user space range - after calling * this function, memory access functions may still return -EFAULT. */ -#define access_ok(type, addr, size) ({ \ +#define access_ok(addr, size) ({ \ __chk_user_ptr(addr); \ likely(__access_ok((unsigned long __force)(addr), (size))); \ }) @@ -258,7 +252,7 @@ do { \ ({ \ const __typeof__(*(ptr)) __user *__p = (ptr); \ might_fault(); \ - access_ok(VERIFY_READ, __p, sizeof(*__p)) ? \ + access_ok(__p, sizeof(*__p)) ? \ __get_user((x), __p) : \ ((x) = 0, -EFAULT); \ }) @@ -386,7 +380,7 @@ do { \ ({ \ __typeof__(*(ptr)) __user *__p = (ptr); \ might_fault(); \ - access_ok(VERIFY_WRITE, __p, sizeof(*__p)) ? \ + access_ok(__p, sizeof(*__p)) ? \ __put_user((x), __p) : \ -EFAULT; \ }) @@ -421,7 +415,7 @@ static inline unsigned long __must_check clear_user(void __user *to, unsigned long n) { might_fault(); - return access_ok(VERIFY_WRITE, to, n) ? + return access_ok(to, n) ? __clear_user(to, n) : n; } diff --git a/arch/riscv/kernel/signal.c b/arch/riscv/kernel/signal.c index f9b5e7e352ef..837e1646091a 100644 --- a/arch/riscv/kernel/signal.c +++ b/arch/riscv/kernel/signal.c @@ -115,7 +115,7 @@ SYSCALL_DEFINE0(rt_sigreturn) frame = (struct rt_sigframe __user *)regs->sp; - if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) + if (!access_ok(frame, sizeof(*frame))) goto badframe; if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set))) @@ -187,7 +187,7 @@ static int setup_rt_frame(struct ksignal *ksig, sigset_t *set, long err = 0; frame = get_sigframe(ksig, regs, sizeof(*frame)); - if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) + if (!access_ok(frame, sizeof(*frame))) return -EFAULT; err |= copy_siginfo_to_user(&frame->info, &ksig->info); diff --git a/arch/s390/include/asm/uaccess.h b/arch/s390/include/asm/uaccess.h index ad6b91013a05..bd2545977ad3 100644 --- a/arch/s390/include/asm/uaccess.h +++ b/arch/s390/include/asm/uaccess.h @@ -48,7 +48,7 @@ static inline int __range_ok(unsigned long addr, unsigned long size) __range_ok((unsigned long)(addr), (size)); \ }) -#define access_ok(type, addr, size) __access_ok(addr, size) +#define access_ok(addr, size) __access_ok(addr, size) unsigned long __must_check raw_copy_from_user(void *to, const void __user *from, unsigned long n); diff --git a/arch/sh/include/asm/checksum_32.h b/arch/sh/include/asm/checksum_32.h index b58f3d95dc19..36b84cfd3f67 100644 --- a/arch/sh/include/asm/checksum_32.h +++ b/arch/sh/include/asm/checksum_32.h @@ -197,7 +197,7 @@ static inline __wsum csum_and_copy_to_user(const void *src, int len, __wsum sum, int *err_ptr) { - if (access_ok(VERIFY_WRITE, dst, len)) + if (access_ok(dst, len)) return csum_partial_copy_generic((__force const void *)src, dst, len, sum, NULL, err_ptr); diff --git a/arch/sh/include/asm/futex.h b/arch/sh/include/asm/futex.h index 6d192f4908a7..3190ec89df81 100644 --- a/arch/sh/include/asm/futex.h +++ b/arch/sh/include/asm/futex.h @@ -22,7 +22,7 @@ static inline int futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, u32 oldval, u32 newval) { - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) + if (!access_ok(uaddr, sizeof(u32))) return -EFAULT; return atomic_futex_op_cmpxchg_inatomic(uval, uaddr, oldval, newval); diff --git a/arch/sh/include/asm/uaccess.h b/arch/sh/include/asm/uaccess.h index 32eb56e00c11..deebbfab5342 100644 --- a/arch/sh/include/asm/uaccess.h +++ b/arch/sh/include/asm/uaccess.h @@ -18,7 +18,7 @@ */ #define __access_ok(addr, size) \ (__addr_ok((addr) + (size))) -#define access_ok(type, addr, size) \ +#define access_ok(addr, size) \ (__chk_user_ptr(addr), \ __access_ok((unsigned long __force)(addr), (size))) @@ -66,7 +66,7 @@ struct __large_struct { unsigned long buf[100]; }; long __gu_err = -EFAULT; \ unsigned long __gu_val = 0; \ const __typeof__(*(ptr)) *__gu_addr = (ptr); \ - if (likely(access_ok(VERIFY_READ, __gu_addr, (size)))) \ + if (likely(access_ok(__gu_addr, (size)))) \ __get_user_size(__gu_val, __gu_addr, (size), __gu_err); \ (x) = (__force __typeof__(*(ptr)))__gu_val; \ __gu_err; \ @@ -87,7 +87,7 @@ struct __large_struct { unsigned long buf[100]; }; long __pu_err = -EFAULT; \ __typeof__(*(ptr)) __user *__pu_addr = (ptr); \ __typeof__(*(ptr)) __pu_val = x; \ - if (likely(access_ok(VERIFY_WRITE, __pu_addr, size))) \ + if (likely(access_ok(__pu_addr, size))) \ __put_user_size(__pu_val, __pu_addr, (size), \ __pu_err); \ __pu_err; \ @@ -132,8 +132,7 @@ __kernel_size_t __clear_user(void *addr, __kernel_size_t size); void __user * __cl_addr = (addr); \ unsigned long __cl_size = (n); \ \ - if (__cl_size && access_ok(VERIFY_WRITE, \ - ((unsigned long)(__cl_addr)), __cl_size)) \ + if (__cl_size && access_ok(__cl_addr, __cl_size)) \ __cl_size = __clear_user(__cl_addr, __cl_size); \ \ __cl_size; \ diff --git a/arch/sh/kernel/signal_32.c b/arch/sh/kernel/signal_32.c index c46c0020ff55..2a2121ba8ebe 100644 --- a/arch/sh/kernel/signal_32.c +++ b/arch/sh/kernel/signal_32.c @@ -160,7 +160,7 @@ asmlinkage int sys_sigreturn(void) /* Always make any pending restarted system calls return -EINTR */ current->restart_block.fn = do_no_restart_syscall; - if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) + if (!access_ok(frame, sizeof(*frame))) goto badframe; if (__get_user(set.sig[0], &frame->sc.oldmask) @@ -190,7 +190,7 @@ asmlinkage int sys_rt_sigreturn(void) /* Always make any pending restarted system calls return -EINTR */ current->restart_block.fn = do_no_restart_syscall; - if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) + if (!access_ok(frame, sizeof(*frame))) goto badframe; if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set))) @@ -272,7 +272,7 @@ static int setup_frame(struct ksignal *ksig, sigset_t *set, frame = get_sigframe(&ksig->ka, regs->regs[15], sizeof(*frame)); - if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) + if (!access_ok(frame, sizeof(*frame))) return -EFAULT; err |= setup_sigcontext(&frame->sc, regs, set->sig[0]); @@ -338,7 +338,7 @@ static int setup_rt_frame(struct ksignal *ksig, sigset_t *set, frame = get_sigframe(&ksig->ka, regs->regs[15], sizeof(*frame)); - if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) + if (!access_ok(frame, sizeof(*frame))) return -EFAULT; err |= copy_siginfo_to_user(&frame->info, &ksig->info); diff --git a/arch/sh/kernel/signal_64.c b/arch/sh/kernel/signal_64.c index 76661dee3c65..f1f1598879c2 100644 --- a/arch/sh/kernel/signal_64.c +++ b/arch/sh/kernel/signal_64.c @@ -259,7 +259,7 @@ asmlinkage int sys_sigreturn(unsigned long r2, unsigned long r3, /* Always make any pending restarted system calls return -EINTR */ current->restart_block.fn = do_no_restart_syscall; - if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) + if (!access_ok(frame, sizeof(*frame))) goto badframe; if (__get_user(set.sig[0], &frame->sc.oldmask) @@ -293,7 +293,7 @@ asmlinkage int sys_rt_sigreturn(unsigned long r2, unsigned long r3, /* Always make any pending restarted system calls return -EINTR */ current->restart_block.fn = do_no_restart_syscall; - if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) + if (!access_ok(frame, sizeof(*frame))) goto badframe; if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set))) @@ -379,7 +379,7 @@ static int setup_frame(struct ksignal *ksig, sigset_t *set, struct pt_regs *regs frame = get_sigframe(&ksig->ka, regs->regs[REG_SP], sizeof(*frame)); - if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) + if (!access_ok(frame, sizeof(*frame))) return -EFAULT; err |= setup_sigcontext(&frame->sc, regs, set->sig[0]); @@ -465,7 +465,7 @@ static int setup_rt_frame(struct ksignal *kig, sigset_t *set, frame = get_sigframe(&ksig->ka, regs->regs[REG_SP], sizeof(*frame)); - if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) + if (!access_ok(frame, sizeof(*frame))) return -EFAULT; err |= __put_user(&frame->info, &frame->pinfo); diff --git a/arch/sh/kernel/traps_64.c b/arch/sh/kernel/traps_64.c index c52bda4d2574..8ce90a7da67d 100644 --- a/arch/sh/kernel/traps_64.c +++ b/arch/sh/kernel/traps_64.c @@ -40,7 +40,7 @@ static int read_opcode(reg_size_t pc, insn_size_t *result_opcode, int from_user_ /* SHmedia */ aligned_pc = pc & ~3; if (from_user_mode) { - if (!access_ok(VERIFY_READ, aligned_pc, sizeof(insn_size_t))) { + if (!access_ok(aligned_pc, sizeof(insn_size_t))) { get_user_error = -EFAULT; } else { get_user_error = __get_user(opcode, (insn_size_t *)aligned_pc); @@ -180,7 +180,7 @@ static int misaligned_load(struct pt_regs *regs, if (user_mode(regs)) { __u64 buffer; - if (!access_ok(VERIFY_READ, (unsigned long) address, 1UL<thread.float_regs[0], &fpu->si_float_regs[0], diff --git a/arch/sparc/kernel/unaligned_32.c b/arch/sparc/kernel/unaligned_32.c index 64ac8c0c1429..83db94c0b431 100644 --- a/arch/sparc/kernel/unaligned_32.c +++ b/arch/sparc/kernel/unaligned_32.c @@ -278,7 +278,6 @@ static inline int ok_for_user(struct pt_regs *regs, unsigned int insn, enum direction dir) { unsigned int reg; - int check = (dir == load) ? VERIFY_READ : VERIFY_WRITE; int size = ((insn >> 19) & 3) == 3 ? 8 : 4; if ((regs->pc | regs->npc) & 3) @@ -290,18 +289,18 @@ static inline int ok_for_user(struct pt_regs *regs, unsigned int insn, reg = (insn >> 25) & 0x1f; if (reg >= 16) { - if (!access_ok(check, WINREG_ADDR(reg - 16), size)) + if (!access_ok(WINREG_ADDR(reg - 16), size)) return -EFAULT; } reg = (insn >> 14) & 0x1f; if (reg >= 16) { - if (!access_ok(check, WINREG_ADDR(reg - 16), size)) + if (!access_ok(WINREG_ADDR(reg - 16), size)) return -EFAULT; } if (!(insn & 0x2000)) { reg = (insn & 0x1f); if (reg >= 16) { - if (!access_ok(check, WINREG_ADDR(reg - 16), size)) + if (!access_ok(WINREG_ADDR(reg - 16), size)) return -EFAULT; } } diff --git a/arch/um/kernel/ptrace.c b/arch/um/kernel/ptrace.c index 1a1d88a4d940..5f47422401e1 100644 --- a/arch/um/kernel/ptrace.c +++ b/arch/um/kernel/ptrace.c @@ -66,7 +66,7 @@ long arch_ptrace(struct task_struct *child, long request, #ifdef PTRACE_GETREGS case PTRACE_GETREGS: { /* Get all gp regs from the child. */ - if (!access_ok(VERIFY_WRITE, p, MAX_REG_OFFSET)) { + if (!access_ok(p, MAX_REG_OFFSET)) { ret = -EIO; break; } @@ -81,7 +81,7 @@ long arch_ptrace(struct task_struct *child, long request, #ifdef PTRACE_SETREGS case PTRACE_SETREGS: { /* Set all gp regs in the child. */ unsigned long tmp = 0; - if (!access_ok(VERIFY_READ, p, MAX_REG_OFFSET)) { + if (!access_ok(p, MAX_REG_OFFSET)) { ret = -EIO; break; } diff --git a/arch/unicore32/kernel/signal.c b/arch/unicore32/kernel/signal.c index 4ae51cf15ade..63be04809d40 100644 --- a/arch/unicore32/kernel/signal.c +++ b/arch/unicore32/kernel/signal.c @@ -117,7 +117,7 @@ asmlinkage int __sys_rt_sigreturn(struct pt_regs *regs) frame = (struct rt_sigframe __user *)regs->UCreg_sp; - if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) + if (!access_ok(frame, sizeof(*frame))) goto badframe; if (restore_sigframe(regs, &frame->sig)) @@ -205,7 +205,7 @@ static inline void __user *get_sigframe(struct k_sigaction *ka, /* * Check that we can actually write to the signal frame. */ - if (!access_ok(VERIFY_WRITE, frame, framesize)) + if (!access_ok(frame, framesize)) frame = NULL; return frame; diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c index d78bcc03e60e..d9d81ad7a400 100644 --- a/arch/x86/entry/vsyscall/vsyscall_64.c +++ b/arch/x86/entry/vsyscall/vsyscall_64.c @@ -99,7 +99,7 @@ static bool write_ok_or_segv(unsigned long ptr, size_t size) * sig_on_uaccess_err, this could go away. */ - if (!access_ok(VERIFY_WRITE, (void __user *)ptr, size)) { + if (!access_ok((void __user *)ptr, size)) { struct thread_struct *thread = ¤t->thread; thread->error_code = X86_PF_USER | X86_PF_WRITE; diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c index 8e02b30cf08e..f65b78d32f5e 100644 --- a/arch/x86/ia32/ia32_aout.c +++ b/arch/x86/ia32/ia32_aout.c @@ -176,10 +176,10 @@ static int aout_core_dump(struct coredump_params *cprm) /* make sure we actually have a data and stack area to dump */ set_fs(USER_DS); - if (!access_ok(VERIFY_READ, (void *) (unsigned long)START_DATA(dump), + if (!access_ok((void *) (unsigned long)START_DATA(dump), dump.u_dsize << PAGE_SHIFT)) dump.u_dsize = 0; - if (!access_ok(VERIFY_READ, (void *) (unsigned long)START_STACK(dump), + if (!access_ok((void *) (unsigned long)START_STACK(dump), dump.u_ssize << PAGE_SHIFT)) dump.u_ssize = 0; diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index 86b1341cba9a..321fe5f5d0e9 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -119,7 +119,7 @@ asmlinkage long sys32_sigreturn(void) struct sigframe_ia32 __user *frame = (struct sigframe_ia32 __user *)(regs->sp-8); sigset_t set; - if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) + if (!access_ok(frame, sizeof(*frame))) goto badframe; if (__get_user(set.sig[0], &frame->sc.oldmask) || (_COMPAT_NSIG_WORDS > 1 @@ -147,7 +147,7 @@ asmlinkage long sys32_rt_sigreturn(void) frame = (struct rt_sigframe_ia32 __user *)(regs->sp - 4); - if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) + if (!access_ok(frame, sizeof(*frame))) goto badframe; if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set))) goto badframe; @@ -269,7 +269,7 @@ int ia32_setup_frame(int sig, struct ksignal *ksig, frame = get_sigframe(ksig, regs, sizeof(*frame), &fpstate); - if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) + if (!access_ok(frame, sizeof(*frame))) return -EFAULT; if (__put_user(sig, &frame->sig)) @@ -349,7 +349,7 @@ int ia32_setup_rt_frame(int sig, struct ksignal *ksig, frame = get_sigframe(ksig, regs, sizeof(*frame), &fpstate); - if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) + if (!access_ok(frame, sizeof(*frame))) return -EFAULT; put_user_try { diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c index 11ef7b7c9cc8..a43212036257 100644 --- a/arch/x86/ia32/sys_ia32.c +++ b/arch/x86/ia32/sys_ia32.c @@ -75,7 +75,7 @@ static int cp_stat64(struct stat64 __user *ubuf, struct kstat *stat) typeof(ubuf->st_gid) gid = 0; SET_UID(uid, from_kuid_munged(current_user_ns(), stat->uid)); SET_GID(gid, from_kgid_munged(current_user_ns(), stat->gid)); - if (!access_ok(VERIFY_WRITE, ubuf, sizeof(struct stat64)) || + if (!access_ok(ubuf, sizeof(struct stat64)) || __put_user(huge_encode_dev(stat->dev), &ubuf->st_dev) || __put_user(stat->ino, &ubuf->__st_ino) || __put_user(stat->ino, &ubuf->st_ino) || diff --git a/arch/x86/include/asm/checksum_32.h b/arch/x86/include/asm/checksum_32.h index 7a659c74cd03..f57b94e02c57 100644 --- a/arch/x86/include/asm/checksum_32.h +++ b/arch/x86/include/asm/checksum_32.h @@ -182,7 +182,7 @@ static inline __wsum csum_and_copy_to_user(const void *src, __wsum ret; might_sleep(); - if (access_ok(VERIFY_WRITE, dst, len)) { + if (access_ok(dst, len)) { stac(); ret = csum_partial_copy_generic(src, (__force void *)dst, len, sum, NULL, err_ptr); diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h index b3ec519e3982..4fe9e7fc74d3 100644 --- a/arch/x86/include/asm/pgtable_32.h +++ b/arch/x86/include/asm/pgtable_32.h @@ -37,7 +37,7 @@ void sync_initial_page_table(void); /* * Define this if things work differently on an i386 and an i486: * it will (on an i486) warn about kernel memory accesses that are - * done without a 'access_ok(VERIFY_WRITE,..)' + * done without a 'access_ok( ..)' */ #undef TEST_ACCESS_OK diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index b5e58cc0c5e7..3920f456db79 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -77,9 +77,6 @@ static inline bool __chk_range_not_ok(unsigned long addr, unsigned long size, un /** * access_ok: - Checks if a user space pointer is valid - * @type: Type of access: %VERIFY_READ or %VERIFY_WRITE. Note that - * %VERIFY_WRITE is a superset of %VERIFY_READ - if it is safe - * to write to a block, it is always safe to read from it. * @addr: User space pointer to start of block to check * @size: Size of block to check * @@ -95,7 +92,7 @@ static inline bool __chk_range_not_ok(unsigned long addr, unsigned long size, un * checks that the pointer is in the user space range - after calling * this function, memory access functions may still return -EFAULT. */ -#define access_ok(type, addr, size) \ +#define access_ok(addr, size) \ ({ \ WARN_ON_IN_IRQ(); \ likely(!__range_not_ok(addr, size, user_addr_max())); \ @@ -670,7 +667,7 @@ extern void __cmpxchg_wrong_size(void) #define user_atomic_cmpxchg_inatomic(uval, ptr, old, new) \ ({ \ - access_ok(VERIFY_WRITE, (ptr), sizeof(*(ptr))) ? \ + access_ok((ptr), sizeof(*(ptr))) ? \ __user_atomic_cmpxchg_inatomic((uval), (ptr), \ (old), (new), sizeof(*(ptr))) : \ -EFAULT; \ diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index d99a8ee9e185..f6a1d299627c 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -164,7 +164,7 @@ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size) ia32_fxstate &= (IS_ENABLED(CONFIG_X86_32) || IS_ENABLED(CONFIG_IA32_EMULATION)); - if (!access_ok(VERIFY_WRITE, buf, size)) + if (!access_ok(buf, size)) return -EACCES; if (!static_cpu_has(X86_FEATURE_FPU)) @@ -281,7 +281,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) return 0; } - if (!access_ok(VERIFY_READ, buf, size)) + if (!access_ok(buf, size)) return -EACCES; fpu__initialize(fpu); diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 92a3b312a53c..08dfd4c1a4f9 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -322,7 +322,7 @@ __setup_frame(int sig, struct ksignal *ksig, sigset_t *set, frame = get_sigframe(&ksig->ka, regs, sizeof(*frame), &fpstate); - if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) + if (!access_ok(frame, sizeof(*frame))) return -EFAULT; if (__put_user(sig, &frame->sig)) @@ -385,7 +385,7 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig, frame = get_sigframe(&ksig->ka, regs, sizeof(*frame), &fpstate); - if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) + if (!access_ok(frame, sizeof(*frame))) return -EFAULT; put_user_try { @@ -465,7 +465,7 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig, frame = get_sigframe(&ksig->ka, regs, sizeof(struct rt_sigframe), &fp); - if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) + if (!access_ok(frame, sizeof(*frame))) return -EFAULT; if (ksig->ka.sa.sa_flags & SA_SIGINFO) { @@ -547,7 +547,7 @@ static int x32_setup_rt_frame(struct ksignal *ksig, frame = get_sigframe(&ksig->ka, regs, sizeof(*frame), &fpstate); - if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) + if (!access_ok(frame, sizeof(*frame))) return -EFAULT; if (ksig->ka.sa.sa_flags & SA_SIGINFO) { @@ -610,7 +610,7 @@ SYSCALL_DEFINE0(sigreturn) frame = (struct sigframe __user *)(regs->sp - 8); - if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) + if (!access_ok(frame, sizeof(*frame))) goto badframe; if (__get_user(set.sig[0], &frame->sc.oldmask) || (_NSIG_WORDS > 1 && __copy_from_user(&set.sig[1], &frame->extramask, @@ -642,7 +642,7 @@ SYSCALL_DEFINE0(rt_sigreturn) unsigned long uc_flags; frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long)); - if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) + if (!access_ok(frame, sizeof(*frame))) goto badframe; if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set))) goto badframe; @@ -871,7 +871,7 @@ asmlinkage long sys32_x32_rt_sigreturn(void) frame = (struct rt_sigframe_x32 __user *)(regs->sp - 8); - if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) + if (!access_ok(frame, sizeof(*frame))) goto badframe; if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set))) goto badframe; diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index 7627455047c2..5c2d71a1dc06 100644 --- a/arch/x86/kernel/stacktrace.c +++ b/arch/x86/kernel/stacktrace.c @@ -177,7 +177,7 @@ copy_stack_frame(const void __user *fp, struct stack_frame_user *frame) { int ret; - if (!access_ok(VERIFY_READ, fp, sizeof(*frame))) + if (!access_ok(fp, sizeof(*frame))) return 0; ret = 1; diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index c2fd39752da8..a092b6b40c6b 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -114,7 +114,7 @@ void save_v86_state(struct kernel_vm86_regs *regs, int retval) set_flags(regs->pt.flags, VEFLAGS, X86_EFLAGS_VIF | vm86->veflags_mask); user = vm86->user_vm86; - if (!access_ok(VERIFY_WRITE, user, vm86->vm86plus.is_vm86pus ? + if (!access_ok(user, vm86->vm86plus.is_vm86pus ? sizeof(struct vm86plus_struct) : sizeof(struct vm86_struct))) { pr_alert("could not access userspace vm86 info\n"); @@ -278,7 +278,7 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus) if (vm86->saved_sp0) return -EPERM; - if (!access_ok(VERIFY_READ, user_vm86, plus ? + if (!access_ok(user_vm86, plus ? sizeof(struct vm86_struct) : sizeof(struct vm86plus_struct))) return -EFAULT; diff --git a/arch/x86/lib/csum-wrappers_64.c b/arch/x86/lib/csum-wrappers_64.c index 8bd53589ecfb..a6a2b7dccbff 100644 --- a/arch/x86/lib/csum-wrappers_64.c +++ b/arch/x86/lib/csum-wrappers_64.c @@ -27,7 +27,7 @@ csum_partial_copy_from_user(const void __user *src, void *dst, might_sleep(); *errp = 0; - if (!likely(access_ok(VERIFY_READ, src, len))) + if (!likely(access_ok(src, len))) goto out_err; /* @@ -89,7 +89,7 @@ csum_partial_copy_to_user(const void *src, void __user *dst, might_sleep(); - if (unlikely(!access_ok(VERIFY_WRITE, dst, len))) { + if (unlikely(!access_ok(dst, len))) { *errp = -EFAULT; return 0; } diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c index 71fb58d44d58..bfd94e7812fc 100644 --- a/arch/x86/lib/usercopy_32.c +++ b/arch/x86/lib/usercopy_32.c @@ -67,7 +67,7 @@ unsigned long clear_user(void __user *to, unsigned long n) { might_fault(); - if (access_ok(VERIFY_WRITE, to, n)) + if (access_ok(to, n)) __do_clear_user(to, n); return n; } diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c index 1bd837cdc4b1..ee42bb0cbeb3 100644 --- a/arch/x86/lib/usercopy_64.c +++ b/arch/x86/lib/usercopy_64.c @@ -48,7 +48,7 @@ EXPORT_SYMBOL(__clear_user); unsigned long clear_user(void __user *to, unsigned long n) { - if (access_ok(VERIFY_WRITE, to, n)) + if (access_ok(to, n)) return __clear_user(to, n); return n; } diff --git a/arch/x86/math-emu/fpu_system.h b/arch/x86/math-emu/fpu_system.h index c8b1b31ed7c4..f98a0c956764 100644 --- a/arch/x86/math-emu/fpu_system.h +++ b/arch/x86/math-emu/fpu_system.h @@ -104,7 +104,7 @@ static inline bool seg_writable(struct desc_struct *d) #define instruction_address (*(struct address *)&I387->soft.fip) #define operand_address (*(struct address *)&I387->soft.foo) -#define FPU_access_ok(x,y,z) if ( !access_ok(x,y,z) ) \ +#define FPU_access_ok(y,z) if ( !access_ok(y,z) ) \ math_abort(FPU_info,SIGSEGV) #define FPU_abort math_abort(FPU_info, SIGSEGV) @@ -119,7 +119,7 @@ static inline bool seg_writable(struct desc_struct *d) /* A simpler test than access_ok() can probably be done for FPU_code_access_ok() because the only possible error is to step past the upper boundary of a legal code area. */ -#define FPU_code_access_ok(z) FPU_access_ok(VERIFY_READ,(void __user *)FPU_EIP,z) +#define FPU_code_access_ok(z) FPU_access_ok((void __user *)FPU_EIP,z) #endif #define FPU_get_user(x,y) get_user((x),(y)) diff --git a/arch/x86/math-emu/load_store.c b/arch/x86/math-emu/load_store.c index f821a9cd7753..f15263e158e8 100644 --- a/arch/x86/math-emu/load_store.c +++ b/arch/x86/math-emu/load_store.c @@ -251,7 +251,7 @@ int FPU_load_store(u_char type, fpu_addr_modes addr_modes, break; case 024: /* fldcw */ RE_ENTRANT_CHECK_OFF; - FPU_access_ok(VERIFY_READ, data_address, 2); + FPU_access_ok(data_address, 2); FPU_get_user(control_word, (unsigned short __user *)data_address); RE_ENTRANT_CHECK_ON; @@ -291,7 +291,7 @@ int FPU_load_store(u_char type, fpu_addr_modes addr_modes, break; case 034: /* fstcw m16int */ RE_ENTRANT_CHECK_OFF; - FPU_access_ok(VERIFY_WRITE, data_address, 2); + FPU_access_ok(data_address, 2); FPU_put_user(control_word, (unsigned short __user *)data_address); RE_ENTRANT_CHECK_ON; @@ -305,7 +305,7 @@ int FPU_load_store(u_char type, fpu_addr_modes addr_modes, break; case 036: /* fstsw m2byte */ RE_ENTRANT_CHECK_OFF; - FPU_access_ok(VERIFY_WRITE, data_address, 2); + FPU_access_ok(data_address, 2); FPU_put_user(status_word(), (unsigned short __user *)data_address); RE_ENTRANT_CHECK_ON; diff --git a/arch/x86/math-emu/reg_ld_str.c b/arch/x86/math-emu/reg_ld_str.c index d40ff45497b9..f3779743d15e 100644 --- a/arch/x86/math-emu/reg_ld_str.c +++ b/arch/x86/math-emu/reg_ld_str.c @@ -84,7 +84,7 @@ int FPU_load_extended(long double __user *s, int stnr) FPU_REG *sti_ptr = &st(stnr); RE_ENTRANT_CHECK_OFF; - FPU_access_ok(VERIFY_READ, s, 10); + FPU_access_ok(s, 10); __copy_from_user(sti_ptr, s, 10); RE_ENTRANT_CHECK_ON; @@ -98,7 +98,7 @@ int FPU_load_double(double __user *dfloat, FPU_REG *loaded_data) unsigned m64, l64; RE_ENTRANT_CHECK_OFF; - FPU_access_ok(VERIFY_READ, dfloat, 8); + FPU_access_ok(dfloat, 8); FPU_get_user(m64, 1 + (unsigned long __user *)dfloat); FPU_get_user(l64, (unsigned long __user *)dfloat); RE_ENTRANT_CHECK_ON; @@ -159,7 +159,7 @@ int FPU_load_single(float __user *single, FPU_REG *loaded_data) int exp, tag, negative; RE_ENTRANT_CHECK_OFF; - FPU_access_ok(VERIFY_READ, single, 4); + FPU_access_ok(single, 4); FPU_get_user(m32, (unsigned long __user *)single); RE_ENTRANT_CHECK_ON; @@ -214,7 +214,7 @@ int FPU_load_int64(long long __user *_s) FPU_REG *st0_ptr = &st(0); RE_ENTRANT_CHECK_OFF; - FPU_access_ok(VERIFY_READ, _s, 8); + FPU_access_ok(_s, 8); if (copy_from_user(&s, _s, 8)) FPU_abort; RE_ENTRANT_CHECK_ON; @@ -243,7 +243,7 @@ int FPU_load_int32(long __user *_s, FPU_REG *loaded_data) int negative; RE_ENTRANT_CHECK_OFF; - FPU_access_ok(VERIFY_READ, _s, 4); + FPU_access_ok(_s, 4); FPU_get_user(s, _s); RE_ENTRANT_CHECK_ON; @@ -271,7 +271,7 @@ int FPU_load_int16(short __user *_s, FPU_REG *loaded_data) int s, negative; RE_ENTRANT_CHECK_OFF; - FPU_access_ok(VERIFY_READ, _s, 2); + FPU_access_ok(_s, 2); /* Cast as short to get the sign extended. */ FPU_get_user(s, _s); RE_ENTRANT_CHECK_ON; @@ -304,7 +304,7 @@ int FPU_load_bcd(u_char __user *s) int sign; RE_ENTRANT_CHECK_OFF; - FPU_access_ok(VERIFY_READ, s, 10); + FPU_access_ok(s, 10); RE_ENTRANT_CHECK_ON; for (pos = 8; pos >= 0; pos--) { l *= 10; @@ -345,7 +345,7 @@ int FPU_store_extended(FPU_REG *st0_ptr, u_char st0_tag, if (st0_tag != TAG_Empty) { RE_ENTRANT_CHECK_OFF; - FPU_access_ok(VERIFY_WRITE, d, 10); + FPU_access_ok(d, 10); FPU_put_user(st0_ptr->sigl, (unsigned long __user *)d); FPU_put_user(st0_ptr->sigh, @@ -364,7 +364,7 @@ int FPU_store_extended(FPU_REG *st0_ptr, u_char st0_tag, /* The masked response */ /* Put out the QNaN indefinite */ RE_ENTRANT_CHECK_OFF; - FPU_access_ok(VERIFY_WRITE, d, 10); + FPU_access_ok(d, 10); FPU_put_user(0, (unsigned long __user *)d); FPU_put_user(0xc0000000, 1 + (unsigned long __user *)d); FPU_put_user(0xffff, 4 + (short __user *)d); @@ -539,7 +539,7 @@ denormal_arg: /* The masked response */ /* Put out the QNaN indefinite */ RE_ENTRANT_CHECK_OFF; - FPU_access_ok(VERIFY_WRITE, dfloat, 8); + FPU_access_ok(dfloat, 8); FPU_put_user(0, (unsigned long __user *)dfloat); FPU_put_user(0xfff80000, 1 + (unsigned long __user *)dfloat); @@ -552,7 +552,7 @@ denormal_arg: l[1] |= 0x80000000; RE_ENTRANT_CHECK_OFF; - FPU_access_ok(VERIFY_WRITE, dfloat, 8); + FPU_access_ok(dfloat, 8); FPU_put_user(l[0], (unsigned long __user *)dfloat); FPU_put_user(l[1], 1 + (unsigned long __user *)dfloat); RE_ENTRANT_CHECK_ON; @@ -724,7 +724,7 @@ int FPU_store_single(FPU_REG *st0_ptr, u_char st0_tag, float __user *single) /* The masked response */ /* Put out the QNaN indefinite */ RE_ENTRANT_CHECK_OFF; - FPU_access_ok(VERIFY_WRITE, single, 4); + FPU_access_ok(single, 4); FPU_put_user(0xffc00000, (unsigned long __user *)single); RE_ENTRANT_CHECK_ON; @@ -742,7 +742,7 @@ int FPU_store_single(FPU_REG *st0_ptr, u_char st0_tag, float __user *single) templ |= 0x80000000; RE_ENTRANT_CHECK_OFF; - FPU_access_ok(VERIFY_WRITE, single, 4); + FPU_access_ok(single, 4); FPU_put_user(templ, (unsigned long __user *)single); RE_ENTRANT_CHECK_ON; @@ -791,7 +791,7 @@ int FPU_store_int64(FPU_REG *st0_ptr, u_char st0_tag, long long __user *d) } RE_ENTRANT_CHECK_OFF; - FPU_access_ok(VERIFY_WRITE, d, 8); + FPU_access_ok(d, 8); if (copy_to_user(d, &tll, 8)) FPU_abort; RE_ENTRANT_CHECK_ON; @@ -838,7 +838,7 @@ int FPU_store_int32(FPU_REG *st0_ptr, u_char st0_tag, long __user *d) } RE_ENTRANT_CHECK_OFF; - FPU_access_ok(VERIFY_WRITE, d, 4); + FPU_access_ok(d, 4); FPU_put_user(t.sigl, (unsigned long __user *)d); RE_ENTRANT_CHECK_ON; @@ -884,7 +884,7 @@ int FPU_store_int16(FPU_REG *st0_ptr, u_char st0_tag, short __user *d) } RE_ENTRANT_CHECK_OFF; - FPU_access_ok(VERIFY_WRITE, d, 2); + FPU_access_ok(d, 2); FPU_put_user((short)t.sigl, d); RE_ENTRANT_CHECK_ON; @@ -925,7 +925,7 @@ int FPU_store_bcd(FPU_REG *st0_ptr, u_char st0_tag, u_char __user *d) if (control_word & CW_Invalid) { /* Produce the QNaN "indefinite" */ RE_ENTRANT_CHECK_OFF; - FPU_access_ok(VERIFY_WRITE, d, 10); + FPU_access_ok(d, 10); for (i = 0; i < 7; i++) FPU_put_user(0, d + i); /* These bytes "undefined" */ FPU_put_user(0xc0, d + 7); /* This byte "undefined" */ @@ -941,7 +941,7 @@ int FPU_store_bcd(FPU_REG *st0_ptr, u_char st0_tag, u_char __user *d) } RE_ENTRANT_CHECK_OFF; - FPU_access_ok(VERIFY_WRITE, d, 10); + FPU_access_ok(d, 10); RE_ENTRANT_CHECK_ON; for (i = 0; i < 9; i++) { b = FPU_div_small(&ll, 10); @@ -1034,7 +1034,7 @@ u_char __user *fldenv(fpu_addr_modes addr_modes, u_char __user *s) ((addr_modes.default_mode == PM16) ^ (addr_modes.override.operand_size == OP_SIZE_PREFIX))) { RE_ENTRANT_CHECK_OFF; - FPU_access_ok(VERIFY_READ, s, 0x0e); + FPU_access_ok(s, 0x0e); FPU_get_user(control_word, (unsigned short __user *)s); FPU_get_user(partial_status, (unsigned short __user *)(s + 2)); FPU_get_user(tag_word, (unsigned short __user *)(s + 4)); @@ -1056,7 +1056,7 @@ u_char __user *fldenv(fpu_addr_modes addr_modes, u_char __user *s) } } else { RE_ENTRANT_CHECK_OFF; - FPU_access_ok(VERIFY_READ, s, 0x1c); + FPU_access_ok(s, 0x1c); FPU_get_user(control_word, (unsigned short __user *)s); FPU_get_user(partial_status, (unsigned short __user *)(s + 4)); FPU_get_user(tag_word, (unsigned short __user *)(s + 8)); @@ -1125,7 +1125,7 @@ void frstor(fpu_addr_modes addr_modes, u_char __user *data_address) /* Copy all registers in stack order. */ RE_ENTRANT_CHECK_OFF; - FPU_access_ok(VERIFY_READ, s, 80); + FPU_access_ok(s, 80); __copy_from_user(register_base + offset, s, other); if (offset) __copy_from_user(register_base, s + other, offset); @@ -1146,7 +1146,7 @@ u_char __user *fstenv(fpu_addr_modes addr_modes, u_char __user *d) ((addr_modes.default_mode == PM16) ^ (addr_modes.override.operand_size == OP_SIZE_PREFIX))) { RE_ENTRANT_CHECK_OFF; - FPU_access_ok(VERIFY_WRITE, d, 14); + FPU_access_ok(d, 14); #ifdef PECULIAR_486 FPU_put_user(control_word & ~0xe080, (unsigned long __user *)d); #else @@ -1174,7 +1174,7 @@ u_char __user *fstenv(fpu_addr_modes addr_modes, u_char __user *d) d += 0x0e; } else { RE_ENTRANT_CHECK_OFF; - FPU_access_ok(VERIFY_WRITE, d, 7 * 4); + FPU_access_ok(d, 7 * 4); #ifdef PECULIAR_486 control_word &= ~0xe080; /* An 80486 sets nearly all of the reserved bits to 1. */ @@ -1204,7 +1204,7 @@ void fsave(fpu_addr_modes addr_modes, u_char __user *data_address) d = fstenv(addr_modes, data_address); RE_ENTRANT_CHECK_OFF; - FPU_access_ok(VERIFY_WRITE, d, 80); + FPU_access_ok(d, 80); /* Copy all registers in stack order. */ if (__copy_to_user(d, register_base + offset, other)) diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c index 2385538e8065..de1851d15699 100644 --- a/arch/x86/mm/mpx.c +++ b/arch/x86/mm/mpx.c @@ -495,7 +495,7 @@ static int get_bt_addr(struct mm_struct *mm, unsigned long bd_entry; unsigned long bt_addr; - if (!access_ok(VERIFY_READ, (bd_entry_ptr), sizeof(*bd_entry_ptr))) + if (!access_ok((bd_entry_ptr), sizeof(*bd_entry_ptr))) return -EFAULT; while (1) { diff --git a/arch/x86/um/asm/checksum_32.h b/arch/x86/um/asm/checksum_32.h index 83a75f8a1233..b9ac7c9eb72c 100644 --- a/arch/x86/um/asm/checksum_32.h +++ b/arch/x86/um/asm/checksum_32.h @@ -43,7 +43,7 @@ static __inline__ __wsum csum_and_copy_to_user(const void *src, void __user *dst, int len, __wsum sum, int *err_ptr) { - if (access_ok(VERIFY_WRITE, dst, len)) { + if (access_ok(dst, len)) { if (copy_to_user(dst, src, len)) { *err_ptr = -EFAULT; return (__force __wsum)-1; diff --git a/arch/x86/um/signal.c b/arch/x86/um/signal.c index 727ed442e0a5..8b4a71efe7ee 100644 --- a/arch/x86/um/signal.c +++ b/arch/x86/um/signal.c @@ -367,7 +367,7 @@ int setup_signal_stack_sc(unsigned long stack_top, struct ksignal *ksig, /* This is the same calculation as i386 - ((sp + 4) & 15) == 0 */ stack_top = ((stack_top + 4) & -16UL) - 4; frame = (struct sigframe __user *) stack_top - 1; - if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) + if (!access_ok(frame, sizeof(*frame))) return 1; restorer = frame->retcode; @@ -412,7 +412,7 @@ int setup_signal_stack_si(unsigned long stack_top, struct ksignal *ksig, stack_top &= -8UL; frame = (struct rt_sigframe __user *) stack_top - 1; - if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) + if (!access_ok(frame, sizeof(*frame))) return 1; restorer = frame->retcode; @@ -497,7 +497,7 @@ int setup_signal_stack_si(unsigned long stack_top, struct ksignal *ksig, /* Subtract 128 for a red zone and 8 for proper alignment */ frame = (struct rt_sigframe __user *) ((unsigned long) frame - 128 - 8); - if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) + if (!access_ok(frame, sizeof(*frame))) goto out; if (ksig->ka.sa.sa_flags & SA_SIGINFO) { diff --git a/arch/xtensa/include/asm/checksum.h b/arch/xtensa/include/asm/checksum.h index 3ae74d7e074b..f302ef57973a 100644 --- a/arch/xtensa/include/asm/checksum.h +++ b/arch/xtensa/include/asm/checksum.h @@ -243,7 +243,7 @@ static __inline__ __wsum csum_and_copy_to_user(const void *src, void __user *dst, int len, __wsum sum, int *err_ptr) { - if (access_ok(VERIFY_WRITE, dst, len)) + if (access_ok(dst, len)) return csum_partial_copy_generic(src,dst,len,sum,NULL,err_ptr); if (len) diff --git a/arch/xtensa/include/asm/futex.h b/arch/xtensa/include/asm/futex.h index fd0eef6b8e7c..505d09eff184 100644 --- a/arch/xtensa/include/asm/futex.h +++ b/arch/xtensa/include/asm/futex.h @@ -93,7 +93,7 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, { int ret = 0; - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) + if (!access_ok(uaddr, sizeof(u32))) return -EFAULT; #if !XCHAL_HAVE_S32C1I diff --git a/arch/xtensa/include/asm/uaccess.h b/arch/xtensa/include/asm/uaccess.h index d11ef2939652..4b2480304bc3 100644 --- a/arch/xtensa/include/asm/uaccess.h +++ b/arch/xtensa/include/asm/uaccess.h @@ -42,7 +42,7 @@ #define __user_ok(addr, size) \ (((size) <= TASK_SIZE)&&((addr) <= TASK_SIZE-(size))) #define __access_ok(addr, size) (__kernel_ok || __user_ok((addr), (size))) -#define access_ok(type, addr, size) __access_ok((unsigned long)(addr), (size)) +#define access_ok(addr, size) __access_ok((unsigned long)(addr), (size)) #define user_addr_max() (uaccess_kernel() ? ~0UL : TASK_SIZE) @@ -86,7 +86,7 @@ extern long __put_user_bad(void); ({ \ long __pu_err = -EFAULT; \ __typeof__(*(ptr)) *__pu_addr = (ptr); \ - if (access_ok(VERIFY_WRITE, __pu_addr, size)) \ + if (access_ok(__pu_addr, size)) \ __put_user_size((x), __pu_addr, (size), __pu_err); \ __pu_err; \ }) @@ -183,7 +183,7 @@ __asm__ __volatile__( \ ({ \ long __gu_err = -EFAULT, __gu_val = 0; \ const __typeof__(*(ptr)) *__gu_addr = (ptr); \ - if (access_ok(VERIFY_READ, __gu_addr, size)) \ + if (access_ok(__gu_addr, size)) \ __get_user_size(__gu_val, __gu_addr, (size), __gu_err); \ (x) = (__force __typeof__(*(ptr)))__gu_val; \ __gu_err; \ @@ -269,7 +269,7 @@ __xtensa_clear_user(void *addr, unsigned long size) static inline unsigned long clear_user(void *addr, unsigned long size) { - if (access_ok(VERIFY_WRITE, addr, size)) + if (access_ok(addr, size)) return __xtensa_clear_user(addr, size); return size ? -EFAULT : 0; } @@ -284,7 +284,7 @@ extern long __strncpy_user(char *, const char *, long); static inline long strncpy_from_user(char *dst, const char *src, long count) { - if (access_ok(VERIFY_READ, src, 1)) + if (access_ok(src, 1)) return __strncpy_user(dst, src, count); return -EFAULT; } diff --git a/arch/xtensa/kernel/signal.c b/arch/xtensa/kernel/signal.c index 74e1682876ac..dc22a238ed9c 100644 --- a/arch/xtensa/kernel/signal.c +++ b/arch/xtensa/kernel/signal.c @@ -251,7 +251,7 @@ asmlinkage long xtensa_rt_sigreturn(long a0, long a1, long a2, long a3, frame = (struct rt_sigframe __user *) regs->areg[1]; - if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) + if (!access_ok(frame, sizeof(*frame))) goto badframe; if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set))) @@ -348,7 +348,7 @@ static int setup_frame(struct ksignal *ksig, sigset_t *set, if (regs->depc > 64) panic ("Double exception sys_sigreturn\n"); - if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) { + if (!access_ok(frame, sizeof(*frame))) { return -EFAULT; } diff --git a/arch/xtensa/kernel/stacktrace.c b/arch/xtensa/kernel/stacktrace.c index 0df4080fa20f..174c11f13bba 100644 --- a/arch/xtensa/kernel/stacktrace.c +++ b/arch/xtensa/kernel/stacktrace.c @@ -91,7 +91,7 @@ void xtensa_backtrace_user(struct pt_regs *regs, unsigned int depth, pc = MAKE_PC_FROM_RA(a0, pc); /* Check if the region is OK to access. */ - if (!access_ok(VERIFY_READ, &SPILL_SLOT(a1, 0), 8)) + if (!access_ok(&SPILL_SLOT(a1, 0), 8)) return; /* Copy a1, a0 from user space stack frame. */ if (__get_user(a0, &SPILL_SLOT(a1, 0)) || diff --git a/drivers/acpi/acpi_dbg.c b/drivers/acpi/acpi_dbg.c index f21c99ec46ee..a2dcd62ea32f 100644 --- a/drivers/acpi/acpi_dbg.c +++ b/drivers/acpi/acpi_dbg.c @@ -614,7 +614,7 @@ static ssize_t acpi_aml_read(struct file *file, char __user *buf, if (!count) return 0; - if (!access_ok(VERIFY_WRITE, buf, count)) + if (!access_ok(buf, count)) return -EFAULT; while (count > 0) { @@ -684,7 +684,7 @@ static ssize_t acpi_aml_write(struct file *file, const char __user *buf, if (!count) return 0; - if (!access_ok(VERIFY_READ, buf, count)) + if (!access_ok(buf, count)) return -EFAULT; while (count > 0) { diff --git a/drivers/char/generic_nvram.c b/drivers/char/generic_nvram.c index 14e728fbb8a0..ff5394f47587 100644 --- a/drivers/char/generic_nvram.c +++ b/drivers/char/generic_nvram.c @@ -44,7 +44,7 @@ static ssize_t read_nvram(struct file *file, char __user *buf, unsigned int i; char __user *p = buf; - if (!access_ok(VERIFY_WRITE, buf, count)) + if (!access_ok(buf, count)) return -EFAULT; if (*ppos >= nvram_len) return 0; @@ -62,7 +62,7 @@ static ssize_t write_nvram(struct file *file, const char __user *buf, const char __user *p = buf; char c; - if (!access_ok(VERIFY_READ, buf, count)) + if (!access_ok(buf, count)) return -EFAULT; if (*ppos >= nvram_len) return 0; diff --git a/drivers/char/mem.c b/drivers/char/mem.c index 7b4e4de778e4..b08dc50f9f26 100644 --- a/drivers/char/mem.c +++ b/drivers/char/mem.c @@ -609,7 +609,7 @@ static ssize_t read_port(struct file *file, char __user *buf, unsigned long i = *ppos; char __user *tmp = buf; - if (!access_ok(VERIFY_WRITE, buf, count)) + if (!access_ok(buf, count)) return -EFAULT; while (count-- > 0 && i < 65536) { if (__put_user(inb(i), tmp) < 0) @@ -627,7 +627,7 @@ static ssize_t write_port(struct file *file, const char __user *buf, unsigned long i = *ppos; const char __user *tmp = buf; - if (!access_ok(VERIFY_READ, buf, count)) + if (!access_ok(buf, count)) return -EFAULT; while (count-- > 0 && i < 65536) { char c; diff --git a/drivers/char/nwflash.c b/drivers/char/nwflash.c index a284ae25e69a..76fb434068d4 100644 --- a/drivers/char/nwflash.c +++ b/drivers/char/nwflash.c @@ -167,7 +167,7 @@ static ssize_t flash_write(struct file *file, const char __user *buf, if (count > gbFlashSize - p) count = gbFlashSize - p; - if (!access_ok(VERIFY_READ, buf, count)) + if (!access_ok(buf, count)) return -EFAULT; /* diff --git a/drivers/char/pcmcia/cm4000_cs.c b/drivers/char/pcmcia/cm4000_cs.c index 809507bf8f1c..7a4eb86aedac 100644 --- a/drivers/char/pcmcia/cm4000_cs.c +++ b/drivers/char/pcmcia/cm4000_cs.c @@ -1445,11 +1445,11 @@ static long cmm_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) _IOC_DIR(cmd), _IOC_READ, _IOC_WRITE, size, cmd); if (_IOC_DIR(cmd) & _IOC_READ) { - if (!access_ok(VERIFY_WRITE, argp, size)) + if (!access_ok(argp, size)) goto out; } if (_IOC_DIR(cmd) & _IOC_WRITE) { - if (!access_ok(VERIFY_READ, argp, size)) + if (!access_ok(argp, size)) goto out; } rc = 0; diff --git a/drivers/crypto/ccp/psp-dev.c b/drivers/crypto/ccp/psp-dev.c index d64a78ccc03e..b16be8a11d92 100644 --- a/drivers/crypto/ccp/psp-dev.c +++ b/drivers/crypto/ccp/psp-dev.c @@ -364,7 +364,7 @@ static int sev_ioctl_do_pek_csr(struct sev_issue_cmd *argp) goto cmd; /* allocate a physically contiguous buffer to store the CSR blob */ - if (!access_ok(VERIFY_WRITE, input.address, input.length) || + if (!access_ok(input.address, input.length) || input.length > SEV_FW_BLOB_MAX_SIZE) { ret = -EFAULT; goto e_free; @@ -644,14 +644,14 @@ static int sev_ioctl_do_pdh_export(struct sev_issue_cmd *argp) /* Allocate a physically contiguous buffer to store the PDH blob. */ if ((input.pdh_cert_len > SEV_FW_BLOB_MAX_SIZE) || - !access_ok(VERIFY_WRITE, input.pdh_cert_address, input.pdh_cert_len)) { + !access_ok(input.pdh_cert_address, input.pdh_cert_len)) { ret = -EFAULT; goto e_free; } /* Allocate a physically contiguous buffer to store the cert chain blob. */ if ((input.cert_chain_len > SEV_FW_BLOB_MAX_SIZE) || - !access_ok(VERIFY_WRITE, input.cert_chain_address, input.cert_chain_len)) { + !access_ok(input.cert_chain_address, input.cert_chain_len)) { ret = -EFAULT; goto e_free; } diff --git a/drivers/firewire/core-cdev.c b/drivers/firewire/core-cdev.c index d8e185582642..16a7045736a9 100644 --- a/drivers/firewire/core-cdev.c +++ b/drivers/firewire/core-cdev.c @@ -1094,7 +1094,7 @@ static int ioctl_queue_iso(struct client *client, union ioctl_arg *arg) return -EINVAL; p = (struct fw_cdev_iso_packet __user *)u64_to_uptr(a->packets); - if (!access_ok(VERIFY_READ, p, a->size)) + if (!access_ok(p, a->size)) return -EFAULT; end = (void __user *)p + a->size; diff --git a/drivers/firmware/efi/test/efi_test.c b/drivers/firmware/efi/test/efi_test.c index 769640940c9f..51ecf7d6da48 100644 --- a/drivers/firmware/efi/test/efi_test.c +++ b/drivers/firmware/efi/test/efi_test.c @@ -68,7 +68,7 @@ copy_ucs2_from_user_len(efi_char16_t **dst, efi_char16_t __user *src, return 0; } - if (!access_ok(VERIFY_READ, src, 1)) + if (!access_ok(src, 1)) return -EFAULT; buf = memdup_user(src, len); @@ -89,7 +89,7 @@ copy_ucs2_from_user_len(efi_char16_t **dst, efi_char16_t __user *src, static inline int get_ucs2_strsize_from_user(efi_char16_t __user *src, size_t *len) { - if (!access_ok(VERIFY_READ, src, 1)) + if (!access_ok(src, 1)) return -EFAULT; *len = user_ucs2_strsize(src); @@ -116,7 +116,7 @@ copy_ucs2_from_user(efi_char16_t **dst, efi_char16_t __user *src) { size_t len; - if (!access_ok(VERIFY_READ, src, 1)) + if (!access_ok(src, 1)) return -EFAULT; len = user_ucs2_strsize(src); @@ -140,7 +140,7 @@ copy_ucs2_to_user_len(efi_char16_t __user *dst, efi_char16_t *src, size_t len) if (!src) return 0; - if (!access_ok(VERIFY_WRITE, dst, 1)) + if (!access_ok(dst, 1)) return -EFAULT; return copy_to_user(dst, src, len); diff --git a/drivers/fpga/dfl-afu-dma-region.c b/drivers/fpga/dfl-afu-dma-region.c index 025aba3ea76c..e18a786fc943 100644 --- a/drivers/fpga/dfl-afu-dma-region.c +++ b/drivers/fpga/dfl-afu-dma-region.c @@ -369,7 +369,7 @@ int afu_dma_map_region(struct dfl_feature_platform_data *pdata, if (user_addr + length < user_addr) return -EINVAL; - if (!access_ok(VERIFY_WRITE, (void __user *)(unsigned long)user_addr, + if (!access_ok((void __user *)(unsigned long)user_addr, length)) return -EINVAL; diff --git a/drivers/fpga/dfl-fme-pr.c b/drivers/fpga/dfl-fme-pr.c index fe5a5578fbf7..d9ca9554844a 100644 --- a/drivers/fpga/dfl-fme-pr.c +++ b/drivers/fpga/dfl-fme-pr.c @@ -99,8 +99,7 @@ static int fme_pr(struct platform_device *pdev, unsigned long arg) return -EINVAL; } - if (!access_ok(VERIFY_READ, - (void __user *)(unsigned long)port_pr.buffer_address, + if (!access_ok((void __user *)(unsigned long)port_pr.buffer_address, port_pr.buffer_size)) return -EFAULT; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c index 3623538baf6f..be68752c3469 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -158,8 +158,7 @@ static int set_queue_properties_from_user(struct queue_properties *q_properties, } if ((args->ring_base_address) && - (!access_ok(VERIFY_WRITE, - (const void __user *) args->ring_base_address, + (!access_ok((const void __user *) args->ring_base_address, sizeof(uint64_t)))) { pr_err("Can't access ring base address\n"); return -EFAULT; @@ -170,31 +169,27 @@ static int set_queue_properties_from_user(struct queue_properties *q_properties, return -EINVAL; } - if (!access_ok(VERIFY_WRITE, - (const void __user *) args->read_pointer_address, + if (!access_ok((const void __user *) args->read_pointer_address, sizeof(uint32_t))) { pr_err("Can't access read pointer\n"); return -EFAULT; } - if (!access_ok(VERIFY_WRITE, - (const void __user *) args->write_pointer_address, + if (!access_ok((const void __user *) args->write_pointer_address, sizeof(uint32_t))) { pr_err("Can't access write pointer\n"); return -EFAULT; } if (args->eop_buffer_address && - !access_ok(VERIFY_WRITE, - (const void __user *) args->eop_buffer_address, + !access_ok((const void __user *) args->eop_buffer_address, sizeof(uint32_t))) { pr_debug("Can't access eop buffer"); return -EFAULT; } if (args->ctx_save_restore_address && - !access_ok(VERIFY_WRITE, - (const void __user *) args->ctx_save_restore_address, + !access_ok((const void __user *) args->ctx_save_restore_address, sizeof(uint32_t))) { pr_debug("Can't access ctx save restore buffer"); return -EFAULT; @@ -365,8 +360,7 @@ static int kfd_ioctl_update_queue(struct file *filp, struct kfd_process *p, } if ((args->ring_base_address) && - (!access_ok(VERIFY_WRITE, - (const void __user *) args->ring_base_address, + (!access_ok((const void __user *) args->ring_base_address, sizeof(uint64_t)))) { pr_err("Can't access ring base address\n"); return -EFAULT; diff --git a/drivers/gpu/drm/armada/armada_gem.c b/drivers/gpu/drm/armada/armada_gem.c index 892c1d9304bb..642d0e70d0f8 100644 --- a/drivers/gpu/drm/armada/armada_gem.c +++ b/drivers/gpu/drm/armada/armada_gem.c @@ -334,7 +334,7 @@ int armada_gem_pwrite_ioctl(struct drm_device *dev, void *data, ptr = (char __user *)(uintptr_t)args->ptr; - if (!access_ok(VERIFY_READ, ptr, args->size)) + if (!access_ok(ptr, args->size)) return -EFAULT; ret = fault_in_pages_readable(ptr, args->size); diff --git a/drivers/gpu/drm/drm_file.c b/drivers/gpu/drm/drm_file.c index ffa8dc35515f..46f48f245eb5 100644 --- a/drivers/gpu/drm/drm_file.c +++ b/drivers/gpu/drm/drm_file.c @@ -525,7 +525,7 @@ ssize_t drm_read(struct file *filp, char __user *buffer, struct drm_device *dev = file_priv->minor->dev; ssize_t ret; - if (!access_ok(VERIFY_WRITE, buffer, count)) + if (!access_ok(buffer, count)) return -EFAULT; ret = mutex_lock_interruptible(&file_priv->event_read_lock); diff --git a/drivers/gpu/drm/etnaviv/etnaviv_drv.c b/drivers/gpu/drm/etnaviv/etnaviv_drv.c index 96efc84396bf..18c27f795cf6 100644 --- a/drivers/gpu/drm/etnaviv/etnaviv_drv.c +++ b/drivers/gpu/drm/etnaviv/etnaviv_drv.c @@ -339,7 +339,6 @@ static int etnaviv_ioctl_gem_userptr(struct drm_device *dev, void *data, struct drm_file *file) { struct drm_etnaviv_gem_userptr *args = data; - int access; if (args->flags & ~(ETNA_USERPTR_READ|ETNA_USERPTR_WRITE) || args->flags == 0) @@ -351,12 +350,7 @@ static int etnaviv_ioctl_gem_userptr(struct drm_device *dev, void *data, args->user_ptr & ~PAGE_MASK) return -EINVAL; - if (args->flags & ETNA_USERPTR_WRITE) - access = VERIFY_WRITE; - else - access = VERIFY_READ; - - if (!access_ok(access, (void __user *)(unsigned long)args->user_ptr, + if (!access_ok((void __user *)(unsigned long)args->user_ptr, args->user_size)) return -EFAULT; diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index a9de07bb72c8..216f52b744a6 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c @@ -1282,8 +1282,7 @@ i915_gem_pread_ioctl(struct drm_device *dev, void *data, if (args->size == 0) return 0; - if (!access_ok(VERIFY_WRITE, - u64_to_user_ptr(args->data_ptr), + if (!access_ok(u64_to_user_ptr(args->data_ptr), args->size)) return -EFAULT; @@ -1609,9 +1608,7 @@ i915_gem_pwrite_ioctl(struct drm_device *dev, void *data, if (args->size == 0) return 0; - if (!access_ok(VERIFY_READ, - u64_to_user_ptr(args->data_ptr), - args->size)) + if (!access_ok(u64_to_user_ptr(args->data_ptr), args->size)) return -EFAULT; obj = i915_gem_object_lookup(file, args->handle); diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c index 8ff6b581cf1c..fee66ccebed6 100644 --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c @@ -1447,7 +1447,7 @@ static int eb_relocate_vma(struct i915_execbuffer *eb, struct i915_vma *vma) * to read. However, if the array is not writable the user loses * the updated relocation values. */ - if (unlikely(!access_ok(VERIFY_READ, urelocs, remain*sizeof(*urelocs)))) + if (unlikely(!access_ok(urelocs, remain*sizeof(*urelocs)))) return -EFAULT; do { @@ -1554,7 +1554,7 @@ static int check_relocations(const struct drm_i915_gem_exec_object2 *entry) addr = u64_to_user_ptr(entry->relocs_ptr); size *= sizeof(struct drm_i915_gem_relocation_entry); - if (!access_ok(VERIFY_READ, addr, size)) + if (!access_ok(addr, size)) return -EFAULT; end = addr + size; @@ -2090,7 +2090,7 @@ get_fence_array(struct drm_i915_gem_execbuffer2 *args, return ERR_PTR(-EINVAL); user = u64_to_user_ptr(args->cliprects_ptr); - if (!access_ok(VERIFY_READ, user, nfences * sizeof(*user))) + if (!access_ok(user, nfences * sizeof(*user))) return ERR_PTR(-EFAULT); fences = kvmalloc_array(nfences, sizeof(*fences), diff --git a/drivers/gpu/drm/i915/i915_gem_userptr.c b/drivers/gpu/drm/i915/i915_gem_userptr.c index 3df77020aada..9558582c105e 100644 --- a/drivers/gpu/drm/i915/i915_gem_userptr.c +++ b/drivers/gpu/drm/i915/i915_gem_userptr.c @@ -789,8 +789,7 @@ i915_gem_userptr_ioctl(struct drm_device *dev, if (offset_in_page(args->user_ptr | args->user_size)) return -EINVAL; - if (!access_ok(args->flags & I915_USERPTR_READ_ONLY ? VERIFY_READ : VERIFY_WRITE, - (char __user *)(unsigned long)args->user_ptr, args->user_size)) + if (!access_ok((char __user *)(unsigned long)args->user_ptr, args->user_size)) return -EFAULT; if (args->flags & I915_USERPTR_READ_ONLY) { diff --git a/drivers/gpu/drm/i915/i915_ioc32.c b/drivers/gpu/drm/i915/i915_ioc32.c index 0e5c580d117c..e869daf9c8a9 100644 --- a/drivers/gpu/drm/i915/i915_ioc32.c +++ b/drivers/gpu/drm/i915/i915_ioc32.c @@ -52,7 +52,7 @@ static int compat_i915_getparam(struct file *file, unsigned int cmd, return -EFAULT; request = compat_alloc_user_space(sizeof(*request)); - if (!access_ok(VERIFY_WRITE, request, sizeof(*request)) || + if (!access_ok(request, sizeof(*request)) || __put_user(req32.param, &request->param) || __put_user((void __user *)(unsigned long)req32.value, &request->value)) diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c index 4529edfdcfc8..2b2eb57ca71f 100644 --- a/drivers/gpu/drm/i915/i915_perf.c +++ b/drivers/gpu/drm/i915/i915_perf.c @@ -3052,7 +3052,7 @@ static struct i915_oa_reg *alloc_oa_regs(struct drm_i915_private *dev_priv, if (!n_regs) return NULL; - if (!access_ok(VERIFY_READ, regs, n_regs * sizeof(u32) * 2)) + if (!access_ok(regs, n_regs * sizeof(u32) * 2)) return ERR_PTR(-EFAULT); /* No is_valid function means we're not allowing any register to be programmed. */ diff --git a/drivers/gpu/drm/i915/i915_query.c b/drivers/gpu/drm/i915/i915_query.c index 6fc4b8eeab42..fe56465cdfd6 100644 --- a/drivers/gpu/drm/i915/i915_query.c +++ b/drivers/gpu/drm/i915/i915_query.c @@ -46,7 +46,7 @@ static int query_topology_info(struct drm_i915_private *dev_priv, if (topo.flags != 0) return -EINVAL; - if (!access_ok(VERIFY_WRITE, u64_to_user_ptr(query_item->data_ptr), + if (!access_ok(u64_to_user_ptr(query_item->data_ptr), total_length)) return -EFAULT; diff --git a/drivers/gpu/drm/msm/msm_gem_submit.c b/drivers/gpu/drm/msm/msm_gem_submit.c index a28465d90529..12b983fc0b56 100644 --- a/drivers/gpu/drm/msm/msm_gem_submit.c +++ b/drivers/gpu/drm/msm/msm_gem_submit.c @@ -77,7 +77,7 @@ void msm_gem_submit_free(struct msm_gem_submit *submit) static inline unsigned long __must_check copy_from_user_inatomic(void *to, const void __user *from, unsigned long n) { - if (access_ok(VERIFY_READ, from, n)) + if (access_ok(from, n)) return __copy_from_user_inatomic(to, from, n); return -EFAULT; } diff --git a/drivers/gpu/drm/qxl/qxl_ioctl.c b/drivers/gpu/drm/qxl/qxl_ioctl.c index 6e828158bcb0..d410e2925162 100644 --- a/drivers/gpu/drm/qxl/qxl_ioctl.c +++ b/drivers/gpu/drm/qxl/qxl_ioctl.c @@ -163,8 +163,7 @@ static int qxl_process_single_command(struct qxl_device *qdev, if (cmd->command_size > PAGE_SIZE - sizeof(union qxl_release_info)) return -EINVAL; - if (!access_ok(VERIFY_READ, - u64_to_user_ptr(cmd->command), + if (!access_ok(u64_to_user_ptr(cmd->command), cmd->command_size)) return -EFAULT; diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index 9f9172eb1512..fb0007aa0c27 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -611,8 +611,7 @@ static ssize_t verify_hdr(struct ib_uverbs_cmd_hdr *hdr, if (hdr->out_words * 8 < method_elm->resp_size) return -ENOSPC; - if (!access_ok(VERIFY_WRITE, - u64_to_user_ptr(ex_hdr->response), + if (!access_ok(u64_to_user_ptr(ex_hdr->response), (hdr->out_words + ex_hdr->provider_out_words) * 8)) return -EFAULT; } else { diff --git a/drivers/infiniband/hw/hfi1/user_exp_rcv.c b/drivers/infiniband/hw/hfi1/user_exp_rcv.c index dbe7d14a5c76..0cd71ce7cc71 100644 --- a/drivers/infiniband/hw/hfi1/user_exp_rcv.c +++ b/drivers/infiniband/hw/hfi1/user_exp_rcv.c @@ -232,7 +232,7 @@ static int pin_rcv_pages(struct hfi1_filedata *fd, struct tid_user_buf *tidbuf) } /* Verify that access is OK for the user buffer */ - if (!access_ok(VERIFY_WRITE, (void __user *)vaddr, + if (!access_ok((void __user *)vaddr, npages * PAGE_SIZE)) { dd_dev_err(dd, "Fail vaddr %p, %u pages, !access_ok\n", (void *)vaddr, npages); diff --git a/drivers/infiniband/hw/qib/qib_file_ops.c b/drivers/infiniband/hw/qib/qib_file_ops.c index 98e1ce14fa2a..78fa634de98a 100644 --- a/drivers/infiniband/hw/qib/qib_file_ops.c +++ b/drivers/infiniband/hw/qib/qib_file_ops.c @@ -343,7 +343,7 @@ static int qib_tid_update(struct qib_ctxtdata *rcd, struct file *fp, /* virtual address of first page in transfer */ vaddr = ti->tidvaddr; - if (!access_ok(VERIFY_WRITE, (void __user *) vaddr, + if (!access_ok((void __user *) vaddr, cnt * PAGE_SIZE)) { ret = -EFAULT; goto done; diff --git a/drivers/macintosh/ans-lcd.c b/drivers/macintosh/ans-lcd.c index ef0c2366cf59..400960cf04d5 100644 --- a/drivers/macintosh/ans-lcd.c +++ b/drivers/macintosh/ans-lcd.c @@ -64,7 +64,7 @@ anslcd_write( struct file * file, const char __user * buf, printk(KERN_DEBUG "LCD: write\n"); #endif - if (!access_ok(VERIFY_READ, buf, count)) + if (!access_ok(buf, count)) return -EFAULT; mutex_lock(&anslcd_mutex); diff --git a/drivers/macintosh/via-pmu.c b/drivers/macintosh/via-pmu.c index ac0cf37d6239..21d532a78fa4 100644 --- a/drivers/macintosh/via-pmu.c +++ b/drivers/macintosh/via-pmu.c @@ -2188,7 +2188,7 @@ pmu_read(struct file *file, char __user *buf, if (count < 1 || !pp) return -EINVAL; - if (!access_ok(VERIFY_WRITE, buf, count)) + if (!access_ok(buf, count)) return -EFAULT; spin_lock_irqsave(&pp->lock, flags); diff --git a/drivers/media/pci/ivtv/ivtvfb.c b/drivers/media/pci/ivtv/ivtvfb.c index 3e02de02ffdd..8ec2525d8ef5 100644 --- a/drivers/media/pci/ivtv/ivtvfb.c +++ b/drivers/media/pci/ivtv/ivtvfb.c @@ -356,7 +356,7 @@ static int ivtvfb_prep_frame(struct ivtv *itv, int cmd, void __user *source, IVTVFB_WARN("ivtvfb_prep_frame: Count not a multiple of 4 (%d)\n", count); /* Check Source */ - if (!access_ok(VERIFY_READ, source + dest_offset, count)) { + if (!access_ok(source + dest_offset, count)) { IVTVFB_WARN("Invalid userspace pointer %p\n", source); IVTVFB_DEBUG_WARN("access_ok() failed for offset 0x%08lx source %p count %d\n", diff --git a/drivers/media/v4l2-core/v4l2-compat-ioctl32.c b/drivers/media/v4l2-core/v4l2-compat-ioctl32.c index fe4577a46869..73dac1d8d4f6 100644 --- a/drivers/media/v4l2-core/v4l2-compat-ioctl32.c +++ b/drivers/media/v4l2-core/v4l2-compat-ioctl32.c @@ -158,7 +158,7 @@ static int get_v4l2_window32(struct v4l2_window __user *p64, compat_caddr_t p; u32 clipcount; - if (!access_ok(VERIFY_READ, p32, sizeof(*p32)) || + if (!access_ok(p32, sizeof(*p32)) || copy_in_user(&p64->w, &p32->w, sizeof(p32->w)) || assign_in_user(&p64->field, &p32->field) || assign_in_user(&p64->chromakey, &p32->chromakey) || @@ -283,7 +283,7 @@ static int __bufsize_v4l2_format(struct v4l2_format32 __user *p32, u32 *size) static int bufsize_v4l2_format(struct v4l2_format32 __user *p32, u32 *size) { - if (!access_ok(VERIFY_READ, p32, sizeof(*p32))) + if (!access_ok(p32, sizeof(*p32))) return -EFAULT; return __bufsize_v4l2_format(p32, size); } @@ -335,7 +335,7 @@ static int get_v4l2_format32(struct v4l2_format __user *p64, struct v4l2_format32 __user *p32, void __user *aux_buf, u32 aux_space) { - if (!access_ok(VERIFY_READ, p32, sizeof(*p32))) + if (!access_ok(p32, sizeof(*p32))) return -EFAULT; return __get_v4l2_format32(p64, p32, aux_buf, aux_space); } @@ -343,7 +343,7 @@ static int get_v4l2_format32(struct v4l2_format __user *p64, static int bufsize_v4l2_create(struct v4l2_create_buffers32 __user *p32, u32 *size) { - if (!access_ok(VERIFY_READ, p32, sizeof(*p32))) + if (!access_ok(p32, sizeof(*p32))) return -EFAULT; return __bufsize_v4l2_format(&p32->format, size); } @@ -352,7 +352,7 @@ static int get_v4l2_create32(struct v4l2_create_buffers __user *p64, struct v4l2_create_buffers32 __user *p32, void __user *aux_buf, u32 aux_space) { - if (!access_ok(VERIFY_READ, p32, sizeof(*p32)) || + if (!access_ok(p32, sizeof(*p32)) || copy_in_user(p64, p32, offsetof(struct v4l2_create_buffers32, format))) return -EFAULT; @@ -404,7 +404,7 @@ static int __put_v4l2_format32(struct v4l2_format __user *p64, static int put_v4l2_format32(struct v4l2_format __user *p64, struct v4l2_format32 __user *p32) { - if (!access_ok(VERIFY_WRITE, p32, sizeof(*p32))) + if (!access_ok(p32, sizeof(*p32))) return -EFAULT; return __put_v4l2_format32(p64, p32); } @@ -412,7 +412,7 @@ static int put_v4l2_format32(struct v4l2_format __user *p64, static int put_v4l2_create32(struct v4l2_create_buffers __user *p64, struct v4l2_create_buffers32 __user *p32) { - if (!access_ok(VERIFY_WRITE, p32, sizeof(*p32)) || + if (!access_ok(p32, sizeof(*p32)) || copy_in_user(p32, p64, offsetof(struct v4l2_create_buffers32, format)) || assign_in_user(&p32->capabilities, &p64->capabilities) || @@ -434,7 +434,7 @@ static int get_v4l2_standard32(struct v4l2_standard __user *p64, struct v4l2_standard32 __user *p32) { /* other fields are not set by the user, nor used by the driver */ - if (!access_ok(VERIFY_READ, p32, sizeof(*p32)) || + if (!access_ok(p32, sizeof(*p32)) || assign_in_user(&p64->index, &p32->index)) return -EFAULT; return 0; @@ -443,7 +443,7 @@ static int get_v4l2_standard32(struct v4l2_standard __user *p64, static int put_v4l2_standard32(struct v4l2_standard __user *p64, struct v4l2_standard32 __user *p32) { - if (!access_ok(VERIFY_WRITE, p32, sizeof(*p32)) || + if (!access_ok(p32, sizeof(*p32)) || assign_in_user(&p32->index, &p64->index) || assign_in_user(&p32->id, &p64->id) || copy_in_user(p32->name, p64->name, sizeof(p32->name)) || @@ -560,7 +560,7 @@ static int bufsize_v4l2_buffer(struct v4l2_buffer32 __user *p32, u32 *size) u32 type; u32 length; - if (!access_ok(VERIFY_READ, p32, sizeof(*p32)) || + if (!access_ok(p32, sizeof(*p32)) || get_user(type, &p32->type) || get_user(length, &p32->length)) return -EFAULT; @@ -593,7 +593,7 @@ static int get_v4l2_buffer32(struct v4l2_buffer __user *p64, compat_caddr_t p; int ret; - if (!access_ok(VERIFY_READ, p32, sizeof(*p32)) || + if (!access_ok(p32, sizeof(*p32)) || assign_in_user(&p64->index, &p32->index) || get_user(type, &p32->type) || put_user(type, &p64->type) || @@ -632,7 +632,7 @@ static int get_v4l2_buffer32(struct v4l2_buffer __user *p64, return -EFAULT; uplane32 = compat_ptr(p); - if (!access_ok(VERIFY_READ, uplane32, + if (!access_ok(uplane32, num_planes * sizeof(*uplane32))) return -EFAULT; @@ -691,7 +691,7 @@ static int put_v4l2_buffer32(struct v4l2_buffer __user *p64, compat_caddr_t p; int ret; - if (!access_ok(VERIFY_WRITE, p32, sizeof(*p32)) || + if (!access_ok(p32, sizeof(*p32)) || assign_in_user(&p32->index, &p64->index) || get_user(type, &p64->type) || put_user(type, &p32->type) || @@ -781,7 +781,7 @@ static int get_v4l2_framebuffer32(struct v4l2_framebuffer __user *p64, { compat_caddr_t tmp; - if (!access_ok(VERIFY_READ, p32, sizeof(*p32)) || + if (!access_ok(p32, sizeof(*p32)) || get_user(tmp, &p32->base) || put_user_force(compat_ptr(tmp), &p64->base) || assign_in_user(&p64->capability, &p32->capability) || @@ -796,7 +796,7 @@ static int put_v4l2_framebuffer32(struct v4l2_framebuffer __user *p64, { void *base; - if (!access_ok(VERIFY_WRITE, p32, sizeof(*p32)) || + if (!access_ok(p32, sizeof(*p32)) || get_user(base, &p64->base) || put_user(ptr_to_compat((void __user *)base), &p32->base) || assign_in_user(&p32->capability, &p64->capability) || @@ -893,7 +893,7 @@ static int bufsize_v4l2_ext_controls(struct v4l2_ext_controls32 __user *p32, { u32 count; - if (!access_ok(VERIFY_READ, p32, sizeof(*p32)) || + if (!access_ok(p32, sizeof(*p32)) || get_user(count, &p32->count)) return -EFAULT; if (count > V4L2_CID_MAX_CTRLS) @@ -913,7 +913,7 @@ static int get_v4l2_ext_controls32(struct file *file, u32 n; compat_caddr_t p; - if (!access_ok(VERIFY_READ, p32, sizeof(*p32)) || + if (!access_ok(p32, sizeof(*p32)) || assign_in_user(&p64->which, &p32->which) || get_user(count, &p32->count) || put_user(count, &p64->count) || @@ -929,7 +929,7 @@ static int get_v4l2_ext_controls32(struct file *file, if (get_user(p, &p32->controls)) return -EFAULT; ucontrols = compat_ptr(p); - if (!access_ok(VERIFY_READ, ucontrols, count * sizeof(*ucontrols))) + if (!access_ok(ucontrols, count * sizeof(*ucontrols))) return -EFAULT; if (aux_space < count * sizeof(*kcontrols)) return -EFAULT; @@ -979,7 +979,7 @@ static int put_v4l2_ext_controls32(struct file *file, * with __user causes smatch warnings, so instead declare it * without __user and cast it as a userspace pointer where needed. */ - if (!access_ok(VERIFY_WRITE, p32, sizeof(*p32)) || + if (!access_ok(p32, sizeof(*p32)) || assign_in_user(&p32->which, &p64->which) || get_user(count, &p64->count) || put_user(count, &p32->count) || @@ -994,7 +994,7 @@ static int put_v4l2_ext_controls32(struct file *file, if (get_user(p, &p32->controls)) return -EFAULT; ucontrols = compat_ptr(p); - if (!access_ok(VERIFY_WRITE, ucontrols, count * sizeof(*ucontrols))) + if (!access_ok(ucontrols, count * sizeof(*ucontrols))) return -EFAULT; for (n = 0; n < count; n++) { @@ -1043,7 +1043,7 @@ struct v4l2_event32 { static int put_v4l2_event32(struct v4l2_event __user *p64, struct v4l2_event32 __user *p32) { - if (!access_ok(VERIFY_WRITE, p32, sizeof(*p32)) || + if (!access_ok(p32, sizeof(*p32)) || assign_in_user(&p32->type, &p64->type) || copy_in_user(&p32->u, &p64->u, sizeof(p64->u)) || assign_in_user(&p32->pending, &p64->pending) || @@ -1069,7 +1069,7 @@ static int get_v4l2_edid32(struct v4l2_edid __user *p64, { compat_uptr_t tmp; - if (!access_ok(VERIFY_READ, p32, sizeof(*p32)) || + if (!access_ok(p32, sizeof(*p32)) || assign_in_user(&p64->pad, &p32->pad) || assign_in_user(&p64->start_block, &p32->start_block) || assign_in_user_cast(&p64->blocks, &p32->blocks) || @@ -1085,7 +1085,7 @@ static int put_v4l2_edid32(struct v4l2_edid __user *p64, { void *edid; - if (!access_ok(VERIFY_WRITE, p32, sizeof(*p32)) || + if (!access_ok(p32, sizeof(*p32)) || assign_in_user(&p32->pad, &p64->pad) || assign_in_user(&p32->start_block, &p64->start_block) || assign_in_user(&p32->blocks, &p64->blocks) || diff --git a/drivers/misc/vmw_vmci/vmci_host.c b/drivers/misc/vmw_vmci/vmci_host.c index 5da1f3e3f997..997f92543dd4 100644 --- a/drivers/misc/vmw_vmci/vmci_host.c +++ b/drivers/misc/vmw_vmci/vmci_host.c @@ -236,7 +236,7 @@ static int vmci_host_setup_notify(struct vmci_ctx *context, * about the size. */ BUILD_BUG_ON(sizeof(bool) != sizeof(u8)); - if (!access_ok(VERIFY_WRITE, (void __user *)uva, sizeof(u8))) + if (!access_ok((void __user *)uva, sizeof(u8))) return VMCI_ERROR_GENERIC; /* diff --git a/drivers/pci/proc.c b/drivers/pci/proc.c index 7ac035af39f0..6fa1627ce08d 100644 --- a/drivers/pci/proc.c +++ b/drivers/pci/proc.c @@ -52,7 +52,7 @@ static ssize_t proc_bus_pci_read(struct file *file, char __user *buf, nbytes = size - pos; cnt = nbytes; - if (!access_ok(VERIFY_WRITE, buf, cnt)) + if (!access_ok(buf, cnt)) return -EINVAL; pci_config_pm_runtime_get(dev); @@ -125,7 +125,7 @@ static ssize_t proc_bus_pci_write(struct file *file, const char __user *buf, nbytes = size - pos; cnt = nbytes; - if (!access_ok(VERIFY_READ, buf, cnt)) + if (!access_ok(buf, cnt)) return -EINVAL; pci_config_pm_runtime_get(dev); diff --git a/drivers/platform/goldfish/goldfish_pipe.c b/drivers/platform/goldfish/goldfish_pipe.c index 7c639006252e..321bc673c417 100644 --- a/drivers/platform/goldfish/goldfish_pipe.c +++ b/drivers/platform/goldfish/goldfish_pipe.c @@ -416,8 +416,7 @@ static ssize_t goldfish_pipe_read_write(struct file *filp, if (unlikely(bufflen == 0)) return 0; /* Check the buffer range for access */ - if (unlikely(!access_ok(is_write ? VERIFY_WRITE : VERIFY_READ, - buffer, bufflen))) + if (unlikely(!access_ok(buffer, bufflen))) return -EFAULT; address = (unsigned long)buffer; diff --git a/drivers/pnp/isapnp/proc.c b/drivers/pnp/isapnp/proc.c index 262285e48a09..051613140812 100644 --- a/drivers/pnp/isapnp/proc.c +++ b/drivers/pnp/isapnp/proc.c @@ -47,7 +47,7 @@ static ssize_t isapnp_proc_bus_read(struct file *file, char __user * buf, nbytes = size - pos; cnt = nbytes; - if (!access_ok(VERIFY_WRITE, buf, cnt)) + if (!access_ok(buf, cnt)) return -EINVAL; isapnp_cfg_begin(dev->card->number, dev->number); diff --git a/drivers/scsi/pmcraid.c b/drivers/scsi/pmcraid.c index 7c4673308f5b..e338d7a4f571 100644 --- a/drivers/scsi/pmcraid.c +++ b/drivers/scsi/pmcraid.c @@ -3600,7 +3600,7 @@ static long pmcraid_ioctl_passthrough( u32 ioasc; int request_size; int buffer_size; - u8 access, direction; + u8 direction; int rc = 0; /* If IOA reset is in progress, wait 10 secs for reset to complete */ @@ -3649,10 +3649,8 @@ static long pmcraid_ioctl_passthrough( request_size = le32_to_cpu(buffer->ioarcb.data_transfer_length); if (buffer->ioarcb.request_flags0 & TRANSFER_DIR_WRITE) { - access = VERIFY_READ; direction = DMA_TO_DEVICE; } else { - access = VERIFY_WRITE; direction = DMA_FROM_DEVICE; } diff --git a/drivers/scsi/scsi_ioctl.c b/drivers/scsi/scsi_ioctl.c index cc30fccc1a2e..840d96fe81bc 100644 --- a/drivers/scsi/scsi_ioctl.c +++ b/drivers/scsi/scsi_ioctl.c @@ -221,7 +221,7 @@ int scsi_ioctl(struct scsi_device *sdev, int cmd, void __user *arg) switch (cmd) { case SCSI_IOCTL_GET_IDLUN: - if (!access_ok(VERIFY_WRITE, arg, sizeof(struct scsi_idlun))) + if (!access_ok(arg, sizeof(struct scsi_idlun))) return -EFAULT; __put_user((sdev->id & 0xff) diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c index 4e27460ec926..d3f15319b9b3 100644 --- a/drivers/scsi/sg.c +++ b/drivers/scsi/sg.c @@ -434,7 +434,7 @@ sg_read(struct file *filp, char __user *buf, size_t count, loff_t * ppos) SCSI_LOG_TIMEOUT(3, sg_printk(KERN_INFO, sdp, "sg_read: count=%d\n", (int) count)); - if (!access_ok(VERIFY_WRITE, buf, count)) + if (!access_ok(buf, count)) return -EFAULT; if (sfp->force_packid && (count >= SZ_SG_HEADER)) { old_hdr = kmalloc(SZ_SG_HEADER, GFP_KERNEL); @@ -632,7 +632,7 @@ sg_write(struct file *filp, const char __user *buf, size_t count, loff_t * ppos) scsi_block_when_processing_errors(sdp->device))) return -ENXIO; - if (!access_ok(VERIFY_READ, buf, count)) + if (!access_ok(buf, count)) return -EFAULT; /* protects following copy_from_user()s + get_user()s */ if (count < SZ_SG_HEADER) return -EIO; @@ -729,7 +729,7 @@ sg_new_write(Sg_fd *sfp, struct file *file, const char __user *buf, if (count < SZ_SG_IO_HDR) return -EINVAL; - if (!access_ok(VERIFY_READ, buf, count)) + if (!access_ok(buf, count)) return -EFAULT; /* protects following copy_from_user()s + get_user()s */ sfp->cmd_q = 1; /* when sg_io_hdr seen, set command queuing on */ @@ -768,7 +768,7 @@ sg_new_write(Sg_fd *sfp, struct file *file, const char __user *buf, sg_remove_request(sfp, srp); return -EMSGSIZE; } - if (!access_ok(VERIFY_READ, hp->cmdp, hp->cmd_len)) { + if (!access_ok(hp->cmdp, hp->cmd_len)) { sg_remove_request(sfp, srp); return -EFAULT; /* protects following copy_from_user()s + get_user()s */ } @@ -922,7 +922,7 @@ sg_ioctl(struct file *filp, unsigned int cmd_in, unsigned long arg) return -ENODEV; if (!scsi_block_when_processing_errors(sdp->device)) return -ENXIO; - if (!access_ok(VERIFY_WRITE, p, SZ_SG_IO_HDR)) + if (!access_ok(p, SZ_SG_IO_HDR)) return -EFAULT; result = sg_new_write(sfp, filp, p, SZ_SG_IO_HDR, 1, read_only, 1, &srp); @@ -968,7 +968,7 @@ sg_ioctl(struct file *filp, unsigned int cmd_in, unsigned long arg) case SG_GET_LOW_DMA: return put_user((int) sdp->device->host->unchecked_isa_dma, ip); case SG_GET_SCSI_ID: - if (!access_ok(VERIFY_WRITE, p, sizeof (sg_scsi_id_t))) + if (!access_ok(p, sizeof (sg_scsi_id_t))) return -EFAULT; else { sg_scsi_id_t __user *sg_idp = p; @@ -997,7 +997,7 @@ sg_ioctl(struct file *filp, unsigned int cmd_in, unsigned long arg) sfp->force_packid = val ? 1 : 0; return 0; case SG_GET_PACK_ID: - if (!access_ok(VERIFY_WRITE, ip, sizeof (int))) + if (!access_ok(ip, sizeof (int))) return -EFAULT; read_lock_irqsave(&sfp->rq_list_lock, iflags); list_for_each_entry(srp, &sfp->rq_list, entry) { @@ -1078,7 +1078,7 @@ sg_ioctl(struct file *filp, unsigned int cmd_in, unsigned long arg) val = (sdp->device ? 1 : 0); return put_user(val, ip); case SG_GET_REQUEST_TABLE: - if (!access_ok(VERIFY_WRITE, p, SZ_SG_REQ_INFO * SG_MAX_QUEUE)) + if (!access_ok(p, SZ_SG_REQ_INFO * SG_MAX_QUEUE)) return -EFAULT; else { sg_req_info_t *rinfo; diff --git a/drivers/staging/comedi/comedi_compat32.c b/drivers/staging/comedi/comedi_compat32.c index fa9d239474ee..36a3564ba1fb 100644 --- a/drivers/staging/comedi/comedi_compat32.c +++ b/drivers/staging/comedi/comedi_compat32.c @@ -102,8 +102,8 @@ static int compat_chaninfo(struct file *file, unsigned long arg) chaninfo = compat_alloc_user_space(sizeof(*chaninfo)); /* Copy chaninfo structure. Ignore unused members. */ - if (!access_ok(VERIFY_READ, chaninfo32, sizeof(*chaninfo32)) || - !access_ok(VERIFY_WRITE, chaninfo, sizeof(*chaninfo))) + if (!access_ok(chaninfo32, sizeof(*chaninfo32)) || + !access_ok(chaninfo, sizeof(*chaninfo))) return -EFAULT; err = 0; @@ -136,8 +136,8 @@ static int compat_rangeinfo(struct file *file, unsigned long arg) rangeinfo = compat_alloc_user_space(sizeof(*rangeinfo)); /* Copy rangeinfo structure. */ - if (!access_ok(VERIFY_READ, rangeinfo32, sizeof(*rangeinfo32)) || - !access_ok(VERIFY_WRITE, rangeinfo, sizeof(*rangeinfo))) + if (!access_ok(rangeinfo32, sizeof(*rangeinfo32)) || + !access_ok(rangeinfo, sizeof(*rangeinfo))) return -EFAULT; err = 0; @@ -163,8 +163,8 @@ static int get_compat_cmd(struct comedi_cmd __user *cmd, } temp; /* Copy cmd structure. */ - if (!access_ok(VERIFY_READ, cmd32, sizeof(*cmd32)) || - !access_ok(VERIFY_WRITE, cmd, sizeof(*cmd))) + if (!access_ok(cmd32, sizeof(*cmd32)) || + !access_ok(cmd, sizeof(*cmd))) return -EFAULT; err = 0; @@ -217,8 +217,8 @@ static int put_compat_cmd(struct comedi32_cmd_struct __user *cmd32, * Assume the pointer values are already valid. * (Could use ptr_to_compat() to set them.) */ - if (!access_ok(VERIFY_READ, cmd, sizeof(*cmd)) || - !access_ok(VERIFY_WRITE, cmd32, sizeof(*cmd32))) + if (!access_ok(cmd, sizeof(*cmd)) || + !access_ok(cmd32, sizeof(*cmd32))) return -EFAULT; err = 0; @@ -317,8 +317,8 @@ static int get_compat_insn(struct comedi_insn __user *insn, /* Copy insn structure. Ignore the unused members. */ err = 0; - if (!access_ok(VERIFY_READ, insn32, sizeof(*insn32)) || - !access_ok(VERIFY_WRITE, insn, sizeof(*insn))) + if (!access_ok(insn32, sizeof(*insn32)) || + !access_ok(insn, sizeof(*insn))) return -EFAULT; err |= __get_user(temp.uint, &insn32->insn); @@ -350,7 +350,7 @@ static int compat_insnlist(struct file *file, unsigned long arg) insnlist32 = compat_ptr(arg); /* Get 32-bit insnlist structure. */ - if (!access_ok(VERIFY_READ, insnlist32, sizeof(*insnlist32))) + if (!access_ok(insnlist32, sizeof(*insnlist32))) return -EFAULT; err = 0; @@ -365,7 +365,7 @@ static int compat_insnlist(struct file *file, unsigned long arg) insn[n_insns])); /* Set native insnlist structure. */ - if (!access_ok(VERIFY_WRITE, &s->insnlist, sizeof(s->insnlist))) + if (!access_ok(&s->insnlist, sizeof(s->insnlist))) return -EFAULT; err |= __put_user(n_insns, &s->insnlist.n_insns); diff --git a/drivers/tty/n_hdlc.c b/drivers/tty/n_hdlc.c index 99460af61b77..4164414d4c64 100644 --- a/drivers/tty/n_hdlc.c +++ b/drivers/tty/n_hdlc.c @@ -573,7 +573,7 @@ static ssize_t n_hdlc_tty_read(struct tty_struct *tty, struct file *file, return -EIO; /* verify user access to buffer */ - if (!access_ok(VERIFY_WRITE, buf, nr)) { + if (!access_ok(buf, nr)) { printk(KERN_WARNING "%s(%d) n_hdlc_tty_read() can't verify user " "buffer\n", __FILE__, __LINE__); return -EFAULT; diff --git a/drivers/usb/core/devices.c b/drivers/usb/core/devices.c index 3de3c750b5f6..44f28a114c2b 100644 --- a/drivers/usb/core/devices.c +++ b/drivers/usb/core/devices.c @@ -598,7 +598,7 @@ static ssize_t usb_device_read(struct file *file, char __user *buf, return -EINVAL; if (nbytes <= 0) return 0; - if (!access_ok(VERIFY_WRITE, buf, nbytes)) + if (!access_ok(buf, nbytes)) return -EFAULT; mutex_lock(&usb_bus_idr_lock); diff --git a/drivers/usb/core/devio.c b/drivers/usb/core/devio.c index a75bc0b8a50f..d65566341dd1 100644 --- a/drivers/usb/core/devio.c +++ b/drivers/usb/core/devio.c @@ -1094,7 +1094,7 @@ static int proc_control(struct usb_dev_state *ps, void __user *arg) ctrl.bRequestType, ctrl.bRequest, ctrl.wValue, ctrl.wIndex, ctrl.wLength); if (ctrl.bRequestType & 0x80) { - if (ctrl.wLength && !access_ok(VERIFY_WRITE, ctrl.data, + if (ctrl.wLength && !access_ok(ctrl.data, ctrl.wLength)) { ret = -EINVAL; goto done; @@ -1183,7 +1183,7 @@ static int proc_bulk(struct usb_dev_state *ps, void __user *arg) } tmo = bulk.timeout; if (bulk.ep & 0x80) { - if (len1 && !access_ok(VERIFY_WRITE, bulk.data, len1)) { + if (len1 && !access_ok(bulk.data, len1)) { ret = -EINVAL; goto done; } @@ -1584,8 +1584,7 @@ static int proc_do_submiturb(struct usb_dev_state *ps, struct usbdevfs_urb *uurb } if (uurb->buffer_length > 0 && - !access_ok(is_in ? VERIFY_WRITE : VERIFY_READ, - uurb->buffer, uurb->buffer_length)) { + !access_ok(uurb->buffer, uurb->buffer_length)) { ret = -EFAULT; goto error; } diff --git a/drivers/usb/gadget/function/f_hid.c b/drivers/usb/gadget/function/f_hid.c index 54e859dcb25c..75b113a5b25c 100644 --- a/drivers/usb/gadget/function/f_hid.c +++ b/drivers/usb/gadget/function/f_hid.c @@ -252,7 +252,7 @@ static ssize_t f_hidg_read(struct file *file, char __user *buffer, if (!count) return 0; - if (!access_ok(VERIFY_WRITE, buffer, count)) + if (!access_ok(buffer, count)) return -EFAULT; spin_lock_irqsave(&hidg->read_spinlock, flags); @@ -339,7 +339,7 @@ static ssize_t f_hidg_write(struct file *file, const char __user *buffer, unsigned long flags; ssize_t status = -ENOMEM; - if (!access_ok(VERIFY_READ, buffer, count)) + if (!access_ok(buffer, count)) return -EFAULT; spin_lock_irqsave(&hidg->write_spinlock, flags); diff --git a/drivers/usb/gadget/udc/atmel_usba_udc.c b/drivers/usb/gadget/udc/atmel_usba_udc.c index 11247322d587..660712e0bf98 100644 --- a/drivers/usb/gadget/udc/atmel_usba_udc.c +++ b/drivers/usb/gadget/udc/atmel_usba_udc.c @@ -88,7 +88,7 @@ static ssize_t queue_dbg_read(struct file *file, char __user *buf, size_t len, remaining, actual = 0; char tmpbuf[38]; - if (!access_ok(VERIFY_WRITE, buf, nbytes)) + if (!access_ok(buf, nbytes)) return -EFAULT; inode_lock(file_inode(file)); diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 55e5aa662ad5..9f7942cbcbb2 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -655,7 +655,7 @@ static bool log_access_ok(void __user *log_base, u64 addr, unsigned long sz) a + (unsigned long)log_base > ULONG_MAX) return false; - return access_ok(VERIFY_WRITE, log_base + a, + return access_ok(log_base + a, (sz + VHOST_PAGE_SIZE * 8 - 1) / VHOST_PAGE_SIZE / 8); } @@ -681,7 +681,7 @@ static bool vq_memory_access_ok(void __user *log_base, struct vhost_umem *umem, return false; - if (!access_ok(VERIFY_WRITE, (void __user *)a, + if (!access_ok((void __user *)a, node->size)) return false; else if (log_all && !log_access_ok(log_base, @@ -973,10 +973,10 @@ static bool umem_access_ok(u64 uaddr, u64 size, int access) return false; if ((access & VHOST_ACCESS_RO) && - !access_ok(VERIFY_READ, (void __user *)a, size)) + !access_ok((void __user *)a, size)) return false; if ((access & VHOST_ACCESS_WO) && - !access_ok(VERIFY_WRITE, (void __user *)a, size)) + !access_ok((void __user *)a, size)) return false; return true; } @@ -1185,10 +1185,10 @@ static bool vq_access_ok(struct vhost_virtqueue *vq, unsigned int num, { size_t s = vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0; - return access_ok(VERIFY_READ, desc, num * sizeof *desc) && - access_ok(VERIFY_READ, avail, + return access_ok(desc, num * sizeof *desc) && + access_ok(avail, sizeof *avail + num * sizeof *avail->ring + s) && - access_ok(VERIFY_WRITE, used, + access_ok(used, sizeof *used + num * sizeof *used->ring + s); } @@ -1814,7 +1814,7 @@ int vhost_vq_init_access(struct vhost_virtqueue *vq) goto err; vq->signalled_used_valid = false; if (!vq->iotlb && - !access_ok(VERIFY_READ, &vq->used->idx, sizeof vq->used->idx)) { + !access_ok(&vq->used->idx, sizeof vq->used->idx)) { r = -EFAULT; goto err; } diff --git a/drivers/video/fbdev/amifb.c b/drivers/video/fbdev/amifb.c index 0777aff211e5..758457026694 100644 --- a/drivers/video/fbdev/amifb.c +++ b/drivers/video/fbdev/amifb.c @@ -1855,7 +1855,7 @@ static int ami_get_var_cursorinfo(struct fb_var_cursorinfo *var, var->yspot = par->crsr.spot_y; if (size > var->height * var->width) return -ENAMETOOLONG; - if (!access_ok(VERIFY_WRITE, data, size)) + if (!access_ok(data, size)) return -EFAULT; delta = 1 << par->crsr.fmode; lspr = lofsprite + (delta << 1); @@ -1935,7 +1935,7 @@ static int ami_set_var_cursorinfo(struct fb_var_cursorinfo *var, return -EINVAL; if (!var->height) return -EINVAL; - if (!access_ok(VERIFY_READ, data, var->width * var->height)) + if (!access_ok(data, var->width * var->height)) return -EFAULT; delta = 1 << fmode; lofsprite = shfsprite = (u_short *)spritememory; diff --git a/drivers/video/fbdev/omap2/omapfb/omapfb-ioctl.c b/drivers/video/fbdev/omap2/omapfb/omapfb-ioctl.c index a3edb20ea4c3..53f93616c671 100644 --- a/drivers/video/fbdev/omap2/omapfb/omapfb-ioctl.c +++ b/drivers/video/fbdev/omap2/omapfb/omapfb-ioctl.c @@ -493,7 +493,7 @@ static int omapfb_memory_read(struct fb_info *fbi, if (!display || !display->driver->memory_read) return -ENOENT; - if (!access_ok(VERIFY_WRITE, mr->buffer, mr->buffer_size)) + if (!access_ok(mr->buffer, mr->buffer_size)) return -EFAULT; if (mr->w > 4096 || mr->h > 4096) diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c index 7e6e682104dc..b24ddac1604b 100644 --- a/drivers/xen/privcmd.c +++ b/drivers/xen/privcmd.c @@ -459,14 +459,14 @@ static long privcmd_ioctl_mmap_batch( return -EFAULT; /* Returns per-frame error in m.arr. */ m.err = NULL; - if (!access_ok(VERIFY_WRITE, m.arr, m.num * sizeof(*m.arr))) + if (!access_ok(m.arr, m.num * sizeof(*m.arr))) return -EFAULT; break; case 2: if (copy_from_user(&m, udata, sizeof(struct privcmd_mmapbatch_v2))) return -EFAULT; /* Returns per-frame error code in m.err. */ - if (!access_ok(VERIFY_WRITE, m.err, m.num * (sizeof(*m.err)))) + if (!access_ok(m.err, m.num * (sizeof(*m.err)))) return -EFAULT; break; default: @@ -661,7 +661,7 @@ static long privcmd_ioctl_dm_op(struct file *file, void __user *udata) goto out; } - if (!access_ok(VERIFY_WRITE, kbufs[i].uptr, + if (!access_ok(kbufs[i].uptr, kbufs[i].size)) { rc = -EFAULT; goto out; diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c index c3deb2e35f20..ca9725f18e00 100644 --- a/fs/binfmt_aout.c +++ b/fs/binfmt_aout.c @@ -78,9 +78,9 @@ static int aout_core_dump(struct coredump_params *cprm) /* make sure we actually have a data and stack area to dump */ set_fs(USER_DS); - if (!access_ok(VERIFY_READ, START_DATA(dump), dump.u_dsize << PAGE_SHIFT)) + if (!access_ok(START_DATA(dump), dump.u_dsize << PAGE_SHIFT)) dump.u_dsize = 0; - if (!access_ok(VERIFY_READ, START_STACK(dump), dump.u_ssize << PAGE_SHIFT)) + if (!access_ok(START_STACK(dump), dump.u_ssize << PAGE_SHIFT)) dump.u_ssize = 0; set_fs(KERNEL_DS); diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 1b15b43905f8..7ea2d6b1f170 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -6646,7 +6646,7 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg) goto out; } - if (!access_ok(VERIFY_READ, arg->clone_sources, + if (!access_ok(arg->clone_sources, sizeof(*arg->clone_sources) * arg->clone_sources_count)) { ret = -EFAULT; diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 8a5a1010886b..7ebae39fbcb3 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -2172,7 +2172,7 @@ static int do_epoll_wait(int epfd, struct epoll_event __user *events, return -EINVAL; /* Verify that the area passed by the user is writeable */ - if (!access_ok(VERIFY_WRITE, events, maxevents * sizeof(struct epoll_event))) + if (!access_ok(events, maxevents * sizeof(struct epoll_event))) return -EFAULT; /* Get the "struct file *" for the eventpoll file */ diff --git a/fs/fat/dir.c b/fs/fat/dir.c index c8366cb8eccd..0295a095b920 100644 --- a/fs/fat/dir.c +++ b/fs/fat/dir.c @@ -805,7 +805,7 @@ static long fat_dir_ioctl(struct file *filp, unsigned int cmd, return fat_generic_ioctl(filp, cmd, arg); } - if (!access_ok(VERIFY_WRITE, d1, sizeof(struct __fat_dirent[2]))) + if (!access_ok(d1, sizeof(struct __fat_dirent[2]))) return -EFAULT; /* * Yes, we don't need this put_user() absolutely. However old @@ -845,7 +845,7 @@ static long fat_compat_dir_ioctl(struct file *filp, unsigned cmd, return fat_generic_ioctl(filp, cmd, (unsigned long)arg); } - if (!access_ok(VERIFY_WRITE, d1, sizeof(struct compat_dirent[2]))) + if (!access_ok(d1, sizeof(struct compat_dirent[2]))) return -EFAULT; /* * Yes, we don't need this put_user() absolutely. However old diff --git a/fs/ioctl.c b/fs/ioctl.c index d64f622cac8b..fef3a6bf7c78 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -203,7 +203,7 @@ static int ioctl_fiemap(struct file *filp, unsigned long arg) fieinfo.fi_extents_start = ufiemap->fm_extents; if (fiemap.fm_extent_count != 0 && - !access_ok(VERIFY_WRITE, fieinfo.fi_extents_start, + !access_ok(fieinfo.fi_extents_start, fieinfo.fi_extents_max * sizeof(struct fiemap_extent))) return -EFAULT; diff --git a/fs/namespace.c b/fs/namespace.c index a7f91265ea67..97b7c7098c3d 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2651,7 +2651,7 @@ static long exact_copy_from_user(void *to, const void __user * from, const char __user *f = from; char c; - if (!access_ok(VERIFY_READ, from, n)) + if (!access_ok(from, n)) return n; current->kernel_uaccess_faults_ok++; diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c index b8fa1487cd85..8decbe95dcec 100644 --- a/fs/ocfs2/dlmfs/dlmfs.c +++ b/fs/ocfs2/dlmfs/dlmfs.c @@ -254,7 +254,7 @@ static ssize_t dlmfs_file_read(struct file *filp, if (!count) return 0; - if (!access_ok(VERIFY_WRITE, buf, count)) + if (!access_ok(buf, count)) return -EFAULT; /* don't read past the lvb */ @@ -302,7 +302,7 @@ static ssize_t dlmfs_file_write(struct file *filp, if (!count) return 0; - if (!access_ok(VERIFY_READ, buf, count)) + if (!access_ok(buf, count)) return -EFAULT; /* don't write past the lvb */ diff --git a/fs/pstore/pmsg.c b/fs/pstore/pmsg.c index 24db02de1787..97fcef74e5af 100644 --- a/fs/pstore/pmsg.c +++ b/fs/pstore/pmsg.c @@ -33,7 +33,7 @@ static ssize_t write_pmsg(struct file *file, const char __user *buf, record.size = count; /* check outside lock, page in any data. write_user also checks */ - if (!access_ok(VERIFY_READ, buf, count)) + if (!access_ok(buf, count)) return -EFAULT; mutex_lock(&pmsg_lock); diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c index c11711c2cc83..f375c0735351 100644 --- a/fs/pstore/ram_core.c +++ b/fs/pstore/ram_core.c @@ -357,7 +357,7 @@ int notrace persistent_ram_write_user(struct persistent_ram_zone *prz, int rem, ret = 0, c = count; size_t start; - if (unlikely(!access_ok(VERIFY_READ, s, count))) + if (unlikely(!access_ok(s, count))) return -EFAULT; if (unlikely(c > prz->buffer_size)) { s += c - prz->buffer_size; diff --git a/fs/read_write.c b/fs/read_write.c index 58f30537c47a..ff3c5e6f87cf 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -442,7 +442,7 @@ ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos) return -EBADF; if (!(file->f_mode & FMODE_CAN_READ)) return -EINVAL; - if (unlikely(!access_ok(VERIFY_WRITE, buf, count))) + if (unlikely(!access_ok(buf, count))) return -EFAULT; ret = rw_verify_area(READ, file, pos, count); @@ -538,7 +538,7 @@ ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_ return -EBADF; if (!(file->f_mode & FMODE_CAN_WRITE)) return -EINVAL; - if (unlikely(!access_ok(VERIFY_READ, buf, count))) + if (unlikely(!access_ok(buf, count))) return -EFAULT; ret = rw_verify_area(WRITE, file, pos, count); @@ -718,9 +718,6 @@ static ssize_t do_loop_readv_writev(struct file *filp, struct iov_iter *iter, return ret; } -/* A write operation does a read from user space and vice versa */ -#define vrfy_dir(type) ((type) == READ ? VERIFY_WRITE : VERIFY_READ) - /** * rw_copy_check_uvector() - Copy an array of &struct iovec from userspace * into the kernel and check that it is valid. @@ -810,7 +807,7 @@ ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector, goto out; } if (type >= 0 - && unlikely(!access_ok(vrfy_dir(type), buf, len))) { + && unlikely(!access_ok(buf, len))) { ret = -EFAULT; goto out; } @@ -856,7 +853,7 @@ ssize_t compat_rw_copy_check_uvector(int type, *ret_pointer = iov; ret = -EFAULT; - if (!access_ok(VERIFY_READ, uvector, nr_segs*sizeof(*uvector))) + if (!access_ok(uvector, nr_segs*sizeof(*uvector))) goto out; /* @@ -881,7 +878,7 @@ ssize_t compat_rw_copy_check_uvector(int type, if (len < 0) /* size_t not fitting in compat_ssize_t .. */ goto out; if (type >= 0 && - !access_ok(vrfy_dir(type), compat_ptr(buf), len)) { + !access_ok(compat_ptr(buf), len)) { ret = -EFAULT; goto out; } diff --git a/fs/readdir.c b/fs/readdir.c index d97f548e6323..2f6a4534e0df 100644 --- a/fs/readdir.c +++ b/fs/readdir.c @@ -105,7 +105,7 @@ static int fillonedir(struct dir_context *ctx, const char *name, int namlen, } buf->result++; dirent = buf->dirent; - if (!access_ok(VERIFY_WRITE, dirent, + if (!access_ok(dirent, (unsigned long)(dirent->d_name + namlen + 1) - (unsigned long)dirent)) goto efault; @@ -221,7 +221,7 @@ SYSCALL_DEFINE3(getdents, unsigned int, fd, }; int error; - if (!access_ok(VERIFY_WRITE, dirent, count)) + if (!access_ok(dirent, count)) return -EFAULT; f = fdget_pos(fd); @@ -304,7 +304,7 @@ int ksys_getdents64(unsigned int fd, struct linux_dirent64 __user *dirent, }; int error; - if (!access_ok(VERIFY_WRITE, dirent, count)) + if (!access_ok(dirent, count)) return -EFAULT; f = fdget_pos(fd); @@ -365,7 +365,7 @@ static int compat_fillonedir(struct dir_context *ctx, const char *name, } buf->result++; dirent = buf->dirent; - if (!access_ok(VERIFY_WRITE, dirent, + if (!access_ok(dirent, (unsigned long)(dirent->d_name + namlen + 1) - (unsigned long)dirent)) goto efault; @@ -475,7 +475,7 @@ COMPAT_SYSCALL_DEFINE3(getdents, unsigned int, fd, }; int error; - if (!access_ok(VERIFY_WRITE, dirent, count)) + if (!access_ok(dirent, count)) return -EFAULT; f = fdget_pos(fd); diff --git a/fs/select.c b/fs/select.c index 4c8652390c94..d0f35dbc0e8f 100644 --- a/fs/select.c +++ b/fs/select.c @@ -381,9 +381,6 @@ typedef struct { #define FDS_BYTES(nr) (FDS_LONGS(nr)*sizeof(long)) /* - * We do a VERIFY_WRITE here even though we are only reading this time: - * we'll write to it eventually.. - * * Use "unsigned long" accesses to let user-mode fd_set's be long-aligned. */ static inline @@ -782,7 +779,7 @@ SYSCALL_DEFINE6(pselect6, int, n, fd_set __user *, inp, fd_set __user *, outp, sigset_t __user *up = NULL; if (sig) { - if (!access_ok(VERIFY_READ, sig, sizeof(void *)+sizeof(size_t)) + if (!access_ok(sig, sizeof(void *)+sizeof(size_t)) || __get_user(up, (sigset_t __user * __user *)sig) || __get_user(sigsetsize, (size_t __user *)(sig+sizeof(void *)))) @@ -802,7 +799,7 @@ SYSCALL_DEFINE6(pselect6_time32, int, n, fd_set __user *, inp, fd_set __user *, sigset_t __user *up = NULL; if (sig) { - if (!access_ok(VERIFY_READ, sig, sizeof(void *)+sizeof(size_t)) + if (!access_ok(sig, sizeof(void *)+sizeof(size_t)) || __get_user(up, (sigset_t __user * __user *)sig) || __get_user(sigsetsize, (size_t __user *)(sig+sizeof(void *)))) @@ -1368,7 +1365,7 @@ COMPAT_SYSCALL_DEFINE6(pselect6_time64, int, n, compat_ulong_t __user *, inp, compat_uptr_t up = 0; if (sig) { - if (!access_ok(VERIFY_READ, sig, + if (!access_ok(sig, sizeof(compat_uptr_t)+sizeof(compat_size_t)) || __get_user(up, (compat_uptr_t __user *)sig) || __get_user(sigsetsize, @@ -1390,7 +1387,7 @@ COMPAT_SYSCALL_DEFINE6(pselect6, int, n, compat_ulong_t __user *, inp, compat_uptr_t up = 0; if (sig) { - if (!access_ok(VERIFY_READ, sig, + if (!access_ok(sig, sizeof(compat_uptr_t)+sizeof(compat_size_t)) || __get_user(up, (compat_uptr_t __user *)sig) || __get_user(sigsetsize, diff --git a/include/asm-generic/uaccess.h b/include/asm-generic/uaccess.h index 6b2e63df2739..d82c78a79da5 100644 --- a/include/asm-generic/uaccess.h +++ b/include/asm-generic/uaccess.h @@ -35,7 +35,7 @@ static inline void set_fs(mm_segment_t fs) #define segment_eq(a, b) ((a).seg == (b).seg) #endif -#define access_ok(type, addr, size) __access_ok((unsigned long)(addr),(size)) +#define access_ok(addr, size) __access_ok((unsigned long)(addr),(size)) /* * The architecture should really override this if possible, at least @@ -78,7 +78,7 @@ static inline int __access_ok(unsigned long addr, unsigned long size) ({ \ void __user *__p = (ptr); \ might_fault(); \ - access_ok(VERIFY_WRITE, __p, sizeof(*ptr)) ? \ + access_ok(__p, sizeof(*ptr)) ? \ __put_user((x), ((__typeof__(*(ptr)) __user *)__p)) : \ -EFAULT; \ }) @@ -140,7 +140,7 @@ extern int __put_user_bad(void) __attribute__((noreturn)); ({ \ const void __user *__p = (ptr); \ might_fault(); \ - access_ok(VERIFY_READ, __p, sizeof(*ptr)) ? \ + access_ok(__p, sizeof(*ptr)) ? \ __get_user((x), (__typeof__(*(ptr)) __user *)__p) :\ ((x) = (__typeof__(*(ptr)))0,-EFAULT); \ }) @@ -175,7 +175,7 @@ __strncpy_from_user(char *dst, const char __user *src, long count) static inline long strncpy_from_user(char *dst, const char __user *src, long count) { - if (!access_ok(VERIFY_READ, src, 1)) + if (!access_ok(src, 1)) return -EFAULT; return __strncpy_from_user(dst, src, count); } @@ -196,7 +196,7 @@ strncpy_from_user(char *dst, const char __user *src, long count) */ static inline long strnlen_user(const char __user *src, long n) { - if (!access_ok(VERIFY_READ, src, 1)) + if (!access_ok(src, 1)) return 0; return __strnlen_user(src, n); } @@ -217,7 +217,7 @@ static inline __must_check unsigned long clear_user(void __user *to, unsigned long n) { might_fault(); - if (!access_ok(VERIFY_WRITE, to, n)) + if (!access_ok(to, n)) return n; return __clear_user(to, n); diff --git a/include/linux/regset.h b/include/linux/regset.h index 494cedaafdf2..a85c1707285c 100644 --- a/include/linux/regset.h +++ b/include/linux/regset.h @@ -376,7 +376,7 @@ static inline int copy_regset_to_user(struct task_struct *target, if (!regset->get) return -EOPNOTSUPP; - if (!access_ok(VERIFY_WRITE, data, size)) + if (!access_ok(data, size)) return -EFAULT; return regset->get(target, regset, offset, size, NULL, data); @@ -402,7 +402,7 @@ static inline int copy_regset_from_user(struct task_struct *target, if (!regset->set) return -EOPNOTSUPP; - if (!access_ok(VERIFY_READ, data, size)) + if (!access_ok(data, size)) return -EFAULT; return regset->set(target, regset, offset, size, NULL, data); diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index efe79c1cdd47..bf2523867a02 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -6,9 +6,6 @@ #include #include -#define VERIFY_READ 0 -#define VERIFY_WRITE 1 - #define uaccess_kernel() segment_eq(get_fs(), KERNEL_DS) #include @@ -111,7 +108,7 @@ _copy_from_user(void *to, const void __user *from, unsigned long n) { unsigned long res = n; might_fault(); - if (likely(access_ok(VERIFY_READ, from, n))) { + if (likely(access_ok(from, n))) { kasan_check_write(to, n); res = raw_copy_from_user(to, from, n); } @@ -129,7 +126,7 @@ static inline unsigned long _copy_to_user(void __user *to, const void *from, unsigned long n) { might_fault(); - if (access_ok(VERIFY_WRITE, to, n)) { + if (access_ok(to, n)) { kasan_check_read(from, n); n = raw_copy_to_user(to, from, n); } @@ -160,7 +157,7 @@ static __always_inline unsigned long __must_check copy_in_user(void __user *to, const void __user *from, unsigned long n) { might_fault(); - if (access_ok(VERIFY_WRITE, to, n) && access_ok(VERIFY_READ, from, n)) + if (access_ok(to, n) && access_ok(from, n)) n = raw_copy_in_user(to, from, n); return n; } diff --git a/include/net/checksum.h b/include/net/checksum.h index aef2b2bb6603..0f319e13be2c 100644 --- a/include/net/checksum.h +++ b/include/net/checksum.h @@ -30,7 +30,7 @@ static inline __wsum csum_and_copy_from_user (const void __user *src, void *dst, int len, __wsum sum, int *err_ptr) { - if (access_ok(VERIFY_READ, src, len)) + if (access_ok(src, len)) return csum_partial_copy_from_user(src, dst, len, sum, err_ptr); if (len) @@ -46,7 +46,7 @@ static __inline__ __wsum csum_and_copy_to_user { sum = csum_partial(src, len, sum); - if (access_ok(VERIFY_WRITE, dst, len)) { + if (access_ok(dst, len)) { if (copy_to_user(dst, src, len) == 0) return sum; } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 0607db304def..b155cd17c1bd 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -79,7 +79,7 @@ int bpf_check_uarg_tail_zero(void __user *uaddr, if (unlikely(actual_size > PAGE_SIZE)) /* silly large */ return -E2BIG; - if (unlikely(!access_ok(VERIFY_READ, uaddr, actual_size))) + if (unlikely(!access_ok(uaddr, actual_size))) return -EFAULT; if (actual_size <= expected_size) diff --git a/kernel/compat.c b/kernel/compat.c index 089d00d0da9c..705d4ae6c018 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -95,28 +95,28 @@ int compat_put_timex(struct compat_timex __user *utp, const struct timex *txc) static int __compat_get_timeval(struct timeval *tv, const struct old_timeval32 __user *ctv) { - return (!access_ok(VERIFY_READ, ctv, sizeof(*ctv)) || + return (!access_ok(ctv, sizeof(*ctv)) || __get_user(tv->tv_sec, &ctv->tv_sec) || __get_user(tv->tv_usec, &ctv->tv_usec)) ? -EFAULT : 0; } static int __compat_put_timeval(const struct timeval *tv, struct old_timeval32 __user *ctv) { - return (!access_ok(VERIFY_WRITE, ctv, sizeof(*ctv)) || + return (!access_ok(ctv, sizeof(*ctv)) || __put_user(tv->tv_sec, &ctv->tv_sec) || __put_user(tv->tv_usec, &ctv->tv_usec)) ? -EFAULT : 0; } static int __compat_get_timespec(struct timespec *ts, const struct old_timespec32 __user *cts) { - return (!access_ok(VERIFY_READ, cts, sizeof(*cts)) || + return (!access_ok(cts, sizeof(*cts)) || __get_user(ts->tv_sec, &cts->tv_sec) || __get_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0; } static int __compat_put_timespec(const struct timespec *ts, struct old_timespec32 __user *cts) { - return (!access_ok(VERIFY_WRITE, cts, sizeof(*cts)) || + return (!access_ok(cts, sizeof(*cts)) || __put_user(ts->tv_sec, &cts->tv_sec) || __put_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0; } @@ -335,7 +335,7 @@ int get_compat_sigevent(struct sigevent *event, const struct compat_sigevent __user *u_event) { memset(event, 0, sizeof(*event)); - return (!access_ok(VERIFY_READ, u_event, sizeof(*u_event)) || + return (!access_ok(u_event, sizeof(*u_event)) || __get_user(event->sigev_value.sival_int, &u_event->sigev_value.sival_int) || __get_user(event->sigev_signo, &u_event->sigev_signo) || @@ -354,7 +354,7 @@ long compat_get_bitmap(unsigned long *mask, const compat_ulong_t __user *umask, bitmap_size = ALIGN(bitmap_size, BITS_PER_COMPAT_LONG); nr_compat_longs = BITS_TO_COMPAT_LONGS(bitmap_size); - if (!access_ok(VERIFY_READ, umask, bitmap_size / 8)) + if (!access_ok(umask, bitmap_size / 8)) return -EFAULT; user_access_begin(); @@ -384,7 +384,7 @@ long compat_put_bitmap(compat_ulong_t __user *umask, unsigned long *mask, bitmap_size = ALIGN(bitmap_size, BITS_PER_COMPAT_LONG); nr_compat_longs = BITS_TO_COMPAT_LONGS(bitmap_size); - if (!access_ok(VERIFY_WRITE, umask, bitmap_size / 8)) + if (!access_ok(umask, bitmap_size / 8)) return -EFAULT; user_access_begin(); @@ -438,7 +438,7 @@ void __user *compat_alloc_user_space(unsigned long len) ptr = arch_compat_alloc_user_space(len); - if (unlikely(!access_ok(VERIFY_WRITE, ptr, len))) + if (unlikely(!access_ok(ptr, len))) return NULL; return ptr; diff --git a/kernel/events/core.c b/kernel/events/core.c index 67ecac337374..3cd13a30f732 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -10135,7 +10135,7 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr, u32 size; int ret; - if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0)) + if (!access_ok(uattr, PERF_ATTR_SIZE_VER0)) return -EFAULT; /* diff --git a/kernel/exit.c b/kernel/exit.c index 0e21e6d21f35..8a01b671dc1f 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1604,7 +1604,7 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, if (!infop) return err; - if (!access_ok(VERIFY_WRITE, infop, sizeof(*infop))) + if (!access_ok(infop, sizeof(*infop))) return -EFAULT; user_access_begin(); @@ -1732,7 +1732,7 @@ COMPAT_SYSCALL_DEFINE5(waitid, if (!infop) return err; - if (!access_ok(VERIFY_WRITE, infop, sizeof(*infop))) + if (!access_ok(infop, sizeof(*infop))) return -EFAULT; user_access_begin(); diff --git a/kernel/futex.c b/kernel/futex.c index 054105854e0e..be3bff2315ff 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -481,13 +481,18 @@ static void drop_futex_key_refs(union futex_key *key) } } +enum futex_access { + FUTEX_READ, + FUTEX_WRITE +}; + /** * get_futex_key() - Get parameters which are the keys for a futex * @uaddr: virtual address of the futex * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED * @key: address where result is stored. - * @rw: mapping needs to be read/write (values: VERIFY_READ, - * VERIFY_WRITE) + * @rw: mapping needs to be read/write (values: FUTEX_READ, + * FUTEX_WRITE) * * Return: a negative error code or 0 * @@ -500,7 +505,7 @@ static void drop_futex_key_refs(union futex_key *key) * lock_page() might sleep, the caller should not hold a spinlock. */ static int -get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw) +get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, enum futex_access rw) { unsigned long address = (unsigned long)uaddr; struct mm_struct *mm = current->mm; @@ -516,7 +521,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw) return -EINVAL; address -= key->both.offset; - if (unlikely(!access_ok(rw, uaddr, sizeof(u32)))) + if (unlikely(!access_ok(uaddr, sizeof(u32)))) return -EFAULT; if (unlikely(should_fail_futex(fshared))) @@ -546,7 +551,7 @@ again: * If write access is not required (eg. FUTEX_WAIT), try * and get read-only access. */ - if (err == -EFAULT && rw == VERIFY_READ) { + if (err == -EFAULT && rw == FUTEX_READ) { err = get_user_pages_fast(address, 1, 0, &page); ro = 1; } @@ -1583,7 +1588,7 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) if (!bitset) return -EINVAL; - ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, VERIFY_READ); + ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_READ); if (unlikely(ret != 0)) goto out; @@ -1642,7 +1647,7 @@ static int futex_atomic_op_inuser(unsigned int encoded_op, u32 __user *uaddr) oparg = 1 << oparg; } - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) + if (!access_ok(uaddr, sizeof(u32))) return -EFAULT; ret = arch_futex_atomic_op_inuser(op, oparg, &oldval, uaddr); @@ -1682,10 +1687,10 @@ futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2, DEFINE_WAKE_Q(wake_q); retry: - ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ); + ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ); if (unlikely(ret != 0)) goto out; - ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE); + ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, FUTEX_WRITE); if (unlikely(ret != 0)) goto out_put_key1; @@ -1961,11 +1966,11 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, } retry: - ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ); + ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ); if (unlikely(ret != 0)) goto out; ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, - requeue_pi ? VERIFY_WRITE : VERIFY_READ); + requeue_pi ? FUTEX_WRITE : FUTEX_READ); if (unlikely(ret != 0)) goto out_put_key1; @@ -2634,7 +2639,7 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, * while the syscall executes. */ retry: - ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key, VERIFY_READ); + ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key, FUTEX_READ); if (unlikely(ret != 0)) return ret; @@ -2793,7 +2798,7 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, } retry: - ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key, VERIFY_WRITE); + ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key, FUTEX_WRITE); if (unlikely(ret != 0)) goto out; @@ -2972,7 +2977,7 @@ retry: if ((uval & FUTEX_TID_MASK) != vpid) return -EPERM; - ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, VERIFY_WRITE); + ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_WRITE); if (ret) return ret; @@ -3199,7 +3204,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, */ rt_mutex_init_waiter(&rt_waiter); - ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE); + ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, FUTEX_WRITE); if (unlikely(ret != 0)) goto out; diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 1306fe0c1dc6..d3d170374ceb 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1466,7 +1466,7 @@ int do_syslog(int type, char __user *buf, int len, int source) return -EINVAL; if (!len) return 0; - if (!access_ok(VERIFY_WRITE, buf, len)) + if (!access_ok(buf, len)) return -EFAULT; error = wait_event_interruptible(log_wait, syslog_seq != log_next_seq); @@ -1484,7 +1484,7 @@ int do_syslog(int type, char __user *buf, int len, int source) return -EINVAL; if (!len) return 0; - if (!access_ok(VERIFY_WRITE, buf, len)) + if (!access_ok(buf, len)) return -EFAULT; error = syslog_print_all(buf, len, clear); break; diff --git a/kernel/ptrace.c b/kernel/ptrace.c index c2cee9db5204..771e93f9c43f 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -1073,7 +1073,7 @@ int ptrace_request(struct task_struct *child, long request, struct iovec kiov; struct iovec __user *uiov = datavp; - if (!access_ok(VERIFY_WRITE, uiov, sizeof(*uiov))) + if (!access_ok(uiov, sizeof(*uiov))) return -EFAULT; if (__get_user(kiov.iov_base, &uiov->iov_base) || @@ -1229,7 +1229,7 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request, compat_uptr_t ptr; compat_size_t len; - if (!access_ok(VERIFY_WRITE, uiov, sizeof(*uiov))) + if (!access_ok(uiov, sizeof(*uiov))) return -EFAULT; if (__get_user(ptr, &uiov->iov_base) || diff --git a/kernel/rseq.c b/kernel/rseq.c index c6242d8594dc..25e9a7b60eba 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -267,7 +267,7 @@ void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs) if (unlikely(t->flags & PF_EXITING)) return; - if (unlikely(!access_ok(VERIFY_WRITE, t->rseq, sizeof(*t->rseq)))) + if (unlikely(!access_ok(t->rseq, sizeof(*t->rseq)))) goto error; ret = rseq_ip_fixup(regs); if (unlikely(ret < 0)) @@ -295,7 +295,7 @@ void rseq_syscall(struct pt_regs *regs) if (!t->rseq) return; - if (!access_ok(VERIFY_READ, t->rseq, sizeof(*t->rseq)) || + if (!access_ok(t->rseq, sizeof(*t->rseq)) || rseq_get_rseq_cs(t, &rseq_cs) || in_rseq_cs(ip, &rseq_cs)) force_sig(SIGSEGV, t); } @@ -351,7 +351,7 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, if (!IS_ALIGNED((unsigned long)rseq, __alignof__(*rseq)) || rseq_len != sizeof(*rseq)) return -EINVAL; - if (!access_ok(VERIFY_WRITE, rseq, rseq_len)) + if (!access_ok(rseq, rseq_len)) return -EFAULT; current->rseq = rseq; current->rseq_len = rseq_len; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f66920173370..1f3e19fd6dc6 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4450,7 +4450,7 @@ static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *a u32 size; int ret; - if (!access_ok(VERIFY_WRITE, uattr, SCHED_ATTR_SIZE_VER0)) + if (!access_ok(uattr, SCHED_ATTR_SIZE_VER0)) return -EFAULT; /* Zero the full structure, so that a short copy will be nice: */ @@ -4650,7 +4650,7 @@ static int sched_read_attr(struct sched_attr __user *uattr, { int ret; - if (!access_ok(VERIFY_WRITE, uattr, usize)) + if (!access_ok(uattr, usize)) return -EFAULT; /* diff --git a/kernel/signal.c b/kernel/signal.c index 53e07d97ffe0..e1d7ad8e6ab1 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3997,7 +3997,7 @@ SYSCALL_DEFINE3(sigaction, int, sig, if (act) { old_sigset_t mask; - if (!access_ok(VERIFY_READ, act, sizeof(*act)) || + if (!access_ok(act, sizeof(*act)) || __get_user(new_ka.sa.sa_handler, &act->sa_handler) || __get_user(new_ka.sa.sa_restorer, &act->sa_restorer) || __get_user(new_ka.sa.sa_flags, &act->sa_flags) || @@ -4012,7 +4012,7 @@ SYSCALL_DEFINE3(sigaction, int, sig, ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL); if (!ret && oact) { - if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) || + if (!access_ok(oact, sizeof(*oact)) || __put_user(old_ka.sa.sa_handler, &oact->sa_handler) || __put_user(old_ka.sa.sa_restorer, &oact->sa_restorer) || __put_user(old_ka.sa.sa_flags, &oact->sa_flags) || @@ -4034,7 +4034,7 @@ COMPAT_SYSCALL_DEFINE3(sigaction, int, sig, compat_uptr_t handler, restorer; if (act) { - if (!access_ok(VERIFY_READ, act, sizeof(*act)) || + if (!access_ok(act, sizeof(*act)) || __get_user(handler, &act->sa_handler) || __get_user(restorer, &act->sa_restorer) || __get_user(new_ka.sa.sa_flags, &act->sa_flags) || @@ -4052,7 +4052,7 @@ COMPAT_SYSCALL_DEFINE3(sigaction, int, sig, ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL); if (!ret && oact) { - if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) || + if (!access_ok(oact, sizeof(*oact)) || __put_user(ptr_to_compat(old_ka.sa.sa_handler), &oact->sa_handler) || __put_user(ptr_to_compat(old_ka.sa.sa_restorer), diff --git a/kernel/sys.c b/kernel/sys.c index 64b5a230f38d..a48cbf1414b8 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2627,7 +2627,7 @@ COMPAT_SYSCALL_DEFINE1(sysinfo, struct compat_sysinfo __user *, info) s.freehigh >>= bitcount; } - if (!access_ok(VERIFY_WRITE, info, sizeof(struct compat_sysinfo)) || + if (!access_ok(info, sizeof(struct compat_sysinfo)) || __put_user(s.uptime, &info->uptime) || __put_user(s.loads[0], &info->loads[0]) || __put_user(s.loads[1], &info->loads[1]) || diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 9ddb6fddb4e0..8b068adb9da1 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -170,7 +170,7 @@ BPF_CALL_3(bpf_probe_write_user, void *, unsafe_ptr, const void *, src, return -EPERM; if (unlikely(uaccess_kernel())) return -EPERM; - if (!access_ok(VERIFY_WRITE, unsafe_ptr, size)) + if (!access_ok(unsafe_ptr, size)) return -EPERM; return probe_kernel_write(unsafe_ptr, src, size); diff --git a/lib/bitmap.c b/lib/bitmap.c index eead55aa7170..98872e9025da 100644 --- a/lib/bitmap.c +++ b/lib/bitmap.c @@ -443,7 +443,7 @@ int bitmap_parse_user(const char __user *ubuf, unsigned int ulen, unsigned long *maskp, int nmaskbits) { - if (!access_ok(VERIFY_READ, ubuf, ulen)) + if (!access_ok(ubuf, ulen)) return -EFAULT; return __bitmap_parse((const char __force *)ubuf, ulen, 1, maskp, nmaskbits); @@ -641,7 +641,7 @@ int bitmap_parselist_user(const char __user *ubuf, unsigned int ulen, unsigned long *maskp, int nmaskbits) { - if (!access_ok(VERIFY_READ, ubuf, ulen)) + if (!access_ok(ubuf, ulen)) return -EFAULT; return __bitmap_parselist((const char __force *)ubuf, ulen, 1, maskp, nmaskbits); diff --git a/lib/iov_iter.c b/lib/iov_iter.c index 1928009f506e..c93870987b58 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -136,7 +136,7 @@ static int copyout(void __user *to, const void *from, size_t n) { - if (access_ok(VERIFY_WRITE, to, n)) { + if (access_ok(to, n)) { kasan_check_read(from, n); n = raw_copy_to_user(to, from, n); } @@ -145,7 +145,7 @@ static int copyout(void __user *to, const void *from, size_t n) static int copyin(void *to, const void __user *from, size_t n) { - if (access_ok(VERIFY_READ, from, n)) { + if (access_ok(from, n)) { kasan_check_write(to, n); n = raw_copy_from_user(to, from, n); } @@ -614,7 +614,7 @@ EXPORT_SYMBOL(_copy_to_iter); #ifdef CONFIG_ARCH_HAS_UACCESS_MCSAFE static int copyout_mcsafe(void __user *to, const void *from, size_t n) { - if (access_ok(VERIFY_WRITE, to, n)) { + if (access_ok(to, n)) { kasan_check_read(from, n); n = copy_to_user_mcsafe((__force void *) to, from, n); } @@ -1663,7 +1663,7 @@ int import_single_range(int rw, void __user *buf, size_t len, { if (len > MAX_RW_COUNT) len = MAX_RW_COUNT; - if (unlikely(!access_ok(!rw, buf, len))) + if (unlikely(!access_ok(buf, len))) return -EFAULT; iov->iov_base = buf; diff --git a/lib/usercopy.c b/lib/usercopy.c index 3744b2a8e591..c2bfbcaeb3dc 100644 --- a/lib/usercopy.c +++ b/lib/usercopy.c @@ -8,7 +8,7 @@ unsigned long _copy_from_user(void *to, const void __user *from, unsigned long n { unsigned long res = n; might_fault(); - if (likely(access_ok(VERIFY_READ, from, n))) { + if (likely(access_ok(from, n))) { kasan_check_write(to, n); res = raw_copy_from_user(to, from, n); } @@ -23,7 +23,7 @@ EXPORT_SYMBOL(_copy_from_user); unsigned long _copy_to_user(void __user *to, const void *from, unsigned long n) { might_fault(); - if (likely(access_ok(VERIFY_WRITE, to, n))) { + if (likely(access_ok(to, n))) { kasan_check_read(from, n); n = raw_copy_to_user(to, from, n); } diff --git a/mm/gup.c b/mm/gup.c index 8cb68a50dbdf..6f591ccb8eca 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1813,8 +1813,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, len = (unsigned long) nr_pages << PAGE_SHIFT; end = start + len; - if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ, - (void __user *)start, len))) + if (unlikely(!access_ok((void __user *)start, len))) return 0; /* @@ -1868,8 +1867,7 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write, if (nr_pages <= 0) return 0; - if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ, - (void __user *)start, len))) + if (unlikely(!access_ok((void __user *)start, len))) return -EFAULT; if (gup_fast_permitted(start, nr_pages, write)) { diff --git a/mm/mincore.c b/mm/mincore.c index 4985965aa20a..218099b5ed31 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -233,14 +233,14 @@ SYSCALL_DEFINE3(mincore, unsigned long, start, size_t, len, return -EINVAL; /* ..and we need to be passed a valid user-space range */ - if (!access_ok(VERIFY_READ, (void __user *) start, len)) + if (!access_ok((void __user *) start, len)) return -ENOMEM; /* This also avoids any overflows on PAGE_ALIGN */ pages = len >> PAGE_SHIFT; pages += (offset_in_page(len)) != 0; - if (!access_ok(VERIFY_WRITE, vec, pages)) + if (!access_ok(vec, pages)) return -EFAULT; tmp = (void *) __get_free_page(GFP_USER); diff --git a/net/batman-adv/icmp_socket.c b/net/batman-adv/icmp_socket.c index d70f363c52ae..6d5859714f52 100644 --- a/net/batman-adv/icmp_socket.c +++ b/net/batman-adv/icmp_socket.c @@ -147,7 +147,7 @@ static ssize_t batadv_socket_read(struct file *file, char __user *buf, if (!buf || count < sizeof(struct batadv_icmp_packet)) return -EINVAL; - if (!access_ok(VERIFY_WRITE, buf, count)) + if (!access_ok(buf, count)) return -EFAULT; error = wait_event_interruptible(socket_client->queue_wait, diff --git a/net/batman-adv/log.c b/net/batman-adv/log.c index 02e55b78132f..75f602e1ce94 100644 --- a/net/batman-adv/log.c +++ b/net/batman-adv/log.c @@ -136,7 +136,7 @@ static ssize_t batadv_log_read(struct file *file, char __user *buf, if (count == 0) return 0; - if (!access_ok(VERIFY_WRITE, buf, count)) + if (!access_ok(buf, count)) return -EFAULT; error = wait_event_interruptible(debug_log->queue_wait, diff --git a/net/compat.c b/net/compat.c index c3a2f868e8af..959d1c51826d 100644 --- a/net/compat.c +++ b/net/compat.c @@ -358,7 +358,7 @@ static int do_set_sock_timeout(struct socket *sock, int level, if (optlen < sizeof(*up)) return -EINVAL; - if (!access_ok(VERIFY_READ, up, sizeof(*up)) || + if (!access_ok(up, sizeof(*up)) || __get_user(ktime.tv_sec, &up->tv_sec) || __get_user(ktime.tv_usec, &up->tv_usec)) return -EFAULT; @@ -438,7 +438,7 @@ static int do_get_sock_timeout(struct socket *sock, int level, int optname, if (!err) { if (put_user(sizeof(*up), optlen) || - !access_ok(VERIFY_WRITE, up, sizeof(*up)) || + !access_ok(up, sizeof(*up)) || __put_user(ktime.tv_sec, &up->tv_sec) || __put_user(ktime.tv_usec, &up->tv_usec)) err = -EFAULT; @@ -590,8 +590,8 @@ int compat_mc_setsockopt(struct sock *sock, int level, int optname, compat_alloc_user_space(sizeof(struct group_req)); u32 interface; - if (!access_ok(VERIFY_READ, gr32, sizeof(*gr32)) || - !access_ok(VERIFY_WRITE, kgr, sizeof(struct group_req)) || + if (!access_ok(gr32, sizeof(*gr32)) || + !access_ok(kgr, sizeof(struct group_req)) || __get_user(interface, &gr32->gr_interface) || __put_user(interface, &kgr->gr_interface) || copy_in_user(&kgr->gr_group, &gr32->gr_group, @@ -611,8 +611,8 @@ int compat_mc_setsockopt(struct sock *sock, int level, int optname, sizeof(struct group_source_req)); u32 interface; - if (!access_ok(VERIFY_READ, gsr32, sizeof(*gsr32)) || - !access_ok(VERIFY_WRITE, kgsr, + if (!access_ok(gsr32, sizeof(*gsr32)) || + !access_ok(kgsr, sizeof(struct group_source_req)) || __get_user(interface, &gsr32->gsr_interface) || __put_user(interface, &kgsr->gsr_interface) || @@ -631,7 +631,7 @@ int compat_mc_setsockopt(struct sock *sock, int level, int optname, struct group_filter __user *kgf; u32 interface, fmode, numsrc; - if (!access_ok(VERIFY_READ, gf32, __COMPAT_GF0_SIZE) || + if (!access_ok(gf32, __COMPAT_GF0_SIZE) || __get_user(interface, &gf32->gf_interface) || __get_user(fmode, &gf32->gf_fmode) || __get_user(numsrc, &gf32->gf_numsrc)) @@ -641,7 +641,7 @@ int compat_mc_setsockopt(struct sock *sock, int level, int optname, if (koptlen < GROUP_FILTER_SIZE(numsrc)) return -EINVAL; kgf = compat_alloc_user_space(koptlen); - if (!access_ok(VERIFY_WRITE, kgf, koptlen) || + if (!access_ok(kgf, koptlen) || __put_user(interface, &kgf->gf_interface) || __put_user(fmode, &kgf->gf_fmode) || __put_user(numsrc, &kgf->gf_numsrc) || @@ -675,7 +675,7 @@ int compat_mc_getsockopt(struct sock *sock, int level, int optname, return getsockopt(sock, level, optname, optval, optlen); koptlen = compat_alloc_user_space(sizeof(*koptlen)); - if (!access_ok(VERIFY_READ, optlen, sizeof(*optlen)) || + if (!access_ok(optlen, sizeof(*optlen)) || __get_user(ulen, optlen)) return -EFAULT; @@ -685,14 +685,14 @@ int compat_mc_getsockopt(struct sock *sock, int level, int optname, if (klen < GROUP_FILTER_SIZE(0)) return -EINVAL; - if (!access_ok(VERIFY_WRITE, koptlen, sizeof(*koptlen)) || + if (!access_ok(koptlen, sizeof(*koptlen)) || __put_user(klen, koptlen)) return -EFAULT; /* have to allow space for previous compat_alloc_user_space, too */ kgf = compat_alloc_user_space(klen+sizeof(*optlen)); - if (!access_ok(VERIFY_READ, gf32, __COMPAT_GF0_SIZE) || + if (!access_ok(gf32, __COMPAT_GF0_SIZE) || __get_user(interface, &gf32->gf_interface) || __get_user(fmode, &gf32->gf_fmode) || __get_user(numsrc, &gf32->gf_numsrc) || @@ -706,18 +706,18 @@ int compat_mc_getsockopt(struct sock *sock, int level, int optname, if (err) return err; - if (!access_ok(VERIFY_READ, koptlen, sizeof(*koptlen)) || + if (!access_ok(koptlen, sizeof(*koptlen)) || __get_user(klen, koptlen)) return -EFAULT; ulen = klen - (sizeof(*kgf)-sizeof(*gf32)); - if (!access_ok(VERIFY_WRITE, optlen, sizeof(*optlen)) || + if (!access_ok(optlen, sizeof(*optlen)) || __put_user(ulen, optlen)) return -EFAULT; - if (!access_ok(VERIFY_READ, kgf, klen) || - !access_ok(VERIFY_WRITE, gf32, ulen) || + if (!access_ok(kgf, klen) || + !access_ok(gf32, ulen) || __get_user(interface, &kgf->gf_interface) || __get_user(fmode, &kgf->gf_fmode) || __get_user(numsrc, &kgf->gf_numsrc) || diff --git a/net/sunrpc/sysctl.c b/net/sunrpc/sysctl.c index 8c3936403fea..0bea8ff8b0d3 100644 --- a/net/sunrpc/sysctl.c +++ b/net/sunrpc/sysctl.c @@ -89,7 +89,7 @@ proc_dodebug(struct ctl_table *table, int write, left = *lenp; if (write) { - if (!access_ok(VERIFY_READ, buffer, left)) + if (!access_ok(buffer, left)) return -EFAULT; p = buffer; while (left && __get_user(c, p) >= 0 && isspace(c)) diff --git a/security/tomoyo/common.c b/security/tomoyo/common.c index 9b38f94b5dd0..c598aa00d5e3 100644 --- a/security/tomoyo/common.c +++ b/security/tomoyo/common.c @@ -2591,7 +2591,7 @@ ssize_t tomoyo_write_control(struct tomoyo_io_buffer *head, int idx; if (!head->write) return -ENOSYS; - if (!access_ok(VERIFY_READ, buffer, buffer_len)) + if (!access_ok(buffer, buffer_len)) return -EFAULT; if (mutex_lock_interruptible(&head->io_sem)) return -EINTR; diff --git a/sound/core/seq/seq_clientmgr.c b/sound/core/seq/seq_clientmgr.c index 92e6524a3a9d..7d4640d1fe9f 100644 --- a/sound/core/seq/seq_clientmgr.c +++ b/sound/core/seq/seq_clientmgr.c @@ -393,7 +393,7 @@ static ssize_t snd_seq_read(struct file *file, char __user *buf, size_t count, if (!(snd_seq_file_flags(file) & SNDRV_SEQ_LFLG_INPUT)) return -ENXIO; - if (!access_ok(VERIFY_WRITE, buf, count)) + if (!access_ok(buf, count)) return -EFAULT; /* check client structures are in place */ diff --git a/sound/isa/sb/emu8000_patch.c b/sound/isa/sb/emu8000_patch.c index d45a6b9d6437..3d44c358c4b3 100644 --- a/sound/isa/sb/emu8000_patch.c +++ b/sound/isa/sb/emu8000_patch.c @@ -183,10 +183,10 @@ snd_emu8000_sample_new(struct snd_emux *rec, struct snd_sf_sample *sp, } if (sp->v.mode_flags & SNDRV_SFNT_SAMPLE_8BITS) { - if (!access_ok(VERIFY_READ, data, sp->v.size)) + if (!access_ok(data, sp->v.size)) return -EFAULT; } else { - if (!access_ok(VERIFY_READ, data, sp->v.size * 2)) + if (!access_ok(data, sp->v.size * 2)) return -EFAULT; } diff --git a/tools/perf/util/include/asm/uaccess.h b/tools/perf/util/include/asm/uaccess.h index 6a6f4b990547..548100315710 100644 --- a/tools/perf/util/include/asm/uaccess.h +++ b/tools/perf/util/include/asm/uaccess.h @@ -10,6 +10,6 @@ #define get_user __get_user -#define access_ok(type, addr, size) 1 +#define access_ok(addr, size) 1 #endif diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 666d0155662d..1f888a103f78 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -939,8 +939,7 @@ int __kvm_set_memory_region(struct kvm *kvm, /* We can read the guest memory with __xxx_user() later on. */ if ((id < KVM_USER_MEM_SLOTS) && ((mem->userspace_addr & (PAGE_SIZE - 1)) || - !access_ok(VERIFY_WRITE, - (void __user *)(unsigned long)mem->userspace_addr, + !access_ok((void __user *)(unsigned long)mem->userspace_addr, mem->memory_size))) goto out; if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_MEM_SLOTS_NUM) -- cgit From 2e05ea5cdc1ac55d9ef678ed5ea6c38acf7fd2a3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 25 Dec 2018 08:50:35 +0100 Subject: dma-mapping: implement dma_map_single_attrs using dma_map_page_attrs And also switch the way we implement the unmap side around to stay consistent. This ensures dma-debug works again because it records which function we used for mapping to ensure it is also used for unmapping, and also reduces further code duplication. Last but not least this also officially allows calling dma_sync_single_* for mappings created using dma_map_page, which is perfectly fine given that the sync calls only take a dma_addr_t, but not a virtual address or struct page. Fixes: 7f0fee242e ("dma-mapping: merge dma_unmap_page_attrs and dma_unmap_single_attrs") Signed-off-by: Christoph Hellwig Tested-by: LABBE Corentin --- include/linux/dma-debug.h | 11 +++----- include/linux/dma-mapping.h | 66 +++++++++++++++++---------------------------- kernel/dma/debug.c | 17 +++--------- 3 files changed, 32 insertions(+), 62 deletions(-) (limited to 'kernel') diff --git a/include/linux/dma-debug.h b/include/linux/dma-debug.h index 2ad5c363d7d5..cb422cbe587d 100644 --- a/include/linux/dma-debug.h +++ b/include/linux/dma-debug.h @@ -35,13 +35,12 @@ extern void debug_dma_map_single(struct device *dev, const void *addr, extern void debug_dma_map_page(struct device *dev, struct page *page, size_t offset, size_t size, - int direction, dma_addr_t dma_addr, - bool map_single); + int direction, dma_addr_t dma_addr); extern void debug_dma_mapping_error(struct device *dev, dma_addr_t dma_addr); extern void debug_dma_unmap_page(struct device *dev, dma_addr_t addr, - size_t size, int direction, bool map_single); + size_t size, int direction); extern void debug_dma_map_sg(struct device *dev, struct scatterlist *sg, int nents, int mapped_ents, int direction); @@ -95,8 +94,7 @@ static inline void debug_dma_map_single(struct device *dev, const void *addr, static inline void debug_dma_map_page(struct device *dev, struct page *page, size_t offset, size_t size, - int direction, dma_addr_t dma_addr, - bool map_single) + int direction, dma_addr_t dma_addr) { } @@ -106,8 +104,7 @@ static inline void debug_dma_mapping_error(struct device *dev, } static inline void debug_dma_unmap_page(struct device *dev, dma_addr_t addr, - size_t size, int direction, - bool map_single) + size_t size, int direction) { } diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index ba521d5506c9..0452a8be2789 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -284,32 +284,25 @@ static inline void dma_direct_sync_sg_for_cpu(struct device *dev, } #endif -static inline dma_addr_t dma_map_single_attrs(struct device *dev, void *ptr, - size_t size, - enum dma_data_direction dir, - unsigned long attrs) +static inline dma_addr_t dma_map_page_attrs(struct device *dev, + struct page *page, size_t offset, size_t size, + enum dma_data_direction dir, unsigned long attrs) { const struct dma_map_ops *ops = get_dma_ops(dev); dma_addr_t addr; BUG_ON(!valid_dma_direction(dir)); - debug_dma_map_single(dev, ptr, size); if (dma_is_direct(ops)) - addr = dma_direct_map_page(dev, virt_to_page(ptr), - offset_in_page(ptr), size, dir, attrs); + addr = dma_direct_map_page(dev, page, offset, size, dir, attrs); else - addr = ops->map_page(dev, virt_to_page(ptr), - offset_in_page(ptr), size, dir, attrs); - debug_dma_map_page(dev, virt_to_page(ptr), - offset_in_page(ptr), size, - dir, addr, true); + addr = ops->map_page(dev, page, offset, size, dir, attrs); + debug_dma_map_page(dev, page, offset, size, dir, addr); + return addr; } -static inline void dma_unmap_single_attrs(struct device *dev, dma_addr_t addr, - size_t size, - enum dma_data_direction dir, - unsigned long attrs) +static inline void dma_unmap_page_attrs(struct device *dev, dma_addr_t addr, + size_t size, enum dma_data_direction dir, unsigned long attrs) { const struct dma_map_ops *ops = get_dma_ops(dev); @@ -318,13 +311,7 @@ static inline void dma_unmap_single_attrs(struct device *dev, dma_addr_t addr, dma_direct_unmap_page(dev, addr, size, dir, attrs); else if (ops->unmap_page) ops->unmap_page(dev, addr, size, dir, attrs); - debug_dma_unmap_page(dev, addr, size, dir, true); -} - -static inline void dma_unmap_page_attrs(struct device *dev, dma_addr_t addr, - size_t size, enum dma_data_direction dir, unsigned long attrs) -{ - return dma_unmap_single_attrs(dev, addr, size, dir, attrs); + debug_dma_unmap_page(dev, addr, size, dir); } /* @@ -363,25 +350,6 @@ static inline void dma_unmap_sg_attrs(struct device *dev, struct scatterlist *sg ops->unmap_sg(dev, sg, nents, dir, attrs); } -static inline dma_addr_t dma_map_page_attrs(struct device *dev, - struct page *page, - size_t offset, size_t size, - enum dma_data_direction dir, - unsigned long attrs) -{ - const struct dma_map_ops *ops = get_dma_ops(dev); - dma_addr_t addr; - - BUG_ON(!valid_dma_direction(dir)); - if (dma_is_direct(ops)) - addr = dma_direct_map_page(dev, page, offset, size, dir, attrs); - else - addr = ops->map_page(dev, page, offset, size, dir, attrs); - debug_dma_map_page(dev, page, offset, size, dir, addr, false); - - return addr; -} - static inline dma_addr_t dma_map_resource(struct device *dev, phys_addr_t phys_addr, size_t size, @@ -488,6 +456,20 @@ dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, } +static inline dma_addr_t dma_map_single_attrs(struct device *dev, void *ptr, + size_t size, enum dma_data_direction dir, unsigned long attrs) +{ + debug_dma_map_single(dev, ptr, size); + return dma_map_page_attrs(dev, virt_to_page(ptr), offset_in_page(ptr), + size, dir, attrs); +} + +static inline void dma_unmap_single_attrs(struct device *dev, dma_addr_t addr, + size_t size, enum dma_data_direction dir, unsigned long attrs) +{ + return dma_unmap_page_attrs(dev, addr, size, dir, attrs); +} + #define dma_map_single(d, a, s, r) dma_map_single_attrs(d, a, s, r, 0) #define dma_unmap_single(d, a, s, r) dma_unmap_single_attrs(d, a, s, r, 0) #define dma_map_sg(d, s, n, r) dma_map_sg_attrs(d, s, n, r, 0) diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index 164706da2a73..1e0157113d15 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -49,7 +49,6 @@ enum { dma_debug_single, - dma_debug_page, dma_debug_sg, dma_debug_coherent, dma_debug_resource, @@ -1300,8 +1299,7 @@ void debug_dma_map_single(struct device *dev, const void *addr, EXPORT_SYMBOL(debug_dma_map_single); void debug_dma_map_page(struct device *dev, struct page *page, size_t offset, - size_t size, int direction, dma_addr_t dma_addr, - bool map_single) + size_t size, int direction, dma_addr_t dma_addr) { struct dma_debug_entry *entry; @@ -1316,7 +1314,7 @@ void debug_dma_map_page(struct device *dev, struct page *page, size_t offset, return; entry->dev = dev; - entry->type = dma_debug_page; + entry->type = dma_debug_single; entry->pfn = page_to_pfn(page); entry->offset = offset, entry->dev_addr = dma_addr; @@ -1324,9 +1322,6 @@ void debug_dma_map_page(struct device *dev, struct page *page, size_t offset, entry->direction = direction; entry->map_err_type = MAP_ERR_NOT_CHECKED; - if (map_single) - entry->type = dma_debug_single; - check_for_stack(dev, page, offset); if (!PageHighMem(page)) { @@ -1378,10 +1373,10 @@ void debug_dma_mapping_error(struct device *dev, dma_addr_t dma_addr) EXPORT_SYMBOL(debug_dma_mapping_error); void debug_dma_unmap_page(struct device *dev, dma_addr_t addr, - size_t size, int direction, bool map_single) + size_t size, int direction) { struct dma_debug_entry ref = { - .type = dma_debug_page, + .type = dma_debug_single, .dev = dev, .dev_addr = addr, .size = size, @@ -1390,10 +1385,6 @@ void debug_dma_unmap_page(struct device *dev, dma_addr_t addr, if (unlikely(dma_debug_disabled())) return; - - if (map_single) - ref.type = dma_debug_single; - check_unmap(&ref); } EXPORT_SYMBOL(debug_dma_unmap_page); -- cgit From d7076f07840851bbe57cb21ba052d6a4a9b1efa9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 25 Dec 2018 17:44:19 +0100 Subject: dma-mapping: implement dmam_alloc_coherent using dmam_alloc_attrs dmam_alloc_coherent is just the default no-flags case of dmam_alloc_attrs, so take advantage of this similar to the non-managed version. Signed-off-by: Christoph Hellwig --- include/linux/dma-mapping.h | 20 +++++++++++++------- kernel/dma/mapping.c | 39 --------------------------------------- 2 files changed, 13 insertions(+), 46 deletions(-) (limited to 'kernel') diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index 0452a8be2789..fa2ebe8ad4d0 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -677,21 +677,20 @@ dma_mark_declared_memory_occupied(struct device *dev, * Managed DMA API */ #ifdef CONFIG_HAS_DMA -extern void *dmam_alloc_coherent(struct device *dev, size_t size, - dma_addr_t *dma_handle, gfp_t gfp); +extern void *dmam_alloc_attrs(struct device *dev, size_t size, + dma_addr_t *dma_handle, gfp_t gfp, + unsigned long attrs); extern void dmam_free_coherent(struct device *dev, size_t size, void *vaddr, dma_addr_t dma_handle); #else /* !CONFIG_HAS_DMA */ -static inline void *dmam_alloc_coherent(struct device *dev, size_t size, - dma_addr_t *dma_handle, gfp_t gfp) +static inline void *dmam_alloc_attrs(struct device *dev, size_t size, + dma_addr_t *dma_handle, gfp_t gfp, + unsigned long attrs) { return NULL; } static inline void dmam_free_coherent(struct device *dev, size_t size, void *vaddr, dma_addr_t dma_handle) { } #endif /* !CONFIG_HAS_DMA */ -extern void *dmam_alloc_attrs(struct device *dev, size_t size, - dma_addr_t *dma_handle, gfp_t gfp, - unsigned long attrs); #ifdef CONFIG_HAVE_GENERIC_DMA_COHERENT extern int dmam_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr, @@ -711,6 +710,13 @@ static inline void dmam_release_declared_memory(struct device *dev) } #endif /* CONFIG_HAVE_GENERIC_DMA_COHERENT */ +static inline void *dmam_alloc_coherent(struct device *dev, size_t size, + dma_addr_t *dma_handle, gfp_t gfp) +{ + return dmam_alloc_attrs(dev, size, dma_handle, gfp, + (gfp & __GFP_NOWARN) ? DMA_ATTR_NO_WARN : 0); +} + static inline void *dma_alloc_wc(struct device *dev, size_t size, dma_addr_t *dma_addr, gfp_t gfp) { diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index d7c34d2d1ba5..f00544cda4e9 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -45,45 +45,6 @@ static int dmam_match(struct device *dev, void *res, void *match_data) return 0; } -/** - * dmam_alloc_coherent - Managed dma_alloc_coherent() - * @dev: Device to allocate coherent memory for - * @size: Size of allocation - * @dma_handle: Out argument for allocated DMA handle - * @gfp: Allocation flags - * - * Managed dma_alloc_coherent(). Memory allocated using this function - * will be automatically released on driver detach. - * - * RETURNS: - * Pointer to allocated memory on success, NULL on failure. - */ -void *dmam_alloc_coherent(struct device *dev, size_t size, - dma_addr_t *dma_handle, gfp_t gfp) -{ - struct dma_devres *dr; - void *vaddr; - - dr = devres_alloc(dmam_release, sizeof(*dr), gfp); - if (!dr) - return NULL; - - vaddr = dma_alloc_coherent(dev, size, dma_handle, gfp); - if (!vaddr) { - devres_free(dr); - return NULL; - } - - dr->vaddr = vaddr; - dr->dma_handle = *dma_handle; - dr->size = size; - - devres_add(dev, dr); - - return vaddr; -} -EXPORT_SYMBOL(dmam_alloc_coherent); - /** * dmam_free_coherent - Managed dma_free_coherent() * @dev: Device to free coherent memory for -- cgit From 4788ba5792cc1368ba4867e1488dc168b4fe97b7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 26 Dec 2018 07:51:44 +0100 Subject: dma-mapping: remove dmam_{declare,release}_coherent_memory These functions have never been used. Signed-off-by: Christoph Hellwig --- Documentation/driver-model/devres.txt | 1 - include/linux/dma-mapping.h | 19 ------------ kernel/dma/mapping.c | 55 ----------------------------------- 3 files changed, 75 deletions(-) (limited to 'kernel') diff --git a/Documentation/driver-model/devres.txt b/Documentation/driver-model/devres.txt index 841c99529d27..b277cafce71e 100644 --- a/Documentation/driver-model/devres.txt +++ b/Documentation/driver-model/devres.txt @@ -250,7 +250,6 @@ DMA dmaenginem_async_device_register() dmam_alloc_coherent() dmam_alloc_attrs() - dmam_declare_coherent_memory() dmam_free_coherent() dmam_pool_create() dmam_pool_destroy() diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index fa2ebe8ad4d0..937c2a949fca 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -691,25 +691,6 @@ static inline void dmam_free_coherent(struct device *dev, size_t size, void *vaddr, dma_addr_t dma_handle) { } #endif /* !CONFIG_HAS_DMA */ -#ifdef CONFIG_HAVE_GENERIC_DMA_COHERENT -extern int dmam_declare_coherent_memory(struct device *dev, - phys_addr_t phys_addr, - dma_addr_t device_addr, size_t size, - int flags); -extern void dmam_release_declared_memory(struct device *dev); -#else /* CONFIG_HAVE_GENERIC_DMA_COHERENT */ -static inline int dmam_declare_coherent_memory(struct device *dev, - phys_addr_t phys_addr, dma_addr_t device_addr, - size_t size, gfp_t gfp) -{ - return 0; -} - -static inline void dmam_release_declared_memory(struct device *dev) -{ -} -#endif /* CONFIG_HAVE_GENERIC_DMA_COHERENT */ - static inline void *dmam_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t gfp) { diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index f00544cda4e9..a11006b6d8e8 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -105,61 +105,6 @@ void *dmam_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle, } EXPORT_SYMBOL(dmam_alloc_attrs); -#ifdef CONFIG_HAVE_GENERIC_DMA_COHERENT - -static void dmam_coherent_decl_release(struct device *dev, void *res) -{ - dma_release_declared_memory(dev); -} - -/** - * dmam_declare_coherent_memory - Managed dma_declare_coherent_memory() - * @dev: Device to declare coherent memory for - * @phys_addr: Physical address of coherent memory to be declared - * @device_addr: Device address of coherent memory to be declared - * @size: Size of coherent memory to be declared - * @flags: Flags - * - * Managed dma_declare_coherent_memory(). - * - * RETURNS: - * 0 on success, -errno on failure. - */ -int dmam_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr, - dma_addr_t device_addr, size_t size, int flags) -{ - void *res; - int rc; - - res = devres_alloc(dmam_coherent_decl_release, 0, GFP_KERNEL); - if (!res) - return -ENOMEM; - - rc = dma_declare_coherent_memory(dev, phys_addr, device_addr, size, - flags); - if (!rc) - devres_add(dev, res); - else - devres_free(res); - - return rc; -} -EXPORT_SYMBOL(dmam_declare_coherent_memory); - -/** - * dmam_release_declared_memory - Managed dma_release_declared_memory(). - * @dev: Device to release declared coherent memory for - * - * Managed dmam_release_declared_memory(). - */ -void dmam_release_declared_memory(struct device *dev) -{ - WARN_ON(devres_destroy(dev, dmam_coherent_decl_release, NULL, NULL)); -} -EXPORT_SYMBOL(dmam_release_declared_memory); - -#endif - /* * Create scatter-list for the already allocated DMA buffer. */ -- cgit From 48e638fb68be8fecdca0611beff53a9c947704e3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 1 Jan 2019 17:14:39 +0100 Subject: dma-mapping: remove a few unused exports Now that the slow path DMA API calls are implemented out of line a few helpers only used by them don't need to be exported anymore. Signed-off-by: Christoph Hellwig --- kernel/dma/coherent.c | 2 -- kernel/dma/debug.c | 2 -- 2 files changed, 4 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/coherent.c b/kernel/dma/coherent.c index 597d40893862..66f0fb7e9a3a 100644 --- a/kernel/dma/coherent.c +++ b/kernel/dma/coherent.c @@ -223,7 +223,6 @@ int dma_alloc_from_dev_coherent(struct device *dev, ssize_t size, */ return mem->flags & DMA_MEMORY_EXCLUSIVE; } -EXPORT_SYMBOL(dma_alloc_from_dev_coherent); void *dma_alloc_from_global_coherent(ssize_t size, dma_addr_t *dma_handle) { @@ -268,7 +267,6 @@ int dma_release_from_dev_coherent(struct device *dev, int order, void *vaddr) return __dma_release_from_coherent(mem, order, vaddr); } -EXPORT_SYMBOL(dma_release_from_dev_coherent); int dma_release_from_global_coherent(int order, void *vaddr) { diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index 1e0157113d15..23cf5361bcf1 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -1512,7 +1512,6 @@ void debug_dma_alloc_coherent(struct device *dev, size_t size, add_dma_entry(entry); } -EXPORT_SYMBOL(debug_dma_alloc_coherent); void debug_dma_free_coherent(struct device *dev, size_t size, void *virt, dma_addr_t addr) @@ -1540,7 +1539,6 @@ void debug_dma_free_coherent(struct device *dev, size_t size, check_unmap(&ref); } -EXPORT_SYMBOL(debug_dma_free_coherent); void debug_dma_map_resource(struct device *dev, phys_addr_t addr, size_t size, int direction, dma_addr_t dma_addr) -- cgit From 594cc251fdd0d231d342d88b2fdff4bc42fb0690 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 4 Jan 2019 12:56:09 -0800 Subject: make 'user_access_begin()' do 'access_ok()' Originally, the rule used to be that you'd have to do access_ok() separately, and then user_access_begin() before actually doing the direct (optimized) user access. But experience has shown that people then decide not to do access_ok() at all, and instead rely on it being implied by other operations or similar. Which makes it very hard to verify that the access has actually been range-checked. If you use the unsafe direct user accesses, hardware features (either SMAP - Supervisor Mode Access Protection - on x86, or PAN - Privileged Access Never - on ARM) do force you to use user_access_begin(). But nothing really forces the range check. By putting the range check into user_access_begin(), we actually force people to do the right thing (tm), and the range check vill be visible near the actual accesses. We have way too long a history of people trying to avoid them. Signed-off-by: Linus Torvalds --- arch/x86/include/asm/uaccess.h | 9 ++++++++- drivers/gpu/drm/i915/i915_gem_execbuffer.c | 15 +++++++++++++-- include/linux/uaccess.h | 2 +- kernel/compat.c | 6 ++---- kernel/exit.c | 6 ++---- lib/strncpy_from_user.c | 9 +++++---- lib/strnlen_user.c | 9 +++++---- 7 files changed, 36 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 3920f456db79..a87ab5290ab4 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -705,7 +705,14 @@ extern struct movsl_mask { * checking before using them, but you have to surround them with the * user_access_begin/end() pair. */ -#define user_access_begin() __uaccess_begin() +static __must_check inline bool user_access_begin(const void __user *ptr, size_t len) +{ + if (unlikely(!access_ok(ptr,len))) + return 0; + __uaccess_begin(); + return 1; +} +#define user_access_begin(a,b) user_access_begin(a,b) #define user_access_end() __uaccess_end() #define unsafe_put_user(x, ptr, err_label) \ diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c index 55d8f9b8777f..485b259127c3 100644 --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c @@ -1624,7 +1624,9 @@ end_user: * happened we would make the mistake of assuming that the * relocations were valid. */ - user_access_begin(); + if (!user_access_begin(urelocs, size)) + goto end_user; + for (copied = 0; copied < nreloc; copied++) unsafe_put_user(-1, &urelocs[copied].presumed_offset, @@ -2606,7 +2608,16 @@ i915_gem_execbuffer2_ioctl(struct drm_device *dev, void *data, unsigned int i; /* Copy the new buffer offsets back to the user's exec list. */ - user_access_begin(); + /* + * Note: count * sizeof(*user_exec_list) does not overflow, + * because we checked 'count' in check_buffer_count(). + * + * And this range already got effectively checked earlier + * when we did the "copy_from_user()" above. + */ + if (!user_access_begin(user_exec_list, count * sizeof(*user_exec_list))) + goto end_user; + for (i = 0; i < args->buffer_count; i++) { if (!(exec2_list[i].offset & UPDATE)) continue; diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index bf2523867a02..37b226e8df13 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -264,7 +264,7 @@ extern long strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count); probe_kernel_read(&retval, addr, sizeof(retval)) #ifndef user_access_begin -#define user_access_begin() do { } while (0) +#define user_access_begin(ptr,len) access_ok(ptr, len) #define user_access_end() do { } while (0) #define unsafe_get_user(x, ptr, err) do { if (unlikely(__get_user(x, ptr))) goto err; } while (0) #define unsafe_put_user(x, ptr, err) do { if (unlikely(__put_user(x, ptr))) goto err; } while (0) diff --git a/kernel/compat.c b/kernel/compat.c index 705d4ae6c018..f01affa17e22 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -354,10 +354,9 @@ long compat_get_bitmap(unsigned long *mask, const compat_ulong_t __user *umask, bitmap_size = ALIGN(bitmap_size, BITS_PER_COMPAT_LONG); nr_compat_longs = BITS_TO_COMPAT_LONGS(bitmap_size); - if (!access_ok(umask, bitmap_size / 8)) + if (!user_access_begin(umask, bitmap_size / 8)) return -EFAULT; - user_access_begin(); while (nr_compat_longs > 1) { compat_ulong_t l1, l2; unsafe_get_user(l1, umask++, Efault); @@ -384,10 +383,9 @@ long compat_put_bitmap(compat_ulong_t __user *umask, unsigned long *mask, bitmap_size = ALIGN(bitmap_size, BITS_PER_COMPAT_LONG); nr_compat_longs = BITS_TO_COMPAT_LONGS(bitmap_size); - if (!access_ok(umask, bitmap_size / 8)) + if (!user_access_begin(umask, bitmap_size / 8)) return -EFAULT; - user_access_begin(); while (nr_compat_longs > 1) { unsigned long m = *mask++; unsafe_put_user((compat_ulong_t)m, umask++, Efault); diff --git a/kernel/exit.c b/kernel/exit.c index 8a01b671dc1f..2d14979577ee 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1604,10 +1604,9 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, if (!infop) return err; - if (!access_ok(infop, sizeof(*infop))) + if (!user_access_begin(infop, sizeof(*infop))) return -EFAULT; - user_access_begin(); unsafe_put_user(signo, &infop->si_signo, Efault); unsafe_put_user(0, &infop->si_errno, Efault); unsafe_put_user(info.cause, &infop->si_code, Efault); @@ -1732,10 +1731,9 @@ COMPAT_SYSCALL_DEFINE5(waitid, if (!infop) return err; - if (!access_ok(infop, sizeof(*infop))) + if (!user_access_begin(infop, sizeof(*infop))) return -EFAULT; - user_access_begin(); unsafe_put_user(signo, &infop->si_signo, Efault); unsafe_put_user(0, &infop->si_errno, Efault); unsafe_put_user(info.cause, &infop->si_code, Efault); diff --git a/lib/strncpy_from_user.c b/lib/strncpy_from_user.c index b53e1b5d80f4..58eacd41526c 100644 --- a/lib/strncpy_from_user.c +++ b/lib/strncpy_from_user.c @@ -114,10 +114,11 @@ long strncpy_from_user(char *dst, const char __user *src, long count) kasan_check_write(dst, count); check_object_size(dst, count, false); - user_access_begin(); - retval = do_strncpy_from_user(dst, src, count, max); - user_access_end(); - return retval; + if (user_access_begin(src, max)) { + retval = do_strncpy_from_user(dst, src, count, max); + user_access_end(); + return retval; + } } return -EFAULT; } diff --git a/lib/strnlen_user.c b/lib/strnlen_user.c index 60d0bbda8f5e..1c1a1b0e38a5 100644 --- a/lib/strnlen_user.c +++ b/lib/strnlen_user.c @@ -114,10 +114,11 @@ long strnlen_user(const char __user *str, long count) unsigned long max = max_addr - src_addr; long retval; - user_access_begin(); - retval = do_strnlen_user(str, count, max); - user_access_end(); - return retval; + if (user_access_begin(str, max)) { + retval = do_strnlen_user(str, count, max); + user_access_end(); + return retval; + } } return 0; } -- cgit From 09be178400829dddc1189b50a7888495dd26aa84 Mon Sep 17 00:00:00 2001 From: Cheng Lin Date: Thu, 3 Jan 2019 15:26:13 -0800 Subject: proc/sysctl: fix return error for proc_doulongvec_minmax() If the number of input parameters is less than the total parameters, an EINVAL error will be returned. For example, we use proc_doulongvec_minmax to pass up to two parameters with kern_table: { .procname = "monitor_signals", .data = &monitor_sigs, .maxlen = 2*sizeof(unsigned long), .mode = 0644, .proc_handler = proc_doulongvec_minmax, }, Reproduce: When passing two parameters, it's work normal. But passing only one parameter, an error "Invalid argument"(EINVAL) is returned. [root@cl150 ~]# echo 1 2 > /proc/sys/kernel/monitor_signals [root@cl150 ~]# cat /proc/sys/kernel/monitor_signals 1 2 [root@cl150 ~]# echo 3 > /proc/sys/kernel/monitor_signals -bash: echo: write error: Invalid argument [root@cl150 ~]# echo $? 1 [root@cl150 ~]# cat /proc/sys/kernel/monitor_signals 3 2 [root@cl150 ~]# The following is the result after apply this patch. No error is returned when the number of input parameters is less than the total parameters. [root@cl150 ~]# echo 1 2 > /proc/sys/kernel/monitor_signals [root@cl150 ~]# cat /proc/sys/kernel/monitor_signals 1 2 [root@cl150 ~]# echo 3 > /proc/sys/kernel/monitor_signals [root@cl150 ~]# echo $? 0 [root@cl150 ~]# cat /proc/sys/kernel/monitor_signals 3 2 [root@cl150 ~]# There are three processing functions dealing with digital parameters, __do_proc_dointvec/__do_proc_douintvec/__do_proc_doulongvec_minmax. This patch deals with __do_proc_doulongvec_minmax, just as __do_proc_dointvec does, adding a check for parameters 'left'. In __do_proc_douintvec, its code implementation explicitly does not support multiple inputs. static int __do_proc_douintvec(...){ ... /* * Arrays are not supported, keep this simple. *Do not* add * support for them. */ if (vleft != 1) { *lenp = 0; return -EINVAL; } ... } So, just __do_proc_doulongvec_minmax has the problem. And most use of proc_doulongvec_minmax/proc_doulongvec_ms_jiffies_minmax just have one parameter. Link: http://lkml.kernel.org/r/1544081775-15720-1-git-send-email-cheng.lin130@zte.com.cn Signed-off-by: Cheng Lin Acked-by: Luis Chamberlain Reviewed-by: Kees Cook Cc: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 1825f712e73b..7f6c1a3b3485 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2787,6 +2787,8 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int bool neg; left -= proc_skip_spaces(&p); + if (!left) + break; err = proc_get_long(&p, &left, &val, &neg, proc_wspace_sep, -- cgit From 168e06f7937d96c7222037d8a05565e8a6eb00fe Mon Sep 17 00:00:00 2001 From: "Liu, Chuansheng" Date: Thu, 3 Jan 2019 15:26:27 -0800 Subject: kernel/hung_task.c: force console verbose before panic Based on commit 401c636a0eeb ("kernel/hung_task.c: show all hung tasks before panic"), we could get the call stack of hung task. However, if the console loglevel is not high, we still can not see the useful panic information in practice, and in most cases users don't set console loglevel to high level. This patch is to force console verbose before system panic, so that the real useful information can be seen in the console, instead of being like the following, which doesn't have hung task information. INFO: task init:1 blocked for more than 120 seconds. Tainted: G U W 4.19.0-quilt-2e5dc0ac-g51b6c21d76cc #1 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. Kernel panic - not syncing: hung_task: blocked tasks CPU: 2 PID: 479 Comm: khungtaskd Tainted: G U W 4.19.0-quilt-2e5dc0ac-g51b6c21d76cc #1 Call Trace: dump_stack+0x4f/0x65 panic+0xde/0x231 watchdog+0x290/0x410 kthread+0x12c/0x150 ret_from_fork+0x35/0x40 reboot: panic mode set: p,w Kernel Offset: 0x34000000 from 0xffffffff81000000 (relocation range: 0xffffffff80000000-0xffffffffbfffffff) Link: http://lkml.kernel.org/r/27240C0AC20F114CBF8149A2696CBE4A6015B675@SHSMSX101.ccr.corp.intel.com Signed-off-by: Chuansheng Liu Reviewed-by: Petr Mladek Reviewed-by: Sergey Senozhatsky Cc: Tetsuo Handa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/hung_task.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/hung_task.c b/kernel/hung_task.c index cb8e3e8ac7b9..85af0cde7f46 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -112,8 +112,11 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) trace_sched_process_hang(t); - if (!sysctl_hung_task_warnings && !sysctl_hung_task_panic) - return; + if (sysctl_hung_task_panic) { + console_verbose(); + hung_task_show_lock = true; + hung_task_call_panic = true; + } /* * Ok, the task did not get scheduled for more than 2 minutes, @@ -135,11 +138,6 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) } touch_nmi_watchdog(); - - if (sysctl_hung_task_panic) { - hung_task_show_lock = true; - hung_task_call_panic = true; - } } /* -- cgit From 304ae42739b108305f8d7b3eb3c1aec7c2b643a9 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Thu, 3 Jan 2019 15:26:31 -0800 Subject: kernel/hung_task.c: break RCU locks based on jiffies check_hung_uninterruptible_tasks() is currently calling rcu_lock_break() for every 1024 threads. But check_hung_task() is very slow if printk() was called, and is very fast otherwise. If many threads within some 1024 threads called printk(), the RCU grace period might be extended enough to trigger RCU stall warnings. Therefore, calling rcu_lock_break() for every some fixed jiffies will be safer. Link: http://lkml.kernel.org/r/1544800658-11423-1-git-send-email-penguin-kernel@I-love.SAKURA.ne.jp Signed-off-by: Tetsuo Handa Acked-by: Paul E. McKenney Cc: Petr Mladek Cc: Sergey Senozhatsky Cc: Dmitry Vyukov Cc: "Rafael J. Wysocki" Cc: Vitaly Kuznetsov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/hung_task.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 85af0cde7f46..4a9191617076 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -34,7 +34,7 @@ int __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT; * is disabled during the critical section. It also controls the size of * the RCU grace period. So it needs to be upper-bound. */ -#define HUNG_TASK_BATCHING 1024 +#define HUNG_TASK_LOCK_BREAK (HZ / 10) /* * Zero means infinite timeout - no checking done: @@ -171,7 +171,7 @@ static bool rcu_lock_break(struct task_struct *g, struct task_struct *t) static void check_hung_uninterruptible_tasks(unsigned long timeout) { int max_count = sysctl_hung_task_check_count; - int batch_count = HUNG_TASK_BATCHING; + unsigned long last_break = jiffies; struct task_struct *g, *t; /* @@ -186,10 +186,10 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) for_each_process_thread(g, t) { if (!max_count--) goto unlock; - if (!--batch_count) { - batch_count = HUNG_TASK_BATCHING; + if (time_after(jiffies, last_break + HUNG_TASK_LOCK_BREAK)) { if (!rcu_lock_break(g, t)) goto unlock; + last_break = jiffies; } /* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */ if (t->state == TASK_UNINTERRUPTIBLE) -- cgit From fb5bf31722d0805a3f394f7d59f2e8cd07acccb7 Mon Sep 17 00:00:00 2001 From: Yi Wang Date: Thu, 3 Jan 2019 15:28:03 -0800 Subject: fork: fix some -Wmissing-prototypes warnings We get a warning when building kernel with W=1: kernel/fork.c:167:13: warning: no previous prototype for `arch_release_thread_stack' [-Wmissing-prototypes] kernel/fork.c:779:13: warning: no previous prototype for `fork_init' [-Wmissing-prototypes] Add the missing declaration in head file to fix this. Also, remove arch_release_thread_stack() completely because no arch seems to implement it since bb9d81264 (arch: remove tile port). Link: http://lkml.kernel.org/r/1542170087-23645-1-git-send-email-wang.yi59@zte.com.cn Signed-off-by: Yi Wang Acked-by: Michal Hocko Acked-by: Mike Rapoport Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched/task.h | 2 ++ init/main.c | 1 - kernel/fork.c | 5 ----- 3 files changed, 2 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index 108ede99e533..44c6f15800ff 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -39,6 +39,8 @@ void __noreturn do_task_dead(void); extern void proc_caches_init(void); +extern void fork_init(void); + extern void release_task(struct task_struct * p); #ifdef CONFIG_HAVE_COPY_THREAD_TLS diff --git a/init/main.c b/init/main.c index 6a74ba0892d2..e2e80ca3165a 100644 --- a/init/main.c +++ b/init/main.c @@ -105,7 +105,6 @@ static int kernel_init(void *); extern void init_IRQ(void); -extern void fork_init(void); extern void radix_tree_init(void); /* diff --git a/kernel/fork.c b/kernel/fork.c index d439c48ecf18..a60459947f18 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -164,10 +164,6 @@ static inline void free_task_struct(struct task_struct *tsk) } #endif -void __weak arch_release_thread_stack(unsigned long *stack) -{ -} - #ifndef CONFIG_ARCH_THREAD_STACK_ALLOCATOR /* @@ -422,7 +418,6 @@ static void release_task_stack(struct task_struct *tsk) return; /* Better to leak the stack than to free prematurely */ account_kernel_stack(tsk, -1); - arch_release_thread_stack(tsk->stack); free_thread_stack(tsk); tsk->stack = NULL; #ifdef CONFIG_VMAP_STACK -- cgit From d999bd9392dea7c1a9ac43b8680b22c4425ae4c7 Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Thu, 3 Jan 2019 15:28:17 -0800 Subject: panic: add options to print system info when panic happens Kernel panic issues are always painful to debug, partially because it's not easy to get enough information of the context when panic happens. And we have ramoops and kdump for that, while this commit tries to provide a easier way to show the system info by adding a cmdline parameter, referring some idea from sysrq handler. Link: http://lkml.kernel.org/r/1543398842-19295-2-git-send-email-feng.tang@intel.com Signed-off-by: Feng Tang Reviewed-by: Kees Cook Acked-by: Steven Rostedt (VMware) Cc: Thomas Gleixner Cc: John Stultz Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/admin-guide/kernel-parameters.txt | 8 +++++++ kernel/panic.c | 28 +++++++++++++++++++++++++ 2 files changed, 36 insertions(+) (limited to 'kernel') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 408781ee142c..e7b5c49702bb 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -3092,6 +3092,14 @@ timeout < 0: reboot immediately Format: + panic_print= Bitmask for printing system info when panic happens. + User can chose combination of the following bits: + bit 0: print all tasks info + bit 1: print system memory info + bit 2: print timer info + bit 3: print locks info if CONFIG_LOCKDEP is on + bit 4: print ftrace buffer + panic_on_warn panic() instead of WARN(). Useful to cause kdump on a WARN(). diff --git a/kernel/panic.c b/kernel/panic.c index d10c340c43b0..855f66738bc7 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -46,6 +46,13 @@ int panic_on_warn __read_mostly; int panic_timeout = CONFIG_PANIC_TIMEOUT; EXPORT_SYMBOL_GPL(panic_timeout); +#define PANIC_PRINT_TASK_INFO 0x00000001 +#define PANIC_PRINT_MEM_INFO 0x00000002 +#define PANIC_PRINT_TIMER_INFO 0x00000004 +#define PANIC_PRINT_LOCK_INFO 0x00000008 +#define PANIC_PRINT_FTRACE_INFO 0x00000010 +static unsigned long panic_print; + ATOMIC_NOTIFIER_HEAD(panic_notifier_list); EXPORT_SYMBOL(panic_notifier_list); @@ -125,6 +132,24 @@ void nmi_panic(struct pt_regs *regs, const char *msg) } EXPORT_SYMBOL(nmi_panic); +static void panic_print_sys_info(void) +{ + if (panic_print & PANIC_PRINT_TASK_INFO) + show_state(); + + if (panic_print & PANIC_PRINT_MEM_INFO) + show_mem(0, NULL); + + if (panic_print & PANIC_PRINT_TIMER_INFO) + sysrq_timer_list_show(); + + if (panic_print & PANIC_PRINT_LOCK_INFO) + debug_show_all_locks(); + + if (panic_print & PANIC_PRINT_FTRACE_INFO) + ftrace_dump(DUMP_ALL); +} + /** * panic - halt the system * @fmt: The text string to print @@ -254,6 +279,8 @@ void panic(const char *fmt, ...) debug_locks_off(); console_flush_on_panic(); + panic_print_sys_info(); + if (!panic_blink) panic_blink = no_blink; @@ -658,6 +685,7 @@ void refcount_error_report(struct pt_regs *regs, const char *err) #endif core_param(panic, panic_timeout, int, 0644); +core_param(panic_print, panic_print, ulong, 0644); core_param(pause_on_oops, pause_on_oops, int, 0644); core_param(panic_on_warn, panic_on_warn, int, 0644); core_param(crash_kexec_post_notifiers, crash_kexec_post_notifiers, bool, 0644); -- cgit From 81c9d43f94870be66146739c6e61df40dc17bb64 Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Thu, 3 Jan 2019 15:28:20 -0800 Subject: kernel/sysctl: add panic_print into sysctl So that we can also runtime chose to print out the needed system info for panic, other than setting the kernel cmdline. Link: http://lkml.kernel.org/r/1543398842-19295-3-git-send-email-feng.tang@intel.com Signed-off-by: Feng Tang Suggested-by: Steven Rostedt Acked-by: Steven Rostedt (VMware) Cc: Thomas Gleixner Cc: John Stultz Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Kees Cook Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/sysctl/kernel.txt | 17 +++++++++++++++++ include/linux/kernel.h | 1 + include/uapi/linux/sysctl.h | 1 + kernel/panic.c | 2 +- kernel/sysctl.c | 7 +++++++ kernel/sysctl_binary.c | 1 + 6 files changed, 28 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index 1b8775298cf7..c0527d8a468a 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt @@ -60,6 +60,7 @@ show up in /proc/sys/kernel: - panic_on_stackoverflow - panic_on_unrecovered_nmi - panic_on_warn +- panic_print - panic_on_rcu_stall - perf_cpu_time_max_percent - perf_event_paranoid @@ -654,6 +655,22 @@ a kernel rebuild when attempting to kdump at the location of a WARN(). ============================================================== +panic_print: + +Bitmask for printing system info when panic happens. User can chose +combination of the following bits: + +bit 0: print all tasks info +bit 1: print system memory info +bit 2: print timer info +bit 3: print locks info if CONFIG_LOCKDEP is on +bit 4: print ftrace buffer + +So for example to print tasks and memory info on panic, user can: + echo 3 > /proc/sys/kernel/panic_print + +============================================================== + panic_on_rcu_stall: When set to 1, calls panic() after RCU stall detection messages. This diff --git a/include/linux/kernel.h b/include/linux/kernel.h index d6aac75b51ba..8f0e68e250a7 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -527,6 +527,7 @@ static inline u32 int_sqrt64(u64 x) extern void bust_spinlocks(int yes); extern int oops_in_progress; /* If set, an oops, panic(), BUG() or die() is in progress */ extern int panic_timeout; +extern unsigned long panic_print; extern int panic_on_oops; extern int panic_on_unrecovered_nmi; extern int panic_on_io_nmi; diff --git a/include/uapi/linux/sysctl.h b/include/uapi/linux/sysctl.h index d71013fffaf6..87aa2a6d9125 100644 --- a/include/uapi/linux/sysctl.h +++ b/include/uapi/linux/sysctl.h @@ -153,6 +153,7 @@ enum KERN_NMI_WATCHDOG=75, /* int: enable/disable nmi watchdog */ KERN_PANIC_ON_NMI=76, /* int: whether we will panic on an unrecovered */ KERN_PANIC_ON_WARN=77, /* int: call panic() in WARN() functions */ + KERN_PANIC_PRINT=78, /* ulong: bitmask to print system info on panic */ }; diff --git a/kernel/panic.c b/kernel/panic.c index 855f66738bc7..f121e6ba7e11 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -51,7 +51,7 @@ EXPORT_SYMBOL_GPL(panic_timeout); #define PANIC_PRINT_TIMER_INFO 0x00000004 #define PANIC_PRINT_LOCK_INFO 0x00000008 #define PANIC_PRINT_FTRACE_INFO 0x00000010 -static unsigned long panic_print; +unsigned long panic_print; ATOMIC_NOTIFIER_HEAD(panic_notifier_list); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 7f6c1a3b3485..ba4d9e85feb8 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -807,6 +807,13 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "panic_print", + .data = &panic_print, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, #if defined CONFIG_PRINTK { .procname = "printk", diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index 07148b497451..73c132095a7b 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -140,6 +140,7 @@ static const struct bin_table bin_kern_table[] = { { CTL_INT, KERN_MAX_LOCK_DEPTH, "max_lock_depth" }, { CTL_INT, KERN_PANIC_ON_NMI, "panic_on_unrecovered_nmi" }, { CTL_INT, KERN_PANIC_ON_WARN, "panic_on_warn" }, + { CTL_ULONG, KERN_PANIC_PRINT, "panic_print" }, {} }; -- cgit From 634724431607f6f46c495dfef801a1c8b44a96d9 Mon Sep 17 00:00:00 2001 From: Anders Roxell Date: Thu, 3 Jan 2019 15:28:24 -0800 Subject: kernel/kcov.c: mark write_comp_data() as notrace Since __sanitizer_cov_trace_const_cmp4 is marked as notrace, the function called from __sanitizer_cov_trace_const_cmp4 shouldn't be traceable either. ftrace_graph_caller() gets called every time func write_comp_data() gets called if it isn't marked 'notrace'. This is the backtrace from gdb: #0 ftrace_graph_caller () at ../arch/arm64/kernel/entry-ftrace.S:179 #1 0xffffff8010201920 in ftrace_caller () at ../arch/arm64/kernel/entry-ftrace.S:151 #2 0xffffff8010439714 in write_comp_data (type=5, arg1=0, arg2=0, ip=18446743524224276596) at ../kernel/kcov.c:116 #3 0xffffff8010439894 in __sanitizer_cov_trace_const_cmp4 (arg1=, arg2=) at ../kernel/kcov.c:188 #4 0xffffff8010201874 in prepare_ftrace_return (self_addr=18446743524226602768, parent=0xffffff801014b918, frame_pointer=18446743524223531344) at ./include/generated/atomic-instrumented.h:27 #5 0xffffff801020194c in ftrace_graph_caller () at ../arch/arm64/kernel/entry-ftrace.S:182 Rework so that write_comp_data() that are called from __sanitizer_cov_trace_*_cmp*() are marked as 'notrace'. Commit 903e8ff86753 ("kernel/kcov.c: mark funcs in __sanitizer_cov_trace_pc() as notrace") missed to mark write_comp_data() as 'notrace'. When that patch was created gcc-7 was used. In lib/Kconfig.debug config KCOV_ENABLE_COMPARISONS depends on $(cc-option,-fsanitize-coverage=trace-cmp) That code path isn't hit with gcc-7. However, it were that with gcc-8. Link: http://lkml.kernel.org/r/20181206143011.23719-1-anders.roxell@linaro.org Signed-off-by: Anders Roxell Signed-off-by: Arnd Bergmann Co-developed-by: Arnd Bergmann Acked-by: Steven Rostedt (VMware) Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kcov.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kcov.c b/kernel/kcov.c index 97959d7b77e2..c2277dbdbfb1 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -112,7 +112,7 @@ void notrace __sanitizer_cov_trace_pc(void) EXPORT_SYMBOL(__sanitizer_cov_trace_pc); #ifdef CONFIG_KCOV_ENABLE_COMPARISONS -static void write_comp_data(u64 type, u64 arg1, u64 arg2, u64 ip) +static void notrace write_comp_data(u64 type, u64 arg1, u64 arg2, u64 ip) { struct task_struct *t; u64 *area; -- cgit From 3bb5f4ac55dd91d516e7e36b45c94bd57efbb068 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Thu, 3 Jan 2019 15:28:44 -0800 Subject: kernel/locking/mutex.c: remove caller signal_pending branch predictions This is already done for us internally by the signal machinery. Link: http://lkml.kernel.org/r/20181116002713.8474-2-dave@stgolabs.net Signed-off-by: Davidlohr Bueso Reviewed-by: Andrew Morton Cc: Peter Zijlstra Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/locking/mutex.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 3f8a35104285..db578783dd36 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -987,7 +987,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, * wait_lock. This ensures the lock cancellation is ordered * against mutex_unlock() and wake-ups do not go missing. */ - if (unlikely(signal_pending_state(state, current))) { + if (signal_pending_state(state, current)) { ret = -EINTR; goto err; } -- cgit From 34ec35ad8f5f4624e8391dbb83afb4c791f027e3 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Thu, 3 Jan 2019 15:28:48 -0800 Subject: kernel/sched/: remove caller signal_pending branch predictions This is already done for us internally by the signal machinery. Link: http://lkml.kernel.org/r/20181116002713.8474-3-dave@stgolabs.net Signed-off-by: Davidlohr Bueso Reviewed-by: Andrew Morton Cc: Peter Zijlstra Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched/core.c | 2 +- kernel/sched/swait.c | 2 +- kernel/sched/wait.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f66920173370..17a954c9e153 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3416,7 +3416,7 @@ static void __sched notrace __schedule(bool preempt) switch_count = &prev->nivcsw; if (!preempt && prev->state) { - if (unlikely(signal_pending_state(prev->state, prev))) { + if (signal_pending_state(prev->state, prev)) { prev->state = TASK_RUNNING; } else { deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK); diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c index 66b59ac77c22..e83a3f8449f6 100644 --- a/kernel/sched/swait.c +++ b/kernel/sched/swait.c @@ -93,7 +93,7 @@ long prepare_to_swait_event(struct swait_queue_head *q, struct swait_queue *wait long ret = 0; raw_spin_lock_irqsave(&q->lock, flags); - if (unlikely(signal_pending_state(state, current))) { + if (signal_pending_state(state, current)) { /* * See prepare_to_wait_event(). TL;DR, subsequent swake_up_one() * must not see us. diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 5dd47f1103d1..6eb1f8efd221 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -264,7 +264,7 @@ long prepare_to_wait_event(struct wait_queue_head *wq_head, struct wait_queue_en long ret = 0; spin_lock_irqsave(&wq_head->lock, flags); - if (unlikely(signal_pending_state(state, current))) { + if (signal_pending_state(state, current)) { /* * Exclusive waiter must not fail if it was selected by wakeup, * it should "consume" the condition we were waiting for. -- cgit From 8270f3a11ceef64bdb0ab141180e8d2f17c619ec Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 4 Jan 2019 18:31:48 +0100 Subject: dma-direct: fix DMA_ATTR_NO_KERNEL_MAPPING for remapped allocations We need to return a dma_addr_t even if we don't have a kernel mapping. Do so by consolidating the phys_to_dma call in a single place and jump to it from all the branches that return successfully. Fixes: bfd56cd60521 ("dma-mapping: support highmem in the generic remap allocator") Reported-by: Liviu Dudau Tested-by: Liviu Dudau --- kernel/dma/remap.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/remap.c b/kernel/dma/remap.c index 18cc09fc27b9..7a723194ecbe 100644 --- a/kernel/dma/remap.c +++ b/kernel/dma/remap.c @@ -204,8 +204,7 @@ void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, ret = dma_alloc_from_pool(size, &page, flags); if (!ret) return NULL; - *dma_handle = phys_to_dma(dev, page_to_phys(page)); - return ret; + goto done; } page = __dma_direct_alloc_pages(dev, size, dma_handle, flags, attrs); @@ -215,8 +214,10 @@ void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, /* remove any dirty cache lines on the kernel alias */ arch_dma_prep_coherent(page, size); - if (attrs & DMA_ATTR_NO_KERNEL_MAPPING) - return page; /* opaque cookie */ + if (attrs & DMA_ATTR_NO_KERNEL_MAPPING) { + ret = page; /* opaque cookie */ + goto done; + } /* create a coherent mapping */ ret = dma_common_contiguous_remap(page, size, VM_USERMAP, @@ -227,9 +228,9 @@ void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, return ret; } - *dma_handle = phys_to_dma(dev, page_to_phys(page)); memset(ret, 0, size); - +done: + *dma_handle = phys_to_dma(dev, page_to_phys(page)); return ret; } -- cgit From e9666d10a5677a494260d60d1fa0b73cc7646eb3 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 31 Dec 2018 00:14:15 +0900 Subject: jump_label: move 'asm goto' support test to Kconfig Currently, CONFIG_JUMP_LABEL just means "I _want_ to use jump label". The jump label is controlled by HAVE_JUMP_LABEL, which is defined like this: #if defined(CC_HAVE_ASM_GOTO) && defined(CONFIG_JUMP_LABEL) # define HAVE_JUMP_LABEL #endif We can improve this by testing 'asm goto' support in Kconfig, then make JUMP_LABEL depend on CC_HAS_ASM_GOTO. Ugly #ifdef HAVE_JUMP_LABEL will go away, and CONFIG_JUMP_LABEL will match to the real kernel capability. Signed-off-by: Masahiro Yamada Acked-by: Michael Ellerman (powerpc) Tested-by: Sedat Dilek --- Makefile | 7 ------- arch/Kconfig | 1 + arch/arm/kernel/jump_label.c | 4 ---- arch/arm64/kernel/jump_label.c | 4 ---- arch/mips/kernel/jump_label.c | 4 ---- arch/powerpc/include/asm/asm-prototypes.h | 2 +- arch/powerpc/kernel/jump_label.c | 2 -- arch/powerpc/platforms/powernv/opal-tracepoints.c | 2 +- arch/powerpc/platforms/powernv/opal-wrappers.S | 2 +- arch/powerpc/platforms/pseries/hvCall.S | 4 ++-- arch/powerpc/platforms/pseries/lpar.c | 2 +- arch/s390/kernel/Makefile | 3 ++- arch/s390/kernel/jump_label.c | 4 ---- arch/sparc/kernel/Makefile | 2 +- arch/sparc/kernel/jump_label.c | 4 ---- arch/x86/Makefile | 2 +- arch/x86/entry/calling.h | 2 +- arch/x86/include/asm/cpufeature.h | 2 +- arch/x86/include/asm/jump_label.h | 13 ------------- arch/x86/include/asm/rmwcc.h | 6 +++--- arch/x86/kernel/Makefile | 3 ++- arch/x86/kernel/jump_label.c | 4 ---- arch/x86/kvm/emulate.c | 2 +- arch/xtensa/kernel/jump_label.c | 4 ---- include/linux/dynamic_debug.h | 6 +++--- include/linux/jump_label.h | 22 +++++++++------------- include/linux/jump_label_ratelimit.h | 8 +++----- include/linux/module.h | 2 +- include/linux/netfilter.h | 4 ++-- include/linux/netfilter_ingress.h | 2 +- init/Kconfig | 3 +++ kernel/jump_label.c | 10 +++------- kernel/module.c | 2 +- kernel/sched/core.c | 2 +- kernel/sched/debug.c | 4 ++-- kernel/sched/fair.c | 6 +++--- kernel/sched/sched.h | 6 +++--- lib/dynamic_debug.c | 2 +- net/core/dev.c | 6 +++--- net/netfilter/core.c | 6 +++--- scripts/gcc-goto.sh | 2 +- tools/arch/x86/include/asm/rmwcc.h | 6 +++--- 42 files changed, 65 insertions(+), 119 deletions(-) (limited to 'kernel') diff --git a/Makefile b/Makefile index 60a473247657..04a857817f77 100644 --- a/Makefile +++ b/Makefile @@ -514,13 +514,6 @@ RETPOLINE_VDSO_CFLAGS := $(call cc-option,$(RETPOLINE_VDSO_CFLAGS_GCC),$(call cc export RETPOLINE_CFLAGS export RETPOLINE_VDSO_CFLAGS -# check for 'asm goto' -ifeq ($(shell $(CONFIG_SHELL) $(srctree)/scripts/gcc-goto.sh $(CC) $(KBUILD_CFLAGS)), y) - CC_HAVE_ASM_GOTO := 1 - KBUILD_CFLAGS += -DCC_HAVE_ASM_GOTO - KBUILD_AFLAGS += -DCC_HAVE_ASM_GOTO -endif - # The expansion should be delayed until arch/$(SRCARCH)/Makefile is included. # Some architectures define CROSS_COMPILE in arch/$(SRCARCH)/Makefile. # CC_VERSION_TEXT is referenced from Kconfig (so it needs export), diff --git a/arch/Kconfig b/arch/Kconfig index b70c952ac838..4cfb6de48f79 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -71,6 +71,7 @@ config KPROBES config JUMP_LABEL bool "Optimize very unlikely/likely branches" depends on HAVE_ARCH_JUMP_LABEL + depends on CC_HAS_ASM_GOTO help This option enables a transparent branch optimization that makes certain almost-always-true or almost-always-false branch diff --git a/arch/arm/kernel/jump_label.c b/arch/arm/kernel/jump_label.c index 90bce3d9928e..303b3ab87f7e 100644 --- a/arch/arm/kernel/jump_label.c +++ b/arch/arm/kernel/jump_label.c @@ -4,8 +4,6 @@ #include #include -#ifdef HAVE_JUMP_LABEL - static void __arch_jump_label_transform(struct jump_entry *entry, enum jump_label_type type, bool is_static) @@ -35,5 +33,3 @@ void arch_jump_label_transform_static(struct jump_entry *entry, { __arch_jump_label_transform(entry, type, true); } - -#endif diff --git a/arch/arm64/kernel/jump_label.c b/arch/arm64/kernel/jump_label.c index 646b9562ee64..1eff270e8861 100644 --- a/arch/arm64/kernel/jump_label.c +++ b/arch/arm64/kernel/jump_label.c @@ -20,8 +20,6 @@ #include #include -#ifdef HAVE_JUMP_LABEL - void arch_jump_label_transform(struct jump_entry *entry, enum jump_label_type type) { @@ -49,5 +47,3 @@ void arch_jump_label_transform_static(struct jump_entry *entry, * NOP needs to be replaced by a branch. */ } - -#endif /* HAVE_JUMP_LABEL */ diff --git a/arch/mips/kernel/jump_label.c b/arch/mips/kernel/jump_label.c index 32e3168316cd..ab943927f97a 100644 --- a/arch/mips/kernel/jump_label.c +++ b/arch/mips/kernel/jump_label.c @@ -16,8 +16,6 @@ #include #include -#ifdef HAVE_JUMP_LABEL - /* * Define parameters for the standard MIPS and the microMIPS jump * instruction encoding respectively: @@ -70,5 +68,3 @@ void arch_jump_label_transform(struct jump_entry *e, mutex_unlock(&text_mutex); } - -#endif /* HAVE_JUMP_LABEL */ diff --git a/arch/powerpc/include/asm/asm-prototypes.h b/arch/powerpc/include/asm/asm-prototypes.h index 6f201b199c02..1d911f68a23b 100644 --- a/arch/powerpc/include/asm/asm-prototypes.h +++ b/arch/powerpc/include/asm/asm-prototypes.h @@ -38,7 +38,7 @@ extern struct static_key hcall_tracepoint_key; void __trace_hcall_entry(unsigned long opcode, unsigned long *args); void __trace_hcall_exit(long opcode, long retval, unsigned long *retbuf); /* OPAL tracing */ -#ifdef HAVE_JUMP_LABEL +#ifdef CONFIG_JUMP_LABEL extern struct static_key opal_tracepoint_key; #endif diff --git a/arch/powerpc/kernel/jump_label.c b/arch/powerpc/kernel/jump_label.c index 6472472093d0..0080c5fbd225 100644 --- a/arch/powerpc/kernel/jump_label.c +++ b/arch/powerpc/kernel/jump_label.c @@ -11,7 +11,6 @@ #include #include -#ifdef HAVE_JUMP_LABEL void arch_jump_label_transform(struct jump_entry *entry, enum jump_label_type type) { @@ -22,4 +21,3 @@ void arch_jump_label_transform(struct jump_entry *entry, else patch_instruction(addr, PPC_INST_NOP); } -#endif diff --git a/arch/powerpc/platforms/powernv/opal-tracepoints.c b/arch/powerpc/platforms/powernv/opal-tracepoints.c index 1ab7d26c0a2c..f16a43540e30 100644 --- a/arch/powerpc/platforms/powernv/opal-tracepoints.c +++ b/arch/powerpc/platforms/powernv/opal-tracepoints.c @@ -4,7 +4,7 @@ #include #include -#ifdef HAVE_JUMP_LABEL +#ifdef CONFIG_JUMP_LABEL struct static_key opal_tracepoint_key = STATIC_KEY_INIT; int opal_tracepoint_regfunc(void) diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S b/arch/powerpc/platforms/powernv/opal-wrappers.S index 251528231a9e..f4875fe3f8ff 100644 --- a/arch/powerpc/platforms/powernv/opal-wrappers.S +++ b/arch/powerpc/platforms/powernv/opal-wrappers.S @@ -20,7 +20,7 @@ .section ".text" #ifdef CONFIG_TRACEPOINTS -#ifdef HAVE_JUMP_LABEL +#ifdef CONFIG_JUMP_LABEL #define OPAL_BRANCH(LABEL) \ ARCH_STATIC_BRANCH(LABEL, opal_tracepoint_key) #else diff --git a/arch/powerpc/platforms/pseries/hvCall.S b/arch/powerpc/platforms/pseries/hvCall.S index d91412c591ef..50dc9426d0be 100644 --- a/arch/powerpc/platforms/pseries/hvCall.S +++ b/arch/powerpc/platforms/pseries/hvCall.S @@ -19,7 +19,7 @@ #ifdef CONFIG_TRACEPOINTS -#ifndef HAVE_JUMP_LABEL +#ifndef CONFIG_JUMP_LABEL .section ".toc","aw" .globl hcall_tracepoint_refcount @@ -79,7 +79,7 @@ hcall_tracepoint_refcount: mr r5,BUFREG; \ __HCALL_INST_POSTCALL -#ifdef HAVE_JUMP_LABEL +#ifdef CONFIG_JUMP_LABEL #define HCALL_BRANCH(LABEL) \ ARCH_STATIC_BRANCH(LABEL, hcall_tracepoint_key) #else diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c index 32d4452973e7..f2a9f0adc2d3 100644 --- a/arch/powerpc/platforms/pseries/lpar.c +++ b/arch/powerpc/platforms/pseries/lpar.c @@ -1040,7 +1040,7 @@ EXPORT_SYMBOL(arch_free_page); #endif /* CONFIG_PPC_BOOK3S_64 */ #ifdef CONFIG_TRACEPOINTS -#ifdef HAVE_JUMP_LABEL +#ifdef CONFIG_JUMP_LABEL struct static_key hcall_tracepoint_key = STATIC_KEY_INIT; int hcall_tracepoint_regfunc(void) diff --git a/arch/s390/kernel/Makefile b/arch/s390/kernel/Makefile index 386b1abb217b..e216e116a9a9 100644 --- a/arch/s390/kernel/Makefile +++ b/arch/s390/kernel/Makefile @@ -48,7 +48,7 @@ CFLAGS_ptrace.o += -DUTS_MACHINE='"$(UTS_MACHINE)"' obj-y := traps.o time.o process.o base.o early.o setup.o idle.o vtime.o obj-y += processor.o sys_s390.o ptrace.o signal.o cpcmd.o ebcdic.o nmi.o obj-y += debug.o irq.o ipl.o dis.o diag.o vdso.o early_nobss.o -obj-y += sysinfo.o jump_label.o lgr.o os_info.o machine_kexec.o pgm_check.o +obj-y += sysinfo.o lgr.o os_info.o machine_kexec.o pgm_check.o obj-y += runtime_instr.o cache.o fpu.o dumpstack.o guarded_storage.o sthyi.o obj-y += entry.o reipl.o relocate_kernel.o kdebugfs.o alternative.o obj-y += nospec-branch.o ipl_vmparm.o @@ -72,6 +72,7 @@ obj-$(CONFIG_KPROBES) += kprobes.o obj-$(CONFIG_FUNCTION_TRACER) += mcount.o ftrace.o obj-$(CONFIG_CRASH_DUMP) += crash_dump.o obj-$(CONFIG_UPROBES) += uprobes.o +obj-$(CONFIG_JUMP_LABEL) += jump_label.o obj-$(CONFIG_KEXEC_FILE) += machine_kexec_file.o kexec_image.o obj-$(CONFIG_KEXEC_FILE) += kexec_elf.o diff --git a/arch/s390/kernel/jump_label.c b/arch/s390/kernel/jump_label.c index 50a1798604a8..3f10b56bd5a3 100644 --- a/arch/s390/kernel/jump_label.c +++ b/arch/s390/kernel/jump_label.c @@ -10,8 +10,6 @@ #include #include -#ifdef HAVE_JUMP_LABEL - struct insn { u16 opcode; s32 offset; @@ -103,5 +101,3 @@ void arch_jump_label_transform_static(struct jump_entry *entry, { __jump_label_transform(entry, type, 1); } - -#endif diff --git a/arch/sparc/kernel/Makefile b/arch/sparc/kernel/Makefile index cf8640841b7a..97c0e19263d1 100644 --- a/arch/sparc/kernel/Makefile +++ b/arch/sparc/kernel/Makefile @@ -118,4 +118,4 @@ pc--$(CONFIG_PERF_EVENTS) := perf_event.o obj-$(CONFIG_SPARC64) += $(pc--y) obj-$(CONFIG_UPROBES) += uprobes.o -obj-$(CONFIG_SPARC64) += jump_label.o +obj-$(CONFIG_JUMP_LABEL) += jump_label.o diff --git a/arch/sparc/kernel/jump_label.c b/arch/sparc/kernel/jump_label.c index 7f8eac51df33..a4cfaeecaf5e 100644 --- a/arch/sparc/kernel/jump_label.c +++ b/arch/sparc/kernel/jump_label.c @@ -9,8 +9,6 @@ #include -#ifdef HAVE_JUMP_LABEL - void arch_jump_label_transform(struct jump_entry *entry, enum jump_label_type type) { @@ -47,5 +45,3 @@ void arch_jump_label_transform(struct jump_entry *entry, flushi(insn); mutex_unlock(&text_mutex); } - -#endif diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 16c3145c0a5f..9c5a67d1b9c1 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -289,7 +289,7 @@ vdso_install: archprepare: checkbin checkbin: -ifndef CC_HAVE_ASM_GOTO +ifndef CONFIG_CC_HAS_ASM_GOTO @echo Compiler lacks asm-goto support. @exit 1 endif diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h index 20d0885b00fb..efb0d1b1f15f 100644 --- a/arch/x86/entry/calling.h +++ b/arch/x86/entry/calling.h @@ -351,7 +351,7 @@ For 32-bit we have the following conventions - kernel is built with */ .macro CALL_enter_from_user_mode #ifdef CONFIG_CONTEXT_TRACKING -#ifdef HAVE_JUMP_LABEL +#ifdef CONFIG_JUMP_LABEL STATIC_JUMP_IF_FALSE .Lafter_call_\@, context_tracking_enabled, def=0 #endif call enter_from_user_mode diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index aced6c9290d6..ce95b8cbd229 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -140,7 +140,7 @@ extern void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit); #define setup_force_cpu_bug(bit) setup_force_cpu_cap(bit) -#if defined(__clang__) && !defined(CC_HAVE_ASM_GOTO) +#if defined(__clang__) && !defined(CONFIG_CC_HAS_ASM_GOTO) /* * Workaround for the sake of BPF compilation which utilizes kernel diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h index 21efc9d07ed9..65191ce8e1cf 100644 --- a/arch/x86/include/asm/jump_label.h +++ b/arch/x86/include/asm/jump_label.h @@ -2,19 +2,6 @@ #ifndef _ASM_X86_JUMP_LABEL_H #define _ASM_X86_JUMP_LABEL_H -#ifndef HAVE_JUMP_LABEL -/* - * For better or for worse, if jump labels (the gcc extension) are missing, - * then the entire static branch patching infrastructure is compiled out. - * If that happens, the code in here will malfunction. Raise a compiler - * error instead. - * - * In theory, jump labels and the static branch patching infrastructure - * could be decoupled to fix this. - */ -#error asm/jump_label.h included on a non-jump-label kernel -#endif - #define JUMP_LABEL_NOP_SIZE 5 #ifdef CONFIG_X86_64 diff --git a/arch/x86/include/asm/rmwcc.h b/arch/x86/include/asm/rmwcc.h index 46ac84b506f5..8a9eba191516 100644 --- a/arch/x86/include/asm/rmwcc.h +++ b/arch/x86/include/asm/rmwcc.h @@ -11,7 +11,7 @@ #define __CLOBBERS_MEM(clb...) "memory", ## clb -#if !defined(__GCC_ASM_FLAG_OUTPUTS__) && defined(CC_HAVE_ASM_GOTO) +#if !defined(__GCC_ASM_FLAG_OUTPUTS__) && defined(CONFIG_CC_HAS_ASM_GOTO) /* Use asm goto */ @@ -27,7 +27,7 @@ cc_label: c = true; \ c; \ }) -#else /* defined(__GCC_ASM_FLAG_OUTPUTS__) || !defined(CC_HAVE_ASM_GOTO) */ +#else /* defined(__GCC_ASM_FLAG_OUTPUTS__) || !defined(CONFIG_CC_HAS_ASM_GOTO) */ /* Use flags output or a set instruction */ @@ -40,7 +40,7 @@ cc_label: c = true; \ c; \ }) -#endif /* defined(__GCC_ASM_FLAG_OUTPUTS__) || !defined(CC_HAVE_ASM_GOTO) */ +#endif /* defined(__GCC_ASM_FLAG_OUTPUTS__) || !defined(CONFIG_CC_HAS_ASM_GOTO) */ #define GEN_UNARY_RMWcc_4(op, var, cc, arg0) \ __GEN_RMWcc(op " " arg0, var, cc, __CLOBBERS_MEM()) diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index eb51b0e1189c..00b7e27bc2b7 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -49,7 +49,8 @@ obj-$(CONFIG_COMPAT) += signal_compat.o obj-y += traps.o idt.o irq.o irq_$(BITS).o dumpstack_$(BITS).o obj-y += time.o ioport.o dumpstack.o nmi.o obj-$(CONFIG_MODIFY_LDT_SYSCALL) += ldt.o -obj-y += setup.o x86_init.o i8259.o irqinit.o jump_label.o +obj-y += setup.o x86_init.o i8259.o irqinit.o +obj-$(CONFIG_JUMP_LABEL) += jump_label.o obj-$(CONFIG_IRQ_WORK) += irq_work.o obj-y += probe_roms.o obj-$(CONFIG_X86_64) += sys_x86_64.o diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c index aac0c1f7e354..f99bd26bd3f1 100644 --- a/arch/x86/kernel/jump_label.c +++ b/arch/x86/kernel/jump_label.c @@ -16,8 +16,6 @@ #include #include -#ifdef HAVE_JUMP_LABEL - union jump_code_union { char code[JUMP_LABEL_NOP_SIZE]; struct { @@ -130,5 +128,3 @@ __init_or_module void arch_jump_label_transform_static(struct jump_entry *entry, if (jlstate == JL_STATE_UPDATE) __jump_label_transform(entry, type, text_poke_early, 1); } - -#endif diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 78e430f4e15c..c338984c850d 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -456,7 +456,7 @@ FOP_END; /* * XXX: inoutclob user must know where the argument is being expanded. - * Relying on CC_HAVE_ASM_GOTO would allow us to remove _fault. + * Relying on CONFIG_CC_HAS_ASM_GOTO would allow us to remove _fault. */ #define asm_safe(insn, inoutclob...) \ ({ \ diff --git a/arch/xtensa/kernel/jump_label.c b/arch/xtensa/kernel/jump_label.c index d108f721c116..61cf6497a646 100644 --- a/arch/xtensa/kernel/jump_label.c +++ b/arch/xtensa/kernel/jump_label.c @@ -10,8 +10,6 @@ #include -#ifdef HAVE_JUMP_LABEL - #define J_OFFSET_MASK 0x0003ffff #define J_SIGN_MASK (~(J_OFFSET_MASK >> 1)) @@ -95,5 +93,3 @@ void arch_jump_label_transform(struct jump_entry *e, patch_text(jump_entry_code(e), &insn, JUMP_LABEL_NOP_SIZE); } - -#endif /* HAVE_JUMP_LABEL */ diff --git a/include/linux/dynamic_debug.h b/include/linux/dynamic_debug.h index 2fd8006153c3..b3419da1a776 100644 --- a/include/linux/dynamic_debug.h +++ b/include/linux/dynamic_debug.h @@ -2,7 +2,7 @@ #ifndef _DYNAMIC_DEBUG_H #define _DYNAMIC_DEBUG_H -#if defined(CC_HAVE_ASM_GOTO) && defined(CONFIG_JUMP_LABEL) +#if defined(CONFIG_JUMP_LABEL) #include #endif @@ -38,7 +38,7 @@ struct _ddebug { #define _DPRINTK_FLAGS_DEFAULT 0 #endif unsigned int flags:8; -#ifdef HAVE_JUMP_LABEL +#ifdef CONFIG_JUMP_LABEL union { struct static_key_true dd_key_true; struct static_key_false dd_key_false; @@ -83,7 +83,7 @@ void __dynamic_netdev_dbg(struct _ddebug *descriptor, dd_key_init(key, init) \ } -#ifdef HAVE_JUMP_LABEL +#ifdef CONFIG_JUMP_LABEL #define dd_key_init(key, init) key = (init) diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h index 5df6a621e464..3e113a1fa0f1 100644 --- a/include/linux/jump_label.h +++ b/include/linux/jump_label.h @@ -71,10 +71,6 @@ * Additional babbling in: Documentation/static-keys.txt */ -#if defined(CC_HAVE_ASM_GOTO) && defined(CONFIG_JUMP_LABEL) -# define HAVE_JUMP_LABEL -#endif - #ifndef __ASSEMBLY__ #include @@ -86,7 +82,7 @@ extern bool static_key_initialized; "%s(): static key '%pS' used before call to jump_label_init()", \ __func__, (key)) -#ifdef HAVE_JUMP_LABEL +#ifdef CONFIG_JUMP_LABEL struct static_key { atomic_t enabled; @@ -114,10 +110,10 @@ struct static_key { struct static_key { atomic_t enabled; }; -#endif /* HAVE_JUMP_LABEL */ +#endif /* CONFIG_JUMP_LABEL */ #endif /* __ASSEMBLY__ */ -#ifdef HAVE_JUMP_LABEL +#ifdef CONFIG_JUMP_LABEL #include #ifndef __ASSEMBLY__ @@ -192,7 +188,7 @@ enum jump_label_type { struct module; -#ifdef HAVE_JUMP_LABEL +#ifdef CONFIG_JUMP_LABEL #define JUMP_TYPE_FALSE 0UL #define JUMP_TYPE_TRUE 1UL @@ -245,7 +241,7 @@ extern void static_key_disable_cpuslocked(struct static_key *key); { .enabled = { 0 }, \ { .entries = (void *)JUMP_TYPE_FALSE } } -#else /* !HAVE_JUMP_LABEL */ +#else /* !CONFIG_JUMP_LABEL */ #include #include @@ -330,7 +326,7 @@ static inline void static_key_disable(struct static_key *key) #define STATIC_KEY_INIT_TRUE { .enabled = ATOMIC_INIT(1) } #define STATIC_KEY_INIT_FALSE { .enabled = ATOMIC_INIT(0) } -#endif /* HAVE_JUMP_LABEL */ +#endif /* CONFIG_JUMP_LABEL */ #define STATIC_KEY_INIT STATIC_KEY_INIT_FALSE #define jump_label_enabled static_key_enabled @@ -394,7 +390,7 @@ extern bool ____wrong_branch_error(void); static_key_count((struct static_key *)x) > 0; \ }) -#ifdef HAVE_JUMP_LABEL +#ifdef CONFIG_JUMP_LABEL /* * Combine the right initial value (type) with the right branch order @@ -476,12 +472,12 @@ extern bool ____wrong_branch_error(void); unlikely(branch); \ }) -#else /* !HAVE_JUMP_LABEL */ +#else /* !CONFIG_JUMP_LABEL */ #define static_branch_likely(x) likely(static_key_enabled(&(x)->key)) #define static_branch_unlikely(x) unlikely(static_key_enabled(&(x)->key)) -#endif /* HAVE_JUMP_LABEL */ +#endif /* CONFIG_JUMP_LABEL */ /* * Advanced usage; refcount, branch is enabled when: count != 0 diff --git a/include/linux/jump_label_ratelimit.h b/include/linux/jump_label_ratelimit.h index baa8eabbaa56..a49f2b45b3f0 100644 --- a/include/linux/jump_label_ratelimit.h +++ b/include/linux/jump_label_ratelimit.h @@ -5,21 +5,19 @@ #include #include -#if defined(CC_HAVE_ASM_GOTO) && defined(CONFIG_JUMP_LABEL) +#if defined(CONFIG_JUMP_LABEL) struct static_key_deferred { struct static_key key; unsigned long timeout; struct delayed_work work; }; -#endif -#ifdef HAVE_JUMP_LABEL extern void static_key_slow_dec_deferred(struct static_key_deferred *key); extern void static_key_deferred_flush(struct static_key_deferred *key); extern void jump_label_rate_limit(struct static_key_deferred *key, unsigned long rl); -#else /* !HAVE_JUMP_LABEL */ +#else /* !CONFIG_JUMP_LABEL */ struct static_key_deferred { struct static_key key; }; @@ -38,5 +36,5 @@ jump_label_rate_limit(struct static_key_deferred *key, { STATIC_KEY_CHECK_USE(key); } -#endif /* HAVE_JUMP_LABEL */ +#endif /* CONFIG_JUMP_LABEL */ #endif /* _LINUX_JUMP_LABEL_RATELIMIT_H */ diff --git a/include/linux/module.h b/include/linux/module.h index d5453eb5a68b..9a21fe3509af 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -436,7 +436,7 @@ struct module { unsigned int num_bpf_raw_events; struct bpf_raw_event_map *bpf_raw_events; #endif -#ifdef HAVE_JUMP_LABEL +#ifdef CONFIG_JUMP_LABEL struct jump_entry *jump_entries; unsigned int num_jump_entries; #endif diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index bbe99d2b28b4..72cb19c3db6a 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -176,7 +176,7 @@ void nf_unregister_net_hooks(struct net *net, const struct nf_hook_ops *reg, int nf_register_sockopt(struct nf_sockopt_ops *reg); void nf_unregister_sockopt(struct nf_sockopt_ops *reg); -#ifdef HAVE_JUMP_LABEL +#ifdef CONFIG_JUMP_LABEL extern struct static_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS]; #endif @@ -198,7 +198,7 @@ static inline int nf_hook(u_int8_t pf, unsigned int hook, struct net *net, struct nf_hook_entries *hook_head = NULL; int ret = 1; -#ifdef HAVE_JUMP_LABEL +#ifdef CONFIG_JUMP_LABEL if (__builtin_constant_p(pf) && __builtin_constant_p(hook) && !static_key_false(&nf_hooks_needed[pf][hook])) diff --git a/include/linux/netfilter_ingress.h b/include/linux/netfilter_ingress.h index 554c920691dd..a13774be2eb5 100644 --- a/include/linux/netfilter_ingress.h +++ b/include/linux/netfilter_ingress.h @@ -8,7 +8,7 @@ #ifdef CONFIG_NETFILTER_INGRESS static inline bool nf_hook_ingress_active(const struct sk_buff *skb) { -#ifdef HAVE_JUMP_LABEL +#ifdef CONFIG_JUMP_LABEL if (!static_key_false(&nf_hooks_needed[NFPROTO_NETDEV][NF_NETDEV_INGRESS])) return false; #endif diff --git a/init/Kconfig b/init/Kconfig index 3e6be1694766..d47cb77a220e 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -23,6 +23,9 @@ config CLANG_VERSION int default $(shell,$(srctree)/scripts/clang-version.sh $(CC)) +config CC_HAS_ASM_GOTO + def_bool $(success,$(srctree)/scripts/gcc-goto.sh $(CC)) + config CONSTRUCTORS bool depends on !UML diff --git a/kernel/jump_label.c b/kernel/jump_label.c index b28028b08d44..bad96b476eb6 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -18,8 +18,6 @@ #include #include -#ifdef HAVE_JUMP_LABEL - /* mutex to protect coming/going of the the jump_label table */ static DEFINE_MUTEX(jump_label_mutex); @@ -80,13 +78,13 @@ jump_label_sort_entries(struct jump_entry *start, struct jump_entry *stop) static void jump_label_update(struct static_key *key); /* - * There are similar definitions for the !HAVE_JUMP_LABEL case in jump_label.h. + * There are similar definitions for the !CONFIG_JUMP_LABEL case in jump_label.h. * The use of 'atomic_read()' requires atomic.h and its problematic for some * kernel headers such as kernel.h and others. Since static_key_count() is not - * used in the branch statements as it is for the !HAVE_JUMP_LABEL case its ok + * used in the branch statements as it is for the !CONFIG_JUMP_LABEL case its ok * to have it be a function here. Similarly, for 'static_key_enable()' and * 'static_key_disable()', which require bug.h. This should allow jump_label.h - * to be included from most/all places for HAVE_JUMP_LABEL. + * to be included from most/all places for CONFIG_JUMP_LABEL. */ int static_key_count(struct static_key *key) { @@ -791,5 +789,3 @@ static __init int jump_label_test(void) } early_initcall(jump_label_test); #endif /* STATIC_KEYS_SELFTEST */ - -#endif /* HAVE_JUMP_LABEL */ diff --git a/kernel/module.c b/kernel/module.c index fcbc0128810b..2ad1b5239910 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3102,7 +3102,7 @@ static int find_module_sections(struct module *mod, struct load_info *info) sizeof(*mod->bpf_raw_events), &mod->num_bpf_raw_events); #endif -#ifdef HAVE_JUMP_LABEL +#ifdef CONFIG_JUMP_LABEL mod->jump_entries = section_objs(info, "__jump_table", sizeof(*mod->jump_entries), &mod->num_jump_entries); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 223f78d5c111..a674c7db2f29 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -24,7 +24,7 @@ DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); -#if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL) +#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_JUMP_LABEL) /* * Debugging: various feature bits * diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 02bd5f969b21..de3de997e245 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -73,7 +73,7 @@ static int sched_feat_show(struct seq_file *m, void *v) return 0; } -#ifdef HAVE_JUMP_LABEL +#ifdef CONFIG_JUMP_LABEL #define jump_label_key__true STATIC_KEY_INIT_TRUE #define jump_label_key__false STATIC_KEY_INIT_FALSE @@ -99,7 +99,7 @@ static void sched_feat_enable(int i) #else static void sched_feat_disable(int i) { }; static void sched_feat_enable(int i) { }; -#endif /* HAVE_JUMP_LABEL */ +#endif /* CONFIG_JUMP_LABEL */ static int sched_feat_set(char *cmp) { diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6483834f1278..50aa2aba69bd 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4217,7 +4217,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) #ifdef CONFIG_CFS_BANDWIDTH -#ifdef HAVE_JUMP_LABEL +#ifdef CONFIG_JUMP_LABEL static struct static_key __cfs_bandwidth_used; static inline bool cfs_bandwidth_used(void) @@ -4234,7 +4234,7 @@ void cfs_bandwidth_usage_dec(void) { static_key_slow_dec_cpuslocked(&__cfs_bandwidth_used); } -#else /* HAVE_JUMP_LABEL */ +#else /* CONFIG_JUMP_LABEL */ static bool cfs_bandwidth_used(void) { return true; @@ -4242,7 +4242,7 @@ static bool cfs_bandwidth_used(void) void cfs_bandwidth_usage_inc(void) {} void cfs_bandwidth_usage_dec(void) {} -#endif /* HAVE_JUMP_LABEL */ +#endif /* CONFIG_JUMP_LABEL */ /* * default period for cfs group bandwidth. diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 0ba08924e017..d04530bf251f 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1488,7 +1488,7 @@ enum { #undef SCHED_FEAT -#if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL) +#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_JUMP_LABEL) /* * To support run-time toggling of sched features, all the translation units @@ -1508,7 +1508,7 @@ static __always_inline bool static_branch_##name(struct static_key *key) \ extern struct static_key sched_feat_keys[__SCHED_FEAT_NR]; #define sched_feat(x) (static_branch_##x(&sched_feat_keys[__SCHED_FEAT_##x])) -#else /* !(SCHED_DEBUG && HAVE_JUMP_LABEL) */ +#else /* !(SCHED_DEBUG && CONFIG_JUMP_LABEL) */ /* * Each translation unit has its own copy of sysctl_sched_features to allow @@ -1524,7 +1524,7 @@ static const_debug __maybe_unused unsigned int sysctl_sched_features = #define sched_feat(x) !!(sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) -#endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */ +#endif /* SCHED_DEBUG && CONFIG_JUMP_LABEL */ extern struct static_key_false sched_numa_balancing; extern struct static_key_false sched_schedstats; diff --git a/lib/dynamic_debug.c b/lib/dynamic_debug.c index c7c96bc7654a..dbf2b457e47e 100644 --- a/lib/dynamic_debug.c +++ b/lib/dynamic_debug.c @@ -188,7 +188,7 @@ static int ddebug_change(const struct ddebug_query *query, newflags = (dp->flags & mask) | flags; if (newflags == dp->flags) continue; -#ifdef HAVE_JUMP_LABEL +#ifdef CONFIG_JUMP_LABEL if (dp->flags & _DPRINTK_FLAGS_PRINT) { if (!(flags & _DPRINTK_FLAGS_PRINT)) static_branch_disable(&dp->key.dd_key_true); diff --git a/net/core/dev.c b/net/core/dev.c index 1b5a4410be0e..82f20022259d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1821,7 +1821,7 @@ EXPORT_SYMBOL_GPL(net_dec_egress_queue); #endif static DEFINE_STATIC_KEY_FALSE(netstamp_needed_key); -#ifdef HAVE_JUMP_LABEL +#ifdef CONFIG_JUMP_LABEL static atomic_t netstamp_needed_deferred; static atomic_t netstamp_wanted; static void netstamp_clear(struct work_struct *work) @@ -1840,7 +1840,7 @@ static DECLARE_WORK(netstamp_work, netstamp_clear); void net_enable_timestamp(void) { -#ifdef HAVE_JUMP_LABEL +#ifdef CONFIG_JUMP_LABEL int wanted; while (1) { @@ -1860,7 +1860,7 @@ EXPORT_SYMBOL(net_enable_timestamp); void net_disable_timestamp(void) { -#ifdef HAVE_JUMP_LABEL +#ifdef CONFIG_JUMP_LABEL int wanted; while (1) { diff --git a/net/netfilter/core.c b/net/netfilter/core.c index dc240cb47ddf..93aaec3a54ec 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -33,7 +33,7 @@ EXPORT_SYMBOL_GPL(nf_ipv6_ops); DEFINE_PER_CPU(bool, nf_skb_duplicated); EXPORT_SYMBOL_GPL(nf_skb_duplicated); -#ifdef HAVE_JUMP_LABEL +#ifdef CONFIG_JUMP_LABEL struct static_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS]; EXPORT_SYMBOL(nf_hooks_needed); #endif @@ -347,7 +347,7 @@ static int __nf_register_net_hook(struct net *net, int pf, if (pf == NFPROTO_NETDEV && reg->hooknum == NF_NETDEV_INGRESS) net_inc_ingress_queue(); #endif -#ifdef HAVE_JUMP_LABEL +#ifdef CONFIG_JUMP_LABEL static_key_slow_inc(&nf_hooks_needed[pf][reg->hooknum]); #endif BUG_ON(p == new_hooks); @@ -405,7 +405,7 @@ static void __nf_unregister_net_hook(struct net *net, int pf, if (pf == NFPROTO_NETDEV && reg->hooknum == NF_NETDEV_INGRESS) net_dec_ingress_queue(); #endif -#ifdef HAVE_JUMP_LABEL +#ifdef CONFIG_JUMP_LABEL static_key_slow_dec(&nf_hooks_needed[pf][reg->hooknum]); #endif } else { diff --git a/scripts/gcc-goto.sh b/scripts/gcc-goto.sh index 083c526073ef..8b980fb2270a 100755 --- a/scripts/gcc-goto.sh +++ b/scripts/gcc-goto.sh @@ -3,7 +3,7 @@ # Test for gcc 'asm goto' support # Copyright (C) 2010, Jason Baron -cat << "END" | $@ -x c - -c -o /dev/null >/dev/null 2>&1 && echo "y" +cat << "END" | $@ -x c - -fno-PIE -c -o /dev/null int main(void) { #if defined(__arm__) || defined(__aarch64__) diff --git a/tools/arch/x86/include/asm/rmwcc.h b/tools/arch/x86/include/asm/rmwcc.h index dc90c0c2fae3..fee7983a90b4 100644 --- a/tools/arch/x86/include/asm/rmwcc.h +++ b/tools/arch/x86/include/asm/rmwcc.h @@ -2,7 +2,7 @@ #ifndef _TOOLS_LINUX_ASM_X86_RMWcc #define _TOOLS_LINUX_ASM_X86_RMWcc -#ifdef CC_HAVE_ASM_GOTO +#ifdef CONFIG_CC_HAS_ASM_GOTO #define __GEN_RMWcc(fullop, var, cc, ...) \ do { \ @@ -20,7 +20,7 @@ cc_label: \ #define GEN_BINARY_RMWcc(op, var, vcon, val, arg0, cc) \ __GEN_RMWcc(op " %1, " arg0, var, cc, vcon (val)) -#else /* !CC_HAVE_ASM_GOTO */ +#else /* !CONFIG_CC_HAS_ASM_GOTO */ #define __GEN_RMWcc(fullop, var, cc, ...) \ do { \ @@ -37,6 +37,6 @@ do { \ #define GEN_BINARY_RMWcc(op, var, vcon, val, arg0, cc) \ __GEN_RMWcc(op " %2, " arg0, var, cc, vcon (val)) -#endif /* CC_HAVE_ASM_GOTO */ +#endif /* CONFIG_CC_HAS_ASM_GOTO */ #endif /* _TOOLS_LINUX_ASM_X86_RMWcc */ -- cgit From ad774086356da92a477a87916613d96f4b36005c Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 31 Dec 2018 17:24:09 +0900 Subject: kbuild: change filechk to surround the given command with { } filechk_* rules often consist of multiple 'echo' lines. They must be surrounded with { } or ( ) to work correctly. Otherwise, only the string from the last 'echo' would be written into the target. Let's take care of that in the 'filechk' in scripts/Kbuild.include to clean up filechk_* rules. Signed-off-by: Masahiro Yamada --- Kbuild | 2 +- Makefile | 6 +++--- arch/s390/tools/Makefile | 2 +- firmware/Makefile | 5 ++--- kernel/Makefile | 6 +++++- scripts/Kbuild.include | 2 +- scripts/Makefile.lib | 3 +-- 7 files changed, 14 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/Kbuild b/Kbuild index 414ae6da1f50..06b801e12fb4 100644 --- a/Kbuild +++ b/Kbuild @@ -27,7 +27,7 @@ timeconst-file := include/generated/timeconst.h targets += $(timeconst-file) define filechk_gentimeconst - (echo $(CONFIG_HZ) | bc -q $< ) + echo $(CONFIG_HZ) | bc -q $< endef $(timeconst-file): kernel/time/timeconst.bc FORCE diff --git a/Makefile b/Makefile index 04a857817f77..437d6033598c 100644 --- a/Makefile +++ b/Makefile @@ -1127,13 +1127,13 @@ define filechk_utsrelease.h echo '"$(KERNELRELEASE)" exceeds $(uts_len) characters' >&2; \ exit 1; \ fi; \ - (echo \#define UTS_RELEASE \"$(KERNELRELEASE)\";) + echo \#define UTS_RELEASE \"$(KERNELRELEASE)\" endef define filechk_version.h - (echo \#define LINUX_VERSION_CODE $(shell \ + echo \#define LINUX_VERSION_CODE $(shell \ expr $(VERSION) \* 65536 + 0$(PATCHLEVEL) \* 256 + 0$(SUBLEVEL)); \ - echo '#define KERNEL_VERSION(a,b,c) (((a) << 16) + ((b) << 8) + (c))';) + echo '#define KERNEL_VERSION(a,b,c) (((a) << 16) + ((b) << 8) + (c))' endef $(version_h): FORCE diff --git a/arch/s390/tools/Makefile b/arch/s390/tools/Makefile index 48cdac1143a9..cf4846a7ee8d 100644 --- a/arch/s390/tools/Makefile +++ b/arch/s390/tools/Makefile @@ -25,7 +25,7 @@ define filechk_facility-defs.h endef define filechk_dis-defs.h - ( $(obj)/gen_opcode_table < $(srctree)/arch/$(ARCH)/tools/opcodes.txt ) + $(obj)/gen_opcode_table < $(srctree)/arch/$(ARCH)/tools/opcodes.txt endef $(kapi)/facility-defs.h: $(obj)/gen_facilities FORCE diff --git a/firmware/Makefile b/firmware/Makefile index e2f7dd2f30e0..37e5ae387400 100644 --- a/firmware/Makefile +++ b/firmware/Makefile @@ -13,7 +13,7 @@ ASM_WORD = $(if $(CONFIG_64BIT),.quad,.long) ASM_ALIGN = $(if $(CONFIG_64BIT),3,2) PROGBITS = $(if $(CONFIG_ARM),%,@)progbits -filechk_fwbin = { \ +filechk_fwbin = \ echo "/* Generated by $(src)/Makefile */" ;\ echo " .section .rodata" ;\ echo " .p2align $(ASM_ALIGN)" ;\ @@ -28,8 +28,7 @@ filechk_fwbin = { \ echo " .p2align $(ASM_ALIGN)" ;\ echo " $(ASM_WORD) _fw_$(FWSTR)_name" ;\ echo " $(ASM_WORD) _fw_$(FWSTR)_bin" ;\ - echo " $(ASM_WORD) _fw_end - _fw_$(FWSTR)_bin" ;\ -} + echo " $(ASM_WORD) _fw_end - _fw_$(FWSTR)_bin" $(obj)/%.gen.S: FORCE $(call filechk,fwbin) diff --git a/kernel/Makefile b/kernel/Makefile index cde93d54c571..6aa7543bcdb2 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -122,7 +122,11 @@ targets += config_data.gz $(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE $(call if_changed,gzip) - filechk_ikconfiggz = (echo "static const char kernel_config_data[] __used = MAGIC_START"; cat $< | scripts/bin2c; echo "MAGIC_END;") +filechk_ikconfiggz = \ + echo "static const char kernel_config_data[] __used = MAGIC_START"; \ + cat $< | scripts/bin2c; \ + echo "MAGIC_END;" + targets += config_data.h $(obj)/config_data.h: $(obj)/config_data.gz FORCE $(call filechk,ikconfiggz) diff --git a/scripts/Kbuild.include b/scripts/Kbuild.include index 46bf1a073f5d..74a3fe7ddc01 100644 --- a/scripts/Kbuild.include +++ b/scripts/Kbuild.include @@ -56,7 +56,7 @@ kecho := $($(quiet)kecho) define filechk $(Q)set -e; \ mkdir -p $(dir $@); \ - $(filechk_$(1)) > $@.tmp; \ + { $(filechk_$(1)); } > $@.tmp; \ if [ -r $@ ] && cmp -s $@ $@.tmp; then \ rm -f $@.tmp; \ else \ diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib index 390957f9306f..12b88d09c3a4 100644 --- a/scripts/Makefile.lib +++ b/scripts/Makefile.lib @@ -417,7 +417,6 @@ endef # Use filechk to avoid rebuilds when a header changes, but the resulting file # does not define filechk_offsets - ( \ echo "#ifndef $2"; \ echo "#define $2"; \ echo "/*"; \ @@ -428,5 +427,5 @@ define filechk_offsets echo ""; \ sed -ne $(sed-offsets) < $<; \ echo ""; \ - echo "#endif" ) + echo "#endif" endef -- cgit