From 571d91dcadfa3cef499010b4eddb9b58b0da4d24 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Wed, 25 Oct 2023 13:16:19 -0700 Subject: perf: Add branch stack counters Currently, the additional information of a branch entry is stored in a u64 space. With more and more information added, the space is running out. For example, the information of occurrences of events will be added for each branch. Two places were suggested to append the counters. https://lore.kernel.org/lkml/20230802215814.GH231007@hirez.programming.kicks-ass.net/ One place is right after the flags of each branch entry. It changes the existing struct perf_branch_entry. The later ARCH specific implementation has to be really careful to consistently pick the right struct. The other place is right after the entire struct perf_branch_stack. The disadvantage is that the pointer of the extra space has to be recorded. The common interface perf_sample_save_brstack() has to be updated. The latter is much straightforward, and should be easily understood and maintained. It is implemented in the patch. Add a new branch sample type, PERF_SAMPLE_BRANCH_COUNTERS, to indicate the event which is recorded in the branch info. The "u64 counters" may store the occurrences of several events. The information regarding the number of events/counters and the width of each counter should be exposed via sysfs as a reference for the perf tool. Define the branch_counter_nr and branch_counter_width ABI here. The support will be implemented later in the Intel-specific patch. Suggested-by: Peter Zijlstra (Intel) Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20231025201626.3000228-1-kan.liang@linux.intel.com --- kernel/events/core.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 3eb26c2c6e65..d27ffd80ed67 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7341,6 +7341,14 @@ void perf_output_sample(struct perf_output_handle *handle, if (branch_sample_hw_index(event)) perf_output_put(handle, data->br_stack->hw_idx); perf_output_copy(handle, data->br_stack->entries, size); + /* + * Add the extension space which is appended + * right after the struct perf_branch_stack. + */ + if (data->br_stack_cntr) { + size = data->br_stack->nr * sizeof(u64); + perf_output_copy(handle, data->br_stack_cntr, size); + } } else { /* * we always store at least the value of nr -- cgit From 96a2b48e5e1df6698f504969f0f51dc34e52ff3d Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Sun, 29 Oct 2023 06:14:28 +0000 Subject: cgroup: Remove unnecessary list_empty() The root hasn't been removed from the root_list, so the list can't be NULL. However, if it had been removed, attempting to destroy it once more is not possible. Let's replace this with WARN_ON_ONCE() for clarity. Signed-off-by: Yafang Shao Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 1d5b9de3b1b9..3a436e4f0da1 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1347,10 +1347,9 @@ static void cgroup_destroy_root(struct cgroup_root *root) spin_unlock_irq(&css_set_lock); - if (!list_empty(&root->root_list)) { - list_del(&root->root_list); - cgroup_root_count--; - } + WARN_ON_ONCE(list_empty(&root->root_list)); + list_del(&root->root_list); + cgroup_root_count--; if (!have_favordynmods) cgroup_favor_dynmods(root, false); -- cgit From d23b5c577715892c87533b13923306acc6243f93 Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Sun, 29 Oct 2023 06:14:29 +0000 Subject: cgroup: Make operations on the cgroup root_list RCU safe At present, when we perform operations on the cgroup root_list, we must hold the cgroup_mutex, which is a relatively heavyweight lock. In reality, we can make operations on this list RCU-safe, eliminating the need to hold the cgroup_mutex during traversal. Modifications to the list only occur in the cgroup root setup and destroy paths, which should be infrequent in a production environment. In contrast, traversal may occur frequently. Therefore, making it RCU-safe would be beneficial. Signed-off-by: Yafang Shao Signed-off-by: Tejun Heo --- include/linux/cgroup-defs.h | 1 + kernel/cgroup/cgroup-internal.h | 3 ++- kernel/cgroup/cgroup.c | 23 ++++++++++++++++------- 3 files changed, 19 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 4a6b6b77ccb6..4caab0c6b361 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -563,6 +563,7 @@ struct cgroup_root { /* A list running through the active hierarchies */ struct list_head root_list; + struct rcu_head rcu; /* Hierarchy-specific flags */ unsigned int flags; diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index c56071f150f2..5e17f01ced9f 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -170,7 +170,8 @@ extern struct list_head cgroup_roots; /* iterate across the hierarchies */ #define for_each_root(root) \ - list_for_each_entry((root), &cgroup_roots, root_list) + list_for_each_entry_rcu((root), &cgroup_roots, root_list, \ + lockdep_is_held(&cgroup_mutex)) /** * for_each_subsys - iterate all enabled cgroup subsystems diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 3a436e4f0da1..19784d44d615 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1315,7 +1315,7 @@ static void cgroup_exit_root_id(struct cgroup_root *root) void cgroup_free_root(struct cgroup_root *root) { - kfree(root); + kfree_rcu(root, rcu); } static void cgroup_destroy_root(struct cgroup_root *root) @@ -1348,7 +1348,7 @@ static void cgroup_destroy_root(struct cgroup_root *root) spin_unlock_irq(&css_set_lock); WARN_ON_ONCE(list_empty(&root->root_list)); - list_del(&root->root_list); + list_del_rcu(&root->root_list); cgroup_root_count--; if (!have_favordynmods) @@ -1389,7 +1389,15 @@ static inline struct cgroup *__cset_cgroup_from_root(struct css_set *cset, } } - BUG_ON(!res_cgroup); + /* + * If cgroup_mutex is not held, the cgrp_cset_link will be freed + * before we remove the cgroup root from the root_list. Consequently, + * when accessing a cgroup root, the cset_link may have already been + * freed, resulting in a NULL res_cgroup. However, by holding the + * cgroup_mutex, we ensure that res_cgroup can't be NULL. + * If we don't hold cgroup_mutex in the caller, we must do the NULL + * check. + */ return res_cgroup; } @@ -1448,7 +1456,6 @@ static struct cgroup *current_cgns_cgroup_dfl(void) static struct cgroup *cset_cgroup_from_root(struct css_set *cset, struct cgroup_root *root) { - lockdep_assert_held(&cgroup_mutex); lockdep_assert_held(&css_set_lock); return __cset_cgroup_from_root(cset, root); @@ -1456,7 +1463,9 @@ static struct cgroup *cset_cgroup_from_root(struct css_set *cset, /* * Return the cgroup for "task" from the given hierarchy. Must be - * called with cgroup_mutex and css_set_lock held. + * called with css_set_lock held to prevent task's groups from being modified. + * Must be called with either cgroup_mutex or rcu read lock to prevent the + * cgroup root from being destroyed. */ struct cgroup *task_cgroup_from_root(struct task_struct *task, struct cgroup_root *root) @@ -2031,7 +2040,7 @@ void init_cgroup_root(struct cgroup_fs_context *ctx) struct cgroup_root *root = ctx->root; struct cgroup *cgrp = &root->cgrp; - INIT_LIST_HEAD(&root->root_list); + INIT_LIST_HEAD_RCU(&root->root_list); atomic_set(&root->nr_cgrps, 1); cgrp->root = root; init_cgroup_housekeeping(cgrp); @@ -2114,7 +2123,7 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) * care of subsystems' refcounts, which are explicitly dropped in * the failure exit path. */ - list_add(&root->root_list, &cgroup_roots); + list_add_rcu(&root->root_list, &cgroup_roots); cgroup_root_count++; /* -- cgit From 9067d90006df089b9a1da0d74f0cad232a5d726a Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Sun, 29 Oct 2023 06:14:30 +0000 Subject: cgroup: Eliminate the need for cgroup_mutex in proc_cgroup_show() The cgroup root_list is already RCU-safe. Therefore, we can replace the cgroup_mutex with the RCU read lock in some particular paths. This change will be particularly beneficial for frequent operations, such as `cat /proc/self/cgroup`, in a cgroup1-based container environment. I did stress tests with this change, as outlined below (with CONFIG_PROVE_RCU_LIST enabled): - Continuously mounting and unmounting named cgroups in some tasks, for example: cgrp_name=$1 while true do mount -t cgroup -o none,name=$cgrp_name none /$cgrp_name umount /$cgrp_name done - Continuously triggering proc_cgroup_show() in some tasks concurrently, for example: while true; do cat /proc/self/cgroup > /dev/null; done They can ran successfully after implementing this change, with no RCU warnings in dmesg. Signed-off-by: Yafang Shao Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 19784d44d615..9bb255e41cf2 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -6285,7 +6285,7 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, if (!buf) goto out; - cgroup_lock(); + rcu_read_lock(); spin_lock_irq(&css_set_lock); for_each_root(root) { @@ -6296,6 +6296,11 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, if (root == &cgrp_dfl_root && !READ_ONCE(cgrp_dfl_visible)) continue; + cgrp = task_cgroup_from_root(tsk, root); + /* The root has already been unmounted. */ + if (!cgrp) + continue; + seq_printf(m, "%d:", root->hierarchy_id); if (root != &cgrp_dfl_root) for_each_subsys(ss, ssid) @@ -6306,9 +6311,6 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, seq_printf(m, "%sname=%s", count ? "," : "", root->name); seq_putc(m, ':'); - - cgrp = task_cgroup_from_root(tsk, root); - /* * On traditional hierarchies, all zombie tasks show up as * belonging to the root cgroup. On the default hierarchy, @@ -6340,7 +6342,7 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, retval = 0; out_unlock: spin_unlock_irq(&css_set_lock); - cgroup_unlock(); + rcu_read_unlock(); kfree(buf); out: return retval; -- cgit From 0008454e8fd30ed0017a9a35b8dd708f168931b8 Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Sun, 29 Oct 2023 06:14:31 +0000 Subject: cgroup: Add annotation for holding namespace_sem in current_cgns_cgroup_from_root() When I initially examined the function current_cgns_cgroup_from_root(), I was perplexed by its lack of holding cgroup_mutex. However, after Michal explained the reason[0] to me, I realized that it already holds the namespace_sem. I believe this intricacy could also confuse others, so it would be advisable to include an annotation for clarification. After we replace the cgroup_mutex with RCU read lock, if current doesn't hold the namespace_sem, the root cgroup will be NULL. So let's add a WARN_ON_ONCE() for it. [0]. https://lore.kernel.org/bpf/afdnpo3jz2ic2ampud7swd6so5carkilts2mkygcaw67vbw6yh@5b5mncf7qyet Signed-off-by: Yafang Shao Cc: Michal Koutny Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 9bb255e41cf2..4e610863cc37 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1420,6 +1420,11 @@ current_cgns_cgroup_from_root(struct cgroup_root *root) rcu_read_unlock(); + /* + * The namespace_sem is held by current, so the root cgroup can't + * be umounted. Therefore, we can ensure that the res is non-NULL. + */ + WARN_ON_ONCE(!res); return res; } -- cgit From aecd408b7e50742868b3305c24325a89024e2a30 Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Sun, 29 Oct 2023 06:14:32 +0000 Subject: cgroup: Add a new helper for cgroup1 hierarchy A new helper is added for cgroup1 hierarchy: - task_get_cgroup1 Acquires the associated cgroup of a task within a specific cgroup1 hierarchy. The cgroup1 hierarchy is identified by its hierarchy ID. This helper function is added to facilitate the tracing of tasks within a particular container or cgroup dir in BPF programs. It's important to note that this helper is designed specifically for cgroup1 only. tj: Use irsqsave/restore as suggested by Hou Tao . Suggested-by: Tejun Heo Signed-off-by: Yafang Shao Cc: Hou Tao Signed-off-by: Tejun Heo --- include/linux/cgroup.h | 4 +++- kernel/cgroup/cgroup-internal.h | 1 - kernel/cgroup/cgroup-v1.c | 34 ++++++++++++++++++++++++++++++++++ 3 files changed, 37 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 0ef0af66080e..34aaf0e87def 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -69,6 +69,7 @@ struct css_task_iter { extern struct file_system_type cgroup_fs_type; extern struct cgroup_root cgrp_dfl_root; extern struct css_set init_css_set; +extern spinlock_t css_set_lock; #define SUBSYS(_x) extern struct cgroup_subsys _x ## _cgrp_subsys; #include @@ -386,7 +387,6 @@ static inline void cgroup_unlock(void) * as locks used during the cgroup_subsys::attach() methods. */ #ifdef CONFIG_PROVE_RCU -extern spinlock_t css_set_lock; #define task_css_set_check(task, __c) \ rcu_dereference_check((task)->cgroups, \ rcu_read_lock_sched_held() || \ @@ -853,4 +853,6 @@ static inline void cgroup_bpf_put(struct cgroup *cgrp) {} #endif /* CONFIG_CGROUP_BPF */ +struct cgroup *task_get_cgroup1(struct task_struct *tsk, int hierarchy_id); + #endif /* _LINUX_CGROUP_H */ diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index 5e17f01ced9f..520b90dd97ec 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -164,7 +164,6 @@ struct cgroup_mgctx { #define DEFINE_CGROUP_MGCTX(name) \ struct cgroup_mgctx name = CGROUP_MGCTX_INIT(name) -extern spinlock_t css_set_lock; extern struct cgroup_subsys *cgroup_subsys[]; extern struct list_head cgroup_roots; diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 76db6c67e39a..04d11a7dd95f 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -1262,6 +1262,40 @@ int cgroup1_get_tree(struct fs_context *fc) return ret; } +/** + * task_get_cgroup1 - Acquires the associated cgroup of a task within a + * specific cgroup1 hierarchy. The cgroup1 hierarchy is identified by its + * hierarchy ID. + * @tsk: The target task + * @hierarchy_id: The ID of a cgroup1 hierarchy + * + * On success, the cgroup is returned. On failure, ERR_PTR is returned. + * We limit it to cgroup1 only. + */ +struct cgroup *task_get_cgroup1(struct task_struct *tsk, int hierarchy_id) +{ + struct cgroup *cgrp = ERR_PTR(-ENOENT); + struct cgroup_root *root; + unsigned long flags; + + rcu_read_lock(); + for_each_root(root) { + /* cgroup1 only*/ + if (root == &cgrp_dfl_root) + continue; + if (root->hierarchy_id != hierarchy_id) + continue; + spin_lock_irqsave(&css_set_lock, flags); + cgrp = task_cgroup_from_root(tsk, root); + if (!cgrp || !cgroup_tryget(cgrp)) + cgrp = ERR_PTR(-ENOENT); + spin_unlock_irqrestore(&css_set_lock, flags); + break; + } + rcu_read_unlock(); + return cgrp; +} + static int __init cgroup1_wq_init(void) { /* -- cgit From 93f7378734b595fb61e89b802002fb7e3a1267d2 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 1 Nov 2023 20:37:45 -0700 Subject: bpf: derive smin/smax from umin/max bounds Add smin/smax derivation from appropriate umin/umax values. Previously the logic was surprisingly asymmetric, trying to derive umin/umax from smin/smax (if possible), but not trying to do the same in the other direction. A simple addition to __reg64_deduce_bounds() fixes this. Added also generic comment about u64/s64 ranges and their relationship. Hopefully that helps readers to understand all the bounds deductions a bit better. Acked-by: Eduard Zingerman Acked-by: Shung-Hsi Yu Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231102033759.2541186-4-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 71 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 71 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index bd1c42eb540f..1a5c389b951a 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2358,6 +2358,77 @@ static void __reg32_deduce_bounds(struct bpf_reg_state *reg) static void __reg64_deduce_bounds(struct bpf_reg_state *reg) { + /* If u64 range forms a valid s64 range (due to matching sign bit), + * try to learn from that. Let's do a bit of ASCII art to see when + * this is happening. Let's take u64 range first: + * + * 0 0x7fffffffffffffff 0x8000000000000000 U64_MAX + * |-------------------------------|--------------------------------| + * + * Valid u64 range is formed when umin and umax are anywhere in the + * range [0, U64_MAX], and umin <= umax. u64 case is simple and + * straightforward. Let's see how s64 range maps onto the same range + * of values, annotated below the line for comparison: + * + * 0 0x7fffffffffffffff 0x8000000000000000 U64_MAX + * |-------------------------------|--------------------------------| + * 0 S64_MAX S64_MIN -1 + * + * So s64 values basically start in the middle and they are logically + * contiguous to the right of it, wrapping around from -1 to 0, and + * then finishing as S64_MAX (0x7fffffffffffffff) right before + * S64_MIN. We can try drawing the continuity of u64 vs s64 values + * more visually as mapped to sign-agnostic range of hex values. + * + * u64 start u64 end + * _______________________________________________________________ + * / \ + * 0 0x7fffffffffffffff 0x8000000000000000 U64_MAX + * |-------------------------------|--------------------------------| + * 0 S64_MAX S64_MIN -1 + * / \ + * >------------------------------ -------------------------------> + * s64 continues... s64 end s64 start s64 "midpoint" + * + * What this means is that, in general, we can't always derive + * something new about u64 from any random s64 range, and vice versa. + * + * But we can do that in two particular cases. One is when entire + * u64/s64 range is *entirely* contained within left half of the above + * diagram or when it is *entirely* contained in the right half. I.e.: + * + * |-------------------------------|--------------------------------| + * ^ ^ ^ ^ + * A B C D + * + * [A, B] and [C, D] are contained entirely in their respective halves + * and form valid contiguous ranges as both u64 and s64 values. [A, B] + * will be non-negative both as u64 and s64 (and in fact it will be + * identical ranges no matter the signedness). [C, D] treated as s64 + * will be a range of negative values, while in u64 it will be + * non-negative range of values larger than 0x8000000000000000. + * + * Now, any other range here can't be represented in both u64 and s64 + * simultaneously. E.g., [A, C], [A, D], [B, C], [B, D] are valid + * contiguous u64 ranges, but they are discontinuous in s64. [B, C] + * in s64 would be properly presented as [S64_MIN, C] and [B, S64_MAX], + * for example. Similarly, valid s64 range [D, A] (going from negative + * to positive values), would be two separate [D, U64_MAX] and [0, A] + * ranges as u64. Currently reg_state can't represent two segments per + * numeric domain, so in such situations we can only derive maximal + * possible range ([0, U64_MAX] for u64, and [S64_MIN, S64_MAX] for s64). + * + * So we use these facts to derive umin/umax from smin/smax and vice + * versa only if they stay within the same "half". This is equivalent + * to checking sign bit: lower half will have sign bit as zero, upper + * half have sign bit 1. Below in code we simplify this by just + * casting umin/umax as smin/smax and checking if they form valid + * range, and vice versa. Those are equivalent checks. + */ + if ((s64)reg->umin_value <= (s64)reg->umax_value) { + reg->smin_value = max_t(s64, reg->smin_value, reg->umin_value); + reg->smax_value = min_t(s64, reg->smax_value, reg->umax_value); + } /* Learn sign from signed bounds. * If we cannot cross the sign boundary, then signed and unsigned bounds * are the same, so combine. This works even in the negative case, e.g. -- cgit From d540517990a9d105bf0312760665964916ac044f Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 1 Nov 2023 20:37:46 -0700 Subject: bpf: derive smin32/smax32 from umin32/umax32 bounds All the logic that applies to u64 vs s64, equally applies for u32 vs s32 relationships (just taken in a smaller 32-bit numeric space). So do the same deduction of smin32/smax32 from umin32/umax32, if we can. Acked-by: Eduard Zingerman Acked-by: Shung-Hsi Yu Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231102033759.2541186-5-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 1a5c389b951a..b53ee72a7d72 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2324,6 +2324,13 @@ static void __update_reg_bounds(struct bpf_reg_state *reg) /* Uses signed min/max values to inform unsigned, and vice-versa */ static void __reg32_deduce_bounds(struct bpf_reg_state *reg) { + /* if u32 range forms a valid s32 range (due to matching sign bit), + * try to learn from that + */ + if ((s32)reg->u32_min_value <= (s32)reg->u32_max_value) { + reg->s32_min_value = max_t(s32, reg->s32_min_value, reg->u32_min_value); + reg->s32_max_value = min_t(s32, reg->s32_max_value, reg->u32_max_value); + } /* Learn sign from signed bounds. * If we cannot cross the sign boundary, then signed and unsigned bounds * are the same, so combine. This works even in the negative case, e.g. -- cgit From c1efab6468fd5ef541d47d81dbb62cca27f8db3b Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 1 Nov 2023 20:37:47 -0700 Subject: bpf: derive subreg bounds from full bounds when upper 32 bits are constant Comments in code try to explain the idea behind why this is correct. Please check the code and comments. Acked-by: Eduard Zingerman Acked-by: Shung-Hsi Yu Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231102033759.2541186-6-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 45 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index b53ee72a7d72..9e39f12538f7 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2324,6 +2324,51 @@ static void __update_reg_bounds(struct bpf_reg_state *reg) /* Uses signed min/max values to inform unsigned, and vice-versa */ static void __reg32_deduce_bounds(struct bpf_reg_state *reg) { + /* If upper 32 bits of u64/s64 range don't change, we can use lower 32 + * bits to improve our u32/s32 boundaries. + * + * E.g., the case where we have upper 32 bits as zero ([10, 20] in + * u64) is pretty trivial, it's obvious that in u32 we'll also have + * [10, 20] range. But this property holds for any 64-bit range as + * long as upper 32 bits in that entire range of values stay the same. + * + * E.g., u64 range [0x10000000A, 0x10000000F] ([4294967306, 4294967311] + * in decimal) has the same upper 32 bits throughout all the values in + * that range. As such, lower 32 bits form a valid [0xA, 0xF] ([10, 15]) + * range. + * + * Note also, that [0xA, 0xF] is a valid range both in u32 and in s32, + * following the rules outlined below about u64/s64 correspondence + * (which equally applies to u32 vs s32 correspondence). In general it + * depends on actual hexadecimal values of 32-bit range. They can form + * only valid u32, or only valid s32 ranges in some cases. + * + * So we use all these insights to derive bounds for subregisters here. + */ + if ((reg->umin_value >> 32) == (reg->umax_value >> 32)) { + /* u64 to u32 casting preserves validity of low 32 bits as + * a range, if upper 32 bits are the same + */ + reg->u32_min_value = max_t(u32, reg->u32_min_value, (u32)reg->umin_value); + reg->u32_max_value = min_t(u32, reg->u32_max_value, (u32)reg->umax_value); + + if ((s32)reg->umin_value <= (s32)reg->umax_value) { + reg->s32_min_value = max_t(s32, reg->s32_min_value, (s32)reg->umin_value); + reg->s32_max_value = min_t(s32, reg->s32_max_value, (s32)reg->umax_value); + } + } + if ((reg->smin_value >> 32) == (reg->smax_value >> 32)) { + /* low 32 bits should form a proper u32 range */ + if ((u32)reg->smin_value <= (u32)reg->smax_value) { + reg->u32_min_value = max_t(u32, reg->u32_min_value, (u32)reg->smin_value); + reg->u32_max_value = min_t(u32, reg->u32_max_value, (u32)reg->smax_value); + } + /* low 32 bits should form a proper s32 range */ + if ((s32)reg->smin_value <= (s32)reg->smax_value) { + reg->s32_min_value = max_t(s32, reg->s32_min_value, (s32)reg->smin_value); + reg->s32_max_value = min_t(s32, reg->s32_max_value, (s32)reg->smax_value); + } + } /* if u32 range forms a valid s32 range (due to matching sign bit), * try to learn from that */ -- cgit From 6593f2e6741f03b49bffc9d55ddd4c1c47853c39 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 1 Nov 2023 20:37:48 -0700 Subject: bpf: add special smin32/smax32 derivation from 64-bit bounds Add a special case where we can derive valid s32 bounds from umin/umax or smin/smax by stitching together negative s32 subrange and non-negative s32 subrange. That requires upper 32 bits to form a [N, N+1] range in u32 domain (taking into account wrap around, so 0xffffffff to 0x00000000 is a valid [N, N+1] range in this sense). See code comment for concrete examples. Eduard Zingerman also provided an alternative explanation ([0]) for more mathematically inclined readers: Suppose: . there are numbers a, b, c . 2**31 <= b < 2**32 . 0 <= c < 2**31 . umin = 2**32 * a + b . umax = 2**32 * (a + 1) + c The number of values in the range represented by [umin; umax] is: . N = umax - umin + 1 = 2**32 + c - b + 1 . min(N) = 2**32 + 0 - (2**32-1) + 1 = 2, with b = 2**32-1, c = 0 . max(N) = 2**32 + (2**31 - 1) - 2**31 + 1 = 2**32, with b = 2**31, c = 2**31-1 Hence [(s32)b; (s32)c] forms a valid range. [0] https://lore.kernel.org/bpf/d7af631802f0cfae20df77fe70068702d24bbd31.camel@gmail.com/ Acked-by: Eduard Zingerman Acked-by: Shung-Hsi Yu Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231102033759.2541186-7-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 9e39f12538f7..0fffbf01328e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2369,6 +2369,29 @@ static void __reg32_deduce_bounds(struct bpf_reg_state *reg) reg->s32_max_value = min_t(s32, reg->s32_max_value, (s32)reg->smax_value); } } + /* Special case where upper bits form a small sequence of two + * sequential numbers (in 32-bit unsigned space, so 0xffffffff to + * 0x00000000 is also valid), while lower bits form a proper s32 range + * going from negative numbers to positive numbers. E.g., let's say we + * have s64 range [-1, 1] ([0xffffffffffffffff, 0x0000000000000001]). + * Possible s64 values are {-1, 0, 1} ({0xffffffffffffffff, + * 0x0000000000000000, 0x00000000000001}). Ignoring upper 32 bits, + * we still get a valid s32 range [-1, 1] ([0xffffffff, 0x00000001]). + * Note that it doesn't have to be 0xffffffff going to 0x00000000 in + * upper 32 bits. As a random example, s64 range + * [0xfffffff0fffffff0; 0xfffffff100000010], forms a valid s32 range + * [-16, 16] ([0xfffffff0; 0x00000010]) in its 32 bit subregister. + */ + if ((u32)(reg->umin_value >> 32) + 1 == (u32)(reg->umax_value >> 32) && + (s32)reg->umin_value < 0 && (s32)reg->umax_value >= 0) { + reg->s32_min_value = max_t(s32, reg->s32_min_value, (s32)reg->umin_value); + reg->s32_max_value = min_t(s32, reg->s32_max_value, (s32)reg->umax_value); + } + if ((u32)(reg->smin_value >> 32) + 1 == (u32)(reg->smax_value >> 32) && + (s32)reg->smin_value < 0 && (s32)reg->smax_value >= 0) { + reg->s32_min_value = max_t(s32, reg->s32_min_value, (s32)reg->smin_value); + reg->s32_max_value = min_t(s32, reg->s32_max_value, (s32)reg->smax_value); + } /* if u32 range forms a valid s32 range (due to matching sign bit), * try to learn from that */ -- cgit From c51d5ad6543cc36334ef1fcd762d0df767a0bf7e Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 1 Nov 2023 20:37:49 -0700 Subject: bpf: improve deduction of 64-bit bounds from 32-bit bounds Add a few interesting cases in which we can tighten 64-bit bounds based on newly learnt information about 32-bit bounds. E.g., when full u64/s64 registers are used in BPF program, and then eventually compared as u32/s32. The latter comparison doesn't change the value of full register, but it does impose new restrictions on possible lower 32 bits of such full registers. And we can use that to derive additional full register bounds information. Acked-by: Eduard Zingerman Signed-off-by: Andrii Nakryiko Acked-by: Shung-Hsi Yu Link: https://lore.kernel.org/r/20231102033759.2541186-8-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 0fffbf01328e..969f1ecfe310 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2536,10 +2536,54 @@ static void __reg64_deduce_bounds(struct bpf_reg_state *reg) } } +static void __reg_deduce_mixed_bounds(struct bpf_reg_state *reg) +{ + /* Try to tighten 64-bit bounds from 32-bit knowledge, using 32-bit + * values on both sides of 64-bit range in hope to have tigher range. + * E.g., if r1 is [0x1'00000000, 0x3'80000000], and we learn from + * 32-bit signed > 0 operation that s32 bounds are now [1; 0x7fffffff]. + * With this, we can substitute 1 as low 32-bits of _low_ 64-bit bound + * (0x100000000 -> 0x100000001) and 0x7fffffff as low 32-bits of + * _high_ 64-bit bound (0x380000000 -> 0x37fffffff) and arrive at a + * better overall bounds for r1 as [0x1'000000001; 0x3'7fffffff]. + * We just need to make sure that derived bounds we are intersecting + * with are well-formed ranges in respecitve s64 or u64 domain, just + * like we do with similar kinds of 32-to-64 or 64-to-32 adjustments. + */ + __u64 new_umin, new_umax; + __s64 new_smin, new_smax; + + /* u32 -> u64 tightening, it's always well-formed */ + new_umin = (reg->umin_value & ~0xffffffffULL) | reg->u32_min_value; + new_umax = (reg->umax_value & ~0xffffffffULL) | reg->u32_max_value; + reg->umin_value = max_t(u64, reg->umin_value, new_umin); + reg->umax_value = min_t(u64, reg->umax_value, new_umax); + /* u32 -> s64 tightening, u32 range embedded into s64 preserves range validity */ + new_smin = (reg->smin_value & ~0xffffffffULL) | reg->u32_min_value; + new_smax = (reg->smax_value & ~0xffffffffULL) | reg->u32_max_value; + reg->smin_value = max_t(s64, reg->smin_value, new_smin); + reg->smax_value = min_t(s64, reg->smax_value, new_smax); + + /* if s32 can be treated as valid u32 range, we can use it as well */ + if ((u32)reg->s32_min_value <= (u32)reg->s32_max_value) { + /* s32 -> u64 tightening */ + new_umin = (reg->umin_value & ~0xffffffffULL) | (u32)reg->s32_min_value; + new_umax = (reg->umax_value & ~0xffffffffULL) | (u32)reg->s32_max_value; + reg->umin_value = max_t(u64, reg->umin_value, new_umin); + reg->umax_value = min_t(u64, reg->umax_value, new_umax); + /* s32 -> s64 tightening */ + new_smin = (reg->smin_value & ~0xffffffffULL) | (u32)reg->s32_min_value; + new_smax = (reg->smax_value & ~0xffffffffULL) | (u32)reg->s32_max_value; + reg->smin_value = max_t(s64, reg->smin_value, new_smin); + reg->smax_value = min_t(s64, reg->smax_value, new_smax); + } +} + static void __reg_deduce_bounds(struct bpf_reg_state *reg) { __reg32_deduce_bounds(reg); __reg64_deduce_bounds(reg); + __reg_deduce_mixed_bounds(reg); } /* Attempts to improve var_off based on unsigned min/max information */ -- cgit From d7f00873817129e62f8c70891cb13c8eafe9feef Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 1 Nov 2023 20:37:50 -0700 Subject: bpf: try harder to deduce register bounds from different numeric domains There are cases (caught by subsequent reg_bounds tests in selftests/bpf) where performing one round of __reg_deduce_bounds() doesn't propagate all the information from, say, s32 to u32 bounds and than from newly learned u32 bounds back to u64 and s64. So perform __reg_deduce_bounds() twice to make sure such derivations are propagated fully after reg_bounds_sync(). One such example is test `(s64)[0xffffffff00000001; 0] (u64)< 0xffffffff00000000` from selftest patch from this patch set. It demonstrates an intricate dance of u64 -> s64 -> u64 -> u32 bounds adjustments, which requires two rounds of __reg_deduce_bounds(). Here are corresponding refinement log from selftest, showing evolution of knowledge. REFINING (FALSE R1) (u64)SRC=[0xffffffff00000000; U64_MAX] (u64)DST_OLD=[0; U64_MAX] (u64)DST_NEW=[0xffffffff00000000; U64_MAX] REFINING (FALSE R1) (u64)SRC=[0xffffffff00000000; U64_MAX] (s64)DST_OLD=[0xffffffff00000001; 0] (s64)DST_NEW=[0xffffffff00000001; -1] REFINING (FALSE R1) (s64)SRC=[0xffffffff00000001; -1] (u64)DST_OLD=[0xffffffff00000000; U64_MAX] (u64)DST_NEW=[0xffffffff00000001; U64_MAX] REFINING (FALSE R1) (u64)SRC=[0xffffffff00000001; U64_MAX] (u32)DST_OLD=[0; U32_MAX] (u32)DST_NEW=[1; U32_MAX] R1 initially has smin/smax set to [0xffffffff00000001; -1], while umin/umax is unknown. After (u64)< comparison, in FALSE branch we gain knowledge that umin/umax is [0xffffffff00000000; U64_MAX]. That causes smin/smax to learn that zero can't happen and upper bound is -1. Then smin/smax is adjusted from umin/umax improving lower bound from 0xffffffff00000000 to 0xffffffff00000001. And then eventually umin32/umax32 bounds are drived from umin/umax and become [1; U32_MAX]. Selftest in the last patch is actually implementing a multi-round fixed-point convergence logic, but so far all the tests are handled by two rounds of reg_bounds_sync() on the verifier state, so we keep it simple for now. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231102033759.2541186-9-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 969f1ecfe310..61f17b63ba00 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2605,6 +2605,7 @@ static void reg_bounds_sync(struct bpf_reg_state *reg) __update_reg_bounds(reg); /* We might have learned something about the sign bit. */ __reg_deduce_bounds(reg); + __reg_deduce_bounds(reg); /* We might have learned some bits from the bounds. */ __reg_bound_offset(reg); /* Intersecting with the old var_off might have improved our bounds -- cgit From 9e314f5d8682e1fe6ac214fb34580a238b6fd3c4 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 1 Nov 2023 20:37:51 -0700 Subject: bpf: drop knowledge-losing __reg_combine_{32,64}_into_{64,32} logic When performing 32-bit conditional operation operating on lower 32 bits of a full 64-bit register, register full value isn't changed. We just potentially gain new knowledge about that register's lower 32 bits. Unfortunately, __reg_combine_{32,64}_into_{64,32} logic that reg_set_min_max() performs as a last step, can lose information in some cases due to __mark_reg64_unbounded() and __reg_assign_32_into_64(). That's bad and completely unnecessary. Especially __reg_assign_32_into_64() looks completely out of place here, because we are not performing zero-extending subregister assignment during conditional jump. So this patch replaced __reg_combine_* with just a normal reg_bounds_sync() which will do a proper job of deriving u64/s64 bounds from u32/s32, and vice versa (among all other combinations). __reg_combine_64_into_32() is also used in one more place, coerce_reg_to_size(), while handling 1- and 2-byte register loads. Looking into this, it seems like besides marking subregister as unbounded before performing reg_bounds_sync(), we were also performing deduction of smin32/smax32 and umin32/umax32 bounds from respective smin/smax and umin/umax bounds. It's now redundant as reg_bounds_sync() performs all the same logic more generically (e.g., without unnecessary assumption that upper 32 bits of full register should be zero). Long story short, we remove __reg_combine_64_into_32() completely, and coerce_reg_to_size() now only does resetting subreg to unbounded and then performing reg_bounds_sync() to recover as much information as possible from 64-bit umin/umax and smin/smax bounds, set explicitly in coerce_reg_to_size() earlier. Acked-by: Eduard Zingerman Signed-off-by: Andrii Nakryiko Acked-by: Shung-Hsi Yu Link: https://lore.kernel.org/r/20231102033759.2541186-10-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 60 +++++++-------------------------------------------- 1 file changed, 8 insertions(+), 52 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 61f17b63ba00..b4d6b5a032ce 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2639,51 +2639,6 @@ static void __reg_assign_32_into_64(struct bpf_reg_state *reg) } } -static void __reg_combine_32_into_64(struct bpf_reg_state *reg) -{ - /* special case when 64-bit register has upper 32-bit register - * zeroed. Typically happens after zext or <<32, >>32 sequence - * allowing us to use 32-bit bounds directly, - */ - if (tnum_equals_const(tnum_clear_subreg(reg->var_off), 0)) { - __reg_assign_32_into_64(reg); - } else { - /* Otherwise the best we can do is push lower 32bit known and - * unknown bits into register (var_off set from jmp logic) - * then learn as much as possible from the 64-bit tnum - * known and unknown bits. The previous smin/smax bounds are - * invalid here because of jmp32 compare so mark them unknown - * so they do not impact tnum bounds calculation. - */ - __mark_reg64_unbounded(reg); - } - reg_bounds_sync(reg); -} - -static bool __reg64_bound_s32(s64 a) -{ - return a >= S32_MIN && a <= S32_MAX; -} - -static bool __reg64_bound_u32(u64 a) -{ - return a >= U32_MIN && a <= U32_MAX; -} - -static void __reg_combine_64_into_32(struct bpf_reg_state *reg) -{ - __mark_reg32_unbounded(reg); - if (__reg64_bound_s32(reg->smin_value) && __reg64_bound_s32(reg->smax_value)) { - reg->s32_min_value = (s32)reg->smin_value; - reg->s32_max_value = (s32)reg->smax_value; - } - if (__reg64_bound_u32(reg->umin_value) && __reg64_bound_u32(reg->umax_value)) { - reg->u32_min_value = (u32)reg->umin_value; - reg->u32_max_value = (u32)reg->umax_value; - } - reg_bounds_sync(reg); -} - /* Mark a register as having a completely unknown (scalar) value. */ static void __mark_reg_unknown(const struct bpf_verifier_env *env, struct bpf_reg_state *reg) @@ -6387,9 +6342,10 @@ static void coerce_reg_to_size(struct bpf_reg_state *reg, int size) * values are also truncated so we push 64-bit bounds into * 32-bit bounds. Above were truncated < 32-bits already. */ - if (size >= 4) - return; - __reg_combine_64_into_32(reg); + if (size < 4) { + __mark_reg32_unbounded(reg); + reg_bounds_sync(reg); + } } static void set_sext64_default_val(struct bpf_reg_state *reg, int size) @@ -14642,13 +14598,13 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg, tnum_subreg(false_32off)); true_reg->var_off = tnum_or(tnum_clear_subreg(true_64off), tnum_subreg(true_32off)); - __reg_combine_32_into_64(false_reg); - __reg_combine_32_into_64(true_reg); + reg_bounds_sync(false_reg); + reg_bounds_sync(true_reg); } else { false_reg->var_off = false_64off; true_reg->var_off = true_64off; - __reg_combine_64_into_32(false_reg); - __reg_combine_64_into_32(true_reg); + reg_bounds_sync(false_reg); + reg_bounds_sync(true_reg); } } -- cgit From c2a3ab094683ddc154879a1364fc7cb0228f96a6 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 1 Nov 2023 20:37:53 -0700 Subject: bpf: rename is_branch_taken reg arguments to prepare for the second one Just taking mundane refactoring bits out into a separate patch. No functional changes. Signed-off-by: Andrii Nakryiko Acked-by: Shung-Hsi Yu Link: https://lore.kernel.org/r/20231102033759.2541186-12-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 108 +++++++++++++++++++++++++------------------------- 1 file changed, 54 insertions(+), 54 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index b4d6b5a032ce..770d5bd54ff3 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -14188,26 +14188,26 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *vstate, })); } -static int is_branch32_taken(struct bpf_reg_state *reg, u32 val, u8 opcode) +static int is_branch32_taken(struct bpf_reg_state *reg1, u32 val, u8 opcode) { - struct tnum subreg = tnum_subreg(reg->var_off); + struct tnum subreg = tnum_subreg(reg1->var_off); s32 sval = (s32)val; switch (opcode) { case BPF_JEQ: if (tnum_is_const(subreg)) return !!tnum_equals_const(subreg, val); - else if (val < reg->u32_min_value || val > reg->u32_max_value) + else if (val < reg1->u32_min_value || val > reg1->u32_max_value) return 0; - else if (sval < reg->s32_min_value || sval > reg->s32_max_value) + else if (sval < reg1->s32_min_value || sval > reg1->s32_max_value) return 0; break; case BPF_JNE: if (tnum_is_const(subreg)) return !tnum_equals_const(subreg, val); - else if (val < reg->u32_min_value || val > reg->u32_max_value) + else if (val < reg1->u32_min_value || val > reg1->u32_max_value) return 1; - else if (sval < reg->s32_min_value || sval > reg->s32_max_value) + else if (sval < reg1->s32_min_value || sval > reg1->s32_max_value) return 1; break; case BPF_JSET: @@ -14217,51 +14217,51 @@ static int is_branch32_taken(struct bpf_reg_state *reg, u32 val, u8 opcode) return 0; break; case BPF_JGT: - if (reg->u32_min_value > val) + if (reg1->u32_min_value > val) return 1; - else if (reg->u32_max_value <= val) + else if (reg1->u32_max_value <= val) return 0; break; case BPF_JSGT: - if (reg->s32_min_value > sval) + if (reg1->s32_min_value > sval) return 1; - else if (reg->s32_max_value <= sval) + else if (reg1->s32_max_value <= sval) return 0; break; case BPF_JLT: - if (reg->u32_max_value < val) + if (reg1->u32_max_value < val) return 1; - else if (reg->u32_min_value >= val) + else if (reg1->u32_min_value >= val) return 0; break; case BPF_JSLT: - if (reg->s32_max_value < sval) + if (reg1->s32_max_value < sval) return 1; - else if (reg->s32_min_value >= sval) + else if (reg1->s32_min_value >= sval) return 0; break; case BPF_JGE: - if (reg->u32_min_value >= val) + if (reg1->u32_min_value >= val) return 1; - else if (reg->u32_max_value < val) + else if (reg1->u32_max_value < val) return 0; break; case BPF_JSGE: - if (reg->s32_min_value >= sval) + if (reg1->s32_min_value >= sval) return 1; - else if (reg->s32_max_value < sval) + else if (reg1->s32_max_value < sval) return 0; break; case BPF_JLE: - if (reg->u32_max_value <= val) + if (reg1->u32_max_value <= val) return 1; - else if (reg->u32_min_value > val) + else if (reg1->u32_min_value > val) return 0; break; case BPF_JSLE: - if (reg->s32_max_value <= sval) + if (reg1->s32_max_value <= sval) return 1; - else if (reg->s32_min_value > sval) + else if (reg1->s32_min_value > sval) return 0; break; } @@ -14270,79 +14270,79 @@ static int is_branch32_taken(struct bpf_reg_state *reg, u32 val, u8 opcode) } -static int is_branch64_taken(struct bpf_reg_state *reg, u64 val, u8 opcode) +static int is_branch64_taken(struct bpf_reg_state *reg1, u64 val, u8 opcode) { s64 sval = (s64)val; switch (opcode) { case BPF_JEQ: - if (tnum_is_const(reg->var_off)) - return !!tnum_equals_const(reg->var_off, val); - else if (val < reg->umin_value || val > reg->umax_value) + if (tnum_is_const(reg1->var_off)) + return !!tnum_equals_const(reg1->var_off, val); + else if (val < reg1->umin_value || val > reg1->umax_value) return 0; - else if (sval < reg->smin_value || sval > reg->smax_value) + else if (sval < reg1->smin_value || sval > reg1->smax_value) return 0; break; case BPF_JNE: - if (tnum_is_const(reg->var_off)) - return !tnum_equals_const(reg->var_off, val); - else if (val < reg->umin_value || val > reg->umax_value) + if (tnum_is_const(reg1->var_off)) + return !tnum_equals_const(reg1->var_off, val); + else if (val < reg1->umin_value || val > reg1->umax_value) return 1; - else if (sval < reg->smin_value || sval > reg->smax_value) + else if (sval < reg1->smin_value || sval > reg1->smax_value) return 1; break; case BPF_JSET: - if ((~reg->var_off.mask & reg->var_off.value) & val) + if ((~reg1->var_off.mask & reg1->var_off.value) & val) return 1; - if (!((reg->var_off.mask | reg->var_off.value) & val)) + if (!((reg1->var_off.mask | reg1->var_off.value) & val)) return 0; break; case BPF_JGT: - if (reg->umin_value > val) + if (reg1->umin_value > val) return 1; - else if (reg->umax_value <= val) + else if (reg1->umax_value <= val) return 0; break; case BPF_JSGT: - if (reg->smin_value > sval) + if (reg1->smin_value > sval) return 1; - else if (reg->smax_value <= sval) + else if (reg1->smax_value <= sval) return 0; break; case BPF_JLT: - if (reg->umax_value < val) + if (reg1->umax_value < val) return 1; - else if (reg->umin_value >= val) + else if (reg1->umin_value >= val) return 0; break; case BPF_JSLT: - if (reg->smax_value < sval) + if (reg1->smax_value < sval) return 1; - else if (reg->smin_value >= sval) + else if (reg1->smin_value >= sval) return 0; break; case BPF_JGE: - if (reg->umin_value >= val) + if (reg1->umin_value >= val) return 1; - else if (reg->umax_value < val) + else if (reg1->umax_value < val) return 0; break; case BPF_JSGE: - if (reg->smin_value >= sval) + if (reg1->smin_value >= sval) return 1; - else if (reg->smax_value < sval) + else if (reg1->smax_value < sval) return 0; break; case BPF_JLE: - if (reg->umax_value <= val) + if (reg1->umax_value <= val) return 1; - else if (reg->umin_value > val) + else if (reg1->umin_value > val) return 0; break; case BPF_JSLE: - if (reg->smax_value <= sval) + if (reg1->smax_value <= sval) return 1; - else if (reg->smin_value > sval) + else if (reg1->smin_value > sval) return 0; break; } @@ -14357,11 +14357,11 @@ static int is_branch64_taken(struct bpf_reg_state *reg, u64 val, u8 opcode) * -1 - unknown. Example: "if (reg < 5)" is unknown when register value * range [0,10] */ -static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode, +static int is_branch_taken(struct bpf_reg_state *reg1, u64 val, u8 opcode, bool is_jmp32) { - if (__is_pointer_value(false, reg)) { - if (!reg_not_null(reg)) + if (__is_pointer_value(false, reg1)) { + if (!reg_not_null(reg1)) return -1; /* If pointer is valid tests against zero will fail so we can @@ -14381,8 +14381,8 @@ static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode, } if (is_jmp32) - return is_branch32_taken(reg, val, opcode); - return is_branch64_taken(reg, val, opcode); + return is_branch32_taken(reg1, val, opcode); + return is_branch64_taken(reg1, val, opcode); } static int flip_opcode(u32 opcode) -- cgit From c31534267c180f7ed00288d239a501b554885300 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 1 Nov 2023 20:37:54 -0700 Subject: bpf: generalize is_branch_taken() to work with two registers While still assuming that second register is a constant, generalize is_branch_taken-related code to accept two registers instead of register plus explicit constant value. This also, as a side effect, allows to simplify check_cond_jmp_op() by unifying BPF_K case with BPF_X case, for which we use a fake register to represent BPF_K's imm constant as a register. Acked-by: Eduard Zingerman Signed-off-by: Andrii Nakryiko Acked-by: Shung-Hsi Yu Link: https://lore.kernel.org/r/20231102033759.2541186-13-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 57 +++++++++++++++++++++++++++++---------------------- 1 file changed, 32 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 770d5bd54ff3..79d01445093b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -14188,9 +14188,13 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *vstate, })); } -static int is_branch32_taken(struct bpf_reg_state *reg1, u32 val, u8 opcode) +/* + * , currently assuming reg2 is a constant + */ +static int is_branch32_taken(struct bpf_reg_state *reg1, struct bpf_reg_state *reg2, u8 opcode) { struct tnum subreg = tnum_subreg(reg1->var_off); + u32 val = (u32)tnum_subreg(reg2->var_off).value; s32 sval = (s32)val; switch (opcode) { @@ -14270,8 +14274,12 @@ static int is_branch32_taken(struct bpf_reg_state *reg1, u32 val, u8 opcode) } -static int is_branch64_taken(struct bpf_reg_state *reg1, u64 val, u8 opcode) +/* + * , currently assuming reg2 is a constant + */ +static int is_branch64_taken(struct bpf_reg_state *reg1, struct bpf_reg_state *reg2, u8 opcode) { + u64 val = reg2->var_off.value; s64 sval = (s64)val; switch (opcode) { @@ -14350,16 +14358,23 @@ static int is_branch64_taken(struct bpf_reg_state *reg1, u64 val, u8 opcode) return -1; } -/* compute branch direction of the expression "if (reg opcode val) goto target;" +/* compute branch direction of the expression "if ( opcode ) goto target;" * and return: * 1 - branch will be taken and "goto target" will be executed * 0 - branch will not be taken and fall-through to next insn - * -1 - unknown. Example: "if (reg < 5)" is unknown when register value + * -1 - unknown. Example: "if (reg1 < 5)" is unknown when register value * range [0,10] */ -static int is_branch_taken(struct bpf_reg_state *reg1, u64 val, u8 opcode, - bool is_jmp32) +static int is_branch_taken(struct bpf_reg_state *reg1, struct bpf_reg_state *reg2, + u8 opcode, bool is_jmp32) { + struct tnum reg2_tnum = is_jmp32 ? tnum_subreg(reg2->var_off) : reg2->var_off; + u64 val; + + if (!tnum_is_const(reg2_tnum)) + return -1; + val = reg2_tnum.value; + if (__is_pointer_value(false, reg1)) { if (!reg_not_null(reg1)) return -1; @@ -14381,8 +14396,8 @@ static int is_branch_taken(struct bpf_reg_state *reg1, u64 val, u8 opcode, } if (is_jmp32) - return is_branch32_taken(reg1, val, opcode); - return is_branch64_taken(reg1, val, opcode); + return is_branch32_taken(reg1, reg2, opcode); + return is_branch64_taken(reg1, reg2, opcode); } static int flip_opcode(u32 opcode) @@ -14853,6 +14868,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, struct bpf_reg_state *regs = this_branch->frame[this_branch->curframe]->regs; struct bpf_reg_state *dst_reg, *other_branch_regs, *src_reg = NULL; struct bpf_reg_state *eq_branch_regs; + struct bpf_reg_state fake_reg = {}; u8 opcode = BPF_OP(insn->code); bool is_jmp32; int pred = -1; @@ -14893,36 +14909,27 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, verbose(env, "BPF_JMP/JMP32 uses reserved fields\n"); return -EINVAL; } + src_reg = &fake_reg; + src_reg->type = SCALAR_VALUE; + __mark_reg_known(src_reg, insn->imm); } is_jmp32 = BPF_CLASS(insn->code) == BPF_JMP32; if (BPF_SRC(insn->code) == BPF_K) { - pred = is_branch_taken(dst_reg, insn->imm, opcode, is_jmp32); + pred = is_branch_taken(dst_reg, src_reg, opcode, is_jmp32); } else if (src_reg->type == SCALAR_VALUE && is_jmp32 && tnum_is_const(tnum_subreg(src_reg->var_off))) { - pred = is_branch_taken(dst_reg, - tnum_subreg(src_reg->var_off).value, - opcode, - is_jmp32); + pred = is_branch_taken(dst_reg, src_reg, opcode, is_jmp32); } else if (src_reg->type == SCALAR_VALUE && !is_jmp32 && tnum_is_const(src_reg->var_off)) { - pred = is_branch_taken(dst_reg, - src_reg->var_off.value, - opcode, - is_jmp32); + pred = is_branch_taken(dst_reg, src_reg, opcode, is_jmp32); } else if (dst_reg->type == SCALAR_VALUE && is_jmp32 && tnum_is_const(tnum_subreg(dst_reg->var_off))) { - pred = is_branch_taken(src_reg, - tnum_subreg(dst_reg->var_off).value, - flip_opcode(opcode), - is_jmp32); + pred = is_branch_taken(src_reg, dst_reg, flip_opcode(opcode), is_jmp32); } else if (dst_reg->type == SCALAR_VALUE && !is_jmp32 && tnum_is_const(dst_reg->var_off)) { - pred = is_branch_taken(src_reg, - dst_reg->var_off.value, - flip_opcode(opcode), - is_jmp32); + pred = is_branch_taken(src_reg, dst_reg, flip_opcode(opcode), is_jmp32); } else if (reg_is_pkt_pointer_any(dst_reg) && reg_is_pkt_pointer_any(src_reg) && !is_jmp32) { -- cgit From c697289efe4ef38bc5c62f119cb74433f784b826 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 1 Nov 2023 20:37:55 -0700 Subject: bpf: move is_branch_taken() down Move is_branch_taken() slightly down. In subsequent patched we'll need both flip_opcode() and is_pkt_ptr_branch_taken() for is_branch_taken(), but instead of sprinkling forward declarations around, it makes more sense to move is_branch_taken() lower below is_pkt_ptr_branch_taken(), and also keep it closer to very tightly related reg_set_min_max(), as they are two critical parts of the same SCALAR range tracking logic. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231102033759.2541186-14-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 84 +++++++++++++++++++++++++-------------------------- 1 file changed, 42 insertions(+), 42 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 79d01445093b..414a7c58b4a4 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -14358,48 +14358,6 @@ static int is_branch64_taken(struct bpf_reg_state *reg1, struct bpf_reg_state *r return -1; } -/* compute branch direction of the expression "if ( opcode ) goto target;" - * and return: - * 1 - branch will be taken and "goto target" will be executed - * 0 - branch will not be taken and fall-through to next insn - * -1 - unknown. Example: "if (reg1 < 5)" is unknown when register value - * range [0,10] - */ -static int is_branch_taken(struct bpf_reg_state *reg1, struct bpf_reg_state *reg2, - u8 opcode, bool is_jmp32) -{ - struct tnum reg2_tnum = is_jmp32 ? tnum_subreg(reg2->var_off) : reg2->var_off; - u64 val; - - if (!tnum_is_const(reg2_tnum)) - return -1; - val = reg2_tnum.value; - - if (__is_pointer_value(false, reg1)) { - if (!reg_not_null(reg1)) - return -1; - - /* If pointer is valid tests against zero will fail so we can - * use this to direct branch taken. - */ - if (val != 0) - return -1; - - switch (opcode) { - case BPF_JEQ: - return 0; - case BPF_JNE: - return 1; - default: - return -1; - } - } - - if (is_jmp32) - return is_branch32_taken(reg1, reg2, opcode); - return is_branch64_taken(reg1, reg2, opcode); -} - static int flip_opcode(u32 opcode) { /* How can we transform "a b" into "b a"? */ @@ -14461,6 +14419,48 @@ static int is_pkt_ptr_branch_taken(struct bpf_reg_state *dst_reg, return -1; } +/* compute branch direction of the expression "if ( opcode ) goto target;" + * and return: + * 1 - branch will be taken and "goto target" will be executed + * 0 - branch will not be taken and fall-through to next insn + * -1 - unknown. Example: "if (reg1 < 5)" is unknown when register value + * range [0,10] + */ +static int is_branch_taken(struct bpf_reg_state *reg1, struct bpf_reg_state *reg2, + u8 opcode, bool is_jmp32) +{ + struct tnum reg2_tnum = is_jmp32 ? tnum_subreg(reg2->var_off) : reg2->var_off; + u64 val; + + if (!tnum_is_const(reg2_tnum)) + return -1; + val = reg2_tnum.value; + + if (__is_pointer_value(false, reg1)) { + if (!reg_not_null(reg1)) + return -1; + + /* If pointer is valid tests against zero will fail so we can + * use this to direct branch taken. + */ + if (val != 0) + return -1; + + switch (opcode) { + case BPF_JEQ: + return 0; + case BPF_JNE: + return 1; + default: + return -1; + } + } + + if (is_jmp32) + return is_branch32_taken(reg1, reg2, opcode); + return is_branch64_taken(reg1, reg2, opcode); +} + /* Adjusts the register min/max values in the case that the dst_reg is the * variable register that we are working on, and src_reg is a constant or we're * simply doing a BPF_K check. -- cgit From b74c2a842bba941945279027083fcee1e9aaa73f Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 1 Nov 2023 20:37:56 -0700 Subject: bpf: generalize is_branch_taken to handle all conditional jumps in one place Make is_branch_taken() a single entry point for branch pruning decision making, handling both pointer vs pointer, pointer vs scalar, and scalar vs scalar cases in one place. This also nicely cleans up check_cond_jmp_op(). Acked-by: Eduard Zingerman Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231102033759.2541186-15-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 49 +++++++++++++++++++++++++------------------------ 1 file changed, 25 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 414a7c58b4a4..17bbff33e0e8 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -14188,6 +14188,19 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *vstate, })); } +/* check if register is a constant scalar value */ +static bool is_reg_const(struct bpf_reg_state *reg, bool subreg32) +{ + return reg->type == SCALAR_VALUE && + tnum_is_const(subreg32 ? tnum_subreg(reg->var_off) : reg->var_off); +} + +/* assuming is_reg_const() is true, return constant value of a register */ +static u64 reg_const_value(struct bpf_reg_state *reg, bool subreg32) +{ + return subreg32 ? tnum_subreg(reg->var_off).value : reg->var_off.value; +} + /* * , currently assuming reg2 is a constant */ @@ -14429,12 +14442,20 @@ static int is_pkt_ptr_branch_taken(struct bpf_reg_state *dst_reg, static int is_branch_taken(struct bpf_reg_state *reg1, struct bpf_reg_state *reg2, u8 opcode, bool is_jmp32) { - struct tnum reg2_tnum = is_jmp32 ? tnum_subreg(reg2->var_off) : reg2->var_off; u64 val; - if (!tnum_is_const(reg2_tnum)) + if (reg_is_pkt_pointer_any(reg1) && reg_is_pkt_pointer_any(reg2) && !is_jmp32) + return is_pkt_ptr_branch_taken(reg1, reg2, opcode); + + /* try to make sure reg2 is a constant SCALAR_VALUE */ + if (!is_reg_const(reg2, is_jmp32)) { + opcode = flip_opcode(opcode); + swap(reg1, reg2); + } + /* for now we expect reg2 to be a constant to make any useful decisions */ + if (!is_reg_const(reg2, is_jmp32)) return -1; - val = reg2_tnum.value; + val = reg_const_value(reg2, is_jmp32); if (__is_pointer_value(false, reg1)) { if (!reg_not_null(reg1)) @@ -14915,27 +14936,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, } is_jmp32 = BPF_CLASS(insn->code) == BPF_JMP32; - - if (BPF_SRC(insn->code) == BPF_K) { - pred = is_branch_taken(dst_reg, src_reg, opcode, is_jmp32); - } else if (src_reg->type == SCALAR_VALUE && - is_jmp32 && tnum_is_const(tnum_subreg(src_reg->var_off))) { - pred = is_branch_taken(dst_reg, src_reg, opcode, is_jmp32); - } else if (src_reg->type == SCALAR_VALUE && - !is_jmp32 && tnum_is_const(src_reg->var_off)) { - pred = is_branch_taken(dst_reg, src_reg, opcode, is_jmp32); - } else if (dst_reg->type == SCALAR_VALUE && - is_jmp32 && tnum_is_const(tnum_subreg(dst_reg->var_off))) { - pred = is_branch_taken(src_reg, dst_reg, flip_opcode(opcode), is_jmp32); - } else if (dst_reg->type == SCALAR_VALUE && - !is_jmp32 && tnum_is_const(dst_reg->var_off)) { - pred = is_branch_taken(src_reg, dst_reg, flip_opcode(opcode), is_jmp32); - } else if (reg_is_pkt_pointer_any(dst_reg) && - reg_is_pkt_pointer_any(src_reg) && - !is_jmp32) { - pred = is_pkt_ptr_branch_taken(dst_reg, src_reg, opcode); - } - + pred = is_branch_taken(dst_reg, src_reg, opcode, is_jmp32); if (pred >= 0) { /* If we get here with a dst_reg pointer type it is because * above is_branch_taken() special cased the 0 comparison. -- cgit From 4d345887d2e5a1915600cb5d37b16c4088c6ee1c Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 1 Nov 2023 20:37:57 -0700 Subject: bpf: unify 32-bit and 64-bit is_branch_taken logic Combine 32-bit and 64-bit is_branch_taken logic for SCALAR_VALUE registers. It makes it easier to see parallels between two domains (32-bit and 64-bit), and makes subsequent refactoring more straightforward. No functional changes. Acked-by: Eduard Zingerman Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231102033759.2541186-16-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 200 +++++++++++++++----------------------------------- 1 file changed, 59 insertions(+), 141 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 17bbff33e0e8..c77cca5c4461 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -14204,166 +14204,86 @@ static u64 reg_const_value(struct bpf_reg_state *reg, bool subreg32) /* * , currently assuming reg2 is a constant */ -static int is_branch32_taken(struct bpf_reg_state *reg1, struct bpf_reg_state *reg2, u8 opcode) +static int is_scalar_branch_taken(struct bpf_reg_state *reg1, struct bpf_reg_state *reg2, + u8 opcode, bool is_jmp32) { - struct tnum subreg = tnum_subreg(reg1->var_off); - u32 val = (u32)tnum_subreg(reg2->var_off).value; - s32 sval = (s32)val; + struct tnum t1 = is_jmp32 ? tnum_subreg(reg1->var_off) : reg1->var_off; + u64 umin1 = is_jmp32 ? (u64)reg1->u32_min_value : reg1->umin_value; + u64 umax1 = is_jmp32 ? (u64)reg1->u32_max_value : reg1->umax_value; + s64 smin1 = is_jmp32 ? (s64)reg1->s32_min_value : reg1->smin_value; + s64 smax1 = is_jmp32 ? (s64)reg1->s32_max_value : reg1->smax_value; + u64 uval = is_jmp32 ? (u32)tnum_subreg(reg2->var_off).value : reg2->var_off.value; + s64 sval = is_jmp32 ? (s32)uval : (s64)uval; switch (opcode) { case BPF_JEQ: - if (tnum_is_const(subreg)) - return !!tnum_equals_const(subreg, val); - else if (val < reg1->u32_min_value || val > reg1->u32_max_value) + if (tnum_is_const(t1)) + return !!tnum_equals_const(t1, uval); + else if (uval < umin1 || uval > umax1) return 0; - else if (sval < reg1->s32_min_value || sval > reg1->s32_max_value) + else if (sval < smin1 || sval > smax1) return 0; break; case BPF_JNE: - if (tnum_is_const(subreg)) - return !tnum_equals_const(subreg, val); - else if (val < reg1->u32_min_value || val > reg1->u32_max_value) + if (tnum_is_const(t1)) + return !tnum_equals_const(t1, uval); + else if (uval < umin1 || uval > umax1) return 1; - else if (sval < reg1->s32_min_value || sval > reg1->s32_max_value) + else if (sval < smin1 || sval > smax1) return 1; break; case BPF_JSET: - if ((~subreg.mask & subreg.value) & val) + if ((~t1.mask & t1.value) & uval) return 1; - if (!((subreg.mask | subreg.value) & val)) + if (!((t1.mask | t1.value) & uval)) return 0; break; case BPF_JGT: - if (reg1->u32_min_value > val) + if (umin1 > uval ) return 1; - else if (reg1->u32_max_value <= val) + else if (umax1 <= uval) return 0; break; case BPF_JSGT: - if (reg1->s32_min_value > sval) + if (smin1 > sval) return 1; - else if (reg1->s32_max_value <= sval) + else if (smax1 <= sval) return 0; break; case BPF_JLT: - if (reg1->u32_max_value < val) + if (umax1 < uval) return 1; - else if (reg1->u32_min_value >= val) + else if (umin1 >= uval) return 0; break; case BPF_JSLT: - if (reg1->s32_max_value < sval) + if (smax1 < sval) return 1; - else if (reg1->s32_min_value >= sval) + else if (smin1 >= sval) return 0; break; case BPF_JGE: - if (reg1->u32_min_value >= val) + if (umin1 >= uval) return 1; - else if (reg1->u32_max_value < val) + else if (umax1 < uval) return 0; break; case BPF_JSGE: - if (reg1->s32_min_value >= sval) + if (smin1 >= sval) return 1; - else if (reg1->s32_max_value < sval) + else if (smax1 < sval) return 0; break; case BPF_JLE: - if (reg1->u32_max_value <= val) + if (umax1 <= uval) return 1; - else if (reg1->u32_min_value > val) + else if (umin1 > uval) return 0; break; case BPF_JSLE: - if (reg1->s32_max_value <= sval) + if (smax1 <= sval) return 1; - else if (reg1->s32_min_value > sval) - return 0; - break; - } - - return -1; -} - - -/* - * , currently assuming reg2 is a constant - */ -static int is_branch64_taken(struct bpf_reg_state *reg1, struct bpf_reg_state *reg2, u8 opcode) -{ - u64 val = reg2->var_off.value; - s64 sval = (s64)val; - - switch (opcode) { - case BPF_JEQ: - if (tnum_is_const(reg1->var_off)) - return !!tnum_equals_const(reg1->var_off, val); - else if (val < reg1->umin_value || val > reg1->umax_value) - return 0; - else if (sval < reg1->smin_value || sval > reg1->smax_value) - return 0; - break; - case BPF_JNE: - if (tnum_is_const(reg1->var_off)) - return !tnum_equals_const(reg1->var_off, val); - else if (val < reg1->umin_value || val > reg1->umax_value) - return 1; - else if (sval < reg1->smin_value || sval > reg1->smax_value) - return 1; - break; - case BPF_JSET: - if ((~reg1->var_off.mask & reg1->var_off.value) & val) - return 1; - if (!((reg1->var_off.mask | reg1->var_off.value) & val)) - return 0; - break; - case BPF_JGT: - if (reg1->umin_value > val) - return 1; - else if (reg1->umax_value <= val) - return 0; - break; - case BPF_JSGT: - if (reg1->smin_value > sval) - return 1; - else if (reg1->smax_value <= sval) - return 0; - break; - case BPF_JLT: - if (reg1->umax_value < val) - return 1; - else if (reg1->umin_value >= val) - return 0; - break; - case BPF_JSLT: - if (reg1->smax_value < sval) - return 1; - else if (reg1->smin_value >= sval) - return 0; - break; - case BPF_JGE: - if (reg1->umin_value >= val) - return 1; - else if (reg1->umax_value < val) - return 0; - break; - case BPF_JSGE: - if (reg1->smin_value >= sval) - return 1; - else if (reg1->smax_value < sval) - return 0; - break; - case BPF_JLE: - if (reg1->umax_value <= val) - return 1; - else if (reg1->umin_value > val) - return 0; - break; - case BPF_JSLE: - if (reg1->smax_value <= sval) - return 1; - else if (reg1->smin_value > sval) + else if (smin1 > sval) return 0; break; } @@ -14477,9 +14397,7 @@ static int is_branch_taken(struct bpf_reg_state *reg1, struct bpf_reg_state *reg } } - if (is_jmp32) - return is_branch32_taken(reg1, reg2, opcode); - return is_branch64_taken(reg1, reg2, opcode); + return is_scalar_branch_taken(reg1, reg2, opcode, is_jmp32); } /* Adjusts the register min/max values in the case that the dst_reg is the @@ -14489,15 +14407,15 @@ static int is_branch_taken(struct bpf_reg_state *reg1, struct bpf_reg_state *reg */ static void reg_set_min_max(struct bpf_reg_state *true_reg, struct bpf_reg_state *false_reg, - u64 val, u32 val32, + u64 uval, u32 uval32, u8 opcode, bool is_jmp32) { struct tnum false_32off = tnum_subreg(false_reg->var_off); struct tnum false_64off = false_reg->var_off; struct tnum true_32off = tnum_subreg(true_reg->var_off); struct tnum true_64off = true_reg->var_off; - s64 sval = (s64)val; - s32 sval32 = (s32)val32; + s64 sval = (s64)uval; + s32 sval32 = (s32)uval32; /* If the dst_reg is a pointer, we can't learn anything about its * variable offset from the compare (unless src_reg were a pointer into @@ -14520,49 +14438,49 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg, */ case BPF_JEQ: if (is_jmp32) { - __mark_reg32_known(true_reg, val32); + __mark_reg32_known(true_reg, uval32); true_32off = tnum_subreg(true_reg->var_off); } else { - ___mark_reg_known(true_reg, val); + ___mark_reg_known(true_reg, uval); true_64off = true_reg->var_off; } break; case BPF_JNE: if (is_jmp32) { - __mark_reg32_known(false_reg, val32); + __mark_reg32_known(false_reg, uval32); false_32off = tnum_subreg(false_reg->var_off); } else { - ___mark_reg_known(false_reg, val); + ___mark_reg_known(false_reg, uval); false_64off = false_reg->var_off; } break; case BPF_JSET: if (is_jmp32) { - false_32off = tnum_and(false_32off, tnum_const(~val32)); - if (is_power_of_2(val32)) + false_32off = tnum_and(false_32off, tnum_const(~uval32)); + if (is_power_of_2(uval32)) true_32off = tnum_or(true_32off, - tnum_const(val32)); + tnum_const(uval32)); } else { - false_64off = tnum_and(false_64off, tnum_const(~val)); - if (is_power_of_2(val)) + false_64off = tnum_and(false_64off, tnum_const(~uval)); + if (is_power_of_2(uval)) true_64off = tnum_or(true_64off, - tnum_const(val)); + tnum_const(uval)); } break; case BPF_JGE: case BPF_JGT: { if (is_jmp32) { - u32 false_umax = opcode == BPF_JGT ? val32 : val32 - 1; - u32 true_umin = opcode == BPF_JGT ? val32 + 1 : val32; + u32 false_umax = opcode == BPF_JGT ? uval32 : uval32 - 1; + u32 true_umin = opcode == BPF_JGT ? uval32 + 1 : uval32; false_reg->u32_max_value = min(false_reg->u32_max_value, false_umax); true_reg->u32_min_value = max(true_reg->u32_min_value, true_umin); } else { - u64 false_umax = opcode == BPF_JGT ? val : val - 1; - u64 true_umin = opcode == BPF_JGT ? val + 1 : val; + u64 false_umax = opcode == BPF_JGT ? uval : uval - 1; + u64 true_umin = opcode == BPF_JGT ? uval + 1 : uval; false_reg->umax_value = min(false_reg->umax_value, false_umax); true_reg->umin_value = max(true_reg->umin_value, true_umin); @@ -14591,16 +14509,16 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg, case BPF_JLT: { if (is_jmp32) { - u32 false_umin = opcode == BPF_JLT ? val32 : val32 + 1; - u32 true_umax = opcode == BPF_JLT ? val32 - 1 : val32; + u32 false_umin = opcode == BPF_JLT ? uval32 : uval32 + 1; + u32 true_umax = opcode == BPF_JLT ? uval32 - 1 : uval32; false_reg->u32_min_value = max(false_reg->u32_min_value, false_umin); true_reg->u32_max_value = min(true_reg->u32_max_value, true_umax); } else { - u64 false_umin = opcode == BPF_JLT ? val : val + 1; - u64 true_umax = opcode == BPF_JLT ? val - 1 : val; + u64 false_umin = opcode == BPF_JLT ? uval : uval + 1; + u64 true_umax = opcode == BPF_JLT ? uval - 1 : uval; false_reg->umin_value = max(false_reg->umin_value, false_umin); true_reg->umax_value = min(true_reg->umax_value, true_umax); @@ -14649,7 +14567,7 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg, */ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg, struct bpf_reg_state *false_reg, - u64 val, u32 val32, + u64 uval, u32 uval32, u8 opcode, bool is_jmp32) { opcode = flip_opcode(opcode); @@ -14657,7 +14575,7 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg, * BPF_JA, can't get here. */ if (opcode) - reg_set_min_max(true_reg, false_reg, val, val32, opcode, is_jmp32); + reg_set_min_max(true_reg, false_reg, uval, uval32, opcode, is_jmp32); } /* Regs are known to be equal, so intersect their min/max/var_off */ -- cgit From 811476e9cc578cb6c776627ac069dc45a8431791 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 1 Nov 2023 20:37:58 -0700 Subject: bpf: prepare reg_set_min_max for second set of registers Similarly to is_branch_taken()-related refactorings, start preparing reg_set_min_max() to handle more generic case of two non-const registers. Start with renaming arguments to accommodate later addition of second register as an input argument. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231102033759.2541186-17-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 80 +++++++++++++++++++++++++-------------------------- 1 file changed, 40 insertions(+), 40 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c77cca5c4461..40ed261d3489 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -14405,25 +14405,25 @@ static int is_branch_taken(struct bpf_reg_state *reg1, struct bpf_reg_state *reg * simply doing a BPF_K check. * In JEQ/JNE cases we also adjust the var_off values. */ -static void reg_set_min_max(struct bpf_reg_state *true_reg, - struct bpf_reg_state *false_reg, +static void reg_set_min_max(struct bpf_reg_state *true_reg1, + struct bpf_reg_state *false_reg1, u64 uval, u32 uval32, u8 opcode, bool is_jmp32) { - struct tnum false_32off = tnum_subreg(false_reg->var_off); - struct tnum false_64off = false_reg->var_off; - struct tnum true_32off = tnum_subreg(true_reg->var_off); - struct tnum true_64off = true_reg->var_off; + struct tnum false_32off = tnum_subreg(false_reg1->var_off); + struct tnum false_64off = false_reg1->var_off; + struct tnum true_32off = tnum_subreg(true_reg1->var_off); + struct tnum true_64off = true_reg1->var_off; s64 sval = (s64)uval; s32 sval32 = (s32)uval32; /* If the dst_reg is a pointer, we can't learn anything about its * variable offset from the compare (unless src_reg were a pointer into * the same object, but we don't bother with that. - * Since false_reg and true_reg have the same type by construction, we + * Since false_reg1 and true_reg1 have the same type by construction, we * only need to check one of them for pointerness. */ - if (__is_pointer_value(false, false_reg)) + if (__is_pointer_value(false, false_reg1)) return; switch (opcode) { @@ -14438,20 +14438,20 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg, */ case BPF_JEQ: if (is_jmp32) { - __mark_reg32_known(true_reg, uval32); - true_32off = tnum_subreg(true_reg->var_off); + __mark_reg32_known(true_reg1, uval32); + true_32off = tnum_subreg(true_reg1->var_off); } else { - ___mark_reg_known(true_reg, uval); - true_64off = true_reg->var_off; + ___mark_reg_known(true_reg1, uval); + true_64off = true_reg1->var_off; } break; case BPF_JNE: if (is_jmp32) { - __mark_reg32_known(false_reg, uval32); - false_32off = tnum_subreg(false_reg->var_off); + __mark_reg32_known(false_reg1, uval32); + false_32off = tnum_subreg(false_reg1->var_off); } else { - ___mark_reg_known(false_reg, uval); - false_64off = false_reg->var_off; + ___mark_reg_known(false_reg1, uval); + false_64off = false_reg1->var_off; } break; case BPF_JSET: @@ -14474,16 +14474,16 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg, u32 false_umax = opcode == BPF_JGT ? uval32 : uval32 - 1; u32 true_umin = opcode == BPF_JGT ? uval32 + 1 : uval32; - false_reg->u32_max_value = min(false_reg->u32_max_value, + false_reg1->u32_max_value = min(false_reg1->u32_max_value, false_umax); - true_reg->u32_min_value = max(true_reg->u32_min_value, + true_reg1->u32_min_value = max(true_reg1->u32_min_value, true_umin); } else { u64 false_umax = opcode == BPF_JGT ? uval : uval - 1; u64 true_umin = opcode == BPF_JGT ? uval + 1 : uval; - false_reg->umax_value = min(false_reg->umax_value, false_umax); - true_reg->umin_value = max(true_reg->umin_value, true_umin); + false_reg1->umax_value = min(false_reg1->umax_value, false_umax); + true_reg1->umin_value = max(true_reg1->umin_value, true_umin); } break; } @@ -14494,14 +14494,14 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg, s32 false_smax = opcode == BPF_JSGT ? sval32 : sval32 - 1; s32 true_smin = opcode == BPF_JSGT ? sval32 + 1 : sval32; - false_reg->s32_max_value = min(false_reg->s32_max_value, false_smax); - true_reg->s32_min_value = max(true_reg->s32_min_value, true_smin); + false_reg1->s32_max_value = min(false_reg1->s32_max_value, false_smax); + true_reg1->s32_min_value = max(true_reg1->s32_min_value, true_smin); } else { s64 false_smax = opcode == BPF_JSGT ? sval : sval - 1; s64 true_smin = opcode == BPF_JSGT ? sval + 1 : sval; - false_reg->smax_value = min(false_reg->smax_value, false_smax); - true_reg->smin_value = max(true_reg->smin_value, true_smin); + false_reg1->smax_value = min(false_reg1->smax_value, false_smax); + true_reg1->smin_value = max(true_reg1->smin_value, true_smin); } break; } @@ -14512,16 +14512,16 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg, u32 false_umin = opcode == BPF_JLT ? uval32 : uval32 + 1; u32 true_umax = opcode == BPF_JLT ? uval32 - 1 : uval32; - false_reg->u32_min_value = max(false_reg->u32_min_value, + false_reg1->u32_min_value = max(false_reg1->u32_min_value, false_umin); - true_reg->u32_max_value = min(true_reg->u32_max_value, + true_reg1->u32_max_value = min(true_reg1->u32_max_value, true_umax); } else { u64 false_umin = opcode == BPF_JLT ? uval : uval + 1; u64 true_umax = opcode == BPF_JLT ? uval - 1 : uval; - false_reg->umin_value = max(false_reg->umin_value, false_umin); - true_reg->umax_value = min(true_reg->umax_value, true_umax); + false_reg1->umin_value = max(false_reg1->umin_value, false_umin); + true_reg1->umax_value = min(true_reg1->umax_value, true_umax); } break; } @@ -14532,14 +14532,14 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg, s32 false_smin = opcode == BPF_JSLT ? sval32 : sval32 + 1; s32 true_smax = opcode == BPF_JSLT ? sval32 - 1 : sval32; - false_reg->s32_min_value = max(false_reg->s32_min_value, false_smin); - true_reg->s32_max_value = min(true_reg->s32_max_value, true_smax); + false_reg1->s32_min_value = max(false_reg1->s32_min_value, false_smin); + true_reg1->s32_max_value = min(true_reg1->s32_max_value, true_smax); } else { s64 false_smin = opcode == BPF_JSLT ? sval : sval + 1; s64 true_smax = opcode == BPF_JSLT ? sval - 1 : sval; - false_reg->smin_value = max(false_reg->smin_value, false_smin); - true_reg->smax_value = min(true_reg->smax_value, true_smax); + false_reg1->smin_value = max(false_reg1->smin_value, false_smin); + true_reg1->smax_value = min(true_reg1->smax_value, true_smax); } break; } @@ -14548,17 +14548,17 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg, } if (is_jmp32) { - false_reg->var_off = tnum_or(tnum_clear_subreg(false_64off), + false_reg1->var_off = tnum_or(tnum_clear_subreg(false_64off), tnum_subreg(false_32off)); - true_reg->var_off = tnum_or(tnum_clear_subreg(true_64off), + true_reg1->var_off = tnum_or(tnum_clear_subreg(true_64off), tnum_subreg(true_32off)); - reg_bounds_sync(false_reg); - reg_bounds_sync(true_reg); + reg_bounds_sync(false_reg1); + reg_bounds_sync(true_reg1); } else { - false_reg->var_off = false_64off; - true_reg->var_off = true_64off; - reg_bounds_sync(false_reg); - reg_bounds_sync(true_reg); + false_reg1->var_off = false_64off; + true_reg1->var_off = true_64off; + reg_bounds_sync(false_reg1); + reg_bounds_sync(true_reg1); } } -- cgit From 4621202adc5bc0d1006af37fe8b9aca131387d3c Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 1 Nov 2023 20:37:59 -0700 Subject: bpf: generalize reg_set_min_max() to handle two sets of two registers Change reg_set_min_max() to take FALSE/TRUE sets of two registers each, instead of assuming that we are always comparing to a constant. For now we still assume that right-hand side registers are constants (and make sure that's the case by swapping src/dst regs, if necessary), but subsequent patches will remove this limitation. reg_set_min_max() is now called unconditionally for any register comparison, so that might include pointer vs pointer. This makes it consistent with is_branch_taken() generality. But we currently only support adjustments based on SCALAR vs SCALAR comparisons, so reg_set_min_max() has to guard itself againts pointers. Taking two by two registers allows to further unify and simplify check_cond_jmp_op() logic. We utilize fake register for BPF_K conditional jump case, just like with is_branch_taken() part. Acked-by: Eduard Zingerman Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231102033759.2541186-18-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 131 +++++++++++++++++++++----------------------------- 1 file changed, 56 insertions(+), 75 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 40ed261d3489..e801c50d3857 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -14400,32 +14400,50 @@ static int is_branch_taken(struct bpf_reg_state *reg1, struct bpf_reg_state *reg return is_scalar_branch_taken(reg1, reg2, opcode, is_jmp32); } -/* Adjusts the register min/max values in the case that the dst_reg is the - * variable register that we are working on, and src_reg is a constant or we're - * simply doing a BPF_K check. - * In JEQ/JNE cases we also adjust the var_off values. +/* Adjusts the register min/max values in the case that the dst_reg and + * src_reg are both SCALAR_VALUE registers (or we are simply doing a BPF_K + * check, in which case we havea fake SCALAR_VALUE representing insn->imm). + * Technically we can do similar adjustments for pointers to the same object, + * but we don't support that right now. */ static void reg_set_min_max(struct bpf_reg_state *true_reg1, + struct bpf_reg_state *true_reg2, struct bpf_reg_state *false_reg1, - u64 uval, u32 uval32, + struct bpf_reg_state *false_reg2, u8 opcode, bool is_jmp32) { - struct tnum false_32off = tnum_subreg(false_reg1->var_off); - struct tnum false_64off = false_reg1->var_off; - struct tnum true_32off = tnum_subreg(true_reg1->var_off); - struct tnum true_64off = true_reg1->var_off; - s64 sval = (s64)uval; - s32 sval32 = (s32)uval32; - - /* If the dst_reg is a pointer, we can't learn anything about its - * variable offset from the compare (unless src_reg were a pointer into - * the same object, but we don't bother with that. - * Since false_reg1 and true_reg1 have the same type by construction, we - * only need to check one of them for pointerness. + struct tnum false_32off, false_64off; + struct tnum true_32off, true_64off; + u64 uval; + u32 uval32; + s64 sval; + s32 sval32; + + /* If either register is a pointer, we can't learn anything about its + * variable offset from the compare (unless they were a pointer into + * the same object, but we don't bother with that). */ - if (__is_pointer_value(false, false_reg1)) + if (false_reg1->type != SCALAR_VALUE || false_reg2->type != SCALAR_VALUE) + return; + + /* we expect right-hand registers (src ones) to be constants, for now */ + if (!is_reg_const(false_reg2, is_jmp32)) { + opcode = flip_opcode(opcode); + swap(true_reg1, true_reg2); + swap(false_reg1, false_reg2); + } + if (!is_reg_const(false_reg2, is_jmp32)) return; + false_32off = tnum_subreg(false_reg1->var_off); + false_64off = false_reg1->var_off; + true_32off = tnum_subreg(true_reg1->var_off); + true_64off = true_reg1->var_off; + uval = false_reg2->var_off.value; + uval32 = (u32)tnum_subreg(false_reg2->var_off).value; + sval = (s64)uval; + sval32 = (s32)uval32; + switch (opcode) { /* JEQ/JNE comparison doesn't change the register equivalence. * @@ -14562,22 +14580,6 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg1, } } -/* Same as above, but for the case that dst_reg holds a constant and src_reg is - * the variable reg. - */ -static void reg_set_min_max_inv(struct bpf_reg_state *true_reg, - struct bpf_reg_state *false_reg, - u64 uval, u32 uval32, - u8 opcode, bool is_jmp32) -{ - opcode = flip_opcode(opcode); - /* This uses zero as "not present in table"; luckily the zero opcode, - * BPF_JA, can't get here. - */ - if (opcode) - reg_set_min_max(true_reg, false_reg, uval, uval32, opcode, is_jmp32); -} - /* Regs are known to be equal, so intersect their min/max/var_off */ static void __reg_combine_min_max(struct bpf_reg_state *src_reg, struct bpf_reg_state *dst_reg) @@ -14902,53 +14904,32 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, return -EFAULT; other_branch_regs = other_branch->frame[other_branch->curframe]->regs; - /* detect if we are comparing against a constant value so we can adjust - * our min/max values for our dst register. - * this is only legit if both are scalars (or pointers to the same - * object, I suppose, see the PTR_MAYBE_NULL related if block below), - * because otherwise the different base pointers mean the offsets aren't - * comparable. - */ if (BPF_SRC(insn->code) == BPF_X) { - struct bpf_reg_state *src_reg = ®s[insn->src_reg]; + reg_set_min_max(&other_branch_regs[insn->dst_reg], + &other_branch_regs[insn->src_reg], + dst_reg, src_reg, opcode, is_jmp32); if (dst_reg->type == SCALAR_VALUE && - src_reg->type == SCALAR_VALUE) { - if (tnum_is_const(src_reg->var_off) || - (is_jmp32 && - tnum_is_const(tnum_subreg(src_reg->var_off)))) - reg_set_min_max(&other_branch_regs[insn->dst_reg], - dst_reg, - src_reg->var_off.value, - tnum_subreg(src_reg->var_off).value, - opcode, is_jmp32); - else if (tnum_is_const(dst_reg->var_off) || - (is_jmp32 && - tnum_is_const(tnum_subreg(dst_reg->var_off)))) - reg_set_min_max_inv(&other_branch_regs[insn->src_reg], - src_reg, - dst_reg->var_off.value, - tnum_subreg(dst_reg->var_off).value, - opcode, is_jmp32); - else if (!is_jmp32 && - (opcode == BPF_JEQ || opcode == BPF_JNE)) - /* Comparing for equality, we can combine knowledge */ - reg_combine_min_max(&other_branch_regs[insn->src_reg], - &other_branch_regs[insn->dst_reg], - src_reg, dst_reg, opcode); - if (src_reg->id && - !WARN_ON_ONCE(src_reg->id != other_branch_regs[insn->src_reg].id)) { - find_equal_scalars(this_branch, src_reg); - find_equal_scalars(other_branch, &other_branch_regs[insn->src_reg]); - } - - } - } else if (dst_reg->type == SCALAR_VALUE) { + src_reg->type == SCALAR_VALUE && + !is_jmp32 && (opcode == BPF_JEQ || opcode == BPF_JNE)) { + /* Comparing for equality, we can combine knowledge */ + reg_combine_min_max(&other_branch_regs[insn->src_reg], + &other_branch_regs[insn->dst_reg], + src_reg, dst_reg, opcode); + } + } else /* BPF_SRC(insn->code) == BPF_K */ { reg_set_min_max(&other_branch_regs[insn->dst_reg], - dst_reg, insn->imm, (u32)insn->imm, - opcode, is_jmp32); + src_reg /* fake one */, + dst_reg, src_reg /* same fake one */, + opcode, is_jmp32); } + if (BPF_SRC(insn->code) == BPF_X && + src_reg->type == SCALAR_VALUE && src_reg->id && + !WARN_ON_ONCE(src_reg->id != other_branch_regs[insn->src_reg].id)) { + find_equal_scalars(this_branch, src_reg); + find_equal_scalars(other_branch, &other_branch_regs[insn->src_reg]); + } if (dst_reg->type == SCALAR_VALUE && dst_reg->id && !WARN_ON_ONCE(dst_reg->id != other_branch_regs[insn->dst_reg].id)) { find_equal_scalars(this_branch, dst_reg); -- cgit From 9b75dbeb36fcd9fc7ed51d370310d0518a387769 Mon Sep 17 00:00:00 2001 From: Florian Lehner Date: Sun, 5 Nov 2023 09:58:01 +0100 Subject: bpf, lpm: Fix check prefixlen before walking trie When looking up an element in LPM trie, the condition 'matchlen == trie->max_prefixlen' will never return true, if key->prefixlen is larger than trie->max_prefixlen. Consequently all elements in the LPM trie will be visited and no element is returned in the end. To resolve this, check key->prefixlen first before walking the LPM trie. Fixes: b95a5c4db09b ("bpf: add a longest prefix match trie map implementation") Signed-off-by: Florian Lehner Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20231105085801.3742-1-dev@der-flo.net Signed-off-by: Alexei Starovoitov --- kernel/bpf/lpm_trie.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 17c7e7782a1f..b32be680da6c 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -231,6 +231,9 @@ static void *trie_lookup_elem(struct bpf_map *map, void *_key) struct lpm_trie_node *node, *found = NULL; struct bpf_lpm_trie_key *key = _key; + if (key->prefixlen > trie->max_prefixlen) + return NULL; + /* Start walking the trie from the root node ... */ for (node = rcu_dereference_check(trie->root, rcu_read_lock_bh_held()); -- cgit From 74523c06ae20b83c5508a98af62393ac34913362 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 6 Nov 2023 20:57:23 -0800 Subject: bpf: Add __bpf_dynptr_data* for in kernel use Different types of bpf dynptr have different internal data storage. Specifically, SKB and XDP type of dynptr may have non-continuous data. Therefore, it is not always safe to directly access dynptr->data. Add __bpf_dynptr_data and __bpf_dynptr_data_rw to replace direct access to dynptr->data. Update bpf_verify_pkcs7_signature to use __bpf_dynptr_data instead of dynptr->data. Signed-off-by: Song Liu Signed-off-by: Andrii Nakryiko Acked-by: Vadim Fedorenko Link: https://lore.kernel.org/bpf/20231107045725.2278852-2-song@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 ++ kernel/bpf/helpers.c | 19 +++++++++++++++++++ kernel/trace/bpf_trace.c | 12 ++++++++---- 3 files changed, 29 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index b4825d3cdb29..eb84caf133df 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1222,6 +1222,8 @@ enum bpf_dynptr_type { int bpf_dynptr_check_size(u32 size); u32 __bpf_dynptr_size(const struct bpf_dynptr_kern *ptr); +const void *__bpf_dynptr_data(const struct bpf_dynptr_kern *ptr, u32 len); +void *__bpf_dynptr_data_rw(const struct bpf_dynptr_kern *ptr, u32 len); #ifdef CONFIG_BPF_JIT int bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr); diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 56b0c1f678ee..174f02a9e703 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -2618,3 +2618,22 @@ static int __init kfunc_init(void) } late_initcall(kfunc_init); + +/* Get a pointer to dynptr data up to len bytes for read only access. If + * the dynptr doesn't have continuous data up to len bytes, return NULL. + */ +const void *__bpf_dynptr_data(const struct bpf_dynptr_kern *ptr, u32 len) +{ + return bpf_dynptr_slice(ptr, 0, NULL, len); +} + +/* Get a pointer to dynptr data up to len bytes for read write access. If + * the dynptr doesn't have continuous data up to len bytes, or the dynptr + * is read only, return NULL. + */ +void *__bpf_dynptr_data_rw(const struct bpf_dynptr_kern *ptr, u32 len) +{ + if (__bpf_dynptr_is_rdonly(ptr)) + return NULL; + return (void *)__bpf_dynptr_data(ptr, len); +} diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 84e8a0f6e4e0..f0b8b7c29126 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1376,6 +1376,8 @@ __bpf_kfunc int bpf_verify_pkcs7_signature(struct bpf_dynptr_kern *data_ptr, struct bpf_dynptr_kern *sig_ptr, struct bpf_key *trusted_keyring) { + const void *data, *sig; + u32 data_len, sig_len; int ret; if (trusted_keyring->has_ref) { @@ -1392,10 +1394,12 @@ __bpf_kfunc int bpf_verify_pkcs7_signature(struct bpf_dynptr_kern *data_ptr, return ret; } - return verify_pkcs7_signature(data_ptr->data, - __bpf_dynptr_size(data_ptr), - sig_ptr->data, - __bpf_dynptr_size(sig_ptr), + data_len = __bpf_dynptr_size(data_ptr); + data = __bpf_dynptr_data(data_ptr, data_len); + sig_len = __bpf_dynptr_size(sig_ptr); + sig = __bpf_dynptr_data(sig_ptr, sig_len); + + return verify_pkcs7_signature(data, data_len, sig, sig_len, trusted_keyring->key, VERIFYING_UNSPECIFIED_SIGNATURE, NULL, NULL); -- cgit From 0b51940729150e807fc4b7767164e6bb6cf4f7dd Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 6 Nov 2023 20:57:24 -0800 Subject: bpf: Factor out helper check_reg_const_str() ARG_PTR_TO_CONST_STR is used to specify constant string args for BPF helpers. The logic that verifies a reg is ARG_PTR_TO_CONST_STR is implemented in check_func_arg(). As we introduce kfuncs with constant string args, it is necessary to do the same check for kfuncs (in check_kfunc_args). Factor out the logic for ARG_PTR_TO_CONST_STR to a new check_reg_const_str() so that it can be reused. check_func_arg() ensures check_reg_const_str() is only called with reg of type PTR_TO_MAP_VALUE. Add a redundent type check in check_reg_const_str() to avoid misuse in the future. Other than this redundent check, there is no change in behavior. Signed-off-by: Song Liu Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Acked-by: Vadim Fedorenko Link: https://lore.kernel.org/bpf/20231107045725.2278852-3-song@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 85 +++++++++++++++++++++++++++++---------------------- 1 file changed, 49 insertions(+), 36 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index e801c50d3857..637f34231633 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -8725,6 +8725,54 @@ static enum bpf_dynptr_type dynptr_get_type(struct bpf_verifier_env *env, return state->stack[spi].spilled_ptr.dynptr.type; } +static int check_reg_const_str(struct bpf_verifier_env *env, + struct bpf_reg_state *reg, u32 regno) +{ + struct bpf_map *map = reg->map_ptr; + int err; + int map_off; + u64 map_addr; + char *str_ptr; + + if (reg->type != PTR_TO_MAP_VALUE) + return -EINVAL; + + if (!bpf_map_is_rdonly(map)) { + verbose(env, "R%d does not point to a readonly map'\n", regno); + return -EACCES; + } + + if (!tnum_is_const(reg->var_off)) { + verbose(env, "R%d is not a constant address'\n", regno); + return -EACCES; + } + + if (!map->ops->map_direct_value_addr) { + verbose(env, "no direct value access support for this map type\n"); + return -EACCES; + } + + err = check_map_access(env, regno, reg->off, + map->value_size - reg->off, false, + ACCESS_HELPER); + if (err) + return err; + + map_off = reg->off + reg->var_off.value; + err = map->ops->map_direct_value_addr(map, &map_addr, map_off); + if (err) { + verbose(env, "direct value access on string failed\n"); + return err; + } + + str_ptr = (char *)(long)(map_addr); + if (!strnchr(str_ptr + map_off, map->value_size - map_off, 0)) { + verbose(env, "string is not zero-terminated\n"); + return -EINVAL; + } + return 0; +} + static int check_func_arg(struct bpf_verifier_env *env, u32 arg, struct bpf_call_arg_meta *meta, const struct bpf_func_proto *fn, @@ -8969,44 +9017,9 @@ skip_type_check: } case ARG_PTR_TO_CONST_STR: { - struct bpf_map *map = reg->map_ptr; - int map_off; - u64 map_addr; - char *str_ptr; - - if (!bpf_map_is_rdonly(map)) { - verbose(env, "R%d does not point to a readonly map'\n", regno); - return -EACCES; - } - - if (!tnum_is_const(reg->var_off)) { - verbose(env, "R%d is not a constant address'\n", regno); - return -EACCES; - } - - if (!map->ops->map_direct_value_addr) { - verbose(env, "no direct value access support for this map type\n"); - return -EACCES; - } - - err = check_map_access(env, regno, reg->off, - map->value_size - reg->off, false, - ACCESS_HELPER); + err = check_reg_const_str(env, reg, regno); if (err) return err; - - map_off = reg->off + reg->var_off.value; - err = map->ops->map_direct_value_addr(map, &map_addr, map_off); - if (err) { - verbose(env, "direct value access on string failed\n"); - return err; - } - - str_ptr = (char *)(long)(map_addr); - if (!strnchr(str_ptr + map_off, map->value_size - map_off, 0)) { - verbose(env, "string is not zero-terminated\n"); - return -EINVAL; - } break; } case ARG_PTR_TO_KPTR: -- cgit From 045edee19d591e59ed53772bf6dfc9b1ed9577eb Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 6 Nov 2023 20:57:25 -0800 Subject: bpf: Introduce KF_ARG_PTR_TO_CONST_STR Similar to ARG_PTR_TO_CONST_STR for BPF helpers, KF_ARG_PTR_TO_CONST_STR specifies kfunc args that point to const strings. Annotation "__str" is used to specify kfunc arg of type KF_ARG_PTR_TO_CONST_STR. Also, add documentation for the "__str" annotation. bpf_get_file_xattr() will be the first kfunc that uses this type. Signed-off-by: Song Liu Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Acked-by: Vadim Fedorenko Link: https://lore.kernel.org/bpf/20231107045725.2278852-4-song@kernel.org Signed-off-by: Alexei Starovoitov --- Documentation/bpf/kfuncs.rst | 24 ++++++++++++++++++++++++ kernel/bpf/verifier.c | 19 +++++++++++++++++++ 2 files changed, 43 insertions(+) (limited to 'kernel') diff --git a/Documentation/bpf/kfuncs.rst b/Documentation/bpf/kfuncs.rst index 723408e399ab..7985c6615f3c 100644 --- a/Documentation/bpf/kfuncs.rst +++ b/Documentation/bpf/kfuncs.rst @@ -135,6 +135,30 @@ Either way, the returned buffer is either NULL, or of size buffer_szk. Without t annotation, the verifier will reject the program if a null pointer is passed in with a nonzero size. +2.2.5 __str Annotation +---------------------------- +This annotation is used to indicate that the argument is a constant string. + +An example is given below:: + + __bpf_kfunc bpf_get_file_xattr(..., const char *name__str, ...) + { + ... + } + +In this case, ``bpf_get_file_xattr()`` can be called as:: + + bpf_get_file_xattr(..., "xattr_name", ...); + +Or:: + + const char name[] = "xattr_name"; /* This need to be global */ + int BPF_PROG(...) + { + ... + bpf_get_file_xattr(..., name, ...); + ... + } .. _BPF_kfunc_nodef: diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 637f34231633..9276e0abcb4b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -10810,6 +10810,11 @@ static bool is_kfunc_arg_nullable(const struct btf *btf, const struct btf_param return __kfunc_param_match_suffix(btf, arg, "__nullable"); } +static bool is_kfunc_arg_const_str(const struct btf *btf, const struct btf_param *arg) +{ + return __kfunc_param_match_suffix(btf, arg, "__str"); +} + static bool is_kfunc_arg_scalar_with_name(const struct btf *btf, const struct btf_param *arg, const char *name) @@ -10953,6 +10958,7 @@ enum kfunc_ptr_arg_type { KF_ARG_PTR_TO_RB_ROOT, KF_ARG_PTR_TO_RB_NODE, KF_ARG_PTR_TO_NULL, + KF_ARG_PTR_TO_CONST_STR, }; enum special_kfunc_type { @@ -11103,6 +11109,9 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env, if (is_kfunc_arg_rbtree_node(meta->btf, &args[argno])) return KF_ARG_PTR_TO_RB_NODE; + if (is_kfunc_arg_const_str(meta->btf, &args[argno])) + return KF_ARG_PTR_TO_CONST_STR; + if ((base_type(reg->type) == PTR_TO_BTF_ID || reg2btf_ids[base_type(reg->type)])) { if (!btf_type_is_struct(ref_t)) { verbose(env, "kernel function %s args#%d pointer type %s %s is not supported\n", @@ -11734,6 +11743,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ case KF_ARG_PTR_TO_MEM_SIZE: case KF_ARG_PTR_TO_CALLBACK: case KF_ARG_PTR_TO_REFCOUNTED_KPTR: + case KF_ARG_PTR_TO_CONST_STR: /* Trusted by default */ break; default: @@ -12005,6 +12015,15 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ meta->arg_btf = reg->btf; meta->arg_btf_id = reg->btf_id; break; + case KF_ARG_PTR_TO_CONST_STR: + if (reg->type != PTR_TO_MAP_VALUE) { + verbose(env, "arg#%d doesn't point to a const string\n", i); + return -EINVAL; + } + ret = check_reg_const_str(env, reg, regno); + if (ret) + return ret; + break; } } -- cgit From 82ce364c6087e31ff9837380a4641a856284064c Mon Sep 17 00:00:00 2001 From: Shung-Hsi Yu Date: Wed, 8 Nov 2023 22:00:41 +0800 Subject: bpf: replace register_is_const() with is_reg_const() The addition of is_reg_const() in commit 171de12646d2 ("bpf: generalize is_branch_taken to handle all conditional jumps in one place") has made the register_is_const() redundant. Give the former has more feature, plus the fact the latter is only used in one place, replace register_is_const() with is_reg_const(), and remove the definition of register_is_const. This requires moving the definition of is_reg_const() further up. And since the comment of reg_const_value() reference is_reg_const(), move it up as well. Signed-off-by: Shung-Hsi Yu Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231108140043.12282-1-shung-hsi.yu@suse.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 27 +++++++++++---------------- 1 file changed, 11 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 9276e0abcb4b..993e4677bbe9 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4690,9 +4690,17 @@ static bool register_is_null(struct bpf_reg_state *reg) return reg->type == SCALAR_VALUE && tnum_equals_const(reg->var_off, 0); } -static bool register_is_const(struct bpf_reg_state *reg) +/* check if register is a constant scalar value */ +static bool is_reg_const(struct bpf_reg_state *reg, bool subreg32) +{ + return reg->type == SCALAR_VALUE && + tnum_is_const(subreg32 ? tnum_subreg(reg->var_off) : reg->var_off); +} + +/* assuming is_reg_const() is true, return constant value of a register */ +static u64 reg_const_value(struct bpf_reg_state *reg, bool subreg32) { - return reg->type == SCALAR_VALUE && tnum_is_const(reg->var_off); + return subreg32 ? tnum_subreg(reg->var_off).value : reg->var_off.value; } static bool __is_scalar_unbounded(struct bpf_reg_state *reg) @@ -10050,7 +10058,7 @@ record_func_key(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, val = reg->var_off.value; max = map->max_entries; - if (!(register_is_const(reg) && val < max)) { + if (!(is_reg_const(reg, false) && val < max)) { bpf_map_key_store(aux, BPF_MAP_KEY_POISON); return 0; } @@ -14220,19 +14228,6 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *vstate, })); } -/* check if register is a constant scalar value */ -static bool is_reg_const(struct bpf_reg_state *reg, bool subreg32) -{ - return reg->type == SCALAR_VALUE && - tnum_is_const(subreg32 ? tnum_subreg(reg->var_off) : reg->var_off); -} - -/* assuming is_reg_const() is true, return constant value of a register */ -static u64 reg_const_value(struct bpf_reg_state *reg, bool subreg32) -{ - return subreg32 ? tnum_subreg(reg->var_off).value : reg->var_off.value; -} - /* * , currently assuming reg2 is a constant */ -- cgit From 1500a5d9f49cb66906d3ea1c9158df25cc41dd40 Mon Sep 17 00:00:00 2001 From: Dave Marchevsky Date: Tue, 7 Nov 2023 00:56:34 -0800 Subject: bpf: Add KF_RCU flag to bpf_refcount_acquire_impl Refcounted local kptrs are kptrs to user-defined types with a bpf_refcount field. Recent commits ([0], [1]) modified the lifetime of refcounted local kptrs such that the underlying memory is not reused until RCU grace period has elapsed. Separately, verification of bpf_refcount_acquire calls currently succeeds for MAYBE_NULL non-owning reference input, which is a problem as bpf_refcount_acquire_impl has no handling for this case. This patch takes advantage of aforementioned lifetime changes to tag bpf_refcount_acquire_impl kfunc KF_RCU, thereby preventing MAYBE_NULL input to the kfunc. The KF_RCU flag applies to all kfunc params; it's fine for it to apply to the void *meta__ign param as that's populated by the verifier and is tagged __ign regardless. [0]: commit 7e26cd12ad1c ("bpf: Use bpf_mem_free_rcu when bpf_obj_dropping refcounted nodes") is the actual change to allocation behaivor [1]: commit 0816b8c6bf7f ("bpf: Consider non-owning refs to refcounted nodes RCU protected") modified verifier understanding of refcounted local kptrs to match [0]'s changes Signed-off-by: Dave Marchevsky Fixes: 7c50b1cb76ac ("bpf: Add bpf_refcount_acquire kfunc") Link: https://lore.kernel.org/r/20231107085639.3016113-2-davemarchevsky@fb.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 174f02a9e703..b84d8cb8d239 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -2520,7 +2520,7 @@ BTF_ID_FLAGS(func, bpf_obj_new_impl, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_percpu_obj_new_impl, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_obj_drop_impl, KF_RELEASE) BTF_ID_FLAGS(func, bpf_percpu_obj_drop_impl, KF_RELEASE) -BTF_ID_FLAGS(func, bpf_refcount_acquire_impl, KF_ACQUIRE | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_refcount_acquire_impl, KF_ACQUIRE | KF_RET_NULL | KF_RCU) BTF_ID_FLAGS(func, bpf_list_push_front_impl) BTF_ID_FLAGS(func, bpf_list_push_back_impl) BTF_ID_FLAGS(func, bpf_list_pop_front, KF_ACQUIRE | KF_RET_NULL) -- cgit From 649924b76ab151a96bdd22a97a993fb0421f134c Mon Sep 17 00:00:00 2001 From: Dave Marchevsky Date: Tue, 7 Nov 2023 00:56:36 -0800 Subject: bpf: Use bpf_mem_free_rcu when bpf_obj_dropping non-refcounted nodes The use of bpf_mem_free_rcu to free refcounted local kptrs was added in commit 7e26cd12ad1c ("bpf: Use bpf_mem_free_rcu when bpf_obj_dropping refcounted nodes"). In the cover letter for the series containing that patch [0] I commented: Perhaps it makes sense to move to mem_free_rcu for _all_ non-owning refs in the future, not just refcounted. This might allow custom non-owning ref lifetime + invalidation logic to be entirely subsumed by MEM_RCU handling. IMO this needs a bit more thought and should be tackled outside of a fix series, so it's not attempted here. It's time to start moving in the "non-owning refs have MEM_RCU lifetime" direction. As mentioned in that comment, using bpf_mem_free_rcu for all local kptrs - not just refcounted - is necessarily the first step towards that goal. This patch does so. After this patch the memory pointed to by all local kptrs will not be reused until RCU grace period elapses. The verifier's understanding of non-owning ref validity and the clobbering logic it uses to enforce that understanding are not changed here, that'll happen gradually in future work, including further patches in the series. [0]: https://lore.kernel.org/all/20230821193311.3290257-1-davemarchevsky@fb.com/ Signed-off-by: Dave Marchevsky Link: https://lore.kernel.org/r/20231107085639.3016113-4-davemarchevsky@fb.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index b84d8cb8d239..03517db5cfb3 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1937,10 +1937,7 @@ void __bpf_obj_drop_impl(void *p, const struct btf_record *rec, bool percpu) ma = &bpf_global_percpu_ma; else ma = &bpf_global_ma; - if (rec && rec->refcount_off >= 0) - bpf_mem_free_rcu(ma, p); - else - bpf_mem_free(ma, p); + bpf_mem_free_rcu(ma, p); } __bpf_kfunc void bpf_obj_drop_impl(void *p__alloc, void *meta__ign) -- cgit From 790ce3cfefb1b768dccd4eee324ddef0f0ce3db4 Mon Sep 17 00:00:00 2001 From: Dave Marchevsky Date: Tue, 7 Nov 2023 00:56:37 -0800 Subject: bpf: Move GRAPH_{ROOT,NODE}_MASK macros into btf_field_type enum This refactoring patch removes the unused BPF_GRAPH_NODE_OR_ROOT btf_field_type and moves BPF_GRAPH_{NODE,ROOT} macros into the btf_field_type enum. Further patches in the series will use BPF_GRAPH_NODE, so let's move this useful definition out of btf.c. Signed-off-by: Dave Marchevsky Link: https://lore.kernel.org/r/20231107085639.3016113-5-davemarchevsky@fb.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 4 ++-- kernel/bpf/btf.c | 11 ++++------- 2 files changed, 6 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index eb84caf133df..4001d11be151 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -186,8 +186,8 @@ enum btf_field_type { BPF_LIST_NODE = (1 << 6), BPF_RB_ROOT = (1 << 7), BPF_RB_NODE = (1 << 8), - BPF_GRAPH_NODE_OR_ROOT = BPF_LIST_NODE | BPF_LIST_HEAD | - BPF_RB_NODE | BPF_RB_ROOT, + BPF_GRAPH_NODE = BPF_RB_NODE | BPF_LIST_NODE, + BPF_GRAPH_ROOT = BPF_RB_ROOT | BPF_LIST_HEAD, BPF_REFCOUNT = (1 << 9), }; diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 15d71d2986d3..63cf4128fc05 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3840,9 +3840,6 @@ end: return ERR_PTR(ret); } -#define GRAPH_ROOT_MASK (BPF_LIST_HEAD | BPF_RB_ROOT) -#define GRAPH_NODE_MASK (BPF_LIST_NODE | BPF_RB_NODE) - int btf_check_and_fixup_fields(const struct btf *btf, struct btf_record *rec) { int i; @@ -3855,13 +3852,13 @@ int btf_check_and_fixup_fields(const struct btf *btf, struct btf_record *rec) * Hence we only need to ensure that bpf_{list_head,rb_root} ownership * does not form cycles. */ - if (IS_ERR_OR_NULL(rec) || !(rec->field_mask & GRAPH_ROOT_MASK)) + if (IS_ERR_OR_NULL(rec) || !(rec->field_mask & BPF_GRAPH_ROOT)) return 0; for (i = 0; i < rec->cnt; i++) { struct btf_struct_meta *meta; u32 btf_id; - if (!(rec->fields[i].type & GRAPH_ROOT_MASK)) + if (!(rec->fields[i].type & BPF_GRAPH_ROOT)) continue; btf_id = rec->fields[i].graph_root.value_btf_id; meta = btf_find_struct_meta(btf, btf_id); @@ -3873,7 +3870,7 @@ int btf_check_and_fixup_fields(const struct btf *btf, struct btf_record *rec) * to check ownership cycle for a type unless it's also a * node type. */ - if (!(rec->field_mask & GRAPH_NODE_MASK)) + if (!(rec->field_mask & BPF_GRAPH_NODE)) continue; /* We need to ensure ownership acyclicity among all types. The @@ -3909,7 +3906,7 @@ int btf_check_and_fixup_fields(const struct btf *btf, struct btf_record *rec) * - A is both an root and node. * - B is only an node. */ - if (meta->record->field_mask & GRAPH_ROOT_MASK) + if (meta->record->field_mask & BPF_GRAPH_ROOT) return -ELOOP; } return 0; -- cgit From 1b12171533a9bb23cf6fba7262b479028b65e1e8 Mon Sep 17 00:00:00 2001 From: Dave Marchevsky Date: Tue, 7 Nov 2023 00:56:38 -0800 Subject: bpf: Mark direct ld of stashed bpf_{rb,list}_node as non-owning ref This patch enables the following pattern: /* mapval contains a __kptr pointing to refcounted local kptr */ mapval = bpf_map_lookup_elem(&map, &idx); if (!mapval || !mapval->some_kptr) { /* omitted */ } p = bpf_refcount_acquire(&mapval->some_kptr); Currently this doesn't work because bpf_refcount_acquire expects an owning or non-owning ref. The verifier defines non-owning ref as a type: PTR_TO_BTF_ID | MEM_ALLOC | NON_OWN_REF while mapval->some_kptr is PTR_TO_BTF_ID | PTR_UNTRUSTED. It's possible to do the refcount_acquire by first bpf_kptr_xchg'ing mapval->some_kptr into a temp kptr, refcount_acquiring that, and xchg'ing back into mapval, but this is unwieldy and shouldn't be necessary. This patch modifies btf_ld_kptr_type such that user-allocated types are marked MEM_ALLOC and if those types have a bpf_{rb,list}_node they're marked NON_OWN_REF as well. Additionally, due to changes to bpf_obj_drop_impl earlier in this series, rcu_protected_object now returns true for all user-allocated types, resulting in mapval->some_kptr being marked MEM_RCU. After this patch's changes, mapval->some_kptr is now: PTR_TO_BTF_ID | MEM_ALLOC | NON_OWN_REF | MEM_RCU which results in it passing the non-owning ref test, and the motivating example passing verification. Future work will likely get rid of special non-owning ref lifetime logic in the verifier, at which point we'll be able to delete the NON_OWN_REF flag entirely. Signed-off-by: Dave Marchevsky Link: https://lore.kernel.org/r/20231107085639.3016113-6-davemarchevsky@fb.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 36 +++++++++++++++++++++++++++++++----- 1 file changed, 31 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 993e4677bbe9..9ae6eae13471 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5557,10 +5557,23 @@ BTF_SET_END(rcu_protected_types) static bool rcu_protected_object(const struct btf *btf, u32 btf_id) { if (!btf_is_kernel(btf)) - return false; + return true; return btf_id_set_contains(&rcu_protected_types, btf_id); } +static struct btf_record *kptr_pointee_btf_record(struct btf_field *kptr_field) +{ + struct btf_struct_meta *meta; + + if (btf_is_kernel(kptr_field->kptr.btf)) + return NULL; + + meta = btf_find_struct_meta(kptr_field->kptr.btf, + kptr_field->kptr.btf_id); + + return meta ? meta->record : NULL; +} + static bool rcu_safe_kptr(const struct btf_field *field) { const struct btf_field_kptr *kptr = &field->kptr; @@ -5571,12 +5584,25 @@ static bool rcu_safe_kptr(const struct btf_field *field) static u32 btf_ld_kptr_type(struct bpf_verifier_env *env, struct btf_field *kptr_field) { + struct btf_record *rec; + u32 ret; + + ret = PTR_MAYBE_NULL; if (rcu_safe_kptr(kptr_field) && in_rcu_cs(env)) { - if (kptr_field->type != BPF_KPTR_PERCPU) - return PTR_MAYBE_NULL | MEM_RCU; - return PTR_MAYBE_NULL | MEM_RCU | MEM_PERCPU; + ret |= MEM_RCU; + if (kptr_field->type == BPF_KPTR_PERCPU) + ret |= MEM_PERCPU; + else if (!btf_is_kernel(kptr_field->kptr.btf)) + ret |= MEM_ALLOC; + + rec = kptr_pointee_btf_record(kptr_field); + if (rec && btf_record_has_field(rec, BPF_GRAPH_NODE)) + ret |= NON_OWN_REF; + } else { + ret |= PTR_UNTRUSTED; } - return PTR_MAYBE_NULL | PTR_UNTRUSTED; + + return ret; } static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno, -- cgit From b8e3a87a627b575896e448021e5c2f8a3bc19931 Mon Sep 17 00:00:00 2001 From: Jordan Rome Date: Wed, 8 Nov 2023 03:23:34 -0800 Subject: bpf: Add crosstask check to __bpf_get_stack Currently get_perf_callchain only supports user stack walking for the current task. Passing the correct *crosstask* param will return 0 frames if the task passed to __bpf_get_stack isn't the current one instead of a single incorrect frame/address. This change passes the correct *crosstask* param but also does a preemptive check in __bpf_get_stack if the task is current and returns -EOPNOTSUPP if it is not. This issue was found using bpf_get_task_stack inside a BPF iterator ("iter/task"), which iterates over all tasks. bpf_get_task_stack works fine for fetching kernel stacks but because get_perf_callchain relies on the caller to know if the requested *task* is the current one (via *crosstask*) it was failing in a confusing way. It might be possible to get user stacks for all tasks utilizing something like access_process_vm but that requires the bpf program calling bpf_get_task_stack to be sleepable and would therefore be a breaking change. Fixes: fa28dcb82a38 ("bpf: Introduce helper bpf_get_task_stack()") Signed-off-by: Jordan Rome Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20231108112334.3433136-1-jordalgo@meta.com --- include/uapi/linux/bpf.h | 3 +++ kernel/bpf/stackmap.c | 11 ++++++++++- tools/include/uapi/linux/bpf.h | 3 +++ 3 files changed, 16 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 095ca7238ac2..7cf8bcf9f6a2 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -4517,6 +4517,8 @@ union bpf_attr { * long bpf_get_task_stack(struct task_struct *task, void *buf, u32 size, u64 flags) * Description * Return a user or a kernel stack in bpf program provided buffer. + * Note: the user stack will only be populated if the *task* is + * the current task; all other tasks will return -EOPNOTSUPP. * To achieve this, the helper needs *task*, which is a valid * pointer to **struct task_struct**. To store the stacktrace, the * bpf program provides *buf* with a nonnegative *size*. @@ -4528,6 +4530,7 @@ union bpf_attr { * * **BPF_F_USER_STACK** * Collect a user space stack instead of a kernel stack. + * The *task* must be the current task. * **BPF_F_USER_BUILD_ID** * Collect buildid+offset instead of ips for user stack, * only valid if **BPF_F_USER_STACK** is also specified. diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index d6b277482085..dff7ba539701 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -388,6 +388,7 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task, { u32 trace_nr, copy_len, elem_size, num_elem, max_depth; bool user_build_id = flags & BPF_F_USER_BUILD_ID; + bool crosstask = task && task != current; u32 skip = flags & BPF_F_SKIP_FIELD_MASK; bool user = flags & BPF_F_USER_STACK; struct perf_callchain_entry *trace; @@ -410,6 +411,14 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task, if (task && user && !user_mode(regs)) goto err_fault; + /* get_perf_callchain does not support crosstask user stack walking + * but returns an empty stack instead of NULL. + */ + if (crosstask && user) { + err = -EOPNOTSUPP; + goto clear; + } + num_elem = size / elem_size; max_depth = num_elem + skip; if (sysctl_perf_event_max_stack < max_depth) @@ -421,7 +430,7 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task, trace = get_callchain_entry_for_task(task, max_depth); else trace = get_perf_callchain(regs, 0, kernel, user, max_depth, - false, false); + crosstask, false); if (unlikely(!trace)) goto err_fault; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 095ca7238ac2..7cf8bcf9f6a2 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -4517,6 +4517,8 @@ union bpf_attr { * long bpf_get_task_stack(struct task_struct *task, void *buf, u32 size, u64 flags) * Description * Return a user or a kernel stack in bpf program provided buffer. + * Note: the user stack will only be populated if the *task* is + * the current task; all other tasks will return -EOPNOTSUPP. * To achieve this, the helper needs *task*, which is a valid * pointer to **struct task_struct**. To store the stacktrace, the * bpf program provides *buf* with a nonnegative *size*. @@ -4528,6 +4530,7 @@ union bpf_attr { * * **BPF_F_USER_STACK** * Collect a user space stack instead of a kernel stack. + * The *task* must be the current task. * **BPF_F_USER_BUILD_ID** * Collect buildid+offset instead of ips for user stack, * only valid if **BPF_F_USER_STACK** is also specified. -- cgit From fe28f631fa941fba583d1c4f25895284b90af671 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Wed, 25 Oct 2023 14:25:52 -0400 Subject: workqueue: Add workqueue_unbound_exclude_cpumask() to exclude CPUs from wq_unbound_cpumask When the "isolcpus" boot command line option is used to add a set of isolated CPUs, those CPUs will be excluded automatically from wq_unbound_cpumask to avoid running work functions from unbound workqueues. Recently cpuset has been extended to allow the creation of partitions of isolated CPUs dynamically. To make it closer to the "isolcpus" in functionality, the CPUs in those isolated cpuset partitions should be excluded from wq_unbound_cpumask as well. This can be done currently by explicitly writing to the workqueue's cpumask sysfs file after creating the isolated partitions. However, this process can be error prone. Ideally, the cpuset code should be allowed to request the workqueue code to exclude those isolated CPUs from wq_unbound_cpumask so that this operation can be done automatically and the isolated CPUs will be returned back to wq_unbound_cpumask after the destructions of the isolated cpuset partitions. This patch adds a new workqueue_unbound_exclude_cpumask() function to enable that. This new function will exclude the specified isolated CPUs from wq_unbound_cpumask. To be able to restore those isolated CPUs back after the destruction of isolated cpuset partitions, a new wq_requested_unbound_cpumask is added to store the user provided unbound cpumask either from the boot command line options or from writing to the cpumask sysfs file. This new cpumask provides the basis for CPU exclusion. To enable users to understand how the wq_unbound_cpumask is being modified internally, this patch also exposes the newly introduced wq_requested_unbound_cpumask as well as a wq_isolated_cpumask to store the cpumask to be excluded from wq_unbound_cpumask as read-only sysfs files. Signed-off-by: Waiman Long Signed-off-by: Tejun Heo --- include/linux/workqueue.h | 2 +- kernel/workqueue.c | 91 ++++++++++++++++++++++++++++++++++++++++++----- 2 files changed, 84 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 24b1e5070f4d..b0b9604b76b8 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -491,7 +491,7 @@ struct workqueue_attrs *alloc_workqueue_attrs(void); void free_workqueue_attrs(struct workqueue_attrs *attrs); int apply_workqueue_attrs(struct workqueue_struct *wq, const struct workqueue_attrs *attrs); -int workqueue_set_unbound_cpumask(cpumask_var_t cpumask); +extern int workqueue_unbound_exclude_cpumask(cpumask_var_t cpumask); extern bool queue_work_on(int cpu, struct workqueue_struct *wq, struct work_struct *work); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 6e578f576a6f..bd9d34eacd78 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -381,6 +381,12 @@ static bool workqueue_freezing; /* PL: have wqs started freezing? */ /* PL&A: allowable cpus for unbound wqs and work items */ static cpumask_var_t wq_unbound_cpumask; +/* PL: user requested unbound cpumask via sysfs */ +static cpumask_var_t wq_requested_unbound_cpumask; + +/* PL: isolated cpumask to be excluded from unbound cpumask */ +static cpumask_var_t wq_isolated_cpumask; + /* for further constrain wq_unbound_cpumask by cmdline parameter*/ static struct cpumask wq_cmdline_cpumask __initdata; @@ -5839,7 +5845,7 @@ static int workqueue_apply_unbound_cpumask(const cpumask_var_t unbound_cpumask) * -EINVAL - Invalid @cpumask * -ENOMEM - Failed to allocate memory for attrs or pwqs. */ -int workqueue_set_unbound_cpumask(cpumask_var_t cpumask) +static int workqueue_set_unbound_cpumask(cpumask_var_t cpumask) { int ret = -EINVAL; @@ -5850,6 +5856,7 @@ int workqueue_set_unbound_cpumask(cpumask_var_t cpumask) cpumask_and(cpumask, cpumask, cpu_possible_mask); if (!cpumask_empty(cpumask)) { apply_wqattrs_lock(); + cpumask_copy(wq_requested_unbound_cpumask, cpumask); if (cpumask_equal(cpumask, wq_unbound_cpumask)) { ret = 0; goto out_unlock; @@ -5864,6 +5871,44 @@ out_unlock: return ret; } +/** + * workqueue_unbound_exclude_cpumask - Exclude given CPUs from unbound cpumask + * @exclude_cpumask: the cpumask to be excluded from wq_unbound_cpumask + * + * This function can be called from cpuset code to provide a set of isolated + * CPUs that should be excluded from wq_unbound_cpumask. The caller must hold + * either cpus_read_lock or cpus_write_lock. + */ +int workqueue_unbound_exclude_cpumask(cpumask_var_t exclude_cpumask) +{ + cpumask_var_t cpumask; + int ret = 0; + + if (!zalloc_cpumask_var(&cpumask, GFP_KERNEL)) + return -ENOMEM; + + lockdep_assert_cpus_held(); + mutex_lock(&wq_pool_mutex); + + /* Save the current isolated cpumask & export it via sysfs */ + cpumask_copy(wq_isolated_cpumask, exclude_cpumask); + + /* + * If the operation fails, it will fall back to + * wq_requested_unbound_cpumask which is initially set to + * (HK_TYPE_WQ ∩ HK_TYPE_DOMAIN) house keeping mask and rewritten + * by any subsequent write to workqueue/cpumask sysfs file. + */ + if (!cpumask_andnot(cpumask, wq_requested_unbound_cpumask, exclude_cpumask)) + cpumask_copy(cpumask, wq_requested_unbound_cpumask); + if (!cpumask_equal(cpumask, wq_unbound_cpumask)) + ret = workqueue_apply_unbound_cpumask(cpumask); + + mutex_unlock(&wq_pool_mutex); + free_cpumask_var(cpumask); + return ret; +} + static int parse_affn_scope(const char *val) { int i; @@ -6158,19 +6203,36 @@ static struct bus_type wq_subsys = { .dev_groups = wq_sysfs_groups, }; -static ssize_t wq_unbound_cpumask_show(struct device *dev, - struct device_attribute *attr, char *buf) +static ssize_t __wq_cpumask_show(struct device *dev, + struct device_attribute *attr, char *buf, cpumask_var_t mask) { int written; mutex_lock(&wq_pool_mutex); - written = scnprintf(buf, PAGE_SIZE, "%*pb\n", - cpumask_pr_args(wq_unbound_cpumask)); + written = scnprintf(buf, PAGE_SIZE, "%*pb\n", cpumask_pr_args(mask)); mutex_unlock(&wq_pool_mutex); return written; } +static ssize_t wq_unbound_cpumask_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return __wq_cpumask_show(dev, attr, buf, wq_unbound_cpumask); +} + +static ssize_t wq_requested_cpumask_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return __wq_cpumask_show(dev, attr, buf, wq_requested_unbound_cpumask); +} + +static ssize_t wq_isolated_cpumask_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return __wq_cpumask_show(dev, attr, buf, wq_isolated_cpumask); +} + static ssize_t wq_unbound_cpumask_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { @@ -6188,9 +6250,13 @@ static ssize_t wq_unbound_cpumask_store(struct device *dev, return ret ? ret : count; } -static struct device_attribute wq_sysfs_cpumask_attr = +static struct device_attribute wq_sysfs_cpumask_attrs[] = { __ATTR(cpumask, 0644, wq_unbound_cpumask_show, - wq_unbound_cpumask_store); + wq_unbound_cpumask_store), + __ATTR(cpumask_requested, 0444, wq_requested_cpumask_show, NULL), + __ATTR(cpumask_isolated, 0444, wq_isolated_cpumask_show, NULL), + __ATTR_NULL, +}; static int __init wq_sysfs_init(void) { @@ -6203,7 +6269,13 @@ static int __init wq_sysfs_init(void) dev_root = bus_get_dev_root(&wq_subsys); if (dev_root) { - err = device_create_file(dev_root, &wq_sysfs_cpumask_attr); + struct device_attribute *attr; + + for (attr = wq_sysfs_cpumask_attrs; attr->attr.name; attr++) { + err = device_create_file(dev_root, attr); + if (err) + break; + } put_device(dev_root); } return err; @@ -6534,11 +6606,14 @@ void __init workqueue_init_early(void) BUILD_BUG_ON(__alignof__(struct pool_workqueue) < __alignof__(long long)); BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL)); + BUG_ON(!alloc_cpumask_var(&wq_requested_unbound_cpumask, GFP_KERNEL)); + BUG_ON(!zalloc_cpumask_var(&wq_isolated_cpumask, GFP_KERNEL)); cpumask_copy(wq_unbound_cpumask, housekeeping_cpumask(HK_TYPE_WQ)); cpumask_and(wq_unbound_cpumask, wq_unbound_cpumask, housekeeping_cpumask(HK_TYPE_DOMAIN)); if (!cpumask_empty(&wq_cmdline_cpumask)) cpumask_and(wq_unbound_cpumask, wq_unbound_cpumask, &wq_cmdline_cpumask); + cpumask_copy(wq_requested_unbound_cpumask, wq_unbound_cpumask); pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC); -- cgit From 11e5f407b64a8fa09d1a4b336d15bd285a434c1f Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Wed, 25 Oct 2023 14:25:54 -0400 Subject: cgroup/cpuset: Keep track of CPUs in isolated partitions Add a new internal isolated_cpus mask to keep track of the CPUs that are in isolated partitions. Expose that new cpumask as a new root-only control file ".cpuset.cpus.isolated". tj: Updated patch description to reflect dropping __DEBUG__ prefix. Signed-off-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 190 +++++++++++++++++++++++++++++++++---------------- 1 file changed, 127 insertions(+), 63 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 615daaf87f1f..19c8779798fd 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -204,6 +204,11 @@ struct cpuset { */ static cpumask_var_t subpartitions_cpus; +/* + * Exclusive CPUs in isolated partitions + */ +static cpumask_var_t isolated_cpus; + /* List of remote partition root children */ static struct list_head remote_children; @@ -1317,6 +1322,7 @@ static void compute_effective_cpumask(struct cpumask *new_cpus, */ enum partition_cmd { partcmd_enable, /* Enable partition root */ + partcmd_enablei, /* Enable isolated partition root */ partcmd_disable, /* Disable partition root */ partcmd_update, /* Update parent's effective_cpus */ partcmd_invalidate, /* Make partition invalid */ @@ -1418,6 +1424,74 @@ static void reset_partition_data(struct cpuset *cs) } } +/* + * partition_xcpus_newstate - Exclusive CPUs state change + * @old_prs: old partition_root_state + * @new_prs: new partition_root_state + * @xcpus: exclusive CPUs with state change + */ +static void partition_xcpus_newstate(int old_prs, int new_prs, struct cpumask *xcpus) +{ + WARN_ON_ONCE(old_prs == new_prs); + if (new_prs == PRS_ISOLATED) + cpumask_or(isolated_cpus, isolated_cpus, xcpus); + else + cpumask_andnot(isolated_cpus, isolated_cpus, xcpus); +} + +/* + * partition_xcpus_add - Add new exclusive CPUs to partition + * @new_prs: new partition_root_state + * @parent: parent cpuset + * @xcpus: exclusive CPUs to be added + * + * Remote partition if parent == NULL + */ +static void partition_xcpus_add(int new_prs, struct cpuset *parent, + struct cpumask *xcpus) +{ + WARN_ON_ONCE(new_prs < 0); + lockdep_assert_held(&callback_lock); + if (!parent) + parent = &top_cpuset; + + if (parent == &top_cpuset) + cpumask_or(subpartitions_cpus, subpartitions_cpus, xcpus); + + if (new_prs != parent->partition_root_state) + partition_xcpus_newstate(parent->partition_root_state, new_prs, + xcpus); + + cpumask_andnot(parent->effective_cpus, parent->effective_cpus, xcpus); +} + +/* + * partition_xcpus_del - Remove exclusive CPUs from partition + * @old_prs: old partition_root_state + * @parent: parent cpuset + * @xcpus: exclusive CPUs to be removed + * + * Remote partition if parent == NULL + */ +static void partition_xcpus_del(int old_prs, struct cpuset *parent, + struct cpumask *xcpus) +{ + WARN_ON_ONCE(old_prs < 0); + lockdep_assert_held(&callback_lock); + if (!parent) + parent = &top_cpuset; + + if (parent == &top_cpuset) + cpumask_andnot(subpartitions_cpus, subpartitions_cpus, xcpus); + + if (old_prs != parent->partition_root_state) + partition_xcpus_newstate(old_prs, parent->partition_root_state, + xcpus); + + cpumask_and(xcpus, xcpus, cpu_active_mask); + cpumask_or(parent->effective_cpus, parent->effective_cpus, xcpus); +} + /* * compute_effective_exclusive_cpumask - compute effective exclusive CPUs * @cs: cpuset @@ -1456,13 +1530,15 @@ static inline bool is_local_partition(struct cpuset *cs) /* * remote_partition_enable - Enable current cpuset as a remote partition root * @cs: the cpuset to update + * @new_prs: new partition_root_state * @tmp: temparary masks * Return: 1 if successful, 0 if error * * Enable the current cpuset to become a remote partition root taking CPUs * directly from the top cpuset. cpuset_mutex must be held by the caller. */ -static int remote_partition_enable(struct cpuset *cs, struct tmpmasks *tmp) +static int remote_partition_enable(struct cpuset *cs, int new_prs, + struct tmpmasks *tmp) { /* * The user must have sysadmin privilege. @@ -1485,18 +1561,14 @@ static int remote_partition_enable(struct cpuset *cs, struct tmpmasks *tmp) return 0; spin_lock_irq(&callback_lock); - cpumask_andnot(top_cpuset.effective_cpus, - top_cpuset.effective_cpus, tmp->new_cpus); - cpumask_or(subpartitions_cpus, - subpartitions_cpus, tmp->new_cpus); - + partition_xcpus_add(new_prs, NULL, tmp->new_cpus); + list_add(&cs->remote_sibling, &remote_children); if (cs->use_parent_ecpus) { struct cpuset *parent = parent_cs(cs); cs->use_parent_ecpus = false; parent->child_ecpus_count--; } - list_add(&cs->remote_sibling, &remote_children); spin_unlock_irq(&callback_lock); /* @@ -1524,13 +1596,8 @@ static void remote_partition_disable(struct cpuset *cs, struct tmpmasks *tmp) WARN_ON_ONCE(!cpumask_subset(tmp->new_cpus, subpartitions_cpus)); spin_lock_irq(&callback_lock); - cpumask_andnot(subpartitions_cpus, - subpartitions_cpus, tmp->new_cpus); - cpumask_and(tmp->new_cpus, - tmp->new_cpus, cpu_active_mask); - cpumask_or(top_cpuset.effective_cpus, - top_cpuset.effective_cpus, tmp->new_cpus); list_del_init(&cs->remote_sibling); + partition_xcpus_del(cs->partition_root_state, NULL, tmp->new_cpus); cs->partition_root_state = -cs->partition_root_state; if (!cs->prs_err) cs->prs_err = PERR_INVCPUS; @@ -1557,6 +1624,7 @@ static void remote_cpus_update(struct cpuset *cs, struct cpumask *newmask, struct tmpmasks *tmp) { bool adding, deleting; + int prs = cs->partition_root_state; if (WARN_ON_ONCE(!is_remote_partition(cs))) return; @@ -1580,20 +1648,10 @@ static void remote_cpus_update(struct cpuset *cs, struct cpumask *newmask, goto invalidate; spin_lock_irq(&callback_lock); - if (adding) { - cpumask_or(subpartitions_cpus, - subpartitions_cpus, tmp->addmask); - cpumask_andnot(top_cpuset.effective_cpus, - top_cpuset.effective_cpus, tmp->addmask); - } - if (deleting) { - cpumask_andnot(subpartitions_cpus, - subpartitions_cpus, tmp->delmask); - cpumask_and(tmp->delmask, - tmp->delmask, cpu_active_mask); - cpumask_or(top_cpuset.effective_cpus, - top_cpuset.effective_cpus, tmp->delmask); - } + if (adding) + partition_xcpus_add(prs, NULL, tmp->addmask); + if (deleting) + partition_xcpus_del(prs, NULL, tmp->delmask); spin_unlock_irq(&callback_lock); /* @@ -1676,11 +1734,11 @@ static bool prstate_housekeeping_conflict(int prstate, struct cpumask *new_cpus) * @tmp: Temporary addmask and delmask * Return: 0 or a partition root state error code * - * For partcmd_enable, the cpuset is being transformed from a non-partition - * root to a partition root. The effective_xcpus (cpus_allowed if effective_xcpus - * not set) mask of the given cpuset will be taken away from parent's - * effective_cpus. The function will return 0 if all the CPUs listed in - * effective_xcpus can be granted or an error code will be returned. + * For partcmd_enable*, the cpuset is being transformed from a non-partition + * root to a partition root. The effective_xcpus (cpus_allowed if + * effective_xcpus not set) mask of the given cpuset will be taken away from + * parent's effective_cpus. The function will return 0 if all the CPUs listed + * in effective_xcpus can be granted or an error code will be returned. * * For partcmd_disable, the cpuset is being transformed from a partition * root back to a non-partition root. Any CPUs in effective_xcpus will be @@ -1695,7 +1753,7 @@ static bool prstate_housekeeping_conflict(int prstate, struct cpumask *new_cpus) * * For partcmd_invalidate, the current partition will be made invalid. * - * The partcmd_enable and partcmd_disable commands are used by + * The partcmd_enable* and partcmd_disable commands are used by * update_prstate(). An error code may be returned and the caller will check * for error. * @@ -1760,7 +1818,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, nocpu = tasks_nocpu_error(parent, cs, xcpus); - if (cmd == partcmd_enable) { + if ((cmd == partcmd_enable) || (cmd == partcmd_enablei)) { /* * Enabling partition root is not allowed if its * effective_xcpus is empty or doesn't overlap with @@ -1783,6 +1841,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, cpumask_copy(tmp->delmask, xcpus); deleting = true; subparts_delta++; + new_prs = (cmd == partcmd_enable) ? PRS_ROOT : PRS_ISOLATED; } else if (cmd == partcmd_disable) { /* * May need to add cpus to parent's effective_cpus for @@ -1792,6 +1851,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, cpumask_and(tmp->addmask, xcpus, parent->effective_xcpus); if (adding) subparts_delta--; + new_prs = PRS_MEMBER; } else if (newmask) { /* * Empty cpumask is not allowed @@ -1940,37 +2000,24 @@ write_error: * newly deleted ones will be added back to effective_cpus. */ spin_lock_irq(&callback_lock); - if (adding) { - if (parent == &top_cpuset) - cpumask_andnot(subpartitions_cpus, - subpartitions_cpus, tmp->addmask); - /* - * Some of the CPUs in effective_xcpus might have been offlined. - */ - cpumask_or(parent->effective_cpus, - parent->effective_cpus, tmp->addmask); - cpumask_and(parent->effective_cpus, - parent->effective_cpus, cpu_active_mask); - } - if (deleting) { - if (parent == &top_cpuset) - cpumask_or(subpartitions_cpus, - subpartitions_cpus, tmp->delmask); - cpumask_andnot(parent->effective_cpus, - parent->effective_cpus, tmp->delmask); - } - - if (is_partition_valid(parent)) { - parent->nr_subparts += subparts_delta; - WARN_ON_ONCE(parent->nr_subparts < 0); - } - if (old_prs != new_prs) { cs->partition_root_state = new_prs; if (new_prs <= 0) cs->nr_subparts = 0; } + /* + * Adding to parent's effective_cpus means deletion CPUs from cs + * and vice versa. + */ + if (adding) + partition_xcpus_del(old_prs, parent, tmp->addmask); + if (deleting) + partition_xcpus_add(new_prs, parent, tmp->delmask); + if (is_partition_valid(parent)) { + parent->nr_subparts += subparts_delta; + WARN_ON_ONCE(parent->nr_subparts < 0); + } spin_unlock_irq(&callback_lock); if ((old_prs != new_prs) && (cmd == partcmd_update)) @@ -2948,6 +2995,7 @@ static int update_prstate(struct cpuset *cs, int new_prs) int err = PERR_NONE, old_prs = cs->partition_root_state; struct cpuset *parent = parent_cs(cs); struct tmpmasks tmpmask; + bool new_xcpus_state = false; if (old_prs == new_prs) return 0; @@ -2977,6 +3025,9 @@ static int update_prstate(struct cpuset *cs, int new_prs) goto out; if (!old_prs) { + enum partition_cmd cmd = (new_prs == PRS_ROOT) + ? partcmd_enable : partcmd_enablei; + /* * cpus_allowed cannot be empty. */ @@ -2985,19 +3036,18 @@ static int update_prstate(struct cpuset *cs, int new_prs) goto out; } - err = update_parent_effective_cpumask(cs, partcmd_enable, - NULL, &tmpmask); + err = update_parent_effective_cpumask(cs, cmd, NULL, &tmpmask); /* * If an attempt to become local partition root fails, * try to become a remote partition root instead. */ - if (err && remote_partition_enable(cs, &tmpmask)) + if (err && remote_partition_enable(cs, new_prs, &tmpmask)) err = 0; } else if (old_prs && new_prs) { /* * A change in load balance state only, no change in cpumasks. */ - ; + new_xcpus_state = true; } else { /* * Switching back to member is always allowed even if it @@ -3029,6 +3079,8 @@ out: WRITE_ONCE(cs->prs_err, err); if (!is_partition_valid(cs)) reset_partition_data(cs); + else if (new_xcpus_state) + partition_xcpus_newstate(old_prs, new_prs, cs->effective_xcpus); spin_unlock_irq(&callback_lock); /* Force update if switching back to member */ @@ -3386,6 +3438,7 @@ typedef enum { FILE_SUBPARTS_CPULIST, FILE_EXCLUSIVE_CPULIST, FILE_EFFECTIVE_XCPULIST, + FILE_ISOLATED_CPULIST, FILE_CPU_EXCLUSIVE, FILE_MEM_EXCLUSIVE, FILE_MEM_HARDWALL, @@ -3582,6 +3635,9 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v) case FILE_SUBPARTS_CPULIST: seq_printf(sf, "%*pbl\n", cpumask_pr_args(subpartitions_cpus)); break; + case FILE_ISOLATED_CPULIST: + seq_printf(sf, "%*pbl\n", cpumask_pr_args(isolated_cpus)); + break; default: ret = -EINVAL; } @@ -3875,6 +3931,13 @@ static struct cftype dfl_files[] = { .flags = CFTYPE_ONLY_ON_ROOT | CFTYPE_DEBUG, }, + { + .name = "cpus.isolated", + .seq_show = cpuset_common_seq_show, + .private = FILE_ISOLATED_CPULIST, + .flags = CFTYPE_ONLY_ON_ROOT | CFTYPE_DEBUG, + }, + { } /* terminate */ }; @@ -4194,6 +4257,7 @@ int __init cpuset_init(void) BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_xcpus, GFP_KERNEL)); BUG_ON(!alloc_cpumask_var(&top_cpuset.exclusive_cpus, GFP_KERNEL)); BUG_ON(!zalloc_cpumask_var(&subpartitions_cpus, GFP_KERNEL)); + BUG_ON(!zalloc_cpumask_var(&isolated_cpus, GFP_KERNEL)); cpumask_setall(top_cpuset.cpus_allowed); nodes_setall(top_cpuset.mems_allowed); -- cgit From 72c6303acfa1008c542e093bc9f9916fb99e0323 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Wed, 25 Oct 2023 14:25:55 -0400 Subject: cgroup/cpuset: Take isolated CPUs out of workqueue unbound cpumask To make CPUs in isolated cpuset partition closer in isolation to the boot time isolated CPUs specified in the "isolcpus" boot command line option, we need to take those CPUs out of the workqueue unbound cpumask so that work functions from the unbound workqueues won't run on those CPUs. Otherwise, they will interfere the user tasks running on those isolated CPUs. With the introduction of the workqueue_unbound_exclude_cpumask() helper function in an earlier commit, those isolated CPUs can now be taken out from the workqueue unbound cpumask. This patch also updates cgroup-v2.rst to mention that isolated CPUs will be excluded from unbound workqueue cpumask as well as updating test_cpuset_prs.sh to verify the correctness of the new *cpuset.cpus.isolated file, if available via cgroup_debug option. Signed-off-by: Waiman Long Signed-off-by: Tejun Heo --- Documentation/admin-guide/cgroup-v2.rst | 10 +- kernel/cgroup/cpuset.c | 116 ++++++++++++++++++---- tools/testing/selftests/cgroup/test_cpuset_prs.sh | 74 ++++++++++++-- 3 files changed, 166 insertions(+), 34 deletions(-) (limited to 'kernel') diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 3f85254f3cef..cf5651a11df8 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -2358,11 +2358,11 @@ Cpuset Interface Files partition or scheduling domain. The set of exclusive CPUs is determined by the value of its "cpuset.cpus.exclusive.effective". - When set to "isolated", the CPUs in that partition will - be in an isolated state without any load balancing from the - scheduler. Tasks placed in such a partition with multiple - CPUs should be carefully distributed and bound to each of the - individual CPUs for optimal performance. + When set to "isolated", the CPUs in that partition will be in + an isolated state without any load balancing from the scheduler + and excluded from the unbound workqueues. Tasks placed in such + a partition with multiple CPUs should be carefully distributed + and bound to each of the individual CPUs for optimal performance. A partition root ("root" or "isolated") can be in one of the two possible states - valid or invalid. An invalid partition diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 19c8779798fd..1bad4007ff4b 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -43,6 +44,7 @@ #include #include #include +#include DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key); DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key); @@ -1444,25 +1446,31 @@ static void partition_xcpus_newstate(int old_prs, int new_prs, struct cpumask *x * @new_prs: new partition_root_state * @parent: parent cpuset * @xcpus: exclusive CPUs to be added + * Return: true if isolated_cpus modified, false otherwise * * Remote partition if parent == NULL */ -static void partition_xcpus_add(int new_prs, struct cpuset *parent, +static bool partition_xcpus_add(int new_prs, struct cpuset *parent, struct cpumask *xcpus) { + bool isolcpus_updated; + WARN_ON_ONCE(new_prs < 0); lockdep_assert_held(&callback_lock); if (!parent) parent = &top_cpuset; + if (parent == &top_cpuset) cpumask_or(subpartitions_cpus, subpartitions_cpus, xcpus); - if (new_prs != parent->partition_root_state) + isolcpus_updated = (new_prs != parent->partition_root_state); + if (isolcpus_updated) partition_xcpus_newstate(parent->partition_root_state, new_prs, xcpus); cpumask_andnot(parent->effective_cpus, parent->effective_cpus, xcpus); + return isolcpus_updated; } /* @@ -1470,12 +1478,15 @@ static void partition_xcpus_add(int new_prs, struct cpuset *parent, * @old_prs: old partition_root_state * @parent: parent cpuset * @xcpus: exclusive CPUs to be removed + * Return: true if isolated_cpus modified, false otherwise * * Remote partition if parent == NULL */ -static void partition_xcpus_del(int old_prs, struct cpuset *parent, +static bool partition_xcpus_del(int old_prs, struct cpuset *parent, struct cpumask *xcpus) { + bool isolcpus_updated; + WARN_ON_ONCE(old_prs < 0); lockdep_assert_held(&callback_lock); if (!parent) @@ -1484,12 +1495,27 @@ static void partition_xcpus_del(int old_prs, struct cpuset *parent, if (parent == &top_cpuset) cpumask_andnot(subpartitions_cpus, subpartitions_cpus, xcpus); - if (old_prs != parent->partition_root_state) + isolcpus_updated = (old_prs != parent->partition_root_state); + if (isolcpus_updated) partition_xcpus_newstate(old_prs, parent->partition_root_state, xcpus); cpumask_and(xcpus, xcpus, cpu_active_mask); cpumask_or(parent->effective_cpus, parent->effective_cpus, xcpus); + return isolcpus_updated; +} + +static void update_unbound_workqueue_cpumask(bool isolcpus_updated) +{ + int ret; + + lockdep_assert_cpus_held(); + + if (!isolcpus_updated) + return; + + ret = workqueue_unbound_exclude_cpumask(isolated_cpus); + WARN_ON_ONCE(ret < 0); } /* @@ -1540,6 +1566,8 @@ static inline bool is_local_partition(struct cpuset *cs) static int remote_partition_enable(struct cpuset *cs, int new_prs, struct tmpmasks *tmp) { + bool isolcpus_updated; + /* * The user must have sysadmin privilege. */ @@ -1561,7 +1589,7 @@ static int remote_partition_enable(struct cpuset *cs, int new_prs, return 0; spin_lock_irq(&callback_lock); - partition_xcpus_add(new_prs, NULL, tmp->new_cpus); + isolcpus_updated = partition_xcpus_add(new_prs, NULL, tmp->new_cpus); list_add(&cs->remote_sibling, &remote_children); if (cs->use_parent_ecpus) { struct cpuset *parent = parent_cs(cs); @@ -1570,13 +1598,13 @@ static int remote_partition_enable(struct cpuset *cs, int new_prs, parent->child_ecpus_count--; } spin_unlock_irq(&callback_lock); + update_unbound_workqueue_cpumask(isolcpus_updated); /* * Proprogate changes in top_cpuset's effective_cpus down the hierarchy. */ update_tasks_cpumask(&top_cpuset, tmp->new_cpus); update_sibling_cpumasks(&top_cpuset, NULL, tmp); - return 1; } @@ -1591,18 +1619,22 @@ static int remote_partition_enable(struct cpuset *cs, int new_prs, */ static void remote_partition_disable(struct cpuset *cs, struct tmpmasks *tmp) { + bool isolcpus_updated; + compute_effective_exclusive_cpumask(cs, tmp->new_cpus); WARN_ON_ONCE(!is_remote_partition(cs)); WARN_ON_ONCE(!cpumask_subset(tmp->new_cpus, subpartitions_cpus)); spin_lock_irq(&callback_lock); list_del_init(&cs->remote_sibling); - partition_xcpus_del(cs->partition_root_state, NULL, tmp->new_cpus); + isolcpus_updated = partition_xcpus_del(cs->partition_root_state, + NULL, tmp->new_cpus); cs->partition_root_state = -cs->partition_root_state; if (!cs->prs_err) cs->prs_err = PERR_INVCPUS; reset_partition_data(cs); spin_unlock_irq(&callback_lock); + update_unbound_workqueue_cpumask(isolcpus_updated); /* * Proprogate changes in top_cpuset's effective_cpus down the hierarchy. @@ -1625,6 +1657,7 @@ static void remote_cpus_update(struct cpuset *cs, struct cpumask *newmask, { bool adding, deleting; int prs = cs->partition_root_state; + int isolcpus_updated = 0; if (WARN_ON_ONCE(!is_remote_partition(cs))) return; @@ -1649,10 +1682,11 @@ static void remote_cpus_update(struct cpuset *cs, struct cpumask *newmask, spin_lock_irq(&callback_lock); if (adding) - partition_xcpus_add(prs, NULL, tmp->addmask); + isolcpus_updated += partition_xcpus_add(prs, NULL, tmp->addmask); if (deleting) - partition_xcpus_del(prs, NULL, tmp->delmask); + isolcpus_updated += partition_xcpus_del(prs, NULL, tmp->delmask); spin_unlock_irq(&callback_lock); + update_unbound_workqueue_cpumask(isolcpus_updated); /* * Proprogate changes in top_cpuset's effective_cpus down the hierarchy. @@ -1774,6 +1808,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, int part_error = PERR_NONE; /* Partition error? */ int subparts_delta = 0; struct cpumask *xcpus; /* cs effective_xcpus */ + int isolcpus_updated = 0; bool nocpu; lockdep_assert_held(&cpuset_mutex); @@ -2010,15 +2045,18 @@ write_error: * and vice versa. */ if (adding) - partition_xcpus_del(old_prs, parent, tmp->addmask); + isolcpus_updated += partition_xcpus_del(old_prs, parent, + tmp->addmask); if (deleting) - partition_xcpus_add(new_prs, parent, tmp->delmask); + isolcpus_updated += partition_xcpus_add(new_prs, parent, + tmp->delmask); if (is_partition_valid(parent)) { parent->nr_subparts += subparts_delta; WARN_ON_ONCE(parent->nr_subparts < 0); } spin_unlock_irq(&callback_lock); + update_unbound_workqueue_cpumask(isolcpus_updated); if ((old_prs != new_prs) && (cmd == partcmd_update)) update_partition_exclusive(cs, new_prs); @@ -3082,6 +3120,7 @@ out: else if (new_xcpus_state) partition_xcpus_newstate(old_prs, new_prs, cs->effective_xcpus); spin_unlock_irq(&callback_lock); + update_unbound_workqueue_cpumask(new_xcpus_state); /* Force update if switching back to member */ update_cpumasks_hier(cs, &tmpmask, !new_prs ? HIER_CHECKALL : 0); @@ -4370,6 +4409,30 @@ void cpuset_force_rebuild(void) force_rebuild = true; } +/* + * Attempt to acquire a cpus_read_lock while a hotplug operation may be in + * progress. + * Return: true if successful, false otherwise + * + * To avoid circular lock dependency between cpuset_mutex and cpus_read_lock, + * cpus_read_trylock() is used here to acquire the lock. + */ +static bool cpuset_hotplug_cpus_read_trylock(void) +{ + int retries = 0; + + while (!cpus_read_trylock()) { + /* + * CPU hotplug still in progress. Retry 5 times + * with a 10ms wait before bailing out. + */ + if (++retries > 5) + return false; + msleep(10); + } + return true; +} + /** * cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug * @cs: cpuset in interest @@ -4386,6 +4449,7 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp) bool cpus_updated; bool mems_updated; bool remote; + int partcmd = -1; struct cpuset *parent; retry: wait_event(cpuset_attach_wq, cs->attach_in_progress == 0); @@ -4417,11 +4481,13 @@ retry: compute_partition_effective_cpumask(cs, &new_cpus); if (remote && cpumask_empty(&new_cpus) && - partition_is_populated(cs, NULL)) { + partition_is_populated(cs, NULL) && + cpuset_hotplug_cpus_read_trylock()) { remote_partition_disable(cs, tmp); compute_effective_cpumask(&new_cpus, cs, parent); remote = false; cpuset_force_rebuild(); + cpus_read_unlock(); } /* @@ -4432,18 +4498,28 @@ retry: * partitions. */ if (is_local_partition(cs) && (!is_partition_valid(parent) || - tasks_nocpu_error(parent, cs, &new_cpus))) { - update_parent_effective_cpumask(cs, partcmd_invalidate, NULL, tmp); - compute_effective_cpumask(&new_cpus, cs, parent); - cpuset_force_rebuild(); - } + tasks_nocpu_error(parent, cs, &new_cpus))) + partcmd = partcmd_invalidate; /* * On the other hand, an invalid partition root may be transitioned * back to a regular one. */ - else if (is_partition_valid(parent) && is_partition_invalid(cs)) { - update_parent_effective_cpumask(cs, partcmd_update, NULL, tmp); - if (is_partition_valid(cs)) { + else if (is_partition_valid(parent) && is_partition_invalid(cs)) + partcmd = partcmd_update; + + /* + * cpus_read_lock needs to be held before calling + * update_parent_effective_cpumask(). To avoid circular lock + * dependency between cpuset_mutex and cpus_read_lock, + * cpus_read_trylock() is used here to acquire the lock. + */ + if (partcmd >= 0) { + if (!cpuset_hotplug_cpus_read_trylock()) + goto update_tasks; + + update_parent_effective_cpumask(cs, partcmd, NULL, tmp); + cpus_read_unlock(); + if ((partcmd == partcmd_invalidate) || is_partition_valid(cs)) { compute_partition_effective_cpumask(cs, &new_cpus); cpuset_force_rebuild(); } diff --git a/tools/testing/selftests/cgroup/test_cpuset_prs.sh b/tools/testing/selftests/cgroup/test_cpuset_prs.sh index 2b825019f806..7b7c4c2b6d85 100755 --- a/tools/testing/selftests/cgroup/test_cpuset_prs.sh +++ b/tools/testing/selftests/cgroup/test_cpuset_prs.sh @@ -232,11 +232,11 @@ TEST_MATRIX=( " C0-3:S+ C1-3:S+ C2-3 C4-5 X2-3 X2-3:P1 P2 P1 0 A1:0-1,A2:,A3:2-3,B1:4-5 \ A1:P0,A2:P1,A3:P2,B1:P1 2-3" " C0-3:S+ C1-3:S+ C2-3 C4 X2-3 X2-3:P1 P2 P1 0 A1:0-1,A2:,A3:2-3,B1:4 \ - A1:P0,A2:P1,A3:P2,B1:P1 2-4" + A1:P0,A2:P1,A3:P2,B1:P1 2-4,2-3" " C0-3:S+ C1-3:S+ C3 C4 X2-3 X2-3:P1 P2 P1 0 A1:0-1,A2:2,A3:3,B1:4 \ - A1:P0,A2:P1,A3:P2,B1:P1 2-4" + A1:P0,A2:P1,A3:P2,B1:P1 2-4,3" " C0-4:S+ C1-4:S+ C2-4 . X2-4 X2-4:P2 X4:P1 . 0 A1:0-1,A2:2-3,A3:4 \ - A1:P0,A2:P2,A3:P1 2-4" + A1:P0,A2:P2,A3:P1 2-4,2-3" " C0-4:X2-4:S+ C1-4:X2-4:S+:P2 C2-4:X4:P1 \ . . X5 . . 0 A1:0-4,A2:1-4,A3:2-4 \ A1:P0,A2:P-2,A3:P-1" @@ -248,7 +248,7 @@ TEST_MATRIX=( " C0-3:S+ C1-3:S+ C2-3 . X2-3 X2-3 X2-3:P2:O2=0 . 0 A1:0-1,A2:1,A3:3 A1:P0,A3:P2 2-3" " C0-3:S+ C1-3:S+ C2-3 . X2-3 X2-3 X2-3:P2:O2=0 O2=1 0 A1:0-1,A2:1,A3:2-3 A1:P0,A3:P2 2-3" " C0-3:S+ C1-3:S+ C3 . X2-3 X2-3 P2:O3=0 . 0 A1:0-2,A2:1-2,A3: A1:P0,A3:P2 3" - " C0-3:S+ C1-3:S+ C3 . X2-3 X2-3 T:P2:O3=0 . 0 A1:0-2,A2:1-2,A3:1-2 A1:P0,A3:P-2 3" + " C0-3:S+ C1-3:S+ C3 . X2-3 X2-3 T:P2:O3=0 . 0 A1:0-2,A2:1-2,A3:1-2 A1:P0,A3:P-2 3," # An invalidated remote partition cannot self-recover from hotplug " C0-3:S+ C1-3:S+ C2 . X2-3 X2-3 T:P2:O2=0 O2=1 0 A1:0-3,A2:1-3,A3:2 A1:P0,A3:P-2" @@ -376,7 +376,7 @@ write_cpu_online() } fi echo $VAL > $CPUFILE - pause 0.01 + pause 0.05 } # @@ -508,12 +508,14 @@ dump_states() XECPUS=$DIR/cpuset.cpus.exclusive.effective PRS=$DIR/cpuset.cpus.partition PCPUS=$DIR/.__DEBUG__.cpuset.cpus.subpartitions + ISCPUS=$DIR/.__DEBUG__.cpuset.cpus.isolated [[ -e $CPUS ]] && echo "$CPUS: $(cat $CPUS)" [[ -e $XCPUS ]] && echo "$XCPUS: $(cat $XCPUS)" [[ -e $ECPUS ]] && echo "$ECPUS: $(cat $ECPUS)" [[ -e $XECPUS ]] && echo "$XECPUS: $(cat $XECPUS)" [[ -e $PRS ]] && echo "$PRS: $(cat $PRS)" [[ -e $PCPUS ]] && echo "$PCPUS: $(cat $PCPUS)" + [[ -e $ISCPUS ]] && echo "$ISCPUS: $(cat $ISCPUS)" done } @@ -591,11 +593,17 @@ check_cgroup_states() # # Get isolated (including offline) CPUs by looking at -# /sys/kernel/debug/sched/domains and compare that with the expected value. +# /sys/kernel/debug/sched/domains and *cpuset.cpus.isolated control file, +# if available, and compare that with the expected value. # -# Note that a sched domain of just 1 CPU will be considered isolated. +# Note that isolated CPUs from the sched/domains context include offline +# CPUs as well as CPUs in non-isolated 1-CPU partition. Those CPUs may +# not be included in the *cpuset.cpus.isolated control file which contains +# only CPUs in isolated partitions. # -# $1 - expected isolated cpu list +# $1 - expected isolated cpu list(s) {,} +# - expected sched/domains value +# - *cpuset.cpus.isolated value = if not defined # check_isolcpus() { @@ -603,8 +611,38 @@ check_isolcpus() ISOLCPUS= LASTISOLCPU= SCHED_DOMAINS=/sys/kernel/debug/sched/domains + ISCPUS=${CGROUP2}/.__DEBUG__.cpuset.cpus.isolated + if [[ $EXPECT_VAL = . ]] + then + EXPECT_VAL= + EXPECT_VAL2= + elif [[ $(expr $EXPECT_VAL : ".*,.*") > 0 ]] + then + set -- $(echo $EXPECT_VAL | sed -e "s/,/ /g") + EXPECT_VAL=$1 + EXPECT_VAL2=$2 + else + EXPECT_VAL2=$EXPECT_VAL + fi + + # + # Check the debug isolated cpumask, if present + # + [[ -f $ISCPUS ]] && { + ISOLCPUS=$(cat $ISCPUS) + [[ "$EXPECT_VAL2" != "$ISOLCPUS" ]] && { + # Take a 50ms pause and try again + pause 0.05 + ISOLCPUS=$(cat $ISCPUS) + } + [[ "$EXPECT_VAL2" != "$ISOLCPUS" ]] && return 1 + ISOLCPUS= + } + + # + # Use the sched domain in debugfs to check isolated CPUs, if available + # [[ -d $SCHED_DOMAINS ]] || return 0 - [[ $EXPECT_VAL = . ]] && EXPECT_VAL= for ((CPU=0; CPU < $NR_CPUS; CPU++)) do @@ -648,6 +686,22 @@ test_fail() exit 1 } +# +# Check to see if there are unexpected isolated CPUs left +# +null_isolcpus_check() +{ + [[ $VERBOSE -gt 0 ]] || return 0 + pause 0.02 + check_isolcpus "." + if [[ $? -ne 0 ]] + then + echo "Unexpected isolated CPUs: $ISOLCPUS" + dump_states + exit 1 + fi +} + # # Run cpuset state transition test # $1 - test matrix name @@ -733,6 +787,7 @@ run_state_test() echo "Effective cpus changed to $NEWLIST after test $I!" exit 1 } + null_isolcpus_check [[ $VERBOSE -gt 0 ]] && echo "Test $I done." ((I++)) done @@ -802,6 +857,7 @@ test_isolated() console_msg "Cleaning up" echo $$ > $CGROUP2/cgroup.procs [[ -d A1 ]] && rmdir A1 + null_isolcpus_check } # -- cgit From e76d28bdf9ba5388b8c4835a5199dc427b603188 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Fri, 3 Nov 2023 23:13:01 -0400 Subject: cgroup/rstat: Reduce cpu_lock hold time in cgroup_rstat_flush_locked() When cgroup_rstat_updated() isn't being called concurrently with cgroup_rstat_flush_locked(), its run time is pretty short. When both are called concurrently, the cgroup_rstat_updated() run time can spike to a pretty high value due to high cpu_lock hold time in cgroup_rstat_flush_locked(). This can be problematic if the task calling cgroup_rstat_updated() is a realtime task running on an isolated CPU with a strict latency requirement. The cgroup_rstat_updated() call can happen when there is a page fault even though the task is running in user space most of the time. The percpu cpu_lock is used to protect the update tree - updated_next and updated_children. This protection is only needed when cgroup_rstat_cpu_pop_updated() is being called. The subsequent flushing operation which can take a much longer time does not need that protection as it is already protected by cgroup_rstat_lock. To reduce the cpu_lock hold time, we need to perform all the cgroup_rstat_cpu_pop_updated() calls up front with the lock released afterward before doing any flushing. This patch adds a new cgroup_rstat_updated_list() function to return a singly linked list of cgroups to be flushed. Some instrumentation code are added to measure the cpu_lock hold time right after lock acquisition to after releasing the lock. Parallel kernel build on a 2-socket x86-64 server is used as the benchmarking tool for measuring the lock hold time. The maximum cpu_lock hold time before and after the patch are 100us and 29us respectively. So the worst case time is reduced to about 30% of the original. However, there may be some OS or hardware noises like NMI or SMI in the test system that can worsen the worst case value. Those noises are usually tuned out in a real production environment to get a better result. OTOH, the lock hold time frequency distribution should give a better idea of the performance benefit of the patch. Below were the frequency distribution before and after the patch: Hold time Before patch After patch --------- ------------ ----------- 0-01 us 804,139 13,738,708 01-05 us 9,772,767 1,177,194 05-10 us 4,595,028 4,984 10-15 us 303,481 3,562 15-20 us 78,971 1,314 20-25 us 24,583 18 25-30 us 6,908 12 30-40 us 8,015 40-50 us 2,192 50-60 us 316 60-70 us 43 70-80 us 7 80-90 us 2 >90 us 3 Signed-off-by: Waiman Long Reviewed-by: Yosry Ahmed Signed-off-by: Tejun Heo --- include/linux/cgroup-defs.h | 7 +++++++ kernel/cgroup/rstat.c | 43 ++++++++++++++++++++++++++++--------------- 2 files changed, 35 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 4caab0c6b361..37518436cfe7 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -496,6 +496,13 @@ struct cgroup { struct cgroup_rstat_cpu __percpu *rstat_cpu; struct list_head rstat_css_list; + /* + * A singly-linked list of cgroup structures to be rstat flushed. + * This is a scratch field to be used exclusively by + * cgroup_rstat_flush_locked() and protected by cgroup_rstat_lock. + */ + struct cgroup *rstat_flush_next; + /* cgroup basic resource statistics */ struct cgroup_base_stat last_bstat; struct cgroup_base_stat bstat; diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index d80d7a608141..1f300bf4dc40 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -145,6 +145,32 @@ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos, return pos; } +/* Return a list of updated cgroups to be flushed */ +static struct cgroup *cgroup_rstat_updated_list(struct cgroup *root, int cpu) +{ + raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu); + struct cgroup *head, *tail, *next; + unsigned long flags; + + /* + * The _irqsave() is needed because cgroup_rstat_lock is + * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring + * this lock with the _irq() suffix only disables interrupts on + * a non-PREEMPT_RT kernel. The raw_spinlock_t below disables + * interrupts on both configurations. The _irqsave() ensures + * that interrupts are always disabled and later restored. + */ + raw_spin_lock_irqsave(cpu_lock, flags); + head = tail = cgroup_rstat_cpu_pop_updated(NULL, root, cpu); + while (tail) { + next = cgroup_rstat_cpu_pop_updated(tail, root, cpu); + tail->rstat_flush_next = next; + tail = next; + } + raw_spin_unlock_irqrestore(cpu_lock, flags); + return head; +} + /* * A hook for bpf stat collectors to attach to and flush their stats. * Together with providing bpf kfuncs for cgroup_rstat_updated() and @@ -179,21 +205,9 @@ static void cgroup_rstat_flush_locked(struct cgroup *cgrp) lockdep_assert_held(&cgroup_rstat_lock); for_each_possible_cpu(cpu) { - raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, - cpu); - struct cgroup *pos = NULL; - unsigned long flags; + struct cgroup *pos = cgroup_rstat_updated_list(cgrp, cpu); - /* - * The _irqsave() is needed because cgroup_rstat_lock is - * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring - * this lock with the _irq() suffix only disables interrupts on - * a non-PREEMPT_RT kernel. The raw_spinlock_t below disables - * interrupts on both configurations. The _irqsave() ensures - * that interrupts are always disabled and later restored. - */ - raw_spin_lock_irqsave(cpu_lock, flags); - while ((pos = cgroup_rstat_cpu_pop_updated(pos, cgrp, cpu))) { + for (; pos; pos = pos->rstat_flush_next) { struct cgroup_subsys_state *css; cgroup_base_stat_flush(pos, cpu); @@ -205,7 +219,6 @@ static void cgroup_rstat_flush_locked(struct cgroup *cgrp) css->ss->css_rstat_flush(css, cpu); rcu_read_unlock(); } - raw_spin_unlock_irqrestore(cpu_lock, flags); /* play nice and yield if necessary */ if (need_resched() || spin_needbreak(&cgroup_rstat_lock)) { -- cgit From 022732e3d846e197539712e51ecada90ded0572a Mon Sep 17 00:00:00 2001 From: Chris Riches Date: Wed, 18 Oct 2023 09:23:51 +0000 Subject: audit: Send netlink ACK before setting connection in auditd_set When auditd_set sets the auditd_conn pointer, audit messages can immediately be put on the socket by other kernel threads. If the backlog is large or the rate is high, this can immediately fill the socket buffer. If the audit daemon requested an ACK for this operation, a full socket buffer causes the ACK to get dropped, also setting ENOBUFS on the socket. To avoid this race and ensure ACKs get through, fast-track the ACK in this specific case to ensure it is sent before auditd_conn is set. Signed-off-by: Chris Riches [PM: fix some tab vs space damage] Signed-off-by: Paul Moore --- kernel/audit.c | 31 ++++++++++++++++++++++++------- 1 file changed, 24 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 16205dd29843..9c8e5f732c4c 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -487,15 +487,19 @@ static void auditd_conn_free(struct rcu_head *rcu) * @pid: auditd PID * @portid: auditd netlink portid * @net: auditd network namespace pointer + * @skb: the netlink command from the audit daemon + * @ack: netlink ack flag, cleared if ack'd here * * Description: * This function will obtain and drop network namespace references as * necessary. Returns zero on success, negative values on failure. */ -static int auditd_set(struct pid *pid, u32 portid, struct net *net) +static int auditd_set(struct pid *pid, u32 portid, struct net *net, + struct sk_buff *skb, bool *ack) { unsigned long flags; struct auditd_connection *ac_old, *ac_new; + struct nlmsghdr *nlh; if (!pid || !net) return -EINVAL; @@ -507,6 +511,13 @@ static int auditd_set(struct pid *pid, u32 portid, struct net *net) ac_new->portid = portid; ac_new->net = get_net(net); + /* send the ack now to avoid a race with the queue backlog */ + if (*ack) { + nlh = nlmsg_hdr(skb); + netlink_ack(skb, nlh, 0, NULL); + *ack = false; + } + spin_lock_irqsave(&auditd_conn_lock, flags); ac_old = rcu_dereference_protected(auditd_conn, lockdep_is_held(&auditd_conn_lock)); @@ -1200,7 +1211,8 @@ static int audit_replace(struct pid *pid) return auditd_send_unicast_skb(skb); } -static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) +static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh, + bool *ack) { u32 seq; void *data; @@ -1293,7 +1305,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) /* register a new auditd connection */ err = auditd_set(req_pid, NETLINK_CB(skb).portid, - sock_net(NETLINK_CB(skb).sk)); + sock_net(NETLINK_CB(skb).sk), + skb, ack); if (audit_enabled != AUDIT_OFF) audit_log_config_change("audit_pid", new_pid, @@ -1538,9 +1551,10 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) * Parse the provided skb and deal with any messages that may be present, * malformed skbs are discarded. */ -static void audit_receive(struct sk_buff *skb) +static void audit_receive(struct sk_buff *skb) { struct nlmsghdr *nlh; + bool ack; /* * len MUST be signed for nlmsg_next to be able to dec it below 0 * if the nlmsg_len was not aligned @@ -1553,9 +1567,12 @@ static void audit_receive(struct sk_buff *skb) audit_ctl_lock(); while (nlmsg_ok(nlh, len)) { - err = audit_receive_msg(skb, nlh); - /* if err or if this message says it wants a response */ - if (err || (nlh->nlmsg_flags & NLM_F_ACK)) + ack = nlh->nlmsg_flags & NLM_F_ACK; + err = audit_receive_msg(skb, nlh, &ack); + + /* send an ack if the user asked for one and audit_receive_msg + * didn't already do it, or if there was an error. */ + if (ack || err) netlink_ack(skb, nlh, err, NULL); nlh = nlmsg_next(nlh, &len); -- cgit From a04a1198088a1378d0389c250cc684f649bcc91e Mon Sep 17 00:00:00 2001 From: Casey Schaufler Date: Tue, 12 Sep 2023 13:56:49 -0700 Subject: LSM: syscalls for current process attributes Create a system call lsm_get_self_attr() to provide the security module maintained attributes of the current process. Create a system call lsm_set_self_attr() to set a security module maintained attribute of the current process. Historically these attributes have been exposed to user space via entries in procfs under /proc/self/attr. The attribute value is provided in a lsm_ctx structure. The structure identifies the size of the attribute, and the attribute value. The format of the attribute value is defined by the security module. A flags field is included for LSM specific information. It is currently unused and must be 0. The total size of the data, including the lsm_ctx structure and any padding, is maintained as well. struct lsm_ctx { __u64 id; __u64 flags; __u64 len; __u64 ctx_len; __u8 ctx[]; }; Two new LSM hooks are used to interface with the LSMs. security_getselfattr() collects the lsm_ctx values from the LSMs that support the hook, accounting for space requirements. security_setselfattr() identifies which LSM the attribute is intended for and passes it along. Signed-off-by: Casey Schaufler Reviewed-by: Kees Cook Reviewed-by: Serge Hallyn Reviewed-by: John Johansen Signed-off-by: Paul Moore --- Documentation/userspace-api/lsm.rst | 70 +++++++++++++++++ include/linux/lsm_hook_defs.h | 4 + include/linux/lsm_hooks.h | 1 + include/linux/security.h | 19 +++++ include/linux/syscalls.h | 5 ++ include/uapi/linux/lsm.h | 36 +++++++++ kernel/sys_ni.c | 2 + security/Makefile | 1 + security/lsm_syscalls.c | 57 ++++++++++++++ security/security.c | 152 ++++++++++++++++++++++++++++++++++++ 10 files changed, 347 insertions(+) create mode 100644 Documentation/userspace-api/lsm.rst create mode 100644 security/lsm_syscalls.c (limited to 'kernel') diff --git a/Documentation/userspace-api/lsm.rst b/Documentation/userspace-api/lsm.rst new file mode 100644 index 000000000000..f8499f3e2826 --- /dev/null +++ b/Documentation/userspace-api/lsm.rst @@ -0,0 +1,70 @@ +.. SPDX-License-Identifier: GPL-2.0 +.. Copyright (C) 2022 Casey Schaufler +.. Copyright (C) 2022 Intel Corporation + +===================================== +Linux Security Modules +===================================== + +:Author: Casey Schaufler +:Date: July 2023 + +Linux security modules (LSM) provide a mechanism to implement +additional access controls to the Linux security policies. + +The various security modules may support any of these attributes: + +``LSM_ATTR_CURRENT`` is the current, active security context of the +process. +The proc filesystem provides this value in ``/proc/self/attr/current``. +This is supported by the SELinux, Smack and AppArmor security modules. +Smack also provides this value in ``/proc/self/attr/smack/current``. +AppArmor also provides this value in ``/proc/self/attr/apparmor/current``. + +``LSM_ATTR_EXEC`` is the security context of the process at the time the +current image was executed. +The proc filesystem provides this value in ``/proc/self/attr/exec``. +This is supported by the SELinux and AppArmor security modules. +AppArmor also provides this value in ``/proc/self/attr/apparmor/exec``. + +``LSM_ATTR_FSCREATE`` is the security context of the process used when +creating file system objects. +The proc filesystem provides this value in ``/proc/self/attr/fscreate``. +This is supported by the SELinux security module. + +``LSM_ATTR_KEYCREATE`` is the security context of the process used when +creating key objects. +The proc filesystem provides this value in ``/proc/self/attr/keycreate``. +This is supported by the SELinux security module. + +``LSM_ATTR_PREV`` is the security context of the process at the time the +current security context was set. +The proc filesystem provides this value in ``/proc/self/attr/prev``. +This is supported by the SELinux and AppArmor security modules. +AppArmor also provides this value in ``/proc/self/attr/apparmor/prev``. + +``LSM_ATTR_SOCKCREATE`` is the security context of the process used when +creating socket objects. +The proc filesystem provides this value in ``/proc/self/attr/sockcreate``. +This is supported by the SELinux security module. + +Kernel interface +================ + +Set a security attribute of the current process +----------------------------------------------- + +.. kernel-doc:: security/lsm_syscalls.c + :identifiers: sys_lsm_set_self_attr + +Get the specified security attributes of the current process +------------------------------------------------------------ + +.. kernel-doc:: security/lsm_syscalls.c + :identifiers: sys_lsm_get_self_attr + +Additional documentation +======================== + +* Documentation/security/lsm.rst +* Documentation/security/lsm-development.rst diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index ff217a5ce552..c925a0d26edf 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -262,6 +262,10 @@ LSM_HOOK(int, 0, sem_semop, struct kern_ipc_perm *perm, struct sembuf *sops, LSM_HOOK(int, 0, netlink_send, struct sock *sk, struct sk_buff *skb) LSM_HOOK(void, LSM_RET_VOID, d_instantiate, struct dentry *dentry, struct inode *inode) +LSM_HOOK(int, -EOPNOTSUPP, getselfattr, unsigned int attr, + struct lsm_ctx __user *ctx, size_t *size, u32 flags) +LSM_HOOK(int, -EOPNOTSUPP, setselfattr, unsigned int attr, + struct lsm_ctx *ctx, size_t size, u32 flags) LSM_HOOK(int, -EINVAL, getprocattr, struct task_struct *p, const char *name, char **value) LSM_HOOK(int, -EINVAL, setprocattr, const char *name, void *value, size_t size) diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h index 7f0adb33caaa..a2ade0ffe9e7 100644 --- a/include/linux/lsm_hooks.h +++ b/include/linux/lsm_hooks.h @@ -25,6 +25,7 @@ #ifndef __LINUX_LSM_HOOKS_H #define __LINUX_LSM_HOOKS_H +#include #include #include #include diff --git a/include/linux/security.h b/include/linux/security.h index c81bca77f4f2..dd1fe487385d 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -60,6 +60,7 @@ struct fs_parameter; enum fs_value_type; struct watch; struct watch_notification; +struct lsm_ctx; /* Default (no) options for the capable function */ #define CAP_OPT_NONE 0x0 @@ -472,6 +473,10 @@ int security_sem_semctl(struct kern_ipc_perm *sma, int cmd); int security_sem_semop(struct kern_ipc_perm *sma, struct sembuf *sops, unsigned nsops, int alter); void security_d_instantiate(struct dentry *dentry, struct inode *inode); +int security_getselfattr(unsigned int attr, struct lsm_ctx __user *ctx, + size_t __user *size, u32 flags); +int security_setselfattr(unsigned int attr, struct lsm_ctx __user *ctx, + size_t size, u32 flags); int security_getprocattr(struct task_struct *p, int lsmid, const char *name, char **value); int security_setprocattr(int lsmid, const char *name, void *value, size_t size); @@ -1338,6 +1343,20 @@ static inline void security_d_instantiate(struct dentry *dentry, struct inode *inode) { } +static inline int security_getselfattr(unsigned int attr, + struct lsm_ctx __user *ctx, + size_t __user *size, u32 flags) +{ + return -EOPNOTSUPP; +} + +static inline int security_setselfattr(unsigned int attr, + struct lsm_ctx __user *ctx, + size_t size, u32 flags) +{ + return -EOPNOTSUPP; +} + static inline int security_getprocattr(struct task_struct *p, int lsmid, const char *name, char **value) { diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index fd9d12de7e92..4e1e56a24f1e 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -71,6 +71,7 @@ struct clone_args; struct open_how; struct mount_attr; struct landlock_ruleset_attr; +struct lsm_ctx; enum landlock_rule_type; struct cachestat_range; struct cachestat; @@ -949,6 +950,10 @@ asmlinkage long sys_cachestat(unsigned int fd, struct cachestat_range __user *cstat_range, struct cachestat __user *cstat, unsigned int flags); asmlinkage long sys_map_shadow_stack(unsigned long addr, unsigned long size, unsigned int flags); +asmlinkage long sys_lsm_get_self_attr(unsigned int attr, struct lsm_ctx *ctx, + size_t *size, __u32 flags); +asmlinkage long sys_lsm_set_self_attr(unsigned int attr, struct lsm_ctx *ctx, + size_t size, __u32 flags); /* * Architecture-specific system calls diff --git a/include/uapi/linux/lsm.h b/include/uapi/linux/lsm.h index f27c9a9cc376..eeda59a77c02 100644 --- a/include/uapi/linux/lsm.h +++ b/include/uapi/linux/lsm.h @@ -9,6 +9,36 @@ #ifndef _UAPI_LINUX_LSM_H #define _UAPI_LINUX_LSM_H +#include +#include + +/** + * struct lsm_ctx - LSM context information + * @id: the LSM id number, see LSM_ID_XXX + * @flags: LSM specific flags + * @len: length of the lsm_ctx struct, @ctx and any other data or padding + * @ctx_len: the size of @ctx + * @ctx: the LSM context value + * + * The @len field MUST be equal to the size of the lsm_ctx struct + * plus any additional padding and/or data placed after @ctx. + * + * In all cases @ctx_len MUST be equal to the length of @ctx. + * If @ctx is a string value it should be nul terminated with + * @ctx_len equal to `strlen(@ctx) + 1`. Binary values are + * supported. + * + * The @flags and @ctx fields SHOULD only be interpreted by the + * LSM specified by @id; they MUST be set to zero/0 when not used. + */ +struct lsm_ctx { + __u64 id; + __u64 flags; + __u64 len; + __u64 ctx_len; + __u8 ctx[]; +}; + /* * ID tokens to identify Linux Security Modules (LSMs) * @@ -51,4 +81,10 @@ #define LSM_ATTR_PREV 104 #define LSM_ATTR_SOCKCREATE 105 +/* + * LSM_FLAG_XXX definitions identify special handling instructions + * for the API. + */ +#define LSM_FLAG_SINGLE 0x0001 + #endif /* _UAPI_LINUX_LSM_H */ diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index e1a6e3c675c0..1f61b8452a6e 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -171,6 +171,8 @@ COND_SYSCALL(landlock_add_rule); COND_SYSCALL(landlock_restrict_self); COND_SYSCALL(fadvise64_64); COND_SYSCALL_COMPAT(fadvise64_64); +COND_SYSCALL(lsm_get_self_attr); +COND_SYSCALL(lsm_set_self_attr); /* CONFIG_MMU only */ COND_SYSCALL(swapon); diff --git a/security/Makefile b/security/Makefile index 18121f8f85cd..59f238490665 100644 --- a/security/Makefile +++ b/security/Makefile @@ -7,6 +7,7 @@ obj-$(CONFIG_KEYS) += keys/ # always enable default capabilities obj-y += commoncap.o +obj-$(CONFIG_SECURITY) += lsm_syscalls.o obj-$(CONFIG_MMU) += min_addr.o # Object file lists diff --git a/security/lsm_syscalls.c b/security/lsm_syscalls.c new file mode 100644 index 000000000000..226ae80d9683 --- /dev/null +++ b/security/lsm_syscalls.c @@ -0,0 +1,57 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * System calls implementing the Linux Security Module API. + * + * Copyright (C) 2022 Casey Schaufler + * Copyright (C) 2022 Intel Corporation + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/** + * sys_lsm_set_self_attr - Set current task's security module attribute + * @attr: which attribute to set + * @ctx: the LSM contexts + * @size: size of @ctx + * @flags: reserved for future use + * + * Sets the calling task's LSM context. On success this function + * returns 0. If the attribute specified cannot be set a negative + * value indicating the reason for the error is returned. + */ +SYSCALL_DEFINE4(lsm_set_self_attr, unsigned int, attr, struct lsm_ctx __user *, + ctx, size_t, size, u32, flags) +{ + return security_setselfattr(attr, ctx, size, flags); +} + +/** + * sys_lsm_get_self_attr - Return current task's security module attributes + * @attr: which attribute to return + * @ctx: the user-space destination for the information, or NULL + * @size: pointer to the size of space available to receive the data + * @flags: special handling options. LSM_FLAG_SINGLE indicates that only + * attributes associated with the LSM identified in the passed @ctx be + * reported. + * + * Returns the calling task's LSM contexts. On success this + * function returns the number of @ctx array elements. This value + * may be zero if there are no LSM contexts assigned. If @size is + * insufficient to contain the return data -E2BIG is returned and + * @size is set to the minimum required size. In all other cases + * a negative value indicating the error is returned. + */ +SYSCALL_DEFINE4(lsm_get_self_attr, unsigned int, attr, struct lsm_ctx __user *, + ctx, size_t __user *, size, u32, flags) +{ + return security_getselfattr(attr, ctx, size, flags); +} diff --git a/security/security.c b/security/security.c index c66f9faefa40..9757d009113f 100644 --- a/security/security.c +++ b/security/security.c @@ -3837,6 +3837,158 @@ void security_d_instantiate(struct dentry *dentry, struct inode *inode) } EXPORT_SYMBOL(security_d_instantiate); +/* + * Please keep this in sync with it's counterpart in security/lsm_syscalls.c + */ + +/** + * security_getselfattr - Read an LSM attribute of the current process. + * @attr: which attribute to return + * @uctx: the user-space destination for the information, or NULL + * @size: pointer to the size of space available to receive the data + * @flags: special handling options. LSM_FLAG_SINGLE indicates that only + * attributes associated with the LSM identified in the passed @ctx be + * reported. + * + * A NULL value for @uctx can be used to get both the number of attributes + * and the size of the data. + * + * Returns the number of attributes found on success, negative value + * on error. @size is reset to the total size of the data. + * If @size is insufficient to contain the data -E2BIG is returned. + */ +int security_getselfattr(unsigned int attr, struct lsm_ctx __user *uctx, + size_t __user *size, u32 flags) +{ + struct security_hook_list *hp; + struct lsm_ctx lctx = { .id = LSM_ID_UNDEF, }; + u8 __user *base = (u8 __user *)uctx; + size_t total = 0; + size_t entrysize; + size_t left; + bool toobig = false; + bool single = false; + int count = 0; + int rc; + + if (attr == LSM_ATTR_UNDEF) + return -EINVAL; + if (size == NULL) + return -EINVAL; + if (get_user(left, size)) + return -EFAULT; + + if (flags) { + /* + * Only flag supported is LSM_FLAG_SINGLE + */ + if (flags != LSM_FLAG_SINGLE) + return -EINVAL; + if (uctx && copy_from_user(&lctx, uctx, sizeof(lctx))) + return -EFAULT; + /* + * If the LSM ID isn't specified it is an error. + */ + if (lctx.id == LSM_ID_UNDEF) + return -EINVAL; + single = true; + } + + /* + * In the usual case gather all the data from the LSMs. + * In the single case only get the data from the LSM specified. + */ + hlist_for_each_entry(hp, &security_hook_heads.getselfattr, list) { + if (single && lctx.id != hp->lsmid->id) + continue; + entrysize = left; + if (base) + uctx = (struct lsm_ctx __user *)(base + total); + rc = hp->hook.getselfattr(attr, uctx, &entrysize, flags); + if (rc == -EOPNOTSUPP) { + rc = 0; + continue; + } + if (rc == -E2BIG) { + toobig = true; + left = 0; + } else if (rc < 0) + return rc; + else + left -= entrysize; + + total += entrysize; + count += rc; + if (single) + break; + } + if (put_user(total, size)) + return -EFAULT; + if (toobig) + return -E2BIG; + if (count == 0) + return LSM_RET_DEFAULT(getselfattr); + return count; +} + +/* + * Please keep this in sync with it's counterpart in security/lsm_syscalls.c + */ + +/** + * security_setselfattr - Set an LSM attribute on the current process. + * @attr: which attribute to set + * @uctx: the user-space source for the information + * @size: the size of the data + * @flags: reserved for future use, must be 0 + * + * Set an LSM attribute for the current process. The LSM, attribute + * and new value are included in @uctx. + * + * Returns 0 on success, -EINVAL if the input is inconsistent, -EFAULT + * if the user buffer is inaccessible, E2BIG if size is too big, or an + * LSM specific failure. + */ +int security_setselfattr(unsigned int attr, struct lsm_ctx __user *uctx, + size_t size, u32 flags) +{ + struct security_hook_list *hp; + struct lsm_ctx *lctx; + int rc = LSM_RET_DEFAULT(setselfattr); + + if (flags) + return -EINVAL; + if (size < sizeof(*lctx)) + return -EINVAL; + if (size > PAGE_SIZE) + return -E2BIG; + + lctx = kmalloc(size, GFP_KERNEL); + if (lctx == NULL) + return -ENOMEM; + + if (copy_from_user(lctx, uctx, size)) { + rc = -EFAULT; + goto free_out; + } + + if (size < lctx->len || size < lctx->ctx_len + sizeof(*lctx) || + lctx->len < lctx->ctx_len + sizeof(*lctx)) { + rc = -EINVAL; + goto free_out; + } + + hlist_for_each_entry(hp, &security_hook_heads.setselfattr, list) + if ((hp->lsmid->id) == lctx->id) { + rc = hp->hook.setselfattr(attr, lctx, size, flags); + break; + } + +free_out: + kfree(lctx); + return rc; +} + /** * security_getprocattr() - Read an attribute for a task * @p: the task -- cgit From ad4aff9ec25f400608283c10d634cc4eeda83a02 Mon Sep 17 00:00:00 2001 From: Casey Schaufler Date: Tue, 12 Sep 2023 13:56:50 -0700 Subject: LSM: Create lsm_list_modules system call MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Create a system call to report the list of Linux Security Modules that are active on the system. The list is provided as an array of LSM ID numbers. The calling application can use this list determine what LSM specific actions it might take. That might include choosing an output format, determining required privilege or bypassing security module specific behavior. Signed-off-by: Casey Schaufler Reviewed-by: Kees Cook Reviewed-by: Serge Hallyn Reviewed-by: John Johansen Reviewed-by: Mickaël Salaün Signed-off-by: Paul Moore --- Documentation/userspace-api/lsm.rst | 3 +++ include/linux/syscalls.h | 1 + kernel/sys_ni.c | 1 + security/lsm_syscalls.c | 39 +++++++++++++++++++++++++++++++++++++ 4 files changed, 44 insertions(+) (limited to 'kernel') diff --git a/Documentation/userspace-api/lsm.rst b/Documentation/userspace-api/lsm.rst index f8499f3e2826..a76da373841b 100644 --- a/Documentation/userspace-api/lsm.rst +++ b/Documentation/userspace-api/lsm.rst @@ -63,6 +63,9 @@ Get the specified security attributes of the current process .. kernel-doc:: security/lsm_syscalls.c :identifiers: sys_lsm_get_self_attr +.. kernel-doc:: security/lsm_syscalls.c + :identifiers: sys_lsm_list_modules + Additional documentation ======================== diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 4e1e56a24f1e..feec5719750b 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -954,6 +954,7 @@ asmlinkage long sys_lsm_get_self_attr(unsigned int attr, struct lsm_ctx *ctx, size_t *size, __u32 flags); asmlinkage long sys_lsm_set_self_attr(unsigned int attr, struct lsm_ctx *ctx, size_t size, __u32 flags); +asmlinkage long sys_lsm_list_modules(u64 *ids, size_t *size, u32 flags); /* * Architecture-specific system calls diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 1f61b8452a6e..9fa5989bf2ce 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -173,6 +173,7 @@ COND_SYSCALL(fadvise64_64); COND_SYSCALL_COMPAT(fadvise64_64); COND_SYSCALL(lsm_get_self_attr); COND_SYSCALL(lsm_set_self_attr); +COND_SYSCALL(lsm_list_modules); /* CONFIG_MMU only */ COND_SYSCALL(swapon); diff --git a/security/lsm_syscalls.c b/security/lsm_syscalls.c index 226ae80d9683..329aaca5efc0 100644 --- a/security/lsm_syscalls.c +++ b/security/lsm_syscalls.c @@ -55,3 +55,42 @@ SYSCALL_DEFINE4(lsm_get_self_attr, unsigned int, attr, struct lsm_ctx __user *, { return security_getselfattr(attr, ctx, size, flags); } + +/** + * sys_lsm_list_modules - Return a list of the active security modules + * @ids: the LSM module ids + * @size: pointer to size of @ids, updated on return + * @flags: reserved for future use, must be zero + * + * Returns a list of the active LSM ids. On success this function + * returns the number of @ids array elements. This value may be zero + * if there are no LSMs active. If @size is insufficient to contain + * the return data -E2BIG is returned and @size is set to the minimum + * required size. In all other cases a negative value indicating the + * error is returned. + */ +SYSCALL_DEFINE3(lsm_list_modules, u64 __user *, ids, size_t __user *, size, + u32, flags) +{ + size_t total_size = lsm_active_cnt * sizeof(*ids); + size_t usize; + int i; + + if (flags) + return -EINVAL; + + if (get_user(usize, size)) + return -EFAULT; + + if (put_user(total_size, size) != 0) + return -EFAULT; + + if (usize < total_size) + return -E2BIG; + + for (i = 0; i < lsm_active_cnt; i++) + if (put_user(lsm_idlist[i]->id, ids++)) + return -EFAULT; + + return lsm_active_cnt; +} -- cgit From fe977716b40cb98cf9c91a66454adf3dc2f8c59a Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Sat, 11 Nov 2023 09:00:29 +0000 Subject: bpf: Add a new kfunc for cgroup1 hierarchy A new kfunc is added to acquire cgroup1 of a task: - bpf_task_get_cgroup1 Acquires the associated cgroup of a task whithin a specific cgroup1 hierarchy. The cgroup1 hierarchy is identified by its hierarchy ID. This new kfunc enables the tracing of tasks within a designated container or cgroup directory in BPF programs. Suggested-by: Tejun Heo Signed-off-by: Yafang Shao Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20231111090034.4248-2-laoar.shao@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 03517db5cfb3..b45a8381f9bd 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -2228,6 +2228,25 @@ __bpf_kfunc long bpf_task_under_cgroup(struct task_struct *task, rcu_read_unlock(); return ret; } + +/** + * bpf_task_get_cgroup1 - Acquires the associated cgroup of a task within a + * specific cgroup1 hierarchy. The cgroup1 hierarchy is identified by its + * hierarchy ID. + * @task: The target task + * @hierarchy_id: The ID of a cgroup1 hierarchy + * + * On success, the cgroup is returen. On failure, NULL is returned. + */ +__bpf_kfunc struct cgroup * +bpf_task_get_cgroup1(struct task_struct *task, int hierarchy_id) +{ + struct cgroup *cgrp = task_get_cgroup1(task, hierarchy_id); + + if (IS_ERR(cgrp)) + return NULL; + return cgrp; +} #endif /* CONFIG_CGROUPS */ /** @@ -2534,6 +2553,7 @@ BTF_ID_FLAGS(func, bpf_cgroup_release, KF_RELEASE) BTF_ID_FLAGS(func, bpf_cgroup_ancestor, KF_ACQUIRE | KF_RCU | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_cgroup_from_id, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_task_under_cgroup, KF_RCU) +BTF_ID_FLAGS(func, bpf_task_get_cgroup1, KF_ACQUIRE | KF_RCU | KF_RET_NULL) #endif BTF_ID_FLAGS(func, bpf_task_from_pid, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_throw) -- cgit From d6111cf45c5787282b2e20d77bdb6b28881d516a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 31 Oct 2023 11:12:01 -0700 Subject: sched: Use WRITE_ONCE() for p->on_rq Since RCU-tasks uses READ_ONCE(p->on_rq), ensure the write-side matches with WRITE_ONCE(). Signed-off-by: "Paul E. McKenney" Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/e4896e0b-eacc-45a2-a7a8-de2280a51ecc@paulmck-laptop --- kernel/sched/core.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a708d225c28e..9d5099d02dbc 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2124,12 +2124,14 @@ void activate_task(struct rq *rq, struct task_struct *p, int flags) enqueue_task(rq, p, flags); - p->on_rq = TASK_ON_RQ_QUEUED; + WRITE_ONCE(p->on_rq, TASK_ON_RQ_QUEUED); + ASSERT_EXCLUSIVE_WRITER(p->on_rq); } void deactivate_task(struct rq *rq, struct task_struct *p, int flags) { - p->on_rq = (flags & DEQUEUE_SLEEP) ? 0 : TASK_ON_RQ_MIGRATING; + WRITE_ONCE(p->on_rq, (flags & DEQUEUE_SLEEP) ? 0 : TASK_ON_RQ_MIGRATING); + ASSERT_EXCLUSIVE_WRITER(p->on_rq); dequeue_task(rq, p, flags); } -- cgit From 84db47ca7146d7bd00eb5cf2b93989a971c84650 Mon Sep 17 00:00:00 2001 From: Raghavendra K T Date: Fri, 20 Oct 2023 21:27:46 +0530 Subject: sched/numa: Fix mm numa_scan_seq based unconditional scan Since commit fc137c0ddab2 ("sched/numa: enhance vma scanning logic") NUMA Balancing allows updating PTEs to trap NUMA hinting faults if the task had previously accessed VMA. However unconditional scan of VMAs are allowed during initial phase of VMA creation until process's mm numa_scan_seq reaches 2 even though current task had not accessed VMA. Rationale: - Without initial scan subsequent PTE update may never happen. - Give fair opportunity to all the VMAs to be scanned and subsequently understand the access pattern of all the VMAs. But it has a corner case where, if a VMA is created after some time, process's mm numa_scan_seq could be already greater than 2. For e.g., values of mm numa_scan_seq when VMAs are created by running mmtest autonuma benchmark briefly looks like: start_seq=0 : 459 start_seq=2 : 138 start_seq=3 : 144 start_seq=4 : 8 start_seq=8 : 1 start_seq=9 : 1 This results in no unconditional PTE updates for those VMAs created after some time. Fix: - Note down the initial value of mm numa_scan_seq in per VMA start_seq. - Allow unconditional scan till start_seq + 2. Result: SUT: AMD EPYC Milan with 2 NUMA nodes 256 cpus. base kernel: upstream 6.6-rc6 with Mels patches [1] applied. kernbench ========== base patched %gain Amean elsp-128 165.09 ( 0.00%) 164.78 * 0.19%* Duration User 41404.28 41375.08 Duration System 9862.22 9768.48 Duration Elapsed 519.87 518.72 Ops NUMA PTE updates 1041416.00 831536.00 Ops NUMA hint faults 263296.00 220966.00 Ops NUMA pages migrated 258021.00 212769.00 Ops AutoNUMA cost 1328.67 1114.69 autonumabench NUMA01_THREADLOCAL ================== Amean elsp-NUMA01_THREADLOCAL 81.79 (0.00%) 67.74 * 17.18%* Duration User 54832.73 47379.67 Duration System 75.00 185.75 Duration Elapsed 576.72 476.09 Ops NUMA PTE updates 394429.00 11121044.00 Ops NUMA hint faults 1001.00 8906404.00 Ops NUMA pages migrated 288.00 2998694.00 Ops AutoNUMA cost 7.77 44666.84 Signed-off-by: Raghavendra K T Signed-off-by: Peter Zijlstra (Intel) Acked-by: Mel Gorman Link: https://lore.kernel.org/r/2ea7cbce80ac7c62e90cbfb9653a7972f902439f.1697816692.git.raghavendra.kt@amd.com --- include/linux/mm_types.h | 3 +++ kernel/sched/fair.c | 4 +++- 2 files changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 957ce38768b2..950df415d7de 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -600,6 +600,9 @@ struct vma_numab_state { */ unsigned long pids_active[2]; + /* MM scan sequence ID when scan first started after VMA creation */ + int start_scan_seq; + /* * MM scan sequence ID when the VMA was last completely scanned. * A VMA is not eligible for scanning if prev_scan_seq == numa_scan_seq diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d7a3c63a2171..44b5262b6657 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3164,7 +3164,7 @@ static bool vma_is_accessed(struct mm_struct *mm, struct vm_area_struct *vma) * This is also done to avoid any side effect of task scanning * amplifying the unfairness of disjoint set of VMAs' access. */ - if (READ_ONCE(current->mm->numa_scan_seq) < 2) + if ((READ_ONCE(current->mm->numa_scan_seq) - vma->numab_state->start_scan_seq) < 2) return true; pids = vma->numab_state->pids_active[0] | vma->numab_state->pids_active[1]; @@ -3307,6 +3307,8 @@ retry_pids: if (!vma->numab_state) continue; + vma->numab_state->start_scan_seq = mm->numa_scan_seq; + vma->numab_state->next_scan = now + msecs_to_jiffies(sysctl_numa_balancing_scan_delay); -- cgit From 2227a957e1d5b1941be4e4207879ec74f4bb37f8 Mon Sep 17 00:00:00 2001 From: Abel Wu Date: Wed, 15 Nov 2023 11:36:45 +0800 Subject: sched/eevdf: Sort the rbtree by virtual deadline Sort the task timeline by virtual deadline and keep the min_vruntime in the augmented tree, so we can avoid doubling the worst case cost and make full use of the cached leftmost node to enable O(1) fastpath picking in next patch. Signed-off-by: Abel Wu Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20231115033647.80785-3-wuyun.abel@bytedance.com --- include/linux/sched.h | 2 +- kernel/sched/debug.c | 11 +++- kernel/sched/fair.c | 168 ++++++++++++++++++++------------------------------ kernel/sched/sched.h | 1 + 4 files changed, 77 insertions(+), 105 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index 292c31697248..cd56d4018527 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -553,7 +553,7 @@ struct sched_entity { struct load_weight load; struct rb_node run_node; u64 deadline; - u64 min_deadline; + u64 min_vruntime; struct list_head group_node; unsigned int on_rq; diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 4580a450700e..168eecc209b4 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -628,8 +628,8 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) { - s64 left_vruntime = -1, min_vruntime, right_vruntime = -1, spread; - struct sched_entity *last, *first; + s64 left_vruntime = -1, min_vruntime, right_vruntime = -1, left_deadline = -1, spread; + struct sched_entity *last, *first, *root; struct rq *rq = cpu_rq(cpu); unsigned long flags; @@ -644,15 +644,20 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) SPLIT_NS(cfs_rq->exec_clock)); raw_spin_rq_lock_irqsave(rq, flags); + root = __pick_root_entity(cfs_rq); + if (root) + left_vruntime = root->min_vruntime; first = __pick_first_entity(cfs_rq); if (first) - left_vruntime = first->vruntime; + left_deadline = first->deadline; last = __pick_last_entity(cfs_rq); if (last) right_vruntime = last->vruntime; min_vruntime = cfs_rq->min_vruntime; raw_spin_rq_unlock_irqrestore(rq, flags); + SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "left_deadline", + SPLIT_NS(left_deadline)); SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "left_vruntime", SPLIT_NS(left_vruntime)); SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "min_vruntime", diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 44b5262b6657..31bca05c3612 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -551,7 +551,11 @@ static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime) static inline bool entity_before(const struct sched_entity *a, const struct sched_entity *b) { - return (s64)(a->vruntime - b->vruntime) < 0; + /* + * Tiebreak on vruntime seems unnecessary since it can + * hardly happen. + */ + return (s64)(a->deadline - b->deadline) < 0; } static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se) @@ -720,7 +724,7 @@ static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se) * Note: using 'avg_vruntime() > se->vruntime' is inacurate due * to the loss in precision caused by the division. */ -int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se) +static int vruntime_eligible(struct cfs_rq *cfs_rq, u64 vruntime) { struct sched_entity *curr = cfs_rq->curr; s64 avg = cfs_rq->avg_vruntime; @@ -733,7 +737,12 @@ int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se) load += weight; } - return avg >= entity_key(cfs_rq, se) * load; + return avg >= (s64)(vruntime - cfs_rq->min_vruntime) * load; +} + +int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + return vruntime_eligible(cfs_rq, se->vruntime); } static u64 __update_min_vruntime(struct cfs_rq *cfs_rq, u64 vruntime) @@ -752,9 +761,8 @@ static u64 __update_min_vruntime(struct cfs_rq *cfs_rq, u64 vruntime) static void update_min_vruntime(struct cfs_rq *cfs_rq) { - struct sched_entity *se = __pick_first_entity(cfs_rq); + struct sched_entity *se = __pick_root_entity(cfs_rq); struct sched_entity *curr = cfs_rq->curr; - u64 vruntime = cfs_rq->min_vruntime; if (curr) { @@ -766,9 +774,9 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq) if (se) { if (!curr) - vruntime = se->vruntime; + vruntime = se->min_vruntime; else - vruntime = min_vruntime(vruntime, se->vruntime); + vruntime = min_vruntime(vruntime, se->min_vruntime); } /* ensure we never gain time by being placed backwards. */ @@ -781,34 +789,34 @@ static inline bool __entity_less(struct rb_node *a, const struct rb_node *b) return entity_before(__node_2_se(a), __node_2_se(b)); } -#define deadline_gt(field, lse, rse) ({ (s64)((lse)->field - (rse)->field) > 0; }) +#define vruntime_gt(field, lse, rse) ({ (s64)((lse)->field - (rse)->field) > 0; }) -static inline void __update_min_deadline(struct sched_entity *se, struct rb_node *node) +static inline void __min_vruntime_update(struct sched_entity *se, struct rb_node *node) { if (node) { struct sched_entity *rse = __node_2_se(node); - if (deadline_gt(min_deadline, se, rse)) - se->min_deadline = rse->min_deadline; + if (vruntime_gt(min_vruntime, se, rse)) + se->min_vruntime = rse->min_vruntime; } } /* - * se->min_deadline = min(se->deadline, left->min_deadline, right->min_deadline) + * se->min_vruntime = min(se->vruntime, {left,right}->min_vruntime) */ -static inline bool min_deadline_update(struct sched_entity *se, bool exit) +static inline bool min_vruntime_update(struct sched_entity *se, bool exit) { - u64 old_min_deadline = se->min_deadline; + u64 old_min_vruntime = se->min_vruntime; struct rb_node *node = &se->run_node; - se->min_deadline = se->deadline; - __update_min_deadline(se, node->rb_right); - __update_min_deadline(se, node->rb_left); + se->min_vruntime = se->vruntime; + __min_vruntime_update(se, node->rb_right); + __min_vruntime_update(se, node->rb_left); - return se->min_deadline == old_min_deadline; + return se->min_vruntime == old_min_vruntime; } -RB_DECLARE_CALLBACKS(static, min_deadline_cb, struct sched_entity, - run_node, min_deadline, min_deadline_update); +RB_DECLARE_CALLBACKS(static, min_vruntime_cb, struct sched_entity, + run_node, min_vruntime, min_vruntime_update); /* * Enqueue an entity into the rb-tree: @@ -816,18 +824,28 @@ RB_DECLARE_CALLBACKS(static, min_deadline_cb, struct sched_entity, static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { avg_vruntime_add(cfs_rq, se); - se->min_deadline = se->deadline; + se->min_vruntime = se->vruntime; rb_add_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline, - __entity_less, &min_deadline_cb); + __entity_less, &min_vruntime_cb); } static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { rb_erase_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline, - &min_deadline_cb); + &min_vruntime_cb); avg_vruntime_sub(cfs_rq, se); } +struct sched_entity *__pick_root_entity(struct cfs_rq *cfs_rq) +{ + struct rb_node *root = cfs_rq->tasks_timeline.rb_root.rb_node; + + if (!root) + return NULL; + + return __node_2_se(root); +} + struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) { struct rb_node *left = rb_first_cached(&cfs_rq->tasks_timeline); @@ -850,23 +868,28 @@ struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) * with the earliest virtual deadline. * * We can do this in O(log n) time due to an augmented RB-tree. The - * tree keeps the entries sorted on service, but also functions as a - * heap based on the deadline by keeping: + * tree keeps the entries sorted on deadline, but also functions as a + * heap based on the vruntime by keeping: * - * se->min_deadline = min(se->deadline, se->{left,right}->min_deadline) + * se->min_vruntime = min(se->vruntime, se->{left,right}->min_vruntime) * - * Which allows an EDF like search on (sub)trees. + * Which allows tree pruning through eligibility. */ -static struct sched_entity *__pick_eevdf(struct cfs_rq *cfs_rq) +static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq) { struct rb_node *node = cfs_rq->tasks_timeline.rb_root.rb_node; struct sched_entity *curr = cfs_rq->curr; struct sched_entity *best = NULL; - struct sched_entity *best_left = NULL; + + /* + * We can safely skip eligibility check if there is only one entity + * in this cfs_rq, saving some cycles. + */ + if (cfs_rq->nr_running == 1) + return curr && curr->on_rq ? curr : __node_2_se(node); if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr))) curr = NULL; - best = curr; /* * Once selected, run a task until it either becomes non-eligible or @@ -875,95 +898,38 @@ static struct sched_entity *__pick_eevdf(struct cfs_rq *cfs_rq) if (sched_feat(RUN_TO_PARITY) && curr && curr->vlag == curr->deadline) return curr; + /* Heap search for the EEVD entity */ while (node) { struct sched_entity *se = __node_2_se(node); + struct rb_node *left = node->rb_left; /* - * If this entity is not eligible, try the left subtree. + * Eligible entities in left subtree are always better + * choices, since they have earlier deadlines. */ - if (!entity_eligible(cfs_rq, se)) { - node = node->rb_left; + if (left && vruntime_eligible(cfs_rq, + __node_2_se(left)->min_vruntime)) { + node = left; continue; } /* - * Now we heap search eligible trees for the best (min_)deadline + * The left subtree either is empty or has no eligible + * entity, so check the current node since it is the one + * with earliest deadline that might be eligible. */ - if (!best || deadline_gt(deadline, best, se)) + if (entity_eligible(cfs_rq, se)) { best = se; - - /* - * Every se in a left branch is eligible, keep track of the - * branch with the best min_deadline - */ - if (node->rb_left) { - struct sched_entity *left = __node_2_se(node->rb_left); - - if (!best_left || deadline_gt(min_deadline, best_left, left)) - best_left = left; - - /* - * min_deadline is in the left branch. rb_left and all - * descendants are eligible, so immediately switch to the second - * loop. - */ - if (left->min_deadline == se->min_deadline) - break; - } - - /* min_deadline is at this node, no need to look right */ - if (se->deadline == se->min_deadline) break; - - /* else min_deadline is in the right branch. */ - node = node->rb_right; - } - - /* - * We ran into an eligible node which is itself the best. - * (Or nr_running == 0 and both are NULL) - */ - if (!best_left || (s64)(best_left->min_deadline - best->deadline) > 0) - return best; - - /* - * Now best_left and all of its children are eligible, and we are just - * looking for deadline == min_deadline - */ - node = &best_left->run_node; - while (node) { - struct sched_entity *se = __node_2_se(node); - - /* min_deadline is the current node */ - if (se->deadline == se->min_deadline) - return se; - - /* min_deadline is in the left branch */ - if (node->rb_left && - __node_2_se(node->rb_left)->min_deadline == se->min_deadline) { - node = node->rb_left; - continue; } - /* else min_deadline is in the right branch */ node = node->rb_right; } - return NULL; -} -static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq) -{ - struct sched_entity *se = __pick_eevdf(cfs_rq); - - if (!se) { - struct sched_entity *left = __pick_first_entity(cfs_rq); - if (left) { - pr_err("EEVDF scheduling fail, picking leftmost\n"); - return left; - } - } + if (!best || (curr && entity_before(curr, best))) + best = curr; - return se; + return best; } #ifdef CONFIG_SCHED_DEBUG diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 2e5a95486a42..539c7e763f15 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2822,6 +2822,7 @@ DEFINE_LOCK_GUARD_2(double_rq_lock, struct rq, double_rq_lock(_T->lock, _T->lock2), double_rq_unlock(_T->lock, _T->lock2)) +extern struct sched_entity *__pick_root_entity(struct cfs_rq *cfs_rq); extern struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq); extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq); -- cgit From ee4373dc902c0a403dd084b254ce70a78f95466f Mon Sep 17 00:00:00 2001 From: Abel Wu Date: Wed, 15 Nov 2023 11:36:46 +0800 Subject: sched/eevdf: O(1) fastpath for task selection Since the RB-tree is now sorted by deadline, let's first try the leftmost entity which has the earliest virtual deadline. I've done some benchmarks to see its effectiveness. All the benchmarks are done inside a normal cpu cgroup in a clean environment with cpu turbo disabled, on a dual-CPU Intel Xeon(R) Platinum 8260 with 2 NUMA nodes each of which has 24C/48T. hackbench: process/thread + pipe/socket + 1/2/4/8 groups netperf: TCP/UDP + STREAM/RR + 24/48/72/96/192 threads tbench: loopback 24/48/72/96/192 threads schbench: 1/2/4/8 mthreads direct: cfs_rq has only one entity parity: RUN_TO_PARITY fast: O(1) fastpath slow: heap search (%) direct parity fast slow hackbench 92.95 2.02 4.91 0.12 netperf 68.08 6.60 24.18 1.14 tbench 67.55 11.22 20.61 0.62 schbench 69.91 2.65 25.73 1.71 The above results indicate that this fastpath really makes task selection more efficient. Signed-off-by: Abel Wu Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20231115033647.80785-4-wuyun.abel@bytedance.com --- kernel/sched/fair.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 31bca05c3612..d3e045d80cab 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -878,6 +878,7 @@ struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq) { struct rb_node *node = cfs_rq->tasks_timeline.rb_root.rb_node; + struct sched_entity *se = __pick_first_entity(cfs_rq); struct sched_entity *curr = cfs_rq->curr; struct sched_entity *best = NULL; @@ -886,7 +887,7 @@ static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq) * in this cfs_rq, saving some cycles. */ if (cfs_rq->nr_running == 1) - return curr && curr->on_rq ? curr : __node_2_se(node); + return curr && curr->on_rq ? curr : se; if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr))) curr = NULL; @@ -898,9 +899,14 @@ static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq) if (sched_feat(RUN_TO_PARITY) && curr && curr->vlag == curr->deadline) return curr; + /* Pick the leftmost entity if it's eligible */ + if (se && entity_eligible(cfs_rq, se)) { + best = se; + goto found; + } + /* Heap search for the EEVD entity */ while (node) { - struct sched_entity *se = __node_2_se(node); struct rb_node *left = node->rb_left; /* @@ -913,6 +919,8 @@ static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq) continue; } + se = __node_2_se(node); + /* * The left subtree either is empty or has no eligible * entity, so check the current node since it is the one @@ -925,7 +933,7 @@ static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq) node = node->rb_right; } - +found: if (!best || (curr && entity_before(curr, best))) best = curr; -- cgit From 5d69eca542ee17c618f9a55da52191d5e28b435f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 4 Nov 2023 11:59:18 +0100 Subject: sched: Unify runtime accounting across classes All classes use sched_entity::exec_start to track runtime and have copies of the exact same code around to compute runtime. Collapse all that. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Phil Auld Reviewed-by: Valentin Schneider Reviewed-by: Steven Rostedt (Google) Link: https://lkml.kernel.org/r/54d148a144f26d9559698c4dd82d8859038a7380.1699095159.git.bristot@kernel.org --- include/linux/sched.h | 2 +- kernel/sched/deadline.c | 15 +++---------- kernel/sched/fair.c | 57 ++++++++++++++++++++++++++++++++++++------------ kernel/sched/rt.c | 15 +++---------- kernel/sched/sched.h | 12 ++-------- kernel/sched/stop_task.c | 13 +---------- 6 files changed, 53 insertions(+), 61 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index cd56d4018527..44b46d9743bf 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -523,7 +523,7 @@ struct sched_statistics { u64 block_max; s64 sum_block_runtime; - u64 exec_max; + s64 exec_max; u64 slice_max; u64 nr_migrations_cold; diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index b28114478b82..de79719c63c0 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1275,9 +1275,8 @@ static void update_curr_dl(struct rq *rq) { struct task_struct *curr = rq->curr; struct sched_dl_entity *dl_se = &curr->dl; - u64 delta_exec, scaled_delta_exec; + s64 delta_exec, scaled_delta_exec; int cpu = cpu_of(rq); - u64 now; if (!dl_task(curr) || !on_dl_rq(dl_se)) return; @@ -1290,21 +1289,13 @@ static void update_curr_dl(struct rq *rq) * natural solution, but the full ramifications of this * approach need further study. */ - now = rq_clock_task(rq); - delta_exec = now - curr->se.exec_start; - if (unlikely((s64)delta_exec <= 0)) { + delta_exec = update_curr_common(rq); + if (unlikely(delta_exec <= 0)) { if (unlikely(dl_se->dl_yielded)) goto throttle; return; } - schedstat_set(curr->stats.exec_max, - max(curr->stats.exec_max, delta_exec)); - - trace_sched_stat_runtime(curr, delta_exec, 0); - - update_current_exec_runtime(curr, now, delta_exec); - if (dl_entity_is_special(dl_se)) return; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d3e045d80cab..11073cf00134 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1103,23 +1103,17 @@ static void update_tg_load_avg(struct cfs_rq *cfs_rq) } #endif /* CONFIG_SMP */ -/* - * Update the current task's runtime statistics. - */ -static void update_curr(struct cfs_rq *cfs_rq) +static s64 update_curr_se(struct rq *rq, struct sched_entity *curr) { - struct sched_entity *curr = cfs_rq->curr; - u64 now = rq_clock_task(rq_of(cfs_rq)); - u64 delta_exec; - - if (unlikely(!curr)) - return; + u64 now = rq_clock_task(rq); + s64 delta_exec; delta_exec = now - curr->exec_start; - if (unlikely((s64)delta_exec <= 0)) - return; + if (unlikely(delta_exec <= 0)) + return delta_exec; curr->exec_start = now; + curr->sum_exec_runtime += delta_exec; if (schedstat_enabled()) { struct sched_statistics *stats; @@ -1129,8 +1123,43 @@ static void update_curr(struct cfs_rq *cfs_rq) max(delta_exec, stats->exec_max)); } - curr->sum_exec_runtime += delta_exec; - schedstat_add(cfs_rq->exec_clock, delta_exec); + return delta_exec; +} + +/* + * Used by other classes to account runtime. + */ +s64 update_curr_common(struct rq *rq) +{ + struct task_struct *curr = rq->curr; + s64 delta_exec; + + delta_exec = update_curr_se(rq, &curr->se); + if (unlikely(delta_exec <= 0)) + return delta_exec; + + trace_sched_stat_runtime(curr, delta_exec, 0); + + account_group_exec_runtime(curr, delta_exec); + cgroup_account_cputime(curr, delta_exec); + + return delta_exec; +} + +/* + * Update the current task's runtime statistics. + */ +static void update_curr(struct cfs_rq *cfs_rq) +{ + struct sched_entity *curr = cfs_rq->curr; + s64 delta_exec; + + if (unlikely(!curr)) + return; + + delta_exec = update_curr_se(rq_of(cfs_rq), curr); + if (unlikely(delta_exec <= 0)) + return; curr->vruntime += calc_delta_fair(delta_exec, curr); update_deadline(cfs_rq, curr); diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 6aaf0a3d6081..3261b067b67e 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1002,24 +1002,15 @@ static void update_curr_rt(struct rq *rq) { struct task_struct *curr = rq->curr; struct sched_rt_entity *rt_se = &curr->rt; - u64 delta_exec; - u64 now; + s64 delta_exec; if (curr->sched_class != &rt_sched_class) return; - now = rq_clock_task(rq); - delta_exec = now - curr->se.exec_start; - if (unlikely((s64)delta_exec <= 0)) + delta_exec = update_curr_common(rq); + if (unlikely(delta_exec <= 0)) return; - schedstat_set(curr->stats.exec_max, - max(curr->stats.exec_max, delta_exec)); - - trace_sched_stat_runtime(curr, delta_exec, 0); - - update_current_exec_runtime(curr, now, delta_exec); - if (!rt_bandwidth_enabled()) return; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 539c7e763f15..6703e9e81b1d 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2212,6 +2212,8 @@ struct affinity_context { unsigned int flags; }; +extern s64 update_curr_common(struct rq *rq); + struct sched_class { #ifdef CONFIG_UCLAMP_TASK @@ -3262,16 +3264,6 @@ extern int sched_dynamic_mode(const char *str); extern void sched_dynamic_update(int mode); #endif -static inline void update_current_exec_runtime(struct task_struct *curr, - u64 now, u64 delta_exec) -{ - curr->se.sum_exec_runtime += delta_exec; - account_group_exec_runtime(curr, delta_exec); - - curr->se.exec_start = now; - cgroup_account_cputime(curr, delta_exec); -} - #ifdef CONFIG_SCHED_MM_CID #define SCHED_MM_CID_PERIOD_NS (100ULL * 1000000) /* 100ms */ diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index 6cf7304e6449..b1b8fe61c532 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -70,18 +70,7 @@ static void yield_task_stop(struct rq *rq) static void put_prev_task_stop(struct rq *rq, struct task_struct *prev) { - struct task_struct *curr = rq->curr; - u64 now, delta_exec; - - now = rq_clock_task(rq); - delta_exec = now - curr->se.exec_start; - if (unlikely((s64)delta_exec < 0)) - delta_exec = 0; - - schedstat_set(curr->stats.exec_max, - max(curr->stats.exec_max, delta_exec)); - - update_current_exec_runtime(curr, now, delta_exec); + update_curr_common(rq); } /* -- cgit From 5fe6ec8f6ab549b6422e41551abb51802bd48bc7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 6 Nov 2023 13:41:43 +0100 Subject: sched: Remove vruntime from trace_sched_stat_runtime() Tracing the runtime delta makes sense, observer can sum over time. Tracing the absolute vruntime makes less sense, inconsistent: absolute-vs-delta, but also vruntime delta can be computed from runtime delta. Removing the vruntime thing also makes the two tracepoint sites identical, allowing to unify the code in a later patch. Signed-off-by: Peter Zijlstra (Intel) --- include/trace/events/sched.h | 15 ++++++--------- kernel/sched/fair.c | 5 ++--- 2 files changed, 8 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 6188ad0d9e0d..dbb01b4b7451 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -493,33 +493,30 @@ DEFINE_EVENT_SCHEDSTAT(sched_stat_template, sched_stat_blocked, */ DECLARE_EVENT_CLASS(sched_stat_runtime, - TP_PROTO(struct task_struct *tsk, u64 runtime, u64 vruntime), + TP_PROTO(struct task_struct *tsk, u64 runtime), - TP_ARGS(tsk, __perf_count(runtime), vruntime), + TP_ARGS(tsk, __perf_count(runtime)), TP_STRUCT__entry( __array( char, comm, TASK_COMM_LEN ) __field( pid_t, pid ) __field( u64, runtime ) - __field( u64, vruntime ) ), TP_fast_assign( memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN); __entry->pid = tsk->pid; __entry->runtime = runtime; - __entry->vruntime = vruntime; ), - TP_printk("comm=%s pid=%d runtime=%Lu [ns] vruntime=%Lu [ns]", + TP_printk("comm=%s pid=%d runtime=%Lu [ns]", __entry->comm, __entry->pid, - (unsigned long long)__entry->runtime, - (unsigned long long)__entry->vruntime) + (unsigned long long)__entry->runtime) ); DEFINE_EVENT(sched_stat_runtime, sched_stat_runtime, - TP_PROTO(struct task_struct *tsk, u64 runtime, u64 vruntime), - TP_ARGS(tsk, runtime, vruntime)); + TP_PROTO(struct task_struct *tsk, u64 runtime), + TP_ARGS(tsk, runtime)); /* * Tracepoint for showing priority inheritance modifying a tasks diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 11073cf00134..33db70c6b582 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1138,8 +1138,7 @@ s64 update_curr_common(struct rq *rq) if (unlikely(delta_exec <= 0)) return delta_exec; - trace_sched_stat_runtime(curr, delta_exec, 0); - + trace_sched_stat_runtime(curr, delta_exec); account_group_exec_runtime(curr, delta_exec); cgroup_account_cputime(curr, delta_exec); @@ -1168,7 +1167,7 @@ static void update_curr(struct cfs_rq *cfs_rq) if (entity_is_task(curr)) { struct task_struct *curtask = task_of(curr); - trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime); + trace_sched_stat_runtime(curtask, delta_exec); cgroup_account_cputime(curtask, delta_exec); account_group_exec_runtime(curtask, delta_exec); } -- cgit From c708a4dc5ab547edc3d6537233ca9e79ea30ce47 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 6 Nov 2023 14:04:01 +0100 Subject: sched: Unify more update_curr*() Now that trace_sched_stat_runtime() no longer takes a vruntime argument, the task specific bits are identical between update_curr_common() and update_curr(). Signed-off-by: Peter Zijlstra (Intel) --- kernel/sched/fair.c | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 33db70c6b582..1cd92b11b289 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1126,6 +1126,13 @@ static s64 update_curr_se(struct rq *rq, struct sched_entity *curr) return delta_exec; } +static inline void update_curr_task(struct task_struct *p, s64 delta_exec) +{ + trace_sched_stat_runtime(p, delta_exec); + account_group_exec_runtime(p, delta_exec); + cgroup_account_cputime(p, delta_exec); +} + /* * Used by other classes to account runtime. */ @@ -1135,12 +1142,8 @@ s64 update_curr_common(struct rq *rq) s64 delta_exec; delta_exec = update_curr_se(rq, &curr->se); - if (unlikely(delta_exec <= 0)) - return delta_exec; - - trace_sched_stat_runtime(curr, delta_exec); - account_group_exec_runtime(curr, delta_exec); - cgroup_account_cputime(curr, delta_exec); + if (likely(delta_exec > 0)) + update_curr_task(curr, delta_exec); return delta_exec; } @@ -1164,13 +1167,8 @@ static void update_curr(struct cfs_rq *cfs_rq) update_deadline(cfs_rq, curr); update_min_vruntime(cfs_rq); - if (entity_is_task(curr)) { - struct task_struct *curtask = task_of(curr); - - trace_sched_stat_runtime(curtask, delta_exec); - cgroup_account_cputime(curtask, delta_exec); - account_group_exec_runtime(curtask, delta_exec); - } + if (entity_is_task(curr)) + update_curr_task(task_of(curr), delta_exec); account_cfs_rq_runtime(cfs_rq, delta_exec); } -- cgit From 9e07d45c5210f5dd6701c00d55791983db7320fa Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 4 Nov 2023 11:59:19 +0100 Subject: sched/deadline: Collect sched_dl_entity initialization Create a single function that initializes a sched_dl_entity. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Phil Auld Reviewed-by: Valentin Schneider Link: https://lkml.kernel.org/r/51acc695eecf0a1a2f78f9a044e11ffd9b316bcf.1699095159.git.bristot@kernel.org --- kernel/sched/core.c | 5 +---- kernel/sched/deadline.c | 22 +++++++++++++++------- kernel/sched/sched.h | 5 +---- 3 files changed, 17 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9d5099d02dbc..966631f05d71 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4511,10 +4511,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) memset(&p->stats, 0, sizeof(p->stats)); #endif - RB_CLEAR_NODE(&p->dl.rb_node); - init_dl_task_timer(&p->dl); - init_dl_inactive_task_timer(&p->dl); - __dl_clear_params(p); + init_dl_entity(&p->dl); INIT_LIST_HEAD(&p->rt.run_list); p->rt.timeout = 0; diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index de79719c63c0..e80bb884262d 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -335,6 +335,8 @@ static void dl_change_utilization(struct task_struct *p, u64 new_bw) __add_rq_bw(new_bw, &rq->dl); } +static void __dl_clear_params(struct sched_dl_entity *dl_se); + /* * The utilization of a task cannot be immediately removed from * the rq active utilization (running_bw) when the task blocks. @@ -434,7 +436,7 @@ static void task_non_contending(struct task_struct *p) raw_spin_lock(&dl_b->lock); __dl_sub(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p))); raw_spin_unlock(&dl_b->lock); - __dl_clear_params(p); + __dl_clear_params(dl_se); } return; @@ -1183,7 +1185,7 @@ unlock: return HRTIMER_NORESTART; } -void init_dl_task_timer(struct sched_dl_entity *dl_se) +static void init_dl_task_timer(struct sched_dl_entity *dl_se) { struct hrtimer *timer = &dl_se->dl_timer; @@ -1389,7 +1391,7 @@ static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer) raw_spin_lock(&dl_b->lock); __dl_sub(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p))); raw_spin_unlock(&dl_b->lock); - __dl_clear_params(p); + __dl_clear_params(dl_se); goto unlock; } @@ -1405,7 +1407,7 @@ unlock: return HRTIMER_NORESTART; } -void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se) +static void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se) { struct hrtimer *timer = &dl_se->inactive_timer; @@ -2957,10 +2959,8 @@ bool __checkparam_dl(const struct sched_attr *attr) /* * This function clears the sched_dl_entity static params. */ -void __dl_clear_params(struct task_struct *p) +static void __dl_clear_params(struct sched_dl_entity *dl_se) { - struct sched_dl_entity *dl_se = &p->dl; - dl_se->dl_runtime = 0; dl_se->dl_deadline = 0; dl_se->dl_period = 0; @@ -2978,6 +2978,14 @@ void __dl_clear_params(struct task_struct *p) #endif } +void init_dl_entity(struct sched_dl_entity *dl_se) +{ + RB_CLEAR_NODE(&dl_se->rb_node); + init_dl_task_timer(dl_se); + init_dl_inactive_task_timer(dl_se); + __dl_clear_params(dl_se); +} + bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr) { struct sched_dl_entity *dl_se = &p->dl; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 6703e9e81b1d..3c62df1511e7 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -273,8 +273,6 @@ struct rt_bandwidth { unsigned int rt_period_active; }; -void __dl_clear_params(struct task_struct *p); - static inline int dl_bandwidth_enabled(void) { return sysctl_sched_rt_runtime >= 0; @@ -2427,8 +2425,7 @@ extern struct rt_bandwidth def_rt_bandwidth; extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime); extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq); -extern void init_dl_task_timer(struct sched_dl_entity *dl_se); -extern void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se); +extern void init_dl_entity(struct sched_dl_entity *dl_se); #define BW_SHIFT 20 #define BW_UNIT (1 << BW_SHIFT) -- cgit From 2f7a0f58948d8231236e2facecc500f1930fb996 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 4 Nov 2023 11:59:20 +0100 Subject: sched/deadline: Move bandwidth accounting into {en,de}queue_dl_entity In preparation of introducing !task sched_dl_entity; move the bandwidth accounting into {en.de}queue_dl_entity(). Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Phil Auld Reviewed-by: Valentin Schneider Link: https://lkml.kernel.org/r/a86dccbbe44e021b8771627e1dae01a69b73466d.1699095159.git.bristot@kernel.org --- kernel/sched/deadline.c | 130 +++++++++++++++++++++++++++--------------------- kernel/sched/sched.h | 6 +++ 2 files changed, 78 insertions(+), 58 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index e80bb884262d..81810f67df7a 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -391,12 +391,12 @@ static void __dl_clear_params(struct sched_dl_entity *dl_se); * up, and checks if the task is still in the "ACTIVE non contending" * state or not (in the second case, it updates running_bw). */ -static void task_non_contending(struct task_struct *p) +static void task_non_contending(struct sched_dl_entity *dl_se) { - struct sched_dl_entity *dl_se = &p->dl; struct hrtimer *timer = &dl_se->inactive_timer; struct dl_rq *dl_rq = dl_rq_of_se(dl_se); struct rq *rq = rq_of_dl_rq(dl_rq); + struct task_struct *p = dl_task_of(dl_se); s64 zerolag_time; /* @@ -428,13 +428,14 @@ static void task_non_contending(struct task_struct *p) if ((zerolag_time < 0) || hrtimer_active(&dl_se->inactive_timer)) { if (dl_task(p)) sub_running_bw(dl_se, dl_rq); + if (!dl_task(p) || READ_ONCE(p->__state) == TASK_DEAD) { struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); if (READ_ONCE(p->__state) == TASK_DEAD) - sub_rq_bw(&p->dl, &rq->dl); + sub_rq_bw(dl_se, &rq->dl); raw_spin_lock(&dl_b->lock); - __dl_sub(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p))); + __dl_sub(dl_b, dl_se->dl_bw, dl_bw_cpus(task_cpu(p))); raw_spin_unlock(&dl_b->lock); __dl_clear_params(dl_se); } @@ -1601,6 +1602,41 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se, int flags) update_stats_enqueue_dl(dl_rq_of_se(dl_se), dl_se, flags); + /* + * Check if a constrained deadline task was activated + * after the deadline but before the next period. + * If that is the case, the task will be throttled and + * the replenishment timer will be set to the next period. + */ + if (!dl_se->dl_throttled && !dl_is_implicit(dl_se)) + dl_check_constrained_dl(dl_se); + + if (flags & (ENQUEUE_RESTORE|ENQUEUE_MIGRATING)) { + struct dl_rq *dl_rq = dl_rq_of_se(dl_se); + + add_rq_bw(dl_se, dl_rq); + add_running_bw(dl_se, dl_rq); + } + + /* + * If p is throttled, we do not enqueue it. In fact, if it exhausted + * its budget it needs a replenishment and, since it now is on + * its rq, the bandwidth timer callback (which clearly has not + * run yet) will take care of this. + * However, the active utilization does not depend on the fact + * that the task is on the runqueue or not (but depends on the + * task's state - in GRUB parlance, "inactive" vs "active contending"). + * In other words, even if a task is throttled its utilization must + * be counted in the active utilization; hence, we need to call + * add_running_bw(). + */ + if (dl_se->dl_throttled && !(flags & ENQUEUE_REPLENISH)) { + if (flags & ENQUEUE_WAKEUP) + task_contending(dl_se, flags); + + return; + } + /* * If this is a wakeup or a new instance, the scheduling * parameters of the task might need updating. Otherwise, @@ -1620,9 +1656,28 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se, int flags) __enqueue_dl_entity(dl_se); } -static void dequeue_dl_entity(struct sched_dl_entity *dl_se) +static void dequeue_dl_entity(struct sched_dl_entity *dl_se, int flags) { __dequeue_dl_entity(dl_se); + + if (flags & (DEQUEUE_SAVE|DEQUEUE_MIGRATING)) { + struct dl_rq *dl_rq = dl_rq_of_se(dl_se); + + sub_running_bw(dl_se, dl_rq); + sub_rq_bw(dl_se, dl_rq); + } + + /* + * This check allows to start the inactive timer (or to immediately + * decrease the active utilization, if needed) in two cases: + * when the task blocks and when it is terminating + * (p->state == TASK_DEAD). We can handle the two cases in the same + * way, because from GRUB's point of view the same thing is happening + * (the task moves from "active contending" to "active non contending" + * or "inactive") + */ + if (flags & DEQUEUE_SLEEP) + task_non_contending(dl_se); } static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) @@ -1667,76 +1722,35 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) return; } - /* - * Check if a constrained deadline task was activated - * after the deadline but before the next period. - * If that is the case, the task will be throttled and - * the replenishment timer will be set to the next period. - */ - if (!p->dl.dl_throttled && !dl_is_implicit(&p->dl)) - dl_check_constrained_dl(&p->dl); - - if (p->on_rq == TASK_ON_RQ_MIGRATING || flags & ENQUEUE_RESTORE) { - add_rq_bw(&p->dl, &rq->dl); - add_running_bw(&p->dl, &rq->dl); - } - - /* - * If p is throttled, we do not enqueue it. In fact, if it exhausted - * its budget it needs a replenishment and, since it now is on - * its rq, the bandwidth timer callback (which clearly has not - * run yet) will take care of this. - * However, the active utilization does not depend on the fact - * that the task is on the runqueue or not (but depends on the - * task's state - in GRUB parlance, "inactive" vs "active contending"). - * In other words, even if a task is throttled its utilization must - * be counted in the active utilization; hence, we need to call - * add_running_bw(). - */ - if (p->dl.dl_throttled && !(flags & ENQUEUE_REPLENISH)) { - if (flags & ENQUEUE_WAKEUP) - task_contending(&p->dl, flags); - - return; - } - check_schedstat_required(); update_stats_wait_start_dl(dl_rq_of_se(&p->dl), &p->dl); + if (p->on_rq == TASK_ON_RQ_MIGRATING) + flags |= ENQUEUE_MIGRATING; + enqueue_dl_entity(&p->dl, flags); - if (!task_current(rq, p) && p->nr_cpus_allowed > 1) + if (!task_current(rq, p) && !p->dl.dl_throttled && p->nr_cpus_allowed > 1) enqueue_pushable_dl_task(rq, p); } static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) { update_stats_dequeue_dl(&rq->dl, &p->dl, flags); - dequeue_dl_entity(&p->dl); - dequeue_pushable_dl_task(rq, p); + dequeue_dl_entity(&p->dl, flags); + + if (!p->dl.dl_throttled) + dequeue_pushable_dl_task(rq, p); } static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) { update_curr_dl(rq); - __dequeue_task_dl(rq, p, flags); - if (p->on_rq == TASK_ON_RQ_MIGRATING || flags & DEQUEUE_SAVE) { - sub_running_bw(&p->dl, &rq->dl); - sub_rq_bw(&p->dl, &rq->dl); - } + if (p->on_rq == TASK_ON_RQ_MIGRATING) + flags |= DEQUEUE_MIGRATING; - /* - * This check allows to start the inactive timer (or to immediately - * decrease the active utilization, if needed) in two cases: - * when the task blocks and when it is terminating - * (p->state == TASK_DEAD). We can handle the two cases in the same - * way, because from GRUB's point of view the same thing is happening - * (the task moves from "active contending" to "active non contending" - * or "inactive") - */ - if (flags & DEQUEUE_SLEEP) - task_non_contending(p); + __dequeue_task_dl(rq, p, flags); } /* @@ -2551,7 +2565,7 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p) * will reset the task parameters. */ if (task_on_rq_queued(p) && p->dl.dl_runtime) - task_non_contending(p); + task_non_contending(&p->dl); /* * In case a task is setscheduled out from SCHED_DEADLINE we need to diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 3c62df1511e7..1cda787172f0 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2177,6 +2177,10 @@ extern const u32 sched_prio_to_wmult[40]; * MOVE - paired with SAVE/RESTORE, explicitly does not preserve the location * in the runqueue. * + * NOCLOCK - skip the update_rq_clock() (avoids double updates) + * + * MIGRATION - p->on_rq == TASK_ON_RQ_MIGRATING (used for DEADLINE) + * * ENQUEUE_HEAD - place at front of runqueue (tail if not specified) * ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline) * ENQUEUE_MIGRATED - the task was migrated during wakeup @@ -2187,6 +2191,7 @@ extern const u32 sched_prio_to_wmult[40]; #define DEQUEUE_SAVE 0x02 /* Matches ENQUEUE_RESTORE */ #define DEQUEUE_MOVE 0x04 /* Matches ENQUEUE_MOVE */ #define DEQUEUE_NOCLOCK 0x08 /* Matches ENQUEUE_NOCLOCK */ +#define DEQUEUE_MIGRATING 0x100 /* Matches ENQUEUE_MIGRATING */ #define ENQUEUE_WAKEUP 0x01 #define ENQUEUE_RESTORE 0x02 @@ -2201,6 +2206,7 @@ extern const u32 sched_prio_to_wmult[40]; #define ENQUEUE_MIGRATED 0x00 #endif #define ENQUEUE_INITIAL 0x80 +#define ENQUEUE_MIGRATING 0x100 #define RETRY_TASK ((void *)-1UL) -- cgit From 63ba8422f876e32ee564ea95da9a7313b13ff0a1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 4 Nov 2023 11:59:21 +0100 Subject: sched/deadline: Introduce deadline servers Low priority tasks (e.g., SCHED_OTHER) can suffer starvation if tasks with higher priority (e.g., SCHED_FIFO) monopolize CPU(s). RT Throttling has been introduced a while ago as a (mostly debug) countermeasure one can utilize to reserve some CPU time for low priority tasks (usually background type of work, e.g. workqueues, timers, etc.). It however has its own problems (see documentation) and the undesired effect of unconditionally throttling FIFO tasks even when no lower priority activity needs to run (there are mechanisms to fix this issue as well, but, again, with their own problems). Introduce deadline servers to service low priority tasks needs under starvation conditions. Deadline servers are built extending SCHED_DEADLINE implementation to allow 2-level scheduling (a sched_deadline entity becomes a container for lower priority scheduling entities). Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/4968601859d920335cf85822eb573a5f179f04b8.1699095159.git.bristot@kernel.org --- include/linux/sched.h | 22 +++- kernel/sched/core.c | 17 +++ kernel/sched/deadline.c | 332 ++++++++++++++++++++++++++++++++---------------- kernel/sched/fair.c | 2 + kernel/sched/sched.h | 27 ++++ 5 files changed, 292 insertions(+), 108 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index 44b46d9743bf..8d258162deb0 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -63,11 +63,13 @@ struct robust_list_head; struct root_domain; struct rq; struct sched_attr; +struct sched_dl_entity; struct seq_file; struct sighand_struct; struct signal_struct; struct task_delay_info; struct task_group; +struct task_struct; struct user_event_mm; /* @@ -607,6 +609,9 @@ struct sched_rt_entity { #endif } __randomize_layout; +typedef bool (*dl_server_has_tasks_f)(struct sched_dl_entity *); +typedef struct task_struct *(*dl_server_pick_f)(struct sched_dl_entity *); + struct sched_dl_entity { struct rb_node rb_node; @@ -654,6 +659,7 @@ struct sched_dl_entity { unsigned int dl_yielded : 1; unsigned int dl_non_contending : 1; unsigned int dl_overrun : 1; + unsigned int dl_server : 1; /* * Bandwidth enforcement timer. Each -deadline task has its @@ -668,7 +674,20 @@ struct sched_dl_entity { * timer is needed to decrease the active utilization at the correct * time. */ - struct hrtimer inactive_timer; + struct hrtimer inactive_timer; + + /* + * Bits for DL-server functionality. Also see the comment near + * dl_server_update(). + * + * @rq the runqueue this server is for + * + * @server_has_tasks() returns true if @server_pick return a + * runnable task. + */ + struct rq *rq; + dl_server_has_tasks_f server_has_tasks; + dl_server_pick_f server_pick; #ifdef CONFIG_RT_MUTEXES /* @@ -795,6 +814,7 @@ struct task_struct { struct sched_entity se; struct sched_rt_entity rt; struct sched_dl_entity dl; + struct sched_dl_entity *dl_server; const struct sched_class *sched_class; #ifdef CONFIG_SCHED_CORE diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 966631f05d71..f5f4495d1768 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3797,6 +3797,8 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, rq->idle_stamp = 0; } #endif + + p->dl_server = NULL; } /* @@ -6003,12 +6005,27 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) p = pick_next_task_idle(rq); } + /* + * This is the fast path; it cannot be a DL server pick; + * therefore even if @p == @prev, ->dl_server must be NULL. + */ + if (p->dl_server) + p->dl_server = NULL; + return p; } restart: put_prev_task_balance(rq, prev, rf); + /* + * We've updated @prev and no longer need the server link, clear it. + * Must be done before ->pick_next_task() because that can (re)set + * ->dl_server. + */ + if (prev->dl_server) + prev->dl_server = NULL; + for_each_class(class) { p = class->pick_next_task(rq); if (p) diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 81810f67df7a..a04a436af8cc 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -54,8 +54,14 @@ static int __init sched_dl_sysctl_init(void) late_initcall(sched_dl_sysctl_init); #endif +static bool dl_server(struct sched_dl_entity *dl_se) +{ + return dl_se->dl_server; +} + static inline struct task_struct *dl_task_of(struct sched_dl_entity *dl_se) { + BUG_ON(dl_server(dl_se)); return container_of(dl_se, struct task_struct, dl); } @@ -64,12 +70,19 @@ static inline struct rq *rq_of_dl_rq(struct dl_rq *dl_rq) return container_of(dl_rq, struct rq, dl); } -static inline struct dl_rq *dl_rq_of_se(struct sched_dl_entity *dl_se) +static inline struct rq *rq_of_dl_se(struct sched_dl_entity *dl_se) { - struct task_struct *p = dl_task_of(dl_se); - struct rq *rq = task_rq(p); + struct rq *rq = dl_se->rq; + + if (!dl_server(dl_se)) + rq = task_rq(dl_task_of(dl_se)); - return &rq->dl; + return rq; +} + +static inline struct dl_rq *dl_rq_of_se(struct sched_dl_entity *dl_se) +{ + return &rq_of_dl_se(dl_se)->dl; } static inline int on_dl_rq(struct sched_dl_entity *dl_se) @@ -394,9 +407,8 @@ static void __dl_clear_params(struct sched_dl_entity *dl_se); static void task_non_contending(struct sched_dl_entity *dl_se) { struct hrtimer *timer = &dl_se->inactive_timer; - struct dl_rq *dl_rq = dl_rq_of_se(dl_se); - struct rq *rq = rq_of_dl_rq(dl_rq); - struct task_struct *p = dl_task_of(dl_se); + struct rq *rq = rq_of_dl_se(dl_se); + struct dl_rq *dl_rq = &rq->dl; s64 zerolag_time; /* @@ -426,25 +438,33 @@ static void task_non_contending(struct sched_dl_entity *dl_se) * utilization now, instead of starting a timer */ if ((zerolag_time < 0) || hrtimer_active(&dl_se->inactive_timer)) { - if (dl_task(p)) + if (dl_server(dl_se)) { sub_running_bw(dl_se, dl_rq); + } else { + struct task_struct *p = dl_task_of(dl_se); - if (!dl_task(p) || READ_ONCE(p->__state) == TASK_DEAD) { - struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); + if (dl_task(p)) + sub_running_bw(dl_se, dl_rq); - if (READ_ONCE(p->__state) == TASK_DEAD) - sub_rq_bw(dl_se, &rq->dl); - raw_spin_lock(&dl_b->lock); - __dl_sub(dl_b, dl_se->dl_bw, dl_bw_cpus(task_cpu(p))); - raw_spin_unlock(&dl_b->lock); - __dl_clear_params(dl_se); + if (!dl_task(p) || READ_ONCE(p->__state) == TASK_DEAD) { + struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); + + if (READ_ONCE(p->__state) == TASK_DEAD) + sub_rq_bw(dl_se, &rq->dl); + raw_spin_lock(&dl_b->lock); + __dl_sub(dl_b, dl_se->dl_bw, dl_bw_cpus(task_cpu(p))); + raw_spin_unlock(&dl_b->lock); + __dl_clear_params(dl_se); + } } return; } dl_se->dl_non_contending = 1; - get_task_struct(p); + if (!dl_server(dl_se)) + get_task_struct(dl_task_of(dl_se)); + hrtimer_start(timer, ns_to_ktime(zerolag_time), HRTIMER_MODE_REL_HARD); } @@ -471,8 +491,10 @@ static void task_contending(struct sched_dl_entity *dl_se, int flags) * will not touch the rq's active utilization, * so we are still safe. */ - if (hrtimer_try_to_cancel(&dl_se->inactive_timer) == 1) - put_task_struct(dl_task_of(dl_se)); + if (hrtimer_try_to_cancel(&dl_se->inactive_timer) == 1) { + if (!dl_server(dl_se)) + put_task_struct(dl_task_of(dl_se)); + } } else { /* * Since "dl_non_contending" is not set, the @@ -485,10 +507,8 @@ static void task_contending(struct sched_dl_entity *dl_se, int flags) } } -static inline int is_leftmost(struct task_struct *p, struct dl_rq *dl_rq) +static inline int is_leftmost(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) { - struct sched_dl_entity *dl_se = &p->dl; - return rb_first_cached(&dl_rq->root) == &dl_se->rb_node; } @@ -740,8 +760,10 @@ static inline void deadline_queue_pull_task(struct rq *rq) } #endif /* CONFIG_SMP */ +static void +enqueue_dl_entity(struct sched_dl_entity *dl_se, int flags); static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags); -static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags); +static void dequeue_dl_entity(struct sched_dl_entity *dl_se, int flags); static void wakeup_preempt_dl(struct rq *rq, struct task_struct *p, int flags); static inline void replenish_dl_new_period(struct sched_dl_entity *dl_se, @@ -989,8 +1011,7 @@ static inline bool dl_is_implicit(struct sched_dl_entity *dl_se) */ static void update_dl_entity(struct sched_dl_entity *dl_se) { - struct dl_rq *dl_rq = dl_rq_of_se(dl_se); - struct rq *rq = rq_of_dl_rq(dl_rq); + struct rq *rq = rq_of_dl_se(dl_se); if (dl_time_before(dl_se->deadline, rq_clock(rq)) || dl_entity_overflow(dl_se, rq_clock(rq))) { @@ -1021,11 +1042,11 @@ static inline u64 dl_next_period(struct sched_dl_entity *dl_se) * actually started or not (i.e., the replenishment instant is in * the future or in the past). */ -static int start_dl_timer(struct task_struct *p) +static int start_dl_timer(struct sched_dl_entity *dl_se) { - struct sched_dl_entity *dl_se = &p->dl; struct hrtimer *timer = &dl_se->dl_timer; - struct rq *rq = task_rq(p); + struct dl_rq *dl_rq = dl_rq_of_se(dl_se); + struct rq *rq = rq_of_dl_rq(dl_rq); ktime_t now, act; s64 delta; @@ -1059,13 +1080,33 @@ static int start_dl_timer(struct task_struct *p) * and observe our state. */ if (!hrtimer_is_queued(timer)) { - get_task_struct(p); + if (!dl_server(dl_se)) + get_task_struct(dl_task_of(dl_se)); hrtimer_start(timer, act, HRTIMER_MODE_ABS_HARD); } return 1; } +static void __push_dl_task(struct rq *rq, struct rq_flags *rf) +{ +#ifdef CONFIG_SMP + /* + * Queueing this task back might have overloaded rq, check if we need + * to kick someone away. + */ + if (has_pushable_dl_tasks(rq)) { + /* + * Nothing relies on rq->lock after this, so its safe to drop + * rq->lock. + */ + rq_unpin_lock(rq, rf); + push_dl_task(rq); + rq_repin_lock(rq, rf); + } +#endif +} + /* * This is the bandwidth enforcement timer callback. If here, we know * a task is not on its dl_rq, since the fact that the timer was running @@ -1084,10 +1125,34 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) struct sched_dl_entity *dl_se = container_of(timer, struct sched_dl_entity, dl_timer); - struct task_struct *p = dl_task_of(dl_se); + struct task_struct *p; struct rq_flags rf; struct rq *rq; + if (dl_server(dl_se)) { + struct rq *rq = rq_of_dl_se(dl_se); + struct rq_flags rf; + + rq_lock(rq, &rf); + if (dl_se->dl_throttled) { + sched_clock_tick(); + update_rq_clock(rq); + + if (dl_se->server_has_tasks(dl_se)) { + enqueue_dl_entity(dl_se, ENQUEUE_REPLENISH); + resched_curr(rq); + __push_dl_task(rq, &rf); + } else { + replenish_dl_entity(dl_se); + } + + } + rq_unlock(rq, &rf); + + return HRTIMER_NORESTART; + } + + p = dl_task_of(dl_se); rq = task_rq_lock(p, &rf); /* @@ -1158,21 +1223,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) else resched_curr(rq); -#ifdef CONFIG_SMP - /* - * Queueing this task back might have overloaded rq, check if we need - * to kick someone away. - */ - if (has_pushable_dl_tasks(rq)) { - /* - * Nothing relies on rq->lock after this, so its safe to drop - * rq->lock. - */ - rq_unpin_lock(rq, &rf); - push_dl_task(rq); - rq_repin_lock(rq, &rf); - } -#endif + __push_dl_task(rq, &rf); unlock: task_rq_unlock(rq, p, &rf); @@ -1214,12 +1265,11 @@ static void init_dl_task_timer(struct sched_dl_entity *dl_se) */ static inline void dl_check_constrained_dl(struct sched_dl_entity *dl_se) { - struct task_struct *p = dl_task_of(dl_se); - struct rq *rq = rq_of_dl_rq(dl_rq_of_se(dl_se)); + struct rq *rq = rq_of_dl_se(dl_se); if (dl_time_before(dl_se->deadline, rq_clock(rq)) && dl_time_before(rq_clock(rq), dl_next_period(dl_se))) { - if (unlikely(is_dl_boosted(dl_se) || !start_dl_timer(p))) + if (unlikely(is_dl_boosted(dl_se) || !start_dl_timer(dl_se))) return; dl_se->dl_throttled = 1; if (dl_se->runtime > 0) @@ -1270,29 +1320,13 @@ static u64 grub_reclaim(u64 delta, struct rq *rq, struct sched_dl_entity *dl_se) return (delta * u_act) >> BW_SHIFT; } -/* - * Update the current task's runtime statistics (provided it is still - * a -deadline task and has not been removed from the dl_rq). - */ -static void update_curr_dl(struct rq *rq) +static inline void +update_stats_dequeue_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se, + int flags); +static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64 delta_exec) { - struct task_struct *curr = rq->curr; - struct sched_dl_entity *dl_se = &curr->dl; - s64 delta_exec, scaled_delta_exec; - int cpu = cpu_of(rq); - - if (!dl_task(curr) || !on_dl_rq(dl_se)) - return; + s64 scaled_delta_exec; - /* - * Consumed budget is computed considering the time as - * observed by schedulable tasks (excluding time spent - * in hardirq context, etc.). Deadlines are instead - * computed using hard walltime. This seems to be the more - * natural solution, but the full ramifications of this - * approach need further study. - */ - delta_exec = update_curr_common(rq); if (unlikely(delta_exec <= 0)) { if (unlikely(dl_se->dl_yielded)) goto throttle; @@ -1310,10 +1344,9 @@ static void update_curr_dl(struct rq *rq) * according to current frequency and CPU maximum capacity. */ if (unlikely(dl_se->flags & SCHED_FLAG_RECLAIM)) { - scaled_delta_exec = grub_reclaim(delta_exec, - rq, - &curr->dl); + scaled_delta_exec = grub_reclaim(delta_exec, rq, dl_se); } else { + int cpu = cpu_of(rq); unsigned long scale_freq = arch_scale_freq_capacity(cpu); unsigned long scale_cpu = arch_scale_cpu_capacity(cpu); @@ -1332,11 +1365,20 @@ throttle: (dl_se->flags & SCHED_FLAG_DL_OVERRUN)) dl_se->dl_overrun = 1; - __dequeue_task_dl(rq, curr, 0); - if (unlikely(is_dl_boosted(dl_se) || !start_dl_timer(curr))) - enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH); + dequeue_dl_entity(dl_se, 0); + if (!dl_server(dl_se)) { + update_stats_dequeue_dl(&rq->dl, dl_se, 0); + dequeue_pushable_dl_task(rq, dl_task_of(dl_se)); + } - if (!is_leftmost(curr, &rq->dl)) + if (unlikely(is_dl_boosted(dl_se) || !start_dl_timer(dl_se))) { + if (dl_server(dl_se)) + enqueue_dl_entity(dl_se, ENQUEUE_REPLENISH); + else + enqueue_task_dl(rq, dl_task_of(dl_se), ENQUEUE_REPLENISH); + } + + if (!is_leftmost(dl_se, &rq->dl)) resched_curr(rq); } @@ -1366,20 +1408,82 @@ throttle: } } +void dl_server_update(struct sched_dl_entity *dl_se, s64 delta_exec) +{ + update_curr_dl_se(dl_se->rq, dl_se, delta_exec); +} + +void dl_server_start(struct sched_dl_entity *dl_se) +{ + if (!dl_server(dl_se)) { + dl_se->dl_server = 1; + setup_new_dl_entity(dl_se); + } + enqueue_dl_entity(dl_se, ENQUEUE_WAKEUP); +} + +void dl_server_stop(struct sched_dl_entity *dl_se) +{ + dequeue_dl_entity(dl_se, DEQUEUE_SLEEP); +} + +void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq, + dl_server_has_tasks_f has_tasks, + dl_server_pick_f pick) +{ + dl_se->rq = rq; + dl_se->server_has_tasks = has_tasks; + dl_se->server_pick = pick; +} + +/* + * Update the current task's runtime statistics (provided it is still + * a -deadline task and has not been removed from the dl_rq). + */ +static void update_curr_dl(struct rq *rq) +{ + struct task_struct *curr = rq->curr; + struct sched_dl_entity *dl_se = &curr->dl; + s64 delta_exec; + + if (!dl_task(curr) || !on_dl_rq(dl_se)) + return; + + /* + * Consumed budget is computed considering the time as + * observed by schedulable tasks (excluding time spent + * in hardirq context, etc.). Deadlines are instead + * computed using hard walltime. This seems to be the more + * natural solution, but the full ramifications of this + * approach need further study. + */ + delta_exec = update_curr_common(rq); + update_curr_dl_se(rq, dl_se, delta_exec); +} + static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer) { struct sched_dl_entity *dl_se = container_of(timer, struct sched_dl_entity, inactive_timer); - struct task_struct *p = dl_task_of(dl_se); + struct task_struct *p = NULL; struct rq_flags rf; struct rq *rq; - rq = task_rq_lock(p, &rf); + if (!dl_server(dl_se)) { + p = dl_task_of(dl_se); + rq = task_rq_lock(p, &rf); + } else { + rq = dl_se->rq; + rq_lock(rq, &rf); + } sched_clock_tick(); update_rq_clock(rq); + if (dl_server(dl_se)) + goto no_task; + if (!dl_task(p) || READ_ONCE(p->__state) == TASK_DEAD) { struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); @@ -1396,14 +1500,21 @@ static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer) goto unlock; } + +no_task: if (dl_se->dl_non_contending == 0) goto unlock; sub_running_bw(dl_se, &rq->dl); dl_se->dl_non_contending = 0; unlock: - task_rq_unlock(rq, p, &rf); - put_task_struct(p); + + if (!dl_server(dl_se)) { + task_rq_unlock(rq, p, &rf); + put_task_struct(p); + } else { + rq_unlock(rq, &rf); + } return HRTIMER_NORESTART; } @@ -1466,10 +1577,8 @@ static inline void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline) {} static inline void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) { - int prio = dl_task_of(dl_se)->prio; u64 deadline = dl_se->deadline; - WARN_ON(!dl_prio(prio)); dl_rq->dl_nr_running++; add_nr_running(rq_of_dl_rq(dl_rq), 1); @@ -1479,9 +1588,6 @@ void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) static inline void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) { - int prio = dl_task_of(dl_se)->prio; - - WARN_ON(!dl_prio(prio)); WARN_ON(!dl_rq->dl_nr_running); dl_rq->dl_nr_running--; sub_nr_running(rq_of_dl_rq(dl_rq), 1); @@ -1648,8 +1754,7 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se, int flags) } else if (flags & ENQUEUE_REPLENISH) { replenish_dl_entity(dl_se); } else if ((flags & ENQUEUE_RESTORE) && - dl_time_before(dl_se->deadline, - rq_clock(rq_of_dl_rq(dl_rq_of_se(dl_se))))) { + dl_time_before(dl_se->deadline, rq_clock(rq_of_dl_se(dl_se)))) { setup_new_dl_entity(dl_se); } @@ -1730,19 +1835,13 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) enqueue_dl_entity(&p->dl, flags); + if (dl_server(&p->dl)) + return; + if (!task_current(rq, p) && !p->dl.dl_throttled && p->nr_cpus_allowed > 1) enqueue_pushable_dl_task(rq, p); } -static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) -{ - update_stats_dequeue_dl(&rq->dl, &p->dl, flags); - dequeue_dl_entity(&p->dl, flags); - - if (!p->dl.dl_throttled) - dequeue_pushable_dl_task(rq, p); -} - static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) { update_curr_dl(rq); @@ -1750,7 +1849,9 @@ static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) if (p->on_rq == TASK_ON_RQ_MIGRATING) flags |= DEQUEUE_MIGRATING; - __dequeue_task_dl(rq, p, flags); + dequeue_dl_entity(&p->dl, flags); + if (!p->dl.dl_throttled && !dl_server(&p->dl)) + dequeue_pushable_dl_task(rq, p); } /* @@ -1940,12 +2041,12 @@ static void wakeup_preempt_dl(struct rq *rq, struct task_struct *p, } #ifdef CONFIG_SCHED_HRTICK -static void start_hrtick_dl(struct rq *rq, struct task_struct *p) +static void start_hrtick_dl(struct rq *rq, struct sched_dl_entity *dl_se) { - hrtick_start(rq, p->dl.runtime); + hrtick_start(rq, dl_se->runtime); } #else /* !CONFIG_SCHED_HRTICK */ -static void start_hrtick_dl(struct rq *rq, struct task_struct *p) +static void start_hrtick_dl(struct rq *rq, struct sched_dl_entity *dl_se) { } #endif @@ -1965,9 +2066,6 @@ static void set_next_task_dl(struct rq *rq, struct task_struct *p, bool first) if (!first) return; - if (hrtick_enabled_dl(rq)) - start_hrtick_dl(rq, p); - if (rq->curr->sched_class != &dl_sched_class) update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 0); @@ -1990,12 +2088,25 @@ static struct task_struct *pick_task_dl(struct rq *rq) struct dl_rq *dl_rq = &rq->dl; struct task_struct *p; +again: if (!sched_dl_runnable(rq)) return NULL; dl_se = pick_next_dl_entity(dl_rq); WARN_ON_ONCE(!dl_se); - p = dl_task_of(dl_se); + + if (dl_server(dl_se)) { + p = dl_se->server_pick(dl_se); + if (!p) { + WARN_ON_ONCE(1); + dl_se->dl_yielded = 1; + update_curr_dl_se(rq, dl_se, 0); + goto again; + } + p->dl_server = dl_se; + } else { + p = dl_task_of(dl_se); + } return p; } @@ -2005,9 +2116,15 @@ static struct task_struct *pick_next_task_dl(struct rq *rq) struct task_struct *p; p = pick_task_dl(rq); - if (p) + if (!p) + return p; + + if (!p->dl_server) set_next_task_dl(rq, p, true); + if (hrtick_enabled(rq)) + start_hrtick_dl(rq, &p->dl); + return p; } @@ -2045,8 +2162,8 @@ static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued) * be set and schedule() will start a new hrtick for the next task. */ if (hrtick_enabled_dl(rq) && queued && p->dl.runtime > 0 && - is_leftmost(p, &rq->dl)) - start_hrtick_dl(rq, p); + is_leftmost(&p->dl, &rq->dl)) + start_hrtick_dl(rq, &p->dl); } static void task_fork_dl(struct task_struct *p) @@ -2986,6 +3103,7 @@ static void __dl_clear_params(struct sched_dl_entity *dl_se) dl_se->dl_yielded = 0; dl_se->dl_non_contending = 0; dl_se->dl_overrun = 0; + dl_se->dl_server = 0; #ifdef CONFIG_RT_MUTEXES dl_se->pi_se = dl_se; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1cd92b11b289..07f555857698 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1131,6 +1131,8 @@ static inline void update_curr_task(struct task_struct *p, s64 delta_exec) trace_sched_stat_runtime(p, delta_exec); account_group_exec_runtime(p, delta_exec); cgroup_account_cputime(p, delta_exec); + if (p->dl_server) + dl_server_update(p->dl_server, delta_exec); } /* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 1cda787172f0..8a70d51ffa33 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -313,6 +313,33 @@ extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *att extern int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial); extern int dl_bw_check_overflow(int cpu); +/* + * SCHED_DEADLINE supports servers (nested scheduling) with the following + * interface: + * + * dl_se::rq -- runqueue we belong to. + * + * dl_se::server_has_tasks() -- used on bandwidth enforcement; we 'stop' the + * server when it runs out of tasks to run. + * + * dl_se::server_pick() -- nested pick_next_task(); we yield the period if this + * returns NULL. + * + * dl_server_update() -- called from update_curr_common(), propagates runtime + * to the server. + * + * dl_server_start() + * dl_server_stop() -- start/stop the server when it has (no) tasks. + * + * dl_server_init() -- initializes the server. + */ +extern void dl_server_update(struct sched_dl_entity *dl_se, s64 delta_exec); +extern void dl_server_start(struct sched_dl_entity *dl_se); +extern void dl_server_stop(struct sched_dl_entity *dl_se); +extern void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq, + dl_server_has_tasks_f has_tasks, + dl_server_pick_f pick); + #ifdef CONFIG_CGROUP_SCHED struct cfs_rq; -- cgit From dd5403869a40595eb953f12e8cd2bb57bb88bb67 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 14 Nov 2023 14:38:39 -0500 Subject: sched/cpuidle: Comment about timers requirements VS idle handler Add missing explanation concerning IRQs re-enablement constraints in the cpuidle path against timers. Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra (Intel) Acked-by: Rafael J. Wysocki Link: https://lkml.kernel.org/r/20231114193840.4041-2-frederic@kernel.org --- kernel/sched/idle.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 565f8374ddbb..31231925f1ec 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -258,6 +258,36 @@ static void do_idle(void) while (!need_resched()) { rmb(); + /* + * Interrupts shouldn't be re-enabled from that point on until + * the CPU sleeping instruction is reached. Otherwise an interrupt + * may fire and queue a timer that would be ignored until the CPU + * wakes from the sleeping instruction. And testing need_resched() + * doesn't tell about pending needed timer reprogram. + * + * Several cases to consider: + * + * - SLEEP-UNTIL-PENDING-INTERRUPT based instructions such as + * "wfi" or "mwait" are fine because they can be entered with + * interrupt disabled. + * + * - sti;mwait() couple is fine because the interrupts are + * re-enabled only upon the execution of mwait, leaving no gap + * in-between. + * + * - ROLLBACK based idle handlers with the sleeping instruction + * called with interrupts enabled are NOT fine. In this scheme + * when the interrupt detects it has interrupted an idle handler, + * it rolls back to its beginning which performs the + * need_resched() check before re-executing the sleeping + * instruction. This can leak a pending needed timer reprogram. + * If such a scheme is really mandatory due to the lack of an + * appropriate CPU sleeping instruction, then a FAST-FORWARD + * must instead be applied: when the interrupt detects it has + * interrupted an idle handler, it must resume to the end of + * this idle handler so that the generic idle loop is iterated + * again to reprogram the tick. + */ local_irq_disable(); if (cpu_is_offline(cpu)) { -- cgit From 194600008d5c43b5a4ba98c4b81633397e34ffad Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 14 Nov 2023 14:38:40 -0500 Subject: sched/timers: Explain why idle task schedules out on remote timer enqueue Trying to avoid that didn't bring much value after testing, add comment about this. Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra (Intel) Acked-by: Rafael J. Wysocki Link: https://lkml.kernel.org/r/20231114193840.4041-3-frederic@kernel.org --- kernel/sched/core.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f5f4495d1768..2de77a6d5ef8 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1131,6 +1131,28 @@ static void wake_up_idle_cpu(int cpu) if (cpu == smp_processor_id()) return; + /* + * Set TIF_NEED_RESCHED and send an IPI if in the non-polling + * part of the idle loop. This forces an exit from the idle loop + * and a round trip to schedule(). Now this could be optimized + * because a simple new idle loop iteration is enough to + * re-evaluate the next tick. Provided some re-ordering of tick + * nohz functions that would need to follow TIF_NR_POLLING + * clearing: + * + * - On most archs, a simple fetch_or on ti::flags with a + * "0" value would be enough to know if an IPI needs to be sent. + * + * - x86 needs to perform a last need_resched() check between + * monitor and mwait which doesn't take timers into account. + * There a dedicated TIF_TIMER flag would be required to + * fetch_or here and be checked along with TIF_NEED_RESCHED + * before mwait(). + * + * However, remote timer enqueue is not such a frequent event + * and testing of the above solutions didn't appear to report + * much benefits. + */ if (set_nr_and_not_polling(rq->idle)) smp_send_reschedule(cpu); else -- cgit From 652ffc2104ec1f69dd4a46313888c33527145ccf Mon Sep 17 00:00:00 2001 From: Greg KH Date: Mon, 12 Jun 2023 15:09:09 +0200 Subject: perf/core: Fix narrow startup race when creating the perf nr_addr_filters sysfs file Signed-off-by: Greg Kroah-Hartman Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/2023061204-decal-flyable-6090@gregkh --- kernel/events/core.c | 40 ++++++++++++++++++++++++++++------------ 1 file changed, 28 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 08250981d9f4..4f0c45ab8d7d 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -11408,9 +11408,32 @@ static DEVICE_ATTR_RW(perf_event_mux_interval_ms); static struct attribute *pmu_dev_attrs[] = { &dev_attr_type.attr, &dev_attr_perf_event_mux_interval_ms.attr, + &dev_attr_nr_addr_filters.attr, + NULL, +}; + +static umode_t pmu_dev_is_visible(struct kobject *kobj, struct attribute *a, int n) +{ + struct device *dev = kobj_to_dev(kobj); + struct pmu *pmu = dev_get_drvdata(dev); + + if (!pmu->nr_addr_filters) + return 0; + + return a->mode; + + return 0; +} + +static struct attribute_group pmu_dev_attr_group = { + .is_visible = pmu_dev_is_visible, + .attrs = pmu_dev_attrs, +}; + +static const struct attribute_group *pmu_dev_groups[] = { + &pmu_dev_attr_group, NULL, }; -ATTRIBUTE_GROUPS(pmu_dev); static int pmu_bus_running; static struct bus_type pmu_bus = { @@ -11447,18 +11470,11 @@ static int pmu_dev_alloc(struct pmu *pmu) if (ret) goto free_dev; - /* For PMUs with address filters, throw in an extra attribute: */ - if (pmu->nr_addr_filters) - ret = device_create_file(pmu->dev, &dev_attr_nr_addr_filters); - - if (ret) - goto del_dev; - - if (pmu->attr_update) + if (pmu->attr_update) { ret = sysfs_update_groups(&pmu->dev->kobj, pmu->attr_update); - - if (ret) - goto del_dev; + if (ret) + goto del_dev; + } out: return ret; -- cgit From 67420501e8681ae18f9f0ea0a69cd2f432100e70 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sat, 11 Nov 2023 17:05:57 -0800 Subject: bpf: generalize reg_set_min_max() to handle non-const register comparisons Generalize bounds adjustment logic of reg_set_min_max() to handle not just register vs constant case, but in general any register vs any register cases. For most of the operations it's trivial extension based on range vs range comparison logic, we just need to properly pick min/max of a range to compare against min/max of the other range. For BPF_JSET we keep the original capabilities, just make sure JSET is integrated in the common framework. This is manifested in the internal-only BPF_JSET + BPF_X "opcode" to allow for simpler and more uniform rev_opcode() handling. See the code for details. This allows to reuse the same code exactly both for TRUE and FALSE branches without explicitly handling both conditions with custom code. Note also that now we don't need a special handling of BPF_JEQ/BPF_JNE case none of the registers are constants. This is now just a normal generic case handled by reg_set_min_max(). To make tnum handling cleaner, tnum_with_subreg() helper is added, as that's a common operator when dealing with 32-bit subregister bounds. This keeps the overall logic much less noisy when it comes to tnums. Acked-by: Eduard Zingerman Signed-off-by: Andrii Nakryiko Acked-by: Shung-Hsi Yu Link: https://lore.kernel.org/r/20231112010609.848406-2-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/tnum.h | 4 + kernel/bpf/tnum.c | 7 +- kernel/bpf/verifier.c | 314 ++++++++++++++++++++++---------------------------- 3 files changed, 146 insertions(+), 179 deletions(-) (limited to 'kernel') diff --git a/include/linux/tnum.h b/include/linux/tnum.h index 1c3948a1d6ad..3c13240077b8 100644 --- a/include/linux/tnum.h +++ b/include/linux/tnum.h @@ -106,6 +106,10 @@ int tnum_sbin(char *str, size_t size, struct tnum a); struct tnum tnum_subreg(struct tnum a); /* Returns the tnum with the lower 32-bit subreg cleared */ struct tnum tnum_clear_subreg(struct tnum a); +/* Returns the tnum with the lower 32-bit subreg in *reg* set to the lower + * 32-bit subreg in *subreg* + */ +struct tnum tnum_with_subreg(struct tnum reg, struct tnum subreg); /* Returns the tnum with the lower 32-bit subreg set to value */ struct tnum tnum_const_subreg(struct tnum a, u32 value); /* Returns true if 32-bit subreg @a is a known constant*/ diff --git a/kernel/bpf/tnum.c b/kernel/bpf/tnum.c index 3d7127f439a1..f4c91c9b27d7 100644 --- a/kernel/bpf/tnum.c +++ b/kernel/bpf/tnum.c @@ -208,7 +208,12 @@ struct tnum tnum_clear_subreg(struct tnum a) return tnum_lshift(tnum_rshift(a, 32), 32); } +struct tnum tnum_with_subreg(struct tnum reg, struct tnum subreg) +{ + return tnum_or(tnum_clear_subreg(reg), tnum_subreg(subreg)); +} + struct tnum tnum_const_subreg(struct tnum a, u32 value) { - return tnum_or(tnum_clear_subreg(a), tnum_const(value)); + return tnum_with_subreg(a, tnum_const(value)); } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 9ae6eae13471..39ce141c55d3 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -14453,218 +14453,186 @@ static int is_branch_taken(struct bpf_reg_state *reg1, struct bpf_reg_state *reg return is_scalar_branch_taken(reg1, reg2, opcode, is_jmp32); } -/* Adjusts the register min/max values in the case that the dst_reg and - * src_reg are both SCALAR_VALUE registers (or we are simply doing a BPF_K - * check, in which case we havea fake SCALAR_VALUE representing insn->imm). - * Technically we can do similar adjustments for pointers to the same object, - * but we don't support that right now. +/* Opcode that corresponds to a *false* branch condition. + * E.g., if r1 < r2, then reverse (false) condition is r1 >= r2 */ -static void reg_set_min_max(struct bpf_reg_state *true_reg1, - struct bpf_reg_state *true_reg2, - struct bpf_reg_state *false_reg1, - struct bpf_reg_state *false_reg2, - u8 opcode, bool is_jmp32) +static u8 rev_opcode(u8 opcode) { - struct tnum false_32off, false_64off; - struct tnum true_32off, true_64off; - u64 uval; - u32 uval32; - s64 sval; - s32 sval32; - - /* If either register is a pointer, we can't learn anything about its - * variable offset from the compare (unless they were a pointer into - * the same object, but we don't bother with that). + switch (opcode) { + case BPF_JEQ: return BPF_JNE; + case BPF_JNE: return BPF_JEQ; + /* JSET doesn't have it's reverse opcode in BPF, so add + * BPF_X flag to denote the reverse of that operation */ - if (false_reg1->type != SCALAR_VALUE || false_reg2->type != SCALAR_VALUE) - return; - - /* we expect right-hand registers (src ones) to be constants, for now */ - if (!is_reg_const(false_reg2, is_jmp32)) { - opcode = flip_opcode(opcode); - swap(true_reg1, true_reg2); - swap(false_reg1, false_reg2); + case BPF_JSET: return BPF_JSET | BPF_X; + case BPF_JSET | BPF_X: return BPF_JSET; + case BPF_JGE: return BPF_JLT; + case BPF_JGT: return BPF_JLE; + case BPF_JLE: return BPF_JGT; + case BPF_JLT: return BPF_JGE; + case BPF_JSGE: return BPF_JSLT; + case BPF_JSGT: return BPF_JSLE; + case BPF_JSLE: return BPF_JSGT; + case BPF_JSLT: return BPF_JSGE; + default: return 0; } - if (!is_reg_const(false_reg2, is_jmp32)) - return; +} - false_32off = tnum_subreg(false_reg1->var_off); - false_64off = false_reg1->var_off; - true_32off = tnum_subreg(true_reg1->var_off); - true_64off = true_reg1->var_off; - uval = false_reg2->var_off.value; - uval32 = (u32)tnum_subreg(false_reg2->var_off).value; - sval = (s64)uval; - sval32 = (s32)uval32; +/* Refine range knowledge for 2 conditional operation. */ +static void regs_refine_cond_op(struct bpf_reg_state *reg1, struct bpf_reg_state *reg2, + u8 opcode, bool is_jmp32) +{ + struct tnum t; + u64 val; +again: switch (opcode) { - /* JEQ/JNE comparison doesn't change the register equivalence. - * - * r1 = r2; - * if (r1 == 42) goto label; - * ... - * label: // here both r1 and r2 are known to be 42. - * - * Hence when marking register as known preserve it's ID. - */ case BPF_JEQ: if (is_jmp32) { - __mark_reg32_known(true_reg1, uval32); - true_32off = tnum_subreg(true_reg1->var_off); + reg1->u32_min_value = max(reg1->u32_min_value, reg2->u32_min_value); + reg1->u32_max_value = min(reg1->u32_max_value, reg2->u32_max_value); + reg1->s32_min_value = max(reg1->s32_min_value, reg2->s32_min_value); + reg1->s32_max_value = min(reg1->s32_max_value, reg2->s32_max_value); + reg2->u32_min_value = reg1->u32_min_value; + reg2->u32_max_value = reg1->u32_max_value; + reg2->s32_min_value = reg1->s32_min_value; + reg2->s32_max_value = reg1->s32_max_value; + + t = tnum_intersect(tnum_subreg(reg1->var_off), tnum_subreg(reg2->var_off)); + reg1->var_off = tnum_with_subreg(reg1->var_off, t); + reg2->var_off = tnum_with_subreg(reg2->var_off, t); } else { - ___mark_reg_known(true_reg1, uval); - true_64off = true_reg1->var_off; + reg1->umin_value = max(reg1->umin_value, reg2->umin_value); + reg1->umax_value = min(reg1->umax_value, reg2->umax_value); + reg1->smin_value = max(reg1->smin_value, reg2->smin_value); + reg1->smax_value = min(reg1->smax_value, reg2->smax_value); + reg2->umin_value = reg1->umin_value; + reg2->umax_value = reg1->umax_value; + reg2->smin_value = reg1->smin_value; + reg2->smax_value = reg1->smax_value; + + reg1->var_off = tnum_intersect(reg1->var_off, reg2->var_off); + reg2->var_off = reg1->var_off; } break; case BPF_JNE: - if (is_jmp32) { - __mark_reg32_known(false_reg1, uval32); - false_32off = tnum_subreg(false_reg1->var_off); - } else { - ___mark_reg_known(false_reg1, uval); - false_64off = false_reg1->var_off; - } + /* we don't derive any new information for inequality yet */ break; case BPF_JSET: + if (!is_reg_const(reg2, is_jmp32)) + swap(reg1, reg2); + if (!is_reg_const(reg2, is_jmp32)) + break; + val = reg_const_value(reg2, is_jmp32); + /* BPF_JSET (i.e., TRUE branch, *not* BPF_JSET | BPF_X) + * requires single bit to learn something useful. E.g., if we + * know that `r1 & 0x3` is true, then which bits (0, 1, or both) + * are actually set? We can learn something definite only if + * it's a single-bit value to begin with. + * + * BPF_JSET | BPF_X (i.e., negation of BPF_JSET) doesn't have + * this restriction. I.e., !(r1 & 0x3) means neither bit 0 nor + * bit 1 is set, which we can readily use in adjustments. + */ + if (!is_power_of_2(val)) + break; if (is_jmp32) { - false_32off = tnum_and(false_32off, tnum_const(~uval32)); - if (is_power_of_2(uval32)) - true_32off = tnum_or(true_32off, - tnum_const(uval32)); + t = tnum_or(tnum_subreg(reg1->var_off), tnum_const(val)); + reg1->var_off = tnum_with_subreg(reg1->var_off, t); } else { - false_64off = tnum_and(false_64off, tnum_const(~uval)); - if (is_power_of_2(uval)) - true_64off = tnum_or(true_64off, - tnum_const(uval)); + reg1->var_off = tnum_or(reg1->var_off, tnum_const(val)); } break; - case BPF_JGE: - case BPF_JGT: - { + case BPF_JSET | BPF_X: /* reverse of BPF_JSET, see rev_opcode() */ + if (!is_reg_const(reg2, is_jmp32)) + swap(reg1, reg2); + if (!is_reg_const(reg2, is_jmp32)) + break; + val = reg_const_value(reg2, is_jmp32); if (is_jmp32) { - u32 false_umax = opcode == BPF_JGT ? uval32 : uval32 - 1; - u32 true_umin = opcode == BPF_JGT ? uval32 + 1 : uval32; - - false_reg1->u32_max_value = min(false_reg1->u32_max_value, - false_umax); - true_reg1->u32_min_value = max(true_reg1->u32_min_value, - true_umin); + t = tnum_and(tnum_subreg(reg1->var_off), tnum_const(~val)); + reg1->var_off = tnum_with_subreg(reg1->var_off, t); } else { - u64 false_umax = opcode == BPF_JGT ? uval : uval - 1; - u64 true_umin = opcode == BPF_JGT ? uval + 1 : uval; - - false_reg1->umax_value = min(false_reg1->umax_value, false_umax); - true_reg1->umin_value = max(true_reg1->umin_value, true_umin); + reg1->var_off = tnum_and(reg1->var_off, tnum_const(~val)); } break; - } - case BPF_JSGE: - case BPF_JSGT: - { + case BPF_JLE: if (is_jmp32) { - s32 false_smax = opcode == BPF_JSGT ? sval32 : sval32 - 1; - s32 true_smin = opcode == BPF_JSGT ? sval32 + 1 : sval32; - - false_reg1->s32_max_value = min(false_reg1->s32_max_value, false_smax); - true_reg1->s32_min_value = max(true_reg1->s32_min_value, true_smin); + reg1->u32_max_value = min(reg1->u32_max_value, reg2->u32_max_value); + reg2->u32_min_value = max(reg1->u32_min_value, reg2->u32_min_value); } else { - s64 false_smax = opcode == BPF_JSGT ? sval : sval - 1; - s64 true_smin = opcode == BPF_JSGT ? sval + 1 : sval; - - false_reg1->smax_value = min(false_reg1->smax_value, false_smax); - true_reg1->smin_value = max(true_reg1->smin_value, true_smin); + reg1->umax_value = min(reg1->umax_value, reg2->umax_value); + reg2->umin_value = max(reg1->umin_value, reg2->umin_value); } break; - } - case BPF_JLE: case BPF_JLT: - { if (is_jmp32) { - u32 false_umin = opcode == BPF_JLT ? uval32 : uval32 + 1; - u32 true_umax = opcode == BPF_JLT ? uval32 - 1 : uval32; - - false_reg1->u32_min_value = max(false_reg1->u32_min_value, - false_umin); - true_reg1->u32_max_value = min(true_reg1->u32_max_value, - true_umax); + reg1->u32_max_value = min(reg1->u32_max_value, reg2->u32_max_value - 1); + reg2->u32_min_value = max(reg1->u32_min_value + 1, reg2->u32_min_value); } else { - u64 false_umin = opcode == BPF_JLT ? uval : uval + 1; - u64 true_umax = opcode == BPF_JLT ? uval - 1 : uval; - - false_reg1->umin_value = max(false_reg1->umin_value, false_umin); - true_reg1->umax_value = min(true_reg1->umax_value, true_umax); + reg1->umax_value = min(reg1->umax_value, reg2->umax_value - 1); + reg2->umin_value = max(reg1->umin_value + 1, reg2->umin_value); } break; - } case BPF_JSLE: + if (is_jmp32) { + reg1->s32_max_value = min(reg1->s32_max_value, reg2->s32_max_value); + reg2->s32_min_value = max(reg1->s32_min_value, reg2->s32_min_value); + } else { + reg1->smax_value = min(reg1->smax_value, reg2->smax_value); + reg2->smin_value = max(reg1->smin_value, reg2->smin_value); + } + break; case BPF_JSLT: - { if (is_jmp32) { - s32 false_smin = opcode == BPF_JSLT ? sval32 : sval32 + 1; - s32 true_smax = opcode == BPF_JSLT ? sval32 - 1 : sval32; - - false_reg1->s32_min_value = max(false_reg1->s32_min_value, false_smin); - true_reg1->s32_max_value = min(true_reg1->s32_max_value, true_smax); + reg1->s32_max_value = min(reg1->s32_max_value, reg2->s32_max_value - 1); + reg2->s32_min_value = max(reg1->s32_min_value + 1, reg2->s32_min_value); } else { - s64 false_smin = opcode == BPF_JSLT ? sval : sval + 1; - s64 true_smax = opcode == BPF_JSLT ? sval - 1 : sval; - - false_reg1->smin_value = max(false_reg1->smin_value, false_smin); - true_reg1->smax_value = min(true_reg1->smax_value, true_smax); + reg1->smax_value = min(reg1->smax_value, reg2->smax_value - 1); + reg2->smin_value = max(reg1->smin_value + 1, reg2->smin_value); } break; - } + case BPF_JGE: + case BPF_JGT: + case BPF_JSGE: + case BPF_JSGT: + /* just reuse LE/LT logic above */ + opcode = flip_opcode(opcode); + swap(reg1, reg2); + goto again; default: return; } - - if (is_jmp32) { - false_reg1->var_off = tnum_or(tnum_clear_subreg(false_64off), - tnum_subreg(false_32off)); - true_reg1->var_off = tnum_or(tnum_clear_subreg(true_64off), - tnum_subreg(true_32off)); - reg_bounds_sync(false_reg1); - reg_bounds_sync(true_reg1); - } else { - false_reg1->var_off = false_64off; - true_reg1->var_off = true_64off; - reg_bounds_sync(false_reg1); - reg_bounds_sync(true_reg1); - } -} - -/* Regs are known to be equal, so intersect their min/max/var_off */ -static void __reg_combine_min_max(struct bpf_reg_state *src_reg, - struct bpf_reg_state *dst_reg) -{ - src_reg->umin_value = dst_reg->umin_value = max(src_reg->umin_value, - dst_reg->umin_value); - src_reg->umax_value = dst_reg->umax_value = min(src_reg->umax_value, - dst_reg->umax_value); - src_reg->smin_value = dst_reg->smin_value = max(src_reg->smin_value, - dst_reg->smin_value); - src_reg->smax_value = dst_reg->smax_value = min(src_reg->smax_value, - dst_reg->smax_value); - src_reg->var_off = dst_reg->var_off = tnum_intersect(src_reg->var_off, - dst_reg->var_off); - reg_bounds_sync(src_reg); - reg_bounds_sync(dst_reg); } -static void reg_combine_min_max(struct bpf_reg_state *true_src, - struct bpf_reg_state *true_dst, - struct bpf_reg_state *false_src, - struct bpf_reg_state *false_dst, - u8 opcode) +/* Adjusts the register min/max values in the case that the dst_reg and + * src_reg are both SCALAR_VALUE registers (or we are simply doing a BPF_K + * check, in which case we havea fake SCALAR_VALUE representing insn->imm). + * Technically we can do similar adjustments for pointers to the same object, + * but we don't support that right now. + */ +static void reg_set_min_max(struct bpf_reg_state *true_reg1, + struct bpf_reg_state *true_reg2, + struct bpf_reg_state *false_reg1, + struct bpf_reg_state *false_reg2, + u8 opcode, bool is_jmp32) { - switch (opcode) { - case BPF_JEQ: - __reg_combine_min_max(true_src, true_dst); - break; - case BPF_JNE: - __reg_combine_min_max(false_src, false_dst); - break; - } + /* If either register is a pointer, we can't learn anything about its + * variable offset from the compare (unless they were a pointer into + * the same object, but we don't bother with that). + */ + if (false_reg1->type != SCALAR_VALUE || false_reg2->type != SCALAR_VALUE) + return; + + /* fallthrough (FALSE) branch */ + regs_refine_cond_op(false_reg1, false_reg2, rev_opcode(opcode), is_jmp32); + reg_bounds_sync(false_reg1); + reg_bounds_sync(false_reg2); + + /* jump (TRUE) branch */ + regs_refine_cond_op(true_reg1, true_reg2, opcode, is_jmp32); + reg_bounds_sync(true_reg1); + reg_bounds_sync(true_reg2); } static void mark_ptr_or_null_reg(struct bpf_func_state *state, @@ -14961,22 +14929,12 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, reg_set_min_max(&other_branch_regs[insn->dst_reg], &other_branch_regs[insn->src_reg], dst_reg, src_reg, opcode, is_jmp32); - - if (dst_reg->type == SCALAR_VALUE && - src_reg->type == SCALAR_VALUE && - !is_jmp32 && (opcode == BPF_JEQ || opcode == BPF_JNE)) { - /* Comparing for equality, we can combine knowledge */ - reg_combine_min_max(&other_branch_regs[insn->src_reg], - &other_branch_regs[insn->dst_reg], - src_reg, dst_reg, opcode); - } } else /* BPF_SRC(insn->code) == BPF_K */ { reg_set_min_max(&other_branch_regs[insn->dst_reg], src_reg /* fake one */, dst_reg, src_reg /* same fake one */, opcode, is_jmp32); } - if (BPF_SRC(insn->code) == BPF_X && src_reg->type == SCALAR_VALUE && src_reg->id && !WARN_ON_ONCE(src_reg->id != other_branch_regs[insn->src_reg].id)) { -- cgit From 96381879a370425a30b810906946f64c0726450e Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sat, 11 Nov 2023 17:05:58 -0800 Subject: bpf: generalize is_scalar_branch_taken() logic Generalize is_branch_taken logic for SCALAR_VALUE register to handle cases when both registers are not constants. Previously supported vs cases are a natural subset of more generic vs set of cases. Generalized logic relies on straightforward segment intersection checks. Acked-by: Eduard Zingerman Signed-off-by: Andrii Nakryiko Acked-by: Shung-Hsi Yu Link: https://lore.kernel.org/r/20231112010609.848406-3-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 98 ++++++++++++++++++++++++++++++--------------------- 1 file changed, 58 insertions(+), 40 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 39ce141c55d3..f459ad99256e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -14261,82 +14261,99 @@ static int is_scalar_branch_taken(struct bpf_reg_state *reg1, struct bpf_reg_sta u8 opcode, bool is_jmp32) { struct tnum t1 = is_jmp32 ? tnum_subreg(reg1->var_off) : reg1->var_off; + struct tnum t2 = is_jmp32 ? tnum_subreg(reg2->var_off) : reg2->var_off; u64 umin1 = is_jmp32 ? (u64)reg1->u32_min_value : reg1->umin_value; u64 umax1 = is_jmp32 ? (u64)reg1->u32_max_value : reg1->umax_value; s64 smin1 = is_jmp32 ? (s64)reg1->s32_min_value : reg1->smin_value; s64 smax1 = is_jmp32 ? (s64)reg1->s32_max_value : reg1->smax_value; - u64 uval = is_jmp32 ? (u32)tnum_subreg(reg2->var_off).value : reg2->var_off.value; - s64 sval = is_jmp32 ? (s32)uval : (s64)uval; + u64 umin2 = is_jmp32 ? (u64)reg2->u32_min_value : reg2->umin_value; + u64 umax2 = is_jmp32 ? (u64)reg2->u32_max_value : reg2->umax_value; + s64 smin2 = is_jmp32 ? (s64)reg2->s32_min_value : reg2->smin_value; + s64 smax2 = is_jmp32 ? (s64)reg2->s32_max_value : reg2->smax_value; switch (opcode) { case BPF_JEQ: - if (tnum_is_const(t1)) - return !!tnum_equals_const(t1, uval); - else if (uval < umin1 || uval > umax1) + /* constants, umin/umax and smin/smax checks would be + * redundant in this case because they all should match + */ + if (tnum_is_const(t1) && tnum_is_const(t2)) + return t1.value == t2.value; + /* non-overlapping ranges */ + if (umin1 > umax2 || umax1 < umin2) return 0; - else if (sval < smin1 || sval > smax1) + if (smin1 > smax2 || smax1 < smin2) return 0; break; case BPF_JNE: - if (tnum_is_const(t1)) - return !tnum_equals_const(t1, uval); - else if (uval < umin1 || uval > umax1) + /* constants, umin/umax and smin/smax checks would be + * redundant in this case because they all should match + */ + if (tnum_is_const(t1) && tnum_is_const(t2)) + return t1.value != t2.value; + /* non-overlapping ranges */ + if (umin1 > umax2 || umax1 < umin2) return 1; - else if (sval < smin1 || sval > smax1) + if (smin1 > smax2 || smax1 < smin2) return 1; break; case BPF_JSET: - if ((~t1.mask & t1.value) & uval) + if (!is_reg_const(reg2, is_jmp32)) { + swap(reg1, reg2); + swap(t1, t2); + } + if (!is_reg_const(reg2, is_jmp32)) + return -1; + if ((~t1.mask & t1.value) & t2.value) return 1; - if (!((t1.mask | t1.value) & uval)) + if (!((t1.mask | t1.value) & t2.value)) return 0; break; case BPF_JGT: - if (umin1 > uval ) + if (umin1 > umax2) return 1; - else if (umax1 <= uval) + else if (umax1 <= umin2) return 0; break; case BPF_JSGT: - if (smin1 > sval) + if (smin1 > smax2) return 1; - else if (smax1 <= sval) + else if (smax1 <= smin2) return 0; break; case BPF_JLT: - if (umax1 < uval) + if (umax1 < umin2) return 1; - else if (umin1 >= uval) + else if (umin1 >= umax2) return 0; break; case BPF_JSLT: - if (smax1 < sval) + if (smax1 < smin2) return 1; - else if (smin1 >= sval) + else if (smin1 >= smax2) return 0; break; case BPF_JGE: - if (umin1 >= uval) + if (umin1 >= umax2) return 1; - else if (umax1 < uval) + else if (umax1 < umin2) return 0; break; case BPF_JSGE: - if (smin1 >= sval) + if (smin1 >= smax2) return 1; - else if (smax1 < sval) + else if (smax1 < smin2) return 0; break; case BPF_JLE: - if (umax1 <= uval) + if (umax1 <= umin2) return 1; - else if (umin1 > uval) + else if (umin1 > umax2) return 0; break; case BPF_JSLE: - if (smax1 <= sval) + if (smax1 <= smin2) return 1; - else if (smin1 > sval) + else if (smin1 > smax2) return 0; break; } @@ -14415,28 +14432,28 @@ static int is_pkt_ptr_branch_taken(struct bpf_reg_state *dst_reg, static int is_branch_taken(struct bpf_reg_state *reg1, struct bpf_reg_state *reg2, u8 opcode, bool is_jmp32) { - u64 val; - if (reg_is_pkt_pointer_any(reg1) && reg_is_pkt_pointer_any(reg2) && !is_jmp32) return is_pkt_ptr_branch_taken(reg1, reg2, opcode); - /* try to make sure reg2 is a constant SCALAR_VALUE */ - if (!is_reg_const(reg2, is_jmp32)) { - opcode = flip_opcode(opcode); - swap(reg1, reg2); - } - /* for now we expect reg2 to be a constant to make any useful decisions */ - if (!is_reg_const(reg2, is_jmp32)) - return -1; - val = reg_const_value(reg2, is_jmp32); + if (__is_pointer_value(false, reg1) || __is_pointer_value(false, reg2)) { + u64 val; + + /* arrange that reg2 is a scalar, and reg1 is a pointer */ + if (!is_reg_const(reg2, is_jmp32)) { + opcode = flip_opcode(opcode); + swap(reg1, reg2); + } + /* and ensure that reg2 is a constant */ + if (!is_reg_const(reg2, is_jmp32)) + return -1; - if (__is_pointer_value(false, reg1)) { if (!reg_not_null(reg1)) return -1; /* If pointer is valid tests against zero will fail so we can * use this to direct branch taken. */ + val = reg_const_value(reg2, is_jmp32); if (val != 0) return -1; @@ -14450,6 +14467,7 @@ static int is_branch_taken(struct bpf_reg_state *reg1, struct bpf_reg_state *reg } } + /* now deal with two scalars, but not necessarily constants */ return is_scalar_branch_taken(reg1, reg2, opcode, is_jmp32); } -- cgit From be41a203bb9e0159099e189e510388fe61962eb8 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sat, 11 Nov 2023 17:05:59 -0800 Subject: bpf: enhance BPF_JEQ/BPF_JNE is_branch_taken logic Use 32-bit subranges to prune some 64-bit BPF_JEQ/BPF_JNE conditions that otherwise would be "inconclusive" (i.e., is_branch_taken() would return -1). This can happen, for example, when registers are initialized as 64-bit u64/s64, then compared for inequality as 32-bit subregisters, and then followed by 64-bit equality/inequality check. That 32-bit inequality can establish some pattern for lower 32 bits of a register (e.g., s< 0 condition determines whether the bit #31 is zero or not), while overall 64-bit value could be anything (according to a value range representation). This is not a fancy quirky special case, but actually a handling that's necessary to prevent correctness issue with BPF verifier's range tracking: set_range_min_max() assumes that register ranges are non-overlapping, and if that condition is not guaranteed by is_branch_taken() we can end up with invalid ranges, where min > max. [0] https://lore.kernel.org/bpf/CACkBjsY2q1_fUohD7hRmKGqv1MV=eP2f6XK8kjkYNw7BaiF8iQ@mail.gmail.com/ Acked-by: Shung-Hsi Yu Acked-by: Eduard Zingerman Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231112010609.848406-4-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f459ad99256e..65570eedfe88 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -14283,6 +14283,18 @@ static int is_scalar_branch_taken(struct bpf_reg_state *reg1, struct bpf_reg_sta return 0; if (smin1 > smax2 || smax1 < smin2) return 0; + if (!is_jmp32) { + /* if 64-bit ranges are inconclusive, see if we can + * utilize 32-bit subrange knowledge to eliminate + * branches that can't be taken a priori + */ + if (reg1->u32_min_value > reg2->u32_max_value || + reg1->u32_max_value < reg2->u32_min_value) + return 0; + if (reg1->s32_min_value > reg2->s32_max_value || + reg1->s32_max_value < reg2->s32_min_value) + return 0; + } break; case BPF_JNE: /* constants, umin/umax and smin/smax checks would be @@ -14295,6 +14307,18 @@ static int is_scalar_branch_taken(struct bpf_reg_state *reg1, struct bpf_reg_sta return 1; if (smin1 > smax2 || smax1 < smin2) return 1; + if (!is_jmp32) { + /* if 64-bit ranges are inconclusive, see if we can + * utilize 32-bit subrange knowledge to eliminate + * branches that can't be taken a priori + */ + if (reg1->u32_min_value > reg2->u32_max_value || + reg1->u32_max_value < reg2->u32_min_value) + return 1; + if (reg1->s32_min_value > reg2->s32_max_value || + reg1->s32_max_value < reg2->s32_min_value) + return 1; + } break; case BPF_JSET: if (!is_reg_const(reg2, is_jmp32)) { -- cgit From 5f99f312bd3bedb3b266b0d26376a8c500cdc97f Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sat, 11 Nov 2023 17:06:00 -0800 Subject: bpf: add register bounds sanity checks and sanitization Add simple sanity checks that validate well-formed ranges (min <= max) across u64, s64, u32, and s32 ranges. Also for cases when the value is constant (either 64-bit or 32-bit), we validate that ranges and tnums are in agreement. These bounds checks are performed at the end of BPF_ALU/BPF_ALU64 operations, on conditional jumps, and for LDX instructions (where subreg zero/sign extension is probably the most important to check). This covers most of the interesting cases. Also, we validate the sanity of the return register when manually adjusting it for some special helpers. By default, sanity violation will trigger a warning in verifier log and resetting register bounds to "unbounded" ones. But to aid development and debugging, BPF_F_TEST_SANITY_STRICT flag is added, which will trigger hard failure of verification with -EFAULT on register bounds violations. This allows selftests to catch such issues. veristat will also gain a CLI option to enable this behavior. Acked-by: Eduard Zingerman Signed-off-by: Andrii Nakryiko Acked-by: Shung-Hsi Yu Link: https://lore.kernel.org/r/20231112010609.848406-5-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 1 + include/uapi/linux/bpf.h | 3 ++ kernel/bpf/syscall.c | 3 +- kernel/bpf/verifier.c | 117 ++++++++++++++++++++++++++++++++--------- tools/include/uapi/linux/bpf.h | 3 ++ 5 files changed, 101 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 24213a99cc79..402b6bc44a1b 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -602,6 +602,7 @@ struct bpf_verifier_env { int stack_size; /* number of states to be processed */ bool strict_alignment; /* perform strict pointer alignment checks */ bool test_state_freq; /* test verifier with different pruning frequency */ + bool test_sanity_strict; /* fail verification on sanity violations */ struct bpf_verifier_state *cur_state; /* current verifier state */ struct bpf_verifier_state_list **explored_states; /* search pruning optimization */ struct bpf_verifier_state_list *free_list; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 7cf8bcf9f6a2..8a5855fcee69 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1200,6 +1200,9 @@ enum bpf_perf_event_type { */ #define BPF_F_XDP_DEV_BOUND_ONLY (1U << 6) +/* The verifier internal test flag. Behavior is undefined */ +#define BPF_F_TEST_SANITY_STRICT (1U << 7) + /* link_create.kprobe_multi.flags used in LINK_CREATE command for * BPF_TRACE_KPROBE_MULTI attach type to create return probe. */ diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 0ed286b8a0f0..f266e03ba342 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2573,7 +2573,8 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) BPF_F_SLEEPABLE | BPF_F_TEST_RND_HI32 | BPF_F_XDP_HAS_FRAGS | - BPF_F_XDP_DEV_BOUND_ONLY)) + BPF_F_XDP_DEV_BOUND_ONLY | + BPF_F_TEST_SANITY_STRICT)) return -EINVAL; if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 65570eedfe88..e7edacf86e0f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2615,6 +2615,56 @@ static void reg_bounds_sync(struct bpf_reg_state *reg) __update_reg_bounds(reg); } +static int reg_bounds_sanity_check(struct bpf_verifier_env *env, + struct bpf_reg_state *reg, const char *ctx) +{ + const char *msg; + + if (reg->umin_value > reg->umax_value || + reg->smin_value > reg->smax_value || + reg->u32_min_value > reg->u32_max_value || + reg->s32_min_value > reg->s32_max_value) { + msg = "range bounds violation"; + goto out; + } + + if (tnum_is_const(reg->var_off)) { + u64 uval = reg->var_off.value; + s64 sval = (s64)uval; + + if (reg->umin_value != uval || reg->umax_value != uval || + reg->smin_value != sval || reg->smax_value != sval) { + msg = "const tnum out of sync with range bounds"; + goto out; + } + } + + if (tnum_subreg_is_const(reg->var_off)) { + u32 uval32 = tnum_subreg(reg->var_off).value; + s32 sval32 = (s32)uval32; + + if (reg->u32_min_value != uval32 || reg->u32_max_value != uval32 || + reg->s32_min_value != sval32 || reg->s32_max_value != sval32) { + msg = "const subreg tnum out of sync with range bounds"; + goto out; + } + } + + return 0; +out: + verbose(env, "REG SANITY VIOLATION (%s): %s u64=[%#llx, %#llx] " + "s64=[%#llx, %#llx] u32=[%#x, %#x] s32=[%#x, %#x] var_off=(%#llx, %#llx)\n", + ctx, msg, reg->umin_value, reg->umax_value, + reg->smin_value, reg->smax_value, + reg->u32_min_value, reg->u32_max_value, + reg->s32_min_value, reg->s32_max_value, + reg->var_off.value, reg->var_off.mask); + if (env->test_sanity_strict) + return -EFAULT; + __mark_reg_unbounded(reg); + return 0; +} + static bool __reg32_bound_s64(s32 a) { return a >= 0 && a <= S32_MAX; @@ -9982,14 +10032,15 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) return 0; } -static void do_refine_retval_range(struct bpf_reg_state *regs, int ret_type, - int func_id, - struct bpf_call_arg_meta *meta) +static int do_refine_retval_range(struct bpf_verifier_env *env, + struct bpf_reg_state *regs, int ret_type, + int func_id, + struct bpf_call_arg_meta *meta) { struct bpf_reg_state *ret_reg = ®s[BPF_REG_0]; if (ret_type != RET_INTEGER) - return; + return 0; switch (func_id) { case BPF_FUNC_get_stack: @@ -10015,6 +10066,8 @@ static void do_refine_retval_range(struct bpf_reg_state *regs, int ret_type, reg_bounds_sync(ret_reg); break; } + + return reg_bounds_sanity_check(env, ret_reg, "retval"); } static int @@ -10666,7 +10719,9 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn regs[BPF_REG_0].ref_obj_id = id; } - do_refine_retval_range(regs, fn->ret_type, func_id, &meta); + err = do_refine_retval_range(env, regs, fn->ret_type, func_id, &meta); + if (err) + return err; err = check_map_func_compatibility(env, meta.map_ptr, func_id); if (err) @@ -14166,13 +14221,12 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) /* check dest operand */ err = check_reg_arg(env, insn->dst_reg, DST_OP_NO_MARK); + err = err ?: adjust_reg_min_max_vals(env, insn); if (err) return err; - - return adjust_reg_min_max_vals(env, insn); } - return 0; + return reg_bounds_sanity_check(env, ®s[insn->dst_reg], "alu"); } static void find_good_pkt_pointers(struct bpf_verifier_state *vstate, @@ -14653,18 +14707,21 @@ again: * Technically we can do similar adjustments for pointers to the same object, * but we don't support that right now. */ -static void reg_set_min_max(struct bpf_reg_state *true_reg1, - struct bpf_reg_state *true_reg2, - struct bpf_reg_state *false_reg1, - struct bpf_reg_state *false_reg2, - u8 opcode, bool is_jmp32) +static int reg_set_min_max(struct bpf_verifier_env *env, + struct bpf_reg_state *true_reg1, + struct bpf_reg_state *true_reg2, + struct bpf_reg_state *false_reg1, + struct bpf_reg_state *false_reg2, + u8 opcode, bool is_jmp32) { + int err; + /* If either register is a pointer, we can't learn anything about its * variable offset from the compare (unless they were a pointer into * the same object, but we don't bother with that). */ if (false_reg1->type != SCALAR_VALUE || false_reg2->type != SCALAR_VALUE) - return; + return 0; /* fallthrough (FALSE) branch */ regs_refine_cond_op(false_reg1, false_reg2, rev_opcode(opcode), is_jmp32); @@ -14675,6 +14732,12 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg1, regs_refine_cond_op(true_reg1, true_reg2, opcode, is_jmp32); reg_bounds_sync(true_reg1); reg_bounds_sync(true_reg2); + + err = reg_bounds_sanity_check(env, true_reg1, "true_reg1"); + err = err ?: reg_bounds_sanity_check(env, true_reg2, "true_reg2"); + err = err ?: reg_bounds_sanity_check(env, false_reg1, "false_reg1"); + err = err ?: reg_bounds_sanity_check(env, false_reg2, "false_reg2"); + return err; } static void mark_ptr_or_null_reg(struct bpf_func_state *state, @@ -14968,15 +15031,20 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, other_branch_regs = other_branch->frame[other_branch->curframe]->regs; if (BPF_SRC(insn->code) == BPF_X) { - reg_set_min_max(&other_branch_regs[insn->dst_reg], - &other_branch_regs[insn->src_reg], - dst_reg, src_reg, opcode, is_jmp32); + err = reg_set_min_max(env, + &other_branch_regs[insn->dst_reg], + &other_branch_regs[insn->src_reg], + dst_reg, src_reg, opcode, is_jmp32); } else /* BPF_SRC(insn->code) == BPF_K */ { - reg_set_min_max(&other_branch_regs[insn->dst_reg], - src_reg /* fake one */, - dst_reg, src_reg /* same fake one */, - opcode, is_jmp32); + err = reg_set_min_max(env, + &other_branch_regs[insn->dst_reg], + src_reg /* fake one */, + dst_reg, src_reg /* same fake one */, + opcode, is_jmp32); } + if (err) + return err; + if (BPF_SRC(insn->code) == BPF_X && src_reg->type == SCALAR_VALUE && src_reg->id && !WARN_ON_ONCE(src_reg->id != other_branch_regs[insn->src_reg].id)) { @@ -17479,10 +17547,8 @@ static int do_check(struct bpf_verifier_env *env) insn->off, BPF_SIZE(insn->code), BPF_READ, insn->dst_reg, false, BPF_MODE(insn->code) == BPF_MEMSX); - if (err) - return err; - - err = save_aux_ptr_type(env, src_reg_type, true); + err = err ?: save_aux_ptr_type(env, src_reg_type, true); + err = err ?: reg_bounds_sanity_check(env, ®s[insn->dst_reg], "ldx"); if (err) return err; } else if (class == BPF_STX) { @@ -20769,6 +20835,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3 if (is_priv) env->test_state_freq = attr->prog_flags & BPF_F_TEST_STATE_FREQ; + env->test_sanity_strict = attr->prog_flags & BPF_F_TEST_SANITY_STRICT; env->explored_states = kvcalloc(state_htab_size(env), sizeof(struct bpf_verifier_state_list *), diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 7cf8bcf9f6a2..8a5855fcee69 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1200,6 +1200,9 @@ enum bpf_perf_event_type { */ #define BPF_F_XDP_DEV_BOUND_ONLY (1U << 6) +/* The verifier internal test flag. Behavior is undefined */ +#define BPF_F_TEST_SANITY_STRICT (1U << 7) + /* link_create.kprobe_multi.flags used in LINK_CREATE command for * BPF_TRACE_KPROBE_MULTI attach type to create return probe. */ -- cgit From 3cf98cf594ea923b8b1e0385b580d3d8aae68c06 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sat, 11 Nov 2023 17:06:01 -0800 Subject: bpf: remove redundant s{32,64} -> u{32,64} deduction logic Equivalent checks were recently added in more succinct and, arguably, safer form in: - f188765f23a5 ("bpf: derive smin32/smax32 from umin32/umax32 bounds"); - 2e74aef782d3 ("bpf: derive smin/smax from umin/max bounds"). The checks we are removing in this patch set do similar checks to detect if entire u32/u64 range has signed bit set or not set, but does it with two separate checks. Further, we forcefully overwrite either smin or smax (and 32-bit equvalents) without applying normal min/max intersection logic. It's not clear why that would be correct in all cases and seems to work by accident. This logic is also "gated" by previous signed -> unsigned derivation, which returns early. All this is quite confusing and seems error-prone, while we already have at least equivalent checks happening earlier. So remove this duplicate and error-prone logic to simplify things a bit. Acked-by: Shung-Hsi Yu Acked-by: Eduard Zingerman Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231112010609.848406-6-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 36 ------------------------------------ 1 file changed, 36 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index e7edacf86e0f..53a9e3e79ab4 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2411,24 +2411,6 @@ static void __reg32_deduce_bounds(struct bpf_reg_state *reg) min_t(u32, reg->s32_max_value, reg->u32_max_value); return; } - /* Learn sign from unsigned bounds. Signed bounds cross the sign - * boundary, so we must be careful. - */ - if ((s32)reg->u32_max_value >= 0) { - /* Positive. We can't learn anything from the smin, but smax - * is positive, hence safe. - */ - reg->s32_min_value = reg->u32_min_value; - reg->s32_max_value = reg->u32_max_value = - min_t(u32, reg->s32_max_value, reg->u32_max_value); - } else if ((s32)reg->u32_min_value < 0) { - /* Negative. We can't learn anything from the smax, but smin - * is negative, hence safe. - */ - reg->s32_min_value = reg->u32_min_value = - max_t(u32, reg->s32_min_value, reg->u32_min_value); - reg->s32_max_value = reg->u32_max_value; - } } static void __reg64_deduce_bounds(struct bpf_reg_state *reg) @@ -2516,24 +2498,6 @@ static void __reg64_deduce_bounds(struct bpf_reg_state *reg) reg->umax_value); return; } - /* Learn sign from unsigned bounds. Signed bounds cross the sign - * boundary, so we must be careful. - */ - if ((s64)reg->umax_value >= 0) { - /* Positive. We can't learn anything from the smin, but smax - * is positive, hence safe. - */ - reg->smin_value = reg->umin_value; - reg->smax_value = reg->umax_value = min_t(u64, reg->smax_value, - reg->umax_value); - } else if ((s64)reg->umin_value < 0) { - /* Negative. We can't learn anything from the smax, but smin - * is negative, hence safe. - */ - reg->smin_value = reg->umin_value = max_t(u64, reg->smin_value, - reg->umin_value); - reg->smax_value = reg->umax_value; - } } static void __reg_deduce_mixed_bounds(struct bpf_reg_state *reg) -- cgit From cf5fe3c71c5a34ac0108afc550407c672d0a032d Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sat, 11 Nov 2023 17:06:02 -0800 Subject: bpf: make __reg{32,64}_deduce_bounds logic more robust This change doesn't seem to have any effect on selftests and production BPF object files, but we preemptively try to make it more robust. First, "learn sign from signed bounds" comment is misleading, as we are learning not just sign, but also values. Second, we simplify the check for determining whether entire range is positive or negative similarly to other checks added earlier, using appropriate u32/u64 cast and single comparisons. As explain in comments in __reg64_deduce_bounds(), the checks are equivalent. Last but not least, smin/smax and s32_min/s32_max reassignment based on min/max of both umin/umax and smin/smax (and 32-bit equivalents) is hard to explain and justify. We are updating unsigned bounds from signed bounds, why would we update signed bounds at the same time? This might be correct, but it's far from obvious why and the code or comments don't try to justify this. Given we've added a separate deduction of signed bounds from unsigned bounds earlier, this seems at least redundant, if not just wrong. In short, we remove doubtful pieces, and streamline the rest to follow the logic and approach of the rest of reg_bounds_sync() checks. Acked-by: Shung-Hsi Yu Acked-by: Eduard Zingerman Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231112010609.848406-7-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 24 ++++++++---------------- 1 file changed, 8 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 53a9e3e79ab4..59505881e7a7 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2399,17 +2399,13 @@ static void __reg32_deduce_bounds(struct bpf_reg_state *reg) reg->s32_min_value = max_t(s32, reg->s32_min_value, reg->u32_min_value); reg->s32_max_value = min_t(s32, reg->s32_max_value, reg->u32_max_value); } - /* Learn sign from signed bounds. - * If we cannot cross the sign boundary, then signed and unsigned bounds + /* If we cannot cross the sign boundary, then signed and unsigned bounds * are the same, so combine. This works even in the negative case, e.g. * -3 s<= x s<= -1 implies 0xf...fd u<= x u<= 0xf...ff. */ - if (reg->s32_min_value >= 0 || reg->s32_max_value < 0) { - reg->s32_min_value = reg->u32_min_value = - max_t(u32, reg->s32_min_value, reg->u32_min_value); - reg->s32_max_value = reg->u32_max_value = - min_t(u32, reg->s32_max_value, reg->u32_max_value); - return; + if ((u32)reg->s32_min_value <= (u32)reg->s32_max_value) { + reg->u32_min_value = max_t(u32, reg->s32_min_value, reg->u32_min_value); + reg->u32_max_value = min_t(u32, reg->s32_max_value, reg->u32_max_value); } } @@ -2486,17 +2482,13 @@ static void __reg64_deduce_bounds(struct bpf_reg_state *reg) reg->smin_value = max_t(s64, reg->smin_value, reg->umin_value); reg->smax_value = min_t(s64, reg->smax_value, reg->umax_value); } - /* Learn sign from signed bounds. - * If we cannot cross the sign boundary, then signed and unsigned bounds + /* If we cannot cross the sign boundary, then signed and unsigned bounds * are the same, so combine. This works even in the negative case, e.g. * -3 s<= x s<= -1 implies 0xf...fd u<= x u<= 0xf...ff. */ - if (reg->smin_value >= 0 || reg->smax_value < 0) { - reg->smin_value = reg->umin_value = max_t(u64, reg->smin_value, - reg->umin_value); - reg->smax_value = reg->umax_value = min_t(u64, reg->smax_value, - reg->umax_value); - return; + if ((u64)reg->smin_value <= (u64)reg->smax_value) { + reg->umin_value = max_t(u64, reg->smin_value, reg->umin_value); + reg->umax_value = min_t(u64, reg->smax_value, reg->umax_value); } } -- cgit From ff8867af01daa7ea770bebf5f91199b7434b74e5 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 17 Nov 2023 09:14:04 -0800 Subject: bpf: rename BPF_F_TEST_SANITY_STRICT to BPF_F_TEST_REG_INVARIANTS Rename verifier internal flag BPF_F_TEST_SANITY_STRICT to more neutral BPF_F_TEST_REG_INVARIANTS. This is a follow up to [0]. A few selftests and veristat need to be adjusted in the same patch as well. [0] https://patchwork.kernel.org/project/netdevbpf/patch/20231112010609.848406-5-andrii@kernel.org/ Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231117171404.225508-1-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 2 +- include/uapi/linux/bpf.h | 2 +- kernel/bpf/syscall.c | 2 +- kernel/bpf/verifier.c | 6 +++--- tools/include/uapi/linux/bpf.h | 2 +- tools/testing/selftests/bpf/prog_tests/bpf_verif_scale.c | 2 +- tools/testing/selftests/bpf/prog_tests/reg_bounds.c | 2 +- tools/testing/selftests/bpf/progs/verifier_bounds.c | 4 ++-- tools/testing/selftests/bpf/test_loader.c | 6 +++--- tools/testing/selftests/bpf/test_sock_addr.c | 3 +-- tools/testing/selftests/bpf/test_verifier.c | 2 +- tools/testing/selftests/bpf/testing_helpers.c | 4 ++-- tools/testing/selftests/bpf/veristat.c | 12 ++++++------ 13 files changed, 24 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 402b6bc44a1b..52a4012b8255 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -602,7 +602,7 @@ struct bpf_verifier_env { int stack_size; /* number of states to be processed */ bool strict_alignment; /* perform strict pointer alignment checks */ bool test_state_freq; /* test verifier with different pruning frequency */ - bool test_sanity_strict; /* fail verification on sanity violations */ + bool test_reg_invariants; /* fail verification on register invariants violations */ struct bpf_verifier_state *cur_state; /* current verifier state */ struct bpf_verifier_state_list **explored_states; /* search pruning optimization */ struct bpf_verifier_state_list *free_list; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 8a5855fcee69..7a5498242eaa 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1201,7 +1201,7 @@ enum bpf_perf_event_type { #define BPF_F_XDP_DEV_BOUND_ONLY (1U << 6) /* The verifier internal test flag. Behavior is undefined */ -#define BPF_F_TEST_SANITY_STRICT (1U << 7) +#define BPF_F_TEST_REG_INVARIANTS (1U << 7) /* link_create.kprobe_multi.flags used in LINK_CREATE command for * BPF_TRACE_KPROBE_MULTI attach type to create return probe. diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index f266e03ba342..5e43ddd1b83f 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2574,7 +2574,7 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) BPF_F_TEST_RND_HI32 | BPF_F_XDP_HAS_FRAGS | BPF_F_XDP_DEV_BOUND_ONLY | - BPF_F_TEST_SANITY_STRICT)) + BPF_F_TEST_REG_INVARIANTS)) return -EINVAL; if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 59505881e7a7..7c3461b89513 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2608,14 +2608,14 @@ static int reg_bounds_sanity_check(struct bpf_verifier_env *env, return 0; out: - verbose(env, "REG SANITY VIOLATION (%s): %s u64=[%#llx, %#llx] " + verbose(env, "REG INVARIANTS VIOLATION (%s): %s u64=[%#llx, %#llx] " "s64=[%#llx, %#llx] u32=[%#x, %#x] s32=[%#x, %#x] var_off=(%#llx, %#llx)\n", ctx, msg, reg->umin_value, reg->umax_value, reg->smin_value, reg->smax_value, reg->u32_min_value, reg->u32_max_value, reg->s32_min_value, reg->s32_max_value, reg->var_off.value, reg->var_off.mask); - if (env->test_sanity_strict) + if (env->test_reg_invariants) return -EFAULT; __mark_reg_unbounded(reg); return 0; @@ -20791,7 +20791,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3 if (is_priv) env->test_state_freq = attr->prog_flags & BPF_F_TEST_STATE_FREQ; - env->test_sanity_strict = attr->prog_flags & BPF_F_TEST_SANITY_STRICT; + env->test_reg_invariants = attr->prog_flags & BPF_F_TEST_REG_INVARIANTS; env->explored_states = kvcalloc(state_htab_size(env), sizeof(struct bpf_verifier_state_list *), diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 8a5855fcee69..7a5498242eaa 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1201,7 +1201,7 @@ enum bpf_perf_event_type { #define BPF_F_XDP_DEV_BOUND_ONLY (1U << 6) /* The verifier internal test flag. Behavior is undefined */ -#define BPF_F_TEST_SANITY_STRICT (1U << 7) +#define BPF_F_TEST_REG_INVARIANTS (1U << 7) /* link_create.kprobe_multi.flags used in LINK_CREATE command for * BPF_TRACE_KPROBE_MULTI attach type to create return probe. diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_verif_scale.c b/tools/testing/selftests/bpf/prog_tests/bpf_verif_scale.c index 3f2d70831873..e770912fc1d2 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_verif_scale.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_verif_scale.c @@ -35,7 +35,7 @@ static int check_load(const char *file, enum bpf_prog_type type) } bpf_program__set_type(prog, type); - bpf_program__set_flags(prog, BPF_F_TEST_RND_HI32 | BPF_F_TEST_SANITY_STRICT); + bpf_program__set_flags(prog, BPF_F_TEST_RND_HI32 | BPF_F_TEST_REG_INVARIANTS); bpf_program__set_log_level(prog, 4 | extra_prog_load_log_flags); err = bpf_object__load(obj); diff --git a/tools/testing/selftests/bpf/prog_tests/reg_bounds.c b/tools/testing/selftests/bpf/prog_tests/reg_bounds.c index fe0cb906644b..7a8b0bf0a7f8 100644 --- a/tools/testing/selftests/bpf/prog_tests/reg_bounds.c +++ b/tools/testing/selftests/bpf/prog_tests/reg_bounds.c @@ -838,7 +838,7 @@ static int load_range_cmp_prog(struct range x, struct range y, enum op op, .log_level = 2, .log_buf = log_buf, .log_size = log_sz, - .prog_flags = BPF_F_TEST_SANITY_STRICT, + .prog_flags = BPF_F_TEST_REG_INVARIANTS, ); /* ; skip exit block below diff --git a/tools/testing/selftests/bpf/progs/verifier_bounds.c b/tools/testing/selftests/bpf/progs/verifier_bounds.c index 0c1460936373..ec430b71730b 100644 --- a/tools/testing/selftests/bpf/progs/verifier_bounds.c +++ b/tools/testing/selftests/bpf/progs/verifier_bounds.c @@ -965,7 +965,7 @@ l0_%=: r0 = 0; \ SEC("xdp") __description("bound check with JMP_JSLT for crossing 64-bit signed boundary") __success __retval(0) -__flag(!BPF_F_TEST_SANITY_STRICT) /* known sanity violation */ +__flag(!BPF_F_TEST_REG_INVARIANTS) /* known invariants violation */ __naked void crossing_64_bit_signed_boundary_2(void) { asm volatile (" \ @@ -1047,7 +1047,7 @@ l0_%=: r0 = 0; \ SEC("xdp") __description("bound check with JMP32_JSLT for crossing 32-bit signed boundary") __success __retval(0) -__flag(!BPF_F_TEST_SANITY_STRICT) /* known sanity violation */ +__flag(!BPF_F_TEST_REG_INVARIANTS) /* known invariants violation */ __naked void crossing_32_bit_signed_boundary_2(void) { asm volatile (" \ diff --git a/tools/testing/selftests/bpf/test_loader.c b/tools/testing/selftests/bpf/test_loader.c index 57e27b1a73a6..a350ecdfba4a 100644 --- a/tools/testing/selftests/bpf/test_loader.c +++ b/tools/testing/selftests/bpf/test_loader.c @@ -179,7 +179,7 @@ static int parse_test_spec(struct test_loader *tester, memset(spec, 0, sizeof(*spec)); spec->prog_name = bpf_program__name(prog); - spec->prog_flags = BPF_F_TEST_SANITY_STRICT; /* by default be strict */ + spec->prog_flags = BPF_F_TEST_REG_INVARIANTS; /* by default be strict */ btf = bpf_object__btf(obj); if (!btf) { @@ -280,8 +280,8 @@ static int parse_test_spec(struct test_loader *tester, update_flags(&spec->prog_flags, BPF_F_SLEEPABLE, clear); } else if (strcmp(val, "BPF_F_XDP_HAS_FRAGS") == 0) { update_flags(&spec->prog_flags, BPF_F_XDP_HAS_FRAGS, clear); - } else if (strcmp(val, "BPF_F_TEST_SANITY_STRICT") == 0) { - update_flags(&spec->prog_flags, BPF_F_TEST_SANITY_STRICT, clear); + } else if (strcmp(val, "BPF_F_TEST_REG_INVARIANTS") == 0) { + update_flags(&spec->prog_flags, BPF_F_TEST_REG_INVARIANTS, clear); } else /* assume numeric value */ { err = parse_int(val, &flags, "test prog flags"); if (err) diff --git a/tools/testing/selftests/bpf/test_sock_addr.c b/tools/testing/selftests/bpf/test_sock_addr.c index 878c077e0fa7..b0068a9d2cfe 100644 --- a/tools/testing/selftests/bpf/test_sock_addr.c +++ b/tools/testing/selftests/bpf/test_sock_addr.c @@ -679,8 +679,7 @@ static int load_path(const struct sock_addr_test *test, const char *path) bpf_program__set_type(prog, BPF_PROG_TYPE_CGROUP_SOCK_ADDR); bpf_program__set_expected_attach_type(prog, test->expected_attach_type); - bpf_program__set_flags(prog, BPF_F_TEST_RND_HI32); - bpf_program__set_flags(prog, BPF_F_TEST_SANITY_STRICT); + bpf_program__set_flags(prog, BPF_F_TEST_RND_HI32 | BPF_F_TEST_REG_INVARIANTS); err = bpf_object__load(obj); if (err) { diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index 4992022f3137..f36e41435be7 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -1588,7 +1588,7 @@ static void do_test_single(struct bpf_test *test, bool unpriv, if (fixup_skips != skips) return; - pflags = BPF_F_TEST_RND_HI32 | BPF_F_TEST_SANITY_STRICT; + pflags = BPF_F_TEST_RND_HI32 | BPF_F_TEST_REG_INVARIANTS; if (test->flags & F_LOAD_WITH_STRICT_ALIGNMENT) pflags |= BPF_F_STRICT_ALIGNMENT; if (test->flags & F_NEEDS_EFFICIENT_UNALIGNED_ACCESS) diff --git a/tools/testing/selftests/bpf/testing_helpers.c b/tools/testing/selftests/bpf/testing_helpers.c index 9786a94a666c..d2458c1b1671 100644 --- a/tools/testing/selftests/bpf/testing_helpers.c +++ b/tools/testing/selftests/bpf/testing_helpers.c @@ -276,7 +276,7 @@ int bpf_prog_test_load(const char *file, enum bpf_prog_type type, if (type != BPF_PROG_TYPE_UNSPEC && bpf_program__type(prog) != type) bpf_program__set_type(prog, type); - flags = bpf_program__flags(prog) | BPF_F_TEST_RND_HI32 | BPF_F_TEST_SANITY_STRICT; + flags = bpf_program__flags(prog) | BPF_F_TEST_RND_HI32 | BPF_F_TEST_REG_INVARIANTS; bpf_program__set_flags(prog, flags); err = bpf_object__load(obj); @@ -299,7 +299,7 @@ int bpf_test_load_program(enum bpf_prog_type type, const struct bpf_insn *insns, { LIBBPF_OPTS(bpf_prog_load_opts, opts, .kern_version = kern_version, - .prog_flags = BPF_F_TEST_RND_HI32 | BPF_F_TEST_SANITY_STRICT, + .prog_flags = BPF_F_TEST_RND_HI32 | BPF_F_TEST_REG_INVARIANTS, .log_level = extra_prog_load_log_flags, .log_buf = log_buf, .log_size = log_buf_sz, diff --git a/tools/testing/selftests/bpf/veristat.c b/tools/testing/selftests/bpf/veristat.c index 609fd9753af0..1d418d66e375 100644 --- a/tools/testing/selftests/bpf/veristat.c +++ b/tools/testing/selftests/bpf/veristat.c @@ -145,7 +145,7 @@ static struct env { bool debug; bool quiet; bool force_checkpoints; - bool strict_range_sanity; + bool force_reg_invariants; enum resfmt out_fmt; bool show_version; bool comparison_mode; @@ -225,8 +225,8 @@ static const struct argp_option opts[] = { { "filter", 'f', "FILTER", 0, "Filter expressions (or @filename for file with expressions)." }, { "test-states", 't', NULL, 0, "Force frequent BPF verifier state checkpointing (set BPF_F_TEST_STATE_FREQ program flag)" }, - { "test-sanity", 'r', NULL, 0, - "Force strict BPF verifier register sanity behavior (BPF_F_TEST_SANITY_STRICT program flag)" }, + { "test-reg-invariants", 'r', NULL, 0, + "Force BPF verifier failure on register invariant violation (BPF_F_TEST_REG_INVARIANTS program flag)" }, {}, }; @@ -299,7 +299,7 @@ static error_t parse_arg(int key, char *arg, struct argp_state *state) env.force_checkpoints = true; break; case 'r': - env.strict_range_sanity = true; + env.force_reg_invariants = true; break; case 'n': errno = 0; @@ -1028,8 +1028,8 @@ static int process_prog(const char *filename, struct bpf_object *obj, struct bpf if (env.force_checkpoints) bpf_program__set_flags(prog, bpf_program__flags(prog) | BPF_F_TEST_STATE_FREQ); - if (env.strict_range_sanity) - bpf_program__set_flags(prog, bpf_program__flags(prog) | BPF_F_TEST_SANITY_STRICT); + if (env.force_reg_invariants) + bpf_program__set_flags(prog, bpf_program__flags(prog) | BPF_F_TEST_REG_INVARIANTS); err = bpf_object__load(obj); env.progs_processed++; -- cgit From f73f6181eb057671e358ebac8ed7f0014f12efb8 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 30 Aug 2023 09:32:15 -0700 Subject: userns: eliminate many kernel-doc warnings Drop the kernel-doc "/**" notation from 8 structs or functions to prevent 22 kernel-doc warnings (samples below). user_namespace.c:239: warning: Function parameter or member 'map_up' not described in 'idmap_key' user_namespace.c:246: warning: Function parameter or member 'k' not described in 'cmp_map_id' user_namespace.c:277: warning: Function parameter or member 'extents' not described in 'map_id_range_down_max' user_namespace.c:295: warning: Function parameter or member 'extents' not described in 'map_id_range_down_base' user_namespace.c:344: warning: Function parameter or member 'extents' not described in 'map_id_up_base' user_namespace.c:364: warning: Function parameter or member 'extents' not described in 'map_id_up_max' user_namespace.c:776: warning: Function parameter or member 'map' not described in 'insert_extent' user_namespace.c:844: warning: Function parameter or member 'map' not described in 'sort_idmaps' Fixes: 6397fac4915a ("userns: bump idmap limits to 340") Signed-off-by: Randy Dunlap Link: https://lore.kernel.org/r/20230830163215.13193-1-rdunlap@infradead.org Cc: Eric Biederman Cc: Christian Brauner Signed-off-by: Christian Brauner --- kernel/user_namespace.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index eabe8bcc7042..625101249e4d 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -231,7 +231,7 @@ void __put_user_ns(struct user_namespace *ns) } EXPORT_SYMBOL(__put_user_ns); -/** +/* * struct idmap_key - holds the information necessary to find an idmapping in a * sorted idmap array. It is passed to cmp_map_id() as first argument. */ @@ -241,7 +241,7 @@ struct idmap_key { u32 count; /* == 0 unless used with map_id_range_down() */ }; -/** +/* * cmp_map_id - Function to be passed to bsearch() to find the requested * idmapping. Expects struct idmap_key to be passed via @k. */ @@ -271,7 +271,7 @@ static int cmp_map_id(const void *k, const void *e) return 1; } -/** +/* * map_id_range_down_max - Find idmap via binary search in ordered idmap array. * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS. */ @@ -288,7 +288,7 @@ map_id_range_down_max(unsigned extents, struct uid_gid_map *map, u32 id, u32 cou sizeof(struct uid_gid_extent), cmp_map_id); } -/** +/* * map_id_range_down_base - Find idmap via binary search in static extent array. * Can only be called if number of mappings is equal or less than * UID_GID_MAP_MAX_BASE_EXTENTS. @@ -337,7 +337,7 @@ static u32 map_id_down(struct uid_gid_map *map, u32 id) return map_id_range_down(map, id, 1); } -/** +/* * map_id_up_base - Find idmap via binary search in static extent array. * Can only be called if number of mappings is equal or less than * UID_GID_MAP_MAX_BASE_EXTENTS. @@ -358,7 +358,7 @@ map_id_up_base(unsigned extents, struct uid_gid_map *map, u32 id) return NULL; } -/** +/* * map_id_up_max - Find idmap via binary search in ordered idmap array. * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS. */ @@ -770,7 +770,7 @@ static bool mappings_overlap(struct uid_gid_map *new_map, return false; } -/** +/* * insert_extent - Safely insert a new idmap extent into struct uid_gid_map. * Takes care to allocate a 4K block of memory if the number of mappings exceeds * UID_GID_MAP_MAX_BASE_EXTENTS. @@ -839,7 +839,7 @@ static int cmp_extents_reverse(const void *a, const void *b) return 0; } -/** +/* * sort_idmaps - Sorts an array of idmap entries. * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS. */ -- cgit From db840d389bad60ce6f3aadc1079da13e7e993a16 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 17 Nov 2023 19:46:16 -0800 Subject: bpf: move verbose_linfo() into kernel/bpf/log.c verifier.c is huge. Let's try to move out parts that are logging-related into log.c, as we previously did with bpf_log() and other related stuff. This patch moves line info verbose output routines: it's pretty self-contained and isolated code, so there is no problem with this. Acked-by: Eduard Zingerman Acked-by: Stanislav Fomichev Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231118034623.3320920-2-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 4 +++ kernel/bpf/log.c | 59 ++++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/verifier.c | 57 ------------------------------------------ 3 files changed, 63 insertions(+), 57 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 52a4012b8255..d896f3db6a22 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -680,6 +680,10 @@ int bpf_vlog_init(struct bpf_verifier_log *log, u32 log_level, void bpf_vlog_reset(struct bpf_verifier_log *log, u64 new_pos); int bpf_vlog_finalize(struct bpf_verifier_log *log, u32 *log_size_actual); +__printf(3, 4) void verbose_linfo(struct bpf_verifier_env *env, + u32 insn_off, + const char *prefix_fmt, ...); + static inline struct bpf_func_state *cur_func(struct bpf_verifier_env *env) { struct bpf_verifier_state *cur = env->cur_state; diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c index 850494423530..f20e1449c882 100644 --- a/kernel/bpf/log.c +++ b/kernel/bpf/log.c @@ -10,6 +10,8 @@ #include #include +#define verbose(env, fmt, args...) bpf_verifier_log_write(env, fmt, ##args) + static bool bpf_verifier_log_attr_valid(const struct bpf_verifier_log *log) { /* ubuf and len_total should both be specified (or not) together */ @@ -325,3 +327,60 @@ __printf(2, 3) void bpf_log(struct bpf_verifier_log *log, va_end(args); } EXPORT_SYMBOL_GPL(bpf_log); + +static const struct bpf_line_info * +find_linfo(const struct bpf_verifier_env *env, u32 insn_off) +{ + const struct bpf_line_info *linfo; + const struct bpf_prog *prog; + u32 i, nr_linfo; + + prog = env->prog; + nr_linfo = prog->aux->nr_linfo; + + if (!nr_linfo || insn_off >= prog->len) + return NULL; + + linfo = prog->aux->linfo; + for (i = 1; i < nr_linfo; i++) + if (insn_off < linfo[i].insn_off) + break; + + return &linfo[i - 1]; +} + +static const char *ltrim(const char *s) +{ + while (isspace(*s)) + s++; + + return s; +} + +__printf(3, 4) void verbose_linfo(struct bpf_verifier_env *env, + u32 insn_off, + const char *prefix_fmt, ...) +{ + const struct bpf_line_info *linfo; + + if (!bpf_verifier_log_needed(&env->log)) + return; + + linfo = find_linfo(env, insn_off); + if (!linfo || linfo == env->prev_linfo) + return; + + if (prefix_fmt) { + va_list args; + + va_start(args, prefix_fmt); + bpf_verifier_vlog(&env->log, prefix_fmt, args); + va_end(args); + } + + verbose(env, "%s\n", + ltrim(btf_name_by_offset(env->prog->aux->btf, + linfo->line_off))); + + env->prev_linfo = linfo; +} diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 7c3461b89513..683fdda25c13 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -337,27 +337,6 @@ struct btf *btf_vmlinux; static DEFINE_MUTEX(bpf_verifier_lock); -static const struct bpf_line_info * -find_linfo(const struct bpf_verifier_env *env, u32 insn_off) -{ - const struct bpf_line_info *linfo; - const struct bpf_prog *prog; - u32 i, nr_linfo; - - prog = env->prog; - nr_linfo = prog->aux->nr_linfo; - - if (!nr_linfo || insn_off >= prog->len) - return NULL; - - linfo = prog->aux->linfo; - for (i = 1; i < nr_linfo; i++) - if (insn_off < linfo[i].insn_off) - break; - - return &linfo[i - 1]; -} - __printf(2, 3) static void verbose(void *private_data, const char *fmt, ...) { struct bpf_verifier_env *env = private_data; @@ -371,42 +350,6 @@ __printf(2, 3) static void verbose(void *private_data, const char *fmt, ...) va_end(args); } -static const char *ltrim(const char *s) -{ - while (isspace(*s)) - s++; - - return s; -} - -__printf(3, 4) static void verbose_linfo(struct bpf_verifier_env *env, - u32 insn_off, - const char *prefix_fmt, ...) -{ - const struct bpf_line_info *linfo; - - if (!bpf_verifier_log_needed(&env->log)) - return; - - linfo = find_linfo(env, insn_off); - if (!linfo || linfo == env->prev_linfo) - return; - - if (prefix_fmt) { - va_list args; - - va_start(args, prefix_fmt); - bpf_verifier_vlog(&env->log, prefix_fmt, args); - va_end(args); - } - - verbose(env, "%s\n", - ltrim(btf_name_by_offset(env->prog->aux->btf, - linfo->line_off))); - - env->prev_linfo = linfo; -} - static void verbose_invalid_scalar(struct bpf_verifier_env *env, struct bpf_reg_state *reg, struct tnum *range, const char *ctx, -- cgit From 42feb6620accded89cad5f455665e21281813d79 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 17 Nov 2023 19:46:17 -0800 Subject: bpf: move verifier state printing code to kernel/bpf/log.c Move a good chunk of code from verifier.c to log.c: verifier state verbose printing logic. This is an important and very much logging/debugging oriented code. It fits the overlall log.c's focus on verifier logging, and moving it allows to keep growing it without unnecessarily adding to verifier.c code that otherwise contains a core verification logic. There are not many shared dependencies between this code and the rest of verifier.c code, except a few single-line helpers for various register type checks and a bit of state "scratching" helpers. We move all such trivial helpers into include/bpf/bpf_verifier.h as static inlines. No functional changes in this patch. Acked-by: Eduard Zingerman Acked-by: Stanislav Fomichev Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231118034623.3320920-3-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 72 ++++++++ kernel/bpf/log.c | 342 ++++++++++++++++++++++++++++++++++++ kernel/bpf/verifier.c | 403 ------------------------------------------- 3 files changed, 414 insertions(+), 403 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index d896f3db6a22..39edc76f436e 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -783,4 +783,76 @@ static inline bool bpf_type_has_unsafe_modifiers(u32 type) return type_flag(type) & ~BPF_REG_TRUSTED_MODIFIERS; } +static inline bool type_is_ptr_alloc_obj(u32 type) +{ + return base_type(type) == PTR_TO_BTF_ID && type_flag(type) & MEM_ALLOC; +} + +static inline bool type_is_non_owning_ref(u32 type) +{ + return type_is_ptr_alloc_obj(type) && type_flag(type) & NON_OWN_REF; +} + +static inline bool type_is_pkt_pointer(enum bpf_reg_type type) +{ + type = base_type(type); + return type == PTR_TO_PACKET || + type == PTR_TO_PACKET_META; +} + +static inline bool type_is_sk_pointer(enum bpf_reg_type type) +{ + return type == PTR_TO_SOCKET || + type == PTR_TO_SOCK_COMMON || + type == PTR_TO_TCP_SOCK || + type == PTR_TO_XDP_SOCK; +} + +static inline void mark_reg_scratched(struct bpf_verifier_env *env, u32 regno) +{ + env->scratched_regs |= 1U << regno; +} + +static inline void mark_stack_slot_scratched(struct bpf_verifier_env *env, u32 spi) +{ + env->scratched_stack_slots |= 1ULL << spi; +} + +static inline bool reg_scratched(const struct bpf_verifier_env *env, u32 regno) +{ + return (env->scratched_regs >> regno) & 1; +} + +static inline bool stack_slot_scratched(const struct bpf_verifier_env *env, u64 regno) +{ + return (env->scratched_stack_slots >> regno) & 1; +} + +static inline bool verifier_state_scratched(const struct bpf_verifier_env *env) +{ + return env->scratched_regs || env->scratched_stack_slots; +} + +static inline void mark_verifier_state_clean(struct bpf_verifier_env *env) +{ + env->scratched_regs = 0U; + env->scratched_stack_slots = 0ULL; +} + +/* Used for printing the entire verifier state. */ +static inline void mark_verifier_state_scratched(struct bpf_verifier_env *env) +{ + env->scratched_regs = ~0U; + env->scratched_stack_slots = ~0ULL; +} + +const char *reg_type_str(struct bpf_verifier_env *env, enum bpf_reg_type type); +const char *dynptr_type_str(enum bpf_dynptr_type type); +const char *iter_type_str(const struct btf *btf, u32 btf_id); +const char *iter_state_str(enum bpf_iter_state state); + +void print_verifier_state(struct bpf_verifier_env *env, + const struct bpf_func_state *state, bool print_all); +void print_insn_state(struct bpf_verifier_env *env, const struct bpf_func_state *state); + #endif /* _LINUX_BPF_VERIFIER_H */ diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c index f20e1449c882..c1b257eac21b 100644 --- a/kernel/bpf/log.c +++ b/kernel/bpf/log.c @@ -384,3 +384,345 @@ __printf(3, 4) void verbose_linfo(struct bpf_verifier_env *env, env->prev_linfo = linfo; } + +static const char *btf_type_name(const struct btf *btf, u32 id) +{ + return btf_name_by_offset(btf, btf_type_by_id(btf, id)->name_off); +} + +/* string representation of 'enum bpf_reg_type' + * + * Note that reg_type_str() can not appear more than once in a single verbose() + * statement. + */ +const char *reg_type_str(struct bpf_verifier_env *env, enum bpf_reg_type type) +{ + char postfix[16] = {0}, prefix[64] = {0}; + static const char * const str[] = { + [NOT_INIT] = "?", + [SCALAR_VALUE] = "scalar", + [PTR_TO_CTX] = "ctx", + [CONST_PTR_TO_MAP] = "map_ptr", + [PTR_TO_MAP_VALUE] = "map_value", + [PTR_TO_STACK] = "fp", + [PTR_TO_PACKET] = "pkt", + [PTR_TO_PACKET_META] = "pkt_meta", + [PTR_TO_PACKET_END] = "pkt_end", + [PTR_TO_FLOW_KEYS] = "flow_keys", + [PTR_TO_SOCKET] = "sock", + [PTR_TO_SOCK_COMMON] = "sock_common", + [PTR_TO_TCP_SOCK] = "tcp_sock", + [PTR_TO_TP_BUFFER] = "tp_buffer", + [PTR_TO_XDP_SOCK] = "xdp_sock", + [PTR_TO_BTF_ID] = "ptr_", + [PTR_TO_MEM] = "mem", + [PTR_TO_BUF] = "buf", + [PTR_TO_FUNC] = "func", + [PTR_TO_MAP_KEY] = "map_key", + [CONST_PTR_TO_DYNPTR] = "dynptr_ptr", + }; + + if (type & PTR_MAYBE_NULL) { + if (base_type(type) == PTR_TO_BTF_ID) + strncpy(postfix, "or_null_", 16); + else + strncpy(postfix, "_or_null", 16); + } + + snprintf(prefix, sizeof(prefix), "%s%s%s%s%s%s%s", + type & MEM_RDONLY ? "rdonly_" : "", + type & MEM_RINGBUF ? "ringbuf_" : "", + type & MEM_USER ? "user_" : "", + type & MEM_PERCPU ? "percpu_" : "", + type & MEM_RCU ? "rcu_" : "", + type & PTR_UNTRUSTED ? "untrusted_" : "", + type & PTR_TRUSTED ? "trusted_" : "" + ); + + snprintf(env->tmp_str_buf, TMP_STR_BUF_LEN, "%s%s%s", + prefix, str[base_type(type)], postfix); + return env->tmp_str_buf; +} + +const char *dynptr_type_str(enum bpf_dynptr_type type) +{ + switch (type) { + case BPF_DYNPTR_TYPE_LOCAL: + return "local"; + case BPF_DYNPTR_TYPE_RINGBUF: + return "ringbuf"; + case BPF_DYNPTR_TYPE_SKB: + return "skb"; + case BPF_DYNPTR_TYPE_XDP: + return "xdp"; + case BPF_DYNPTR_TYPE_INVALID: + return ""; + default: + WARN_ONCE(1, "unknown dynptr type %d\n", type); + return ""; + } +} + +const char *iter_type_str(const struct btf *btf, u32 btf_id) +{ + if (!btf || btf_id == 0) + return ""; + + /* we already validated that type is valid and has conforming name */ + return btf_type_name(btf, btf_id) + sizeof(ITER_PREFIX) - 1; +} + +const char *iter_state_str(enum bpf_iter_state state) +{ + switch (state) { + case BPF_ITER_STATE_ACTIVE: + return "active"; + case BPF_ITER_STATE_DRAINED: + return "drained"; + case BPF_ITER_STATE_INVALID: + return ""; + default: + WARN_ONCE(1, "unknown iter state %d\n", state); + return ""; + } +} + +static char slot_type_char[] = { + [STACK_INVALID] = '?', + [STACK_SPILL] = 'r', + [STACK_MISC] = 'm', + [STACK_ZERO] = '0', + [STACK_DYNPTR] = 'd', + [STACK_ITER] = 'i', +}; + +static void print_liveness(struct bpf_verifier_env *env, + enum bpf_reg_liveness live) +{ + if (live & (REG_LIVE_READ | REG_LIVE_WRITTEN | REG_LIVE_DONE)) + verbose(env, "_"); + if (live & REG_LIVE_READ) + verbose(env, "r"); + if (live & REG_LIVE_WRITTEN) + verbose(env, "w"); + if (live & REG_LIVE_DONE) + verbose(env, "D"); +} + +static void print_scalar_ranges(struct bpf_verifier_env *env, + const struct bpf_reg_state *reg, + const char **sep) +{ + struct { + const char *name; + u64 val; + bool omit; + } minmaxs[] = { + {"smin", reg->smin_value, reg->smin_value == S64_MIN}, + {"smax", reg->smax_value, reg->smax_value == S64_MAX}, + {"umin", reg->umin_value, reg->umin_value == 0}, + {"umax", reg->umax_value, reg->umax_value == U64_MAX}, + {"smin32", (s64)reg->s32_min_value, reg->s32_min_value == S32_MIN}, + {"smax32", (s64)reg->s32_max_value, reg->s32_max_value == S32_MAX}, + {"umin32", reg->u32_min_value, reg->u32_min_value == 0}, + {"umax32", reg->u32_max_value, reg->u32_max_value == U32_MAX}, + }, *m1, *m2, *mend = &minmaxs[ARRAY_SIZE(minmaxs)]; + bool neg1, neg2; + + for (m1 = &minmaxs[0]; m1 < mend; m1++) { + if (m1->omit) + continue; + + neg1 = m1->name[0] == 's' && (s64)m1->val < 0; + + verbose(env, "%s%s=", *sep, m1->name); + *sep = ","; + + for (m2 = m1 + 2; m2 < mend; m2 += 2) { + if (m2->omit || m2->val != m1->val) + continue; + /* don't mix negatives with positives */ + neg2 = m2->name[0] == 's' && (s64)m2->val < 0; + if (neg2 != neg1) + continue; + m2->omit = true; + verbose(env, "%s=", m2->name); + } + + verbose(env, m1->name[0] == 's' ? "%lld" : "%llu", m1->val); + } +} + +void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_func_state *state, + bool print_all) +{ + const struct bpf_reg_state *reg; + enum bpf_reg_type t; + int i; + + if (state->frameno) + verbose(env, " frame%d:", state->frameno); + for (i = 0; i < MAX_BPF_REG; i++) { + reg = &state->regs[i]; + t = reg->type; + if (t == NOT_INIT) + continue; + if (!print_all && !reg_scratched(env, i)) + continue; + verbose(env, " R%d", i); + print_liveness(env, reg->live); + verbose(env, "="); + if (t == SCALAR_VALUE && reg->precise) + verbose(env, "P"); + if ((t == SCALAR_VALUE || t == PTR_TO_STACK) && + tnum_is_const(reg->var_off)) { + /* reg->off should be 0 for SCALAR_VALUE */ + verbose(env, "%s", t == SCALAR_VALUE ? "" : reg_type_str(env, t)); + verbose(env, "%lld", reg->var_off.value + reg->off); + } else { + const char *sep = ""; + + verbose(env, "%s", reg_type_str(env, t)); + if (base_type(t) == PTR_TO_BTF_ID) + verbose(env, "%s", btf_type_name(reg->btf, reg->btf_id)); + verbose(env, "("); +/* + * _a stands for append, was shortened to avoid multiline statements below. + * This macro is used to output a comma separated list of attributes. + */ +#define verbose_a(fmt, ...) ({ verbose(env, "%s" fmt, sep, __VA_ARGS__); sep = ","; }) + + if (reg->id) + verbose_a("id=%d", reg->id); + if (reg->ref_obj_id) + verbose_a("ref_obj_id=%d", reg->ref_obj_id); + if (type_is_non_owning_ref(reg->type)) + verbose_a("%s", "non_own_ref"); + if (t != SCALAR_VALUE) + verbose_a("off=%d", reg->off); + if (type_is_pkt_pointer(t)) + verbose_a("r=%d", reg->range); + else if (base_type(t) == CONST_PTR_TO_MAP || + base_type(t) == PTR_TO_MAP_KEY || + base_type(t) == PTR_TO_MAP_VALUE) + verbose_a("ks=%d,vs=%d", + reg->map_ptr->key_size, + reg->map_ptr->value_size); + if (tnum_is_const(reg->var_off)) { + /* Typically an immediate SCALAR_VALUE, but + * could be a pointer whose offset is too big + * for reg->off + */ + verbose_a("imm=%llx", reg->var_off.value); + } else { + print_scalar_ranges(env, reg, &sep); + if (!tnum_is_unknown(reg->var_off)) { + char tn_buf[48]; + + tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); + verbose_a("var_off=%s", tn_buf); + } + } +#undef verbose_a + + verbose(env, ")"); + } + } + for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { + char types_buf[BPF_REG_SIZE + 1]; + bool valid = false; + int j; + + for (j = 0; j < BPF_REG_SIZE; j++) { + if (state->stack[i].slot_type[j] != STACK_INVALID) + valid = true; + types_buf[j] = slot_type_char[state->stack[i].slot_type[j]]; + } + types_buf[BPF_REG_SIZE] = 0; + if (!valid) + continue; + if (!print_all && !stack_slot_scratched(env, i)) + continue; + switch (state->stack[i].slot_type[BPF_REG_SIZE - 1]) { + case STACK_SPILL: + reg = &state->stack[i].spilled_ptr; + t = reg->type; + + verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE); + print_liveness(env, reg->live); + verbose(env, "=%s", t == SCALAR_VALUE ? "" : reg_type_str(env, t)); + if (t == SCALAR_VALUE && reg->precise) + verbose(env, "P"); + if (t == SCALAR_VALUE && tnum_is_const(reg->var_off)) + verbose(env, "%lld", reg->var_off.value + reg->off); + break; + case STACK_DYNPTR: + i += BPF_DYNPTR_NR_SLOTS - 1; + reg = &state->stack[i].spilled_ptr; + + verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE); + print_liveness(env, reg->live); + verbose(env, "=dynptr_%s", dynptr_type_str(reg->dynptr.type)); + if (reg->ref_obj_id) + verbose(env, "(ref_id=%d)", reg->ref_obj_id); + break; + case STACK_ITER: + /* only main slot has ref_obj_id set; skip others */ + reg = &state->stack[i].spilled_ptr; + if (!reg->ref_obj_id) + continue; + + verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE); + print_liveness(env, reg->live); + verbose(env, "=iter_%s(ref_id=%d,state=%s,depth=%u)", + iter_type_str(reg->iter.btf, reg->iter.btf_id), + reg->ref_obj_id, iter_state_str(reg->iter.state), + reg->iter.depth); + break; + case STACK_MISC: + case STACK_ZERO: + default: + reg = &state->stack[i].spilled_ptr; + + for (j = 0; j < BPF_REG_SIZE; j++) + types_buf[j] = slot_type_char[state->stack[i].slot_type[j]]; + types_buf[BPF_REG_SIZE] = 0; + + verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE); + print_liveness(env, reg->live); + verbose(env, "=%s", types_buf); + break; + } + } + if (state->acquired_refs && state->refs[0].id) { + verbose(env, " refs=%d", state->refs[0].id); + for (i = 1; i < state->acquired_refs; i++) + if (state->refs[i].id) + verbose(env, ",%d", state->refs[i].id); + } + if (state->in_callback_fn) + verbose(env, " cb"); + if (state->in_async_callback_fn) + verbose(env, " async_cb"); + verbose(env, "\n"); + if (!print_all) + mark_verifier_state_clean(env); +} + +static inline u32 vlog_alignment(u32 pos) +{ + return round_up(max(pos + BPF_LOG_MIN_ALIGNMENT / 2, BPF_LOG_ALIGNMENT), + BPF_LOG_MIN_ALIGNMENT) - pos - 1; +} + +void print_insn_state(struct bpf_verifier_env *env, const struct bpf_func_state *state) +{ + if (env->prev_log_pos && env->prev_log_pos == env->log.end_pos) { + /* remove new line character */ + bpf_vlog_reset(&env->log, env->prev_log_pos - 1); + verbose(env, "%*c;", vlog_alignment(env->prev_insn_print_pos), ' '); + } else { + verbose(env, "%d:", env->insn_idx); + } + print_verifier_state(env, state, false); +} diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 683fdda25c13..8c2d31aa3d31 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -368,21 +368,6 @@ static void verbose_invalid_scalar(struct bpf_verifier_env *env, verbose(env, " should have been in %s\n", tn_buf); } -static bool type_is_pkt_pointer(enum bpf_reg_type type) -{ - type = base_type(type); - return type == PTR_TO_PACKET || - type == PTR_TO_PACKET_META; -} - -static bool type_is_sk_pointer(enum bpf_reg_type type) -{ - return type == PTR_TO_SOCKET || - type == PTR_TO_SOCK_COMMON || - type == PTR_TO_TCP_SOCK || - type == PTR_TO_XDP_SOCK; -} - static bool type_may_be_null(u32 type) { return type & PTR_MAYBE_NULL; @@ -406,16 +391,6 @@ static bool reg_not_null(const struct bpf_reg_state *reg) type == PTR_TO_MEM; } -static bool type_is_ptr_alloc_obj(u32 type) -{ - return base_type(type) == PTR_TO_BTF_ID && type_flag(type) & MEM_ALLOC; -} - -static bool type_is_non_owning_ref(u32 type) -{ - return type_is_ptr_alloc_obj(type) && type_flag(type) & NON_OWN_REF; -} - static struct btf_record *reg_btf_record(const struct bpf_reg_state *reg) { struct btf_record *rec = NULL; @@ -532,83 +507,6 @@ static bool is_cmpxchg_insn(const struct bpf_insn *insn) insn->imm == BPF_CMPXCHG; } -/* string representation of 'enum bpf_reg_type' - * - * Note that reg_type_str() can not appear more than once in a single verbose() - * statement. - */ -static const char *reg_type_str(struct bpf_verifier_env *env, - enum bpf_reg_type type) -{ - char postfix[16] = {0}, prefix[64] = {0}; - static const char * const str[] = { - [NOT_INIT] = "?", - [SCALAR_VALUE] = "scalar", - [PTR_TO_CTX] = "ctx", - [CONST_PTR_TO_MAP] = "map_ptr", - [PTR_TO_MAP_VALUE] = "map_value", - [PTR_TO_STACK] = "fp", - [PTR_TO_PACKET] = "pkt", - [PTR_TO_PACKET_META] = "pkt_meta", - [PTR_TO_PACKET_END] = "pkt_end", - [PTR_TO_FLOW_KEYS] = "flow_keys", - [PTR_TO_SOCKET] = "sock", - [PTR_TO_SOCK_COMMON] = "sock_common", - [PTR_TO_TCP_SOCK] = "tcp_sock", - [PTR_TO_TP_BUFFER] = "tp_buffer", - [PTR_TO_XDP_SOCK] = "xdp_sock", - [PTR_TO_BTF_ID] = "ptr_", - [PTR_TO_MEM] = "mem", - [PTR_TO_BUF] = "buf", - [PTR_TO_FUNC] = "func", - [PTR_TO_MAP_KEY] = "map_key", - [CONST_PTR_TO_DYNPTR] = "dynptr_ptr", - }; - - if (type & PTR_MAYBE_NULL) { - if (base_type(type) == PTR_TO_BTF_ID) - strncpy(postfix, "or_null_", 16); - else - strncpy(postfix, "_or_null", 16); - } - - snprintf(prefix, sizeof(prefix), "%s%s%s%s%s%s%s", - type & MEM_RDONLY ? "rdonly_" : "", - type & MEM_RINGBUF ? "ringbuf_" : "", - type & MEM_USER ? "user_" : "", - type & MEM_PERCPU ? "percpu_" : "", - type & MEM_RCU ? "rcu_" : "", - type & PTR_UNTRUSTED ? "untrusted_" : "", - type & PTR_TRUSTED ? "trusted_" : "" - ); - - snprintf(env->tmp_str_buf, TMP_STR_BUF_LEN, "%s%s%s", - prefix, str[base_type(type)], postfix); - return env->tmp_str_buf; -} - -static char slot_type_char[] = { - [STACK_INVALID] = '?', - [STACK_SPILL] = 'r', - [STACK_MISC] = 'm', - [STACK_ZERO] = '0', - [STACK_DYNPTR] = 'd', - [STACK_ITER] = 'i', -}; - -static void print_liveness(struct bpf_verifier_env *env, - enum bpf_reg_liveness live) -{ - if (live & (REG_LIVE_READ | REG_LIVE_WRITTEN | REG_LIVE_DONE)) - verbose(env, "_"); - if (live & REG_LIVE_READ) - verbose(env, "r"); - if (live & REG_LIVE_WRITTEN) - verbose(env, "w"); - if (live & REG_LIVE_DONE) - verbose(env, "D"); -} - static int __get_spi(s32 off) { return (-off - 1) / BPF_REG_SIZE; @@ -678,87 +576,6 @@ static const char *btf_type_name(const struct btf *btf, u32 id) return btf_name_by_offset(btf, btf_type_by_id(btf, id)->name_off); } -static const char *dynptr_type_str(enum bpf_dynptr_type type) -{ - switch (type) { - case BPF_DYNPTR_TYPE_LOCAL: - return "local"; - case BPF_DYNPTR_TYPE_RINGBUF: - return "ringbuf"; - case BPF_DYNPTR_TYPE_SKB: - return "skb"; - case BPF_DYNPTR_TYPE_XDP: - return "xdp"; - case BPF_DYNPTR_TYPE_INVALID: - return ""; - default: - WARN_ONCE(1, "unknown dynptr type %d\n", type); - return ""; - } -} - -static const char *iter_type_str(const struct btf *btf, u32 btf_id) -{ - if (!btf || btf_id == 0) - return ""; - - /* we already validated that type is valid and has conforming name */ - return btf_type_name(btf, btf_id) + sizeof(ITER_PREFIX) - 1; -} - -static const char *iter_state_str(enum bpf_iter_state state) -{ - switch (state) { - case BPF_ITER_STATE_ACTIVE: - return "active"; - case BPF_ITER_STATE_DRAINED: - return "drained"; - case BPF_ITER_STATE_INVALID: - return ""; - default: - WARN_ONCE(1, "unknown iter state %d\n", state); - return ""; - } -} - -static void mark_reg_scratched(struct bpf_verifier_env *env, u32 regno) -{ - env->scratched_regs |= 1U << regno; -} - -static void mark_stack_slot_scratched(struct bpf_verifier_env *env, u32 spi) -{ - env->scratched_stack_slots |= 1ULL << spi; -} - -static bool reg_scratched(const struct bpf_verifier_env *env, u32 regno) -{ - return (env->scratched_regs >> regno) & 1; -} - -static bool stack_slot_scratched(const struct bpf_verifier_env *env, u64 regno) -{ - return (env->scratched_stack_slots >> regno) & 1; -} - -static bool verifier_state_scratched(const struct bpf_verifier_env *env) -{ - return env->scratched_regs || env->scratched_stack_slots; -} - -static void mark_verifier_state_clean(struct bpf_verifier_env *env) -{ - env->scratched_regs = 0U; - env->scratched_stack_slots = 0ULL; -} - -/* Used for printing the entire verifier state. */ -static void mark_verifier_state_scratched(struct bpf_verifier_env *env) -{ - env->scratched_regs = ~0U; - env->scratched_stack_slots = ~0ULL; -} - static enum bpf_dynptr_type arg_to_dynptr_type(enum bpf_arg_type arg_type) { switch (arg_type & DYNPTR_TYPE_FLAG_MASK) { @@ -1298,226 +1115,6 @@ static void scrub_spilled_slot(u8 *stype) *stype = STACK_MISC; } -static void print_scalar_ranges(struct bpf_verifier_env *env, - const struct bpf_reg_state *reg, - const char **sep) -{ - struct { - const char *name; - u64 val; - bool omit; - } minmaxs[] = { - {"smin", reg->smin_value, reg->smin_value == S64_MIN}, - {"smax", reg->smax_value, reg->smax_value == S64_MAX}, - {"umin", reg->umin_value, reg->umin_value == 0}, - {"umax", reg->umax_value, reg->umax_value == U64_MAX}, - {"smin32", (s64)reg->s32_min_value, reg->s32_min_value == S32_MIN}, - {"smax32", (s64)reg->s32_max_value, reg->s32_max_value == S32_MAX}, - {"umin32", reg->u32_min_value, reg->u32_min_value == 0}, - {"umax32", reg->u32_max_value, reg->u32_max_value == U32_MAX}, - }, *m1, *m2, *mend = &minmaxs[ARRAY_SIZE(minmaxs)]; - bool neg1, neg2; - - for (m1 = &minmaxs[0]; m1 < mend; m1++) { - if (m1->omit) - continue; - - neg1 = m1->name[0] == 's' && (s64)m1->val < 0; - - verbose(env, "%s%s=", *sep, m1->name); - *sep = ","; - - for (m2 = m1 + 2; m2 < mend; m2 += 2) { - if (m2->omit || m2->val != m1->val) - continue; - /* don't mix negatives with positives */ - neg2 = m2->name[0] == 's' && (s64)m2->val < 0; - if (neg2 != neg1) - continue; - m2->omit = true; - verbose(env, "%s=", m2->name); - } - - verbose(env, m1->name[0] == 's' ? "%lld" : "%llu", m1->val); - } -} - -static void print_verifier_state(struct bpf_verifier_env *env, - const struct bpf_func_state *state, - bool print_all) -{ - const struct bpf_reg_state *reg; - enum bpf_reg_type t; - int i; - - if (state->frameno) - verbose(env, " frame%d:", state->frameno); - for (i = 0; i < MAX_BPF_REG; i++) { - reg = &state->regs[i]; - t = reg->type; - if (t == NOT_INIT) - continue; - if (!print_all && !reg_scratched(env, i)) - continue; - verbose(env, " R%d", i); - print_liveness(env, reg->live); - verbose(env, "="); - if (t == SCALAR_VALUE && reg->precise) - verbose(env, "P"); - if ((t == SCALAR_VALUE || t == PTR_TO_STACK) && - tnum_is_const(reg->var_off)) { - /* reg->off should be 0 for SCALAR_VALUE */ - verbose(env, "%s", t == SCALAR_VALUE ? "" : reg_type_str(env, t)); - verbose(env, "%lld", reg->var_off.value + reg->off); - } else { - const char *sep = ""; - - verbose(env, "%s", reg_type_str(env, t)); - if (base_type(t) == PTR_TO_BTF_ID) - verbose(env, "%s", btf_type_name(reg->btf, reg->btf_id)); - verbose(env, "("); -/* - * _a stands for append, was shortened to avoid multiline statements below. - * This macro is used to output a comma separated list of attributes. - */ -#define verbose_a(fmt, ...) ({ verbose(env, "%s" fmt, sep, __VA_ARGS__); sep = ","; }) - - if (reg->id) - verbose_a("id=%d", reg->id); - if (reg->ref_obj_id) - verbose_a("ref_obj_id=%d", reg->ref_obj_id); - if (type_is_non_owning_ref(reg->type)) - verbose_a("%s", "non_own_ref"); - if (t != SCALAR_VALUE) - verbose_a("off=%d", reg->off); - if (type_is_pkt_pointer(t)) - verbose_a("r=%d", reg->range); - else if (base_type(t) == CONST_PTR_TO_MAP || - base_type(t) == PTR_TO_MAP_KEY || - base_type(t) == PTR_TO_MAP_VALUE) - verbose_a("ks=%d,vs=%d", - reg->map_ptr->key_size, - reg->map_ptr->value_size); - if (tnum_is_const(reg->var_off)) { - /* Typically an immediate SCALAR_VALUE, but - * could be a pointer whose offset is too big - * for reg->off - */ - verbose_a("imm=%llx", reg->var_off.value); - } else { - print_scalar_ranges(env, reg, &sep); - if (!tnum_is_unknown(reg->var_off)) { - char tn_buf[48]; - - tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose_a("var_off=%s", tn_buf); - } - } -#undef verbose_a - - verbose(env, ")"); - } - } - for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { - char types_buf[BPF_REG_SIZE + 1]; - bool valid = false; - int j; - - for (j = 0; j < BPF_REG_SIZE; j++) { - if (state->stack[i].slot_type[j] != STACK_INVALID) - valid = true; - types_buf[j] = slot_type_char[state->stack[i].slot_type[j]]; - } - types_buf[BPF_REG_SIZE] = 0; - if (!valid) - continue; - if (!print_all && !stack_slot_scratched(env, i)) - continue; - switch (state->stack[i].slot_type[BPF_REG_SIZE - 1]) { - case STACK_SPILL: - reg = &state->stack[i].spilled_ptr; - t = reg->type; - - verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE); - print_liveness(env, reg->live); - verbose(env, "=%s", t == SCALAR_VALUE ? "" : reg_type_str(env, t)); - if (t == SCALAR_VALUE && reg->precise) - verbose(env, "P"); - if (t == SCALAR_VALUE && tnum_is_const(reg->var_off)) - verbose(env, "%lld", reg->var_off.value + reg->off); - break; - case STACK_DYNPTR: - i += BPF_DYNPTR_NR_SLOTS - 1; - reg = &state->stack[i].spilled_ptr; - - verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE); - print_liveness(env, reg->live); - verbose(env, "=dynptr_%s", dynptr_type_str(reg->dynptr.type)); - if (reg->ref_obj_id) - verbose(env, "(ref_id=%d)", reg->ref_obj_id); - break; - case STACK_ITER: - /* only main slot has ref_obj_id set; skip others */ - reg = &state->stack[i].spilled_ptr; - if (!reg->ref_obj_id) - continue; - - verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE); - print_liveness(env, reg->live); - verbose(env, "=iter_%s(ref_id=%d,state=%s,depth=%u)", - iter_type_str(reg->iter.btf, reg->iter.btf_id), - reg->ref_obj_id, iter_state_str(reg->iter.state), - reg->iter.depth); - break; - case STACK_MISC: - case STACK_ZERO: - default: - reg = &state->stack[i].spilled_ptr; - - for (j = 0; j < BPF_REG_SIZE; j++) - types_buf[j] = slot_type_char[state->stack[i].slot_type[j]]; - types_buf[BPF_REG_SIZE] = 0; - - verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE); - print_liveness(env, reg->live); - verbose(env, "=%s", types_buf); - break; - } - } - if (state->acquired_refs && state->refs[0].id) { - verbose(env, " refs=%d", state->refs[0].id); - for (i = 1; i < state->acquired_refs; i++) - if (state->refs[i].id) - verbose(env, ",%d", state->refs[i].id); - } - if (state->in_callback_fn) - verbose(env, " cb"); - if (state->in_async_callback_fn) - verbose(env, " async_cb"); - verbose(env, "\n"); - if (!print_all) - mark_verifier_state_clean(env); -} - -static inline u32 vlog_alignment(u32 pos) -{ - return round_up(max(pos + BPF_LOG_MIN_ALIGNMENT / 2, BPF_LOG_ALIGNMENT), - BPF_LOG_MIN_ALIGNMENT) - pos - 1; -} - -static void print_insn_state(struct bpf_verifier_env *env, - const struct bpf_func_state *state) -{ - if (env->prev_log_pos && env->prev_log_pos == env->log.end_pos) { - /* remove new line character */ - bpf_vlog_reset(&env->log, env->prev_log_pos - 1); - verbose(env, "%*c;", vlog_alignment(env->prev_insn_print_pos), ' '); - } else { - verbose(env, "%d:", env->insn_idx); - } - print_verifier_state(env, state, false); -} - /* copy array src of length n * size bytes to dst. dst is reallocated if it's too * small to hold src. This is different from krealloc since we don't want to preserve * the contents of dst. -- cgit From 009f5465be3636e9ce795cfbd5d3109d8978774d Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 17 Nov 2023 19:46:18 -0800 Subject: bpf: extract register state printing Extract printing register state representation logic into a separate helper, as we are going to reuse it for spilled register state printing in the next patch. This also nicely reduces code nestedness. No functional changes. Acked-by: Eduard Zingerman Acked-by: Stanislav Fomichev Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231118034623.3320920-4-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/log.c | 120 +++++++++++++++++++++++++++++-------------------------- 1 file changed, 63 insertions(+), 57 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c index c1b257eac21b..05d737e2fab3 100644 --- a/kernel/bpf/log.c +++ b/kernel/bpf/log.c @@ -553,6 +553,67 @@ static void print_scalar_ranges(struct bpf_verifier_env *env, } } +static void print_reg_state(struct bpf_verifier_env *env, const struct bpf_reg_state *reg) +{ + enum bpf_reg_type t; + const char *sep = ""; + + t = reg->type; + if (t == SCALAR_VALUE && reg->precise) + verbose(env, "P"); + if ((t == SCALAR_VALUE || t == PTR_TO_STACK) && + tnum_is_const(reg->var_off)) { + /* reg->off should be 0 for SCALAR_VALUE */ + verbose(env, "%s", t == SCALAR_VALUE ? "" : reg_type_str(env, t)); + verbose(env, "%lld", reg->var_off.value + reg->off); + return; + } +/* + * _a stands for append, was shortened to avoid multiline statements below. + * This macro is used to output a comma separated list of attributes. + */ +#define verbose_a(fmt, ...) ({ verbose(env, "%s" fmt, sep, __VA_ARGS__); sep = ","; }) + + verbose(env, "%s", reg_type_str(env, t)); + if (base_type(t) == PTR_TO_BTF_ID) + verbose(env, "%s", btf_type_name(reg->btf, reg->btf_id)); + verbose(env, "("); + if (reg->id) + verbose_a("id=%d", reg->id); + if (reg->ref_obj_id) + verbose_a("ref_obj_id=%d", reg->ref_obj_id); + if (type_is_non_owning_ref(reg->type)) + verbose_a("%s", "non_own_ref"); + if (t != SCALAR_VALUE) + verbose_a("off=%d", reg->off); + if (type_is_pkt_pointer(t)) + verbose_a("r=%d", reg->range); + else if (base_type(t) == CONST_PTR_TO_MAP || + base_type(t) == PTR_TO_MAP_KEY || + base_type(t) == PTR_TO_MAP_VALUE) + verbose_a("ks=%d,vs=%d", + reg->map_ptr->key_size, + reg->map_ptr->value_size); + if (tnum_is_const(reg->var_off)) { + /* Typically an immediate SCALAR_VALUE, but + * could be a pointer whose offset is too big + * for reg->off + */ + verbose_a("imm=%llx", reg->var_off.value); + } else { + print_scalar_ranges(env, reg, &sep); + if (!tnum_is_unknown(reg->var_off)) { + char tn_buf[48]; + + tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); + verbose_a("var_off=%s", tn_buf); + } + } + verbose(env, ")"); + +#undef verbose_a +} + void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_func_state *state, bool print_all) { @@ -564,69 +625,14 @@ void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_func_st verbose(env, " frame%d:", state->frameno); for (i = 0; i < MAX_BPF_REG; i++) { reg = &state->regs[i]; - t = reg->type; - if (t == NOT_INIT) + if (reg->type == NOT_INIT) continue; if (!print_all && !reg_scratched(env, i)) continue; verbose(env, " R%d", i); print_liveness(env, reg->live); verbose(env, "="); - if (t == SCALAR_VALUE && reg->precise) - verbose(env, "P"); - if ((t == SCALAR_VALUE || t == PTR_TO_STACK) && - tnum_is_const(reg->var_off)) { - /* reg->off should be 0 for SCALAR_VALUE */ - verbose(env, "%s", t == SCALAR_VALUE ? "" : reg_type_str(env, t)); - verbose(env, "%lld", reg->var_off.value + reg->off); - } else { - const char *sep = ""; - - verbose(env, "%s", reg_type_str(env, t)); - if (base_type(t) == PTR_TO_BTF_ID) - verbose(env, "%s", btf_type_name(reg->btf, reg->btf_id)); - verbose(env, "("); -/* - * _a stands for append, was shortened to avoid multiline statements below. - * This macro is used to output a comma separated list of attributes. - */ -#define verbose_a(fmt, ...) ({ verbose(env, "%s" fmt, sep, __VA_ARGS__); sep = ","; }) - - if (reg->id) - verbose_a("id=%d", reg->id); - if (reg->ref_obj_id) - verbose_a("ref_obj_id=%d", reg->ref_obj_id); - if (type_is_non_owning_ref(reg->type)) - verbose_a("%s", "non_own_ref"); - if (t != SCALAR_VALUE) - verbose_a("off=%d", reg->off); - if (type_is_pkt_pointer(t)) - verbose_a("r=%d", reg->range); - else if (base_type(t) == CONST_PTR_TO_MAP || - base_type(t) == PTR_TO_MAP_KEY || - base_type(t) == PTR_TO_MAP_VALUE) - verbose_a("ks=%d,vs=%d", - reg->map_ptr->key_size, - reg->map_ptr->value_size); - if (tnum_is_const(reg->var_off)) { - /* Typically an immediate SCALAR_VALUE, but - * could be a pointer whose offset is too big - * for reg->off - */ - verbose_a("imm=%llx", reg->var_off.value); - } else { - print_scalar_ranges(env, reg, &sep); - if (!tnum_is_unknown(reg->var_off)) { - char tn_buf[48]; - - tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose_a("var_off=%s", tn_buf); - } - } -#undef verbose_a - - verbose(env, ")"); - } + print_reg_state(env, reg); } for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { char types_buf[BPF_REG_SIZE + 1]; -- cgit From 67d43dfbb42d6575304daea67733c88fbf536a1c Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 17 Nov 2023 19:46:19 -0800 Subject: bpf: print spilled register state in stack slot Print the same register state representation when printing stack state, as we do for normal registers. Note that if stack slot contains subregister spill (1, 2, or 4 byte long), we'll still emit "m0?" mask for those bytes that are not part of spilled register. While means we can get something like fp-8=0000scalar() for a 4-byte spill with other 4 bytes still being STACK_ZERO. Some example before and after, taken from the log of pyperf_subprogs.bpf.o: 49: (7b) *(u64 *)(r10 -256) = r1 ; frame1: R1_w=ctx(off=0,imm=0) R10=fp0 fp-256_w=ctx 49: (7b) *(u64 *)(r10 -256) = r1 ; frame1: R1_w=ctx(off=0,imm=0) R10=fp0 fp-256_w=ctx(off=0,imm=0) 150: (7b) *(u64 *)(r10 -264) = r0 ; frame1: R0_w=map_value_or_null(id=6,off=0,ks=192,vs=4,imm=0) R10=fp0 fp-264_w=map_value_or_null 150: (7b) *(u64 *)(r10 -264) = r0 ; frame1: R0_w=map_value_or_null(id=6,off=0,ks=192,vs=4,imm=0) R10=fp0 fp-264_w=map_value_or_null(id=6,off=0,ks=192,vs=4,imm=0) 5192: (61) r1 = *(u32 *)(r10 -272) ; frame1: R1_w=scalar(smin=smin32=0,smax=umax=smax32=umax32=15,var_off=(0x0; 0xf)) R10=fp0 fp-272= 5192: (61) r1 = *(u32 *)(r10 -272) ; frame1: R1_w=scalar(smin=smin32=0,smax=umax=smax32=umax32=15,var_off=(0x0; 0xf)) R10=fp0 fp-272=????scalar(smin=smin32=0,smax=umax=smax32=umax32=15,var_off=(0x0; 0xf)) While at it, do a few other simple clean ups: - skip slot if it's not scratched before detecting whether it's valid; - move taking spilled_reg pointer outside of switch (only DYNPTR has to adjust that to get to the "main" slot); - don't recalculate types_buf second time for MISC/ZERO/default case. Acked-by: Eduard Zingerman Acked-by: Stanislav Fomichev Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231118034623.3320920-5-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/log.c | 36 +++++++++++++++++------------------- 1 file changed, 17 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c index 05d737e2fab3..97a1641e848e 100644 --- a/kernel/bpf/log.c +++ b/kernel/bpf/log.c @@ -618,7 +618,6 @@ void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_func_st bool print_all) { const struct bpf_reg_state *reg; - enum bpf_reg_type t; int i; if (state->frameno) @@ -637,32 +636,38 @@ void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_func_st for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { char types_buf[BPF_REG_SIZE + 1]; bool valid = false; + u8 slot_type; int j; + if (!print_all && !stack_slot_scratched(env, i)) + continue; + for (j = 0; j < BPF_REG_SIZE; j++) { - if (state->stack[i].slot_type[j] != STACK_INVALID) + slot_type = state->stack[i].slot_type[j]; + if (slot_type != STACK_INVALID) valid = true; - types_buf[j] = slot_type_char[state->stack[i].slot_type[j]]; + types_buf[j] = slot_type_char[slot_type]; } types_buf[BPF_REG_SIZE] = 0; if (!valid) continue; - if (!print_all && !stack_slot_scratched(env, i)) - continue; + + reg = &state->stack[i].spilled_ptr; switch (state->stack[i].slot_type[BPF_REG_SIZE - 1]) { case STACK_SPILL: - reg = &state->stack[i].spilled_ptr; - t = reg->type; + /* print MISC/ZERO/INVALID slots above subreg spill */ + for (j = 0; j < BPF_REG_SIZE; j++) + if (state->stack[i].slot_type[j] == STACK_SPILL) + break; + types_buf[j] = '\0'; verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE); print_liveness(env, reg->live); - verbose(env, "=%s", t == SCALAR_VALUE ? "" : reg_type_str(env, t)); - if (t == SCALAR_VALUE && reg->precise) - verbose(env, "P"); - if (t == SCALAR_VALUE && tnum_is_const(reg->var_off)) - verbose(env, "%lld", reg->var_off.value + reg->off); + verbose(env, "=%s", types_buf); + print_reg_state(env, reg); break; case STACK_DYNPTR: + /* skip to main dynptr slot */ i += BPF_DYNPTR_NR_SLOTS - 1; reg = &state->stack[i].spilled_ptr; @@ -674,7 +679,6 @@ void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_func_st break; case STACK_ITER: /* only main slot has ref_obj_id set; skip others */ - reg = &state->stack[i].spilled_ptr; if (!reg->ref_obj_id) continue; @@ -688,12 +692,6 @@ void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_func_st case STACK_MISC: case STACK_ZERO: default: - reg = &state->stack[i].spilled_ptr; - - for (j = 0; j < BPF_REG_SIZE; j++) - types_buf[j] = slot_type_char[state->stack[i].slot_type[j]]; - types_buf[BPF_REG_SIZE] = 0; - verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE); print_liveness(env, reg->live); verbose(env, "=%s", types_buf); -- cgit From 0c95c9fdb696f35c7864785ba84cb9a50152daff Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 17 Nov 2023 19:46:20 -0800 Subject: bpf: emit map name in register state if applicable and available In complicated real-world applications, whenever debugging some verification error through verifier log, it often would be very useful to see map name for PTR_TO_MAP_VALUE register. Usually this needs to be inferred from key/value sizes and maybe trying to guess C code location, but it's not always clear. Given verifier has the name, and it's never too long, let's just emit it for ptr_to_map_key, ptr_to_map_value, and const_ptr_to_map registers. We reshuffle the order a bit, so that map name, key size, and value size appear before offset and immediate values, which seems like a more logical order. Current output: R1_w=map_ptr(map=array_map,ks=4,vs=8,off=0,imm=0) But we'll get rid of useless off=0 and imm=0 parts in the next patch. Acked-by: Eduard Zingerman Acked-by: Stanislav Fomichev Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231118034623.3320920-6-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/log.c | 24 ++++++++++++++++------ tools/testing/selftests/bpf/prog_tests/spin_lock.c | 10 ++++----- 2 files changed, 23 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c index 97a1641e848e..c209ab1ec2b5 100644 --- a/kernel/bpf/log.c +++ b/kernel/bpf/log.c @@ -553,6 +553,17 @@ static void print_scalar_ranges(struct bpf_verifier_env *env, } } +static bool type_is_map_ptr(enum bpf_reg_type t) { + switch (base_type(t)) { + case CONST_PTR_TO_MAP: + case PTR_TO_MAP_KEY: + case PTR_TO_MAP_VALUE: + return true; + default: + return false; + } +} + static void print_reg_state(struct bpf_verifier_env *env, const struct bpf_reg_state *reg) { enum bpf_reg_type t; @@ -584,16 +595,17 @@ static void print_reg_state(struct bpf_verifier_env *env, const struct bpf_reg_s verbose_a("ref_obj_id=%d", reg->ref_obj_id); if (type_is_non_owning_ref(reg->type)) verbose_a("%s", "non_own_ref"); + if (type_is_map_ptr(t)) { + if (reg->map_ptr->name[0]) + verbose_a("map=%s", reg->map_ptr->name); + verbose_a("ks=%d,vs=%d", + reg->map_ptr->key_size, + reg->map_ptr->value_size); + } if (t != SCALAR_VALUE) verbose_a("off=%d", reg->off); if (type_is_pkt_pointer(t)) verbose_a("r=%d", reg->range); - else if (base_type(t) == CONST_PTR_TO_MAP || - base_type(t) == PTR_TO_MAP_KEY || - base_type(t) == PTR_TO_MAP_VALUE) - verbose_a("ks=%d,vs=%d", - reg->map_ptr->key_size, - reg->map_ptr->value_size); if (tnum_is_const(reg->var_off)) { /* Typically an immediate SCALAR_VALUE, but * could be a pointer whose offset is too big diff --git a/tools/testing/selftests/bpf/prog_tests/spin_lock.c b/tools/testing/selftests/bpf/prog_tests/spin_lock.c index f29c08d93beb..ace65224286f 100644 --- a/tools/testing/selftests/bpf/prog_tests/spin_lock.c +++ b/tools/testing/selftests/bpf/prog_tests/spin_lock.c @@ -17,18 +17,18 @@ static struct { "R1_w=ptr_foo(id=2,ref_obj_id=2,off=0,imm=0) refs=2\n6: (85) call bpf_this_cpu_ptr#154\n" "R1 type=ptr_ expected=percpu_ptr_" }, { "lock_id_global_zero", - "; R1_w=map_value(off=0,ks=4,vs=4,imm=0)\n2: (85) call bpf_this_cpu_ptr#154\n" + "; R1_w=map_value(map=.data.A,ks=4,vs=4,off=0,imm=0)\n2: (85) call bpf_this_cpu_ptr#154\n" "R1 type=map_value expected=percpu_ptr_" }, { "lock_id_mapval_preserve", "[0-9]\\+: (bf) r1 = r0 ;" - " R0_w=map_value(id=1,off=0,ks=4,vs=8,imm=0)" - " R1_w=map_value(id=1,off=0,ks=4,vs=8,imm=0)\n" + " R0_w=map_value(id=1,map=array_map,ks=4,vs=8,off=0,imm=0)" + " R1_w=map_value(id=1,map=array_map,ks=4,vs=8,off=0,imm=0)\n" "[0-9]\\+: (85) call bpf_this_cpu_ptr#154\n" "R1 type=map_value expected=percpu_ptr_" }, { "lock_id_innermapval_preserve", "[0-9]\\+: (bf) r1 = r0 ;" - " R0=map_value(id=2,off=0,ks=4,vs=8,imm=0)" - " R1_w=map_value(id=2,off=0,ks=4,vs=8,imm=0)\n" + " R0=map_value(id=2,ks=4,vs=8,off=0,imm=0)" + " R1_w=map_value(id=2,ks=4,vs=8,off=0,imm=0)\n" "[0-9]\\+: (85) call bpf_this_cpu_ptr#154\n" "R1 type=map_value expected=percpu_ptr_" }, { "lock_id_mismatch_kptr_kptr", "bpf_spin_unlock of different lock" }, -- cgit From 1db747d75b1dbe17bf4283ed87bd3b7a92010f34 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 17 Nov 2023 19:46:21 -0800 Subject: bpf: omit default off=0 and imm=0 in register state log Simplify BPF verifier log further by omitting default (and frequently irrelevant) off=0 and imm=0 parts for non-SCALAR_VALUE registers. As can be seen from fixed tests, this is often a visual noise for PTR_TO_CTX register and even for PTR_TO_PACKET registers. Omitting default values follows the rest of register state logic: we omit default values to keep verifier log succinct and to highlight interesting state that deviates from default one. E.g., we do the same for var_off, when it's unknown, which gives no additional information. Acked-by: Eduard Zingerman Acked-by: Stanislav Fomichev Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231118034623.3320920-7-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/log.c | 10 +++--- tools/testing/selftests/bpf/prog_tests/align.c | 42 +++++++++++----------- tools/testing/selftests/bpf/prog_tests/log_buf.c | 4 +-- tools/testing/selftests/bpf/prog_tests/spin_lock.c | 14 ++++---- .../selftests/bpf/progs/exceptions_assert.c | 10 +++--- 5 files changed, 39 insertions(+), 41 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c index c209ab1ec2b5..20b4f81087da 100644 --- a/kernel/bpf/log.c +++ b/kernel/bpf/log.c @@ -602,16 +602,14 @@ static void print_reg_state(struct bpf_verifier_env *env, const struct bpf_reg_s reg->map_ptr->key_size, reg->map_ptr->value_size); } - if (t != SCALAR_VALUE) + if (t != SCALAR_VALUE && reg->off) verbose_a("off=%d", reg->off); if (type_is_pkt_pointer(t)) verbose_a("r=%d", reg->range); if (tnum_is_const(reg->var_off)) { - /* Typically an immediate SCALAR_VALUE, but - * could be a pointer whose offset is too big - * for reg->off - */ - verbose_a("imm=%llx", reg->var_off.value); + /* a pointer register with fixed offset */ + if (reg->var_off.value) + verbose_a("imm=%llx", reg->var_off.value); } else { print_scalar_ranges(env, reg, &sep); if (!tnum_is_unknown(reg->var_off)) { diff --git a/tools/testing/selftests/bpf/prog_tests/align.c b/tools/testing/selftests/bpf/prog_tests/align.c index 465c1c3a3d3c..4ebd0da898f5 100644 --- a/tools/testing/selftests/bpf/prog_tests/align.c +++ b/tools/testing/selftests/bpf/prog_tests/align.c @@ -40,7 +40,7 @@ static struct bpf_align_test tests[] = { }, .prog_type = BPF_PROG_TYPE_SCHED_CLS, .matches = { - {0, "R1", "ctx(off=0,imm=0)"}, + {0, "R1", "ctx()"}, {0, "R10", "fp0"}, {0, "R3_w", "2"}, {1, "R3_w", "4"}, @@ -68,7 +68,7 @@ static struct bpf_align_test tests[] = { }, .prog_type = BPF_PROG_TYPE_SCHED_CLS, .matches = { - {0, "R1", "ctx(off=0,imm=0)"}, + {0, "R1", "ctx()"}, {0, "R10", "fp0"}, {0, "R3_w", "1"}, {1, "R3_w", "2"}, @@ -97,7 +97,7 @@ static struct bpf_align_test tests[] = { }, .prog_type = BPF_PROG_TYPE_SCHED_CLS, .matches = { - {0, "R1", "ctx(off=0,imm=0)"}, + {0, "R1", "ctx()"}, {0, "R10", "fp0"}, {0, "R3_w", "4"}, {1, "R3_w", "8"}, @@ -119,7 +119,7 @@ static struct bpf_align_test tests[] = { }, .prog_type = BPF_PROG_TYPE_SCHED_CLS, .matches = { - {0, "R1", "ctx(off=0,imm=0)"}, + {0, "R1", "ctx()"}, {0, "R10", "fp0"}, {0, "R3_w", "7"}, {1, "R3_w", "7"}, @@ -162,13 +162,13 @@ static struct bpf_align_test tests[] = { }, .prog_type = BPF_PROG_TYPE_SCHED_CLS, .matches = { - {6, "R0_w", "pkt(off=8,r=8,imm=0)"}, + {6, "R0_w", "pkt(off=8,r=8)"}, {6, "R3_w", "var_off=(0x0; 0xff)"}, {7, "R3_w", "var_off=(0x0; 0x1fe)"}, {8, "R3_w", "var_off=(0x0; 0x3fc)"}, {9, "R3_w", "var_off=(0x0; 0x7f8)"}, {10, "R3_w", "var_off=(0x0; 0xff0)"}, - {12, "R3_w", "pkt_end(off=0,imm=0)"}, + {12, "R3_w", "pkt_end()"}, {17, "R4_w", "var_off=(0x0; 0xff)"}, {18, "R4_w", "var_off=(0x0; 0x1fe0)"}, {19, "R4_w", "var_off=(0x0; 0xff0)"}, @@ -235,11 +235,11 @@ static struct bpf_align_test tests[] = { }, .prog_type = BPF_PROG_TYPE_SCHED_CLS, .matches = { - {2, "R5_w", "pkt(off=0,r=0,imm=0)"}, - {4, "R5_w", "pkt(off=14,r=0,imm=0)"}, - {5, "R4_w", "pkt(off=14,r=0,imm=0)"}, - {9, "R2", "pkt(off=0,r=18,imm=0)"}, - {10, "R5", "pkt(off=14,r=18,imm=0)"}, + {2, "R5_w", "pkt(r=0)"}, + {4, "R5_w", "pkt(off=14,r=0)"}, + {5, "R4_w", "pkt(off=14,r=0)"}, + {9, "R2", "pkt(r=18)"}, + {10, "R5", "pkt(off=14,r=18)"}, {10, "R4_w", "var_off=(0x0; 0xff)"}, {13, "R4_w", "var_off=(0x0; 0xffff)"}, {14, "R4_w", "var_off=(0x0; 0xffff)"}, @@ -299,7 +299,7 @@ static struct bpf_align_test tests[] = { /* Calculated offset in R6 has unknown value, but known * alignment of 4. */ - {6, "R2_w", "pkt(off=0,r=8,imm=0)"}, + {6, "R2_w", "pkt(r=8)"}, {7, "R6_w", "var_off=(0x0; 0x3fc)"}, /* Offset is added to packet pointer R5, resulting in * known fixed offset, and variable offset from R6. @@ -337,7 +337,7 @@ static struct bpf_align_test tests[] = { /* Constant offset is added to R5 packet pointer, * resulting in reg->off value of 14. */ - {26, "R5_w", "pkt(off=14,r=8,"}, + {26, "R5_w", "pkt(off=14,r=8)"}, /* Variable offset is added to R5, resulting in a * variable offset of (4n). See comment for insn #18 * for R4 = R5 trick. @@ -397,7 +397,7 @@ static struct bpf_align_test tests[] = { /* Calculated offset in R6 has unknown value, but known * alignment of 4. */ - {6, "R2_w", "pkt(off=0,r=8,imm=0)"}, + {6, "R2_w", "pkt(r=8)"}, {7, "R6_w", "var_off=(0x0; 0x3fc)"}, /* Adding 14 makes R6 be (4n+2) */ {8, "R6_w", "var_off=(0x2; 0x7fc)"}, @@ -459,7 +459,7 @@ static struct bpf_align_test tests[] = { .prog_type = BPF_PROG_TYPE_SCHED_CLS, .result = REJECT, .matches = { - {3, "R5_w", "pkt_end(off=0,imm=0)"}, + {3, "R5_w", "pkt_end()"}, /* (ptr - ptr) << 2 == unknown, (4n) */ {5, "R5_w", "var_off=(0x0; 0xfffffffffffffffc)"}, /* (4n) + 14 == (4n+2). We blow our bounds, because @@ -513,7 +513,7 @@ static struct bpf_align_test tests[] = { /* Calculated offset in R6 has unknown value, but known * alignment of 4. */ - {6, "R2_w", "pkt(off=0,r=8,imm=0)"}, + {6, "R2_w", "pkt(r=8)"}, {8, "R6_w", "var_off=(0x0; 0x3fc)"}, /* Adding 14 makes R6 be (4n+2) */ {9, "R6_w", "var_off=(0x2; 0x7fc)"}, @@ -566,7 +566,7 @@ static struct bpf_align_test tests[] = { /* Calculated offset in R6 has unknown value, but known * alignment of 4. */ - {6, "R2_w", "pkt(off=0,r=8,imm=0)"}, + {6, "R2_w", "pkt(r=8)"}, {9, "R6_w", "var_off=(0x0; 0x3c)"}, /* Adding 14 makes R6 be (4n+2) */ {10, "R6_w", "var_off=(0x2; 0x7c)"}, @@ -659,14 +659,14 @@ static int do_test_single(struct bpf_align_test *test) /* Check the next line as well in case the previous line * did not have a corresponding bpf insn. Example: * func#0 @0 - * 0: R1=ctx(off=0,imm=0) R10=fp0 + * 0: R1=ctx() R10=fp0 * 0: (b7) r3 = 2 ; R3_w=2 * * Sometimes it's actually two lines below, e.g. when * searching for "6: R3_w=scalar(umax=255,var_off=(0x0; 0xff))": - * from 4 to 6: R0_w=pkt(off=8,r=8,imm=0) R1=ctx(off=0,imm=0) R2_w=pkt(off=0,r=8,imm=0) R3_w=pkt_end(off=0,imm=0) R10=fp0 - * 6: R0_w=pkt(off=8,r=8,imm=0) R1=ctx(off=0,imm=0) R2_w=pkt(off=0,r=8,imm=0) R3_w=pkt_end(off=0,imm=0) R10=fp0 - * 6: (71) r3 = *(u8 *)(r2 +0) ; R2_w=pkt(off=0,r=8,imm=0) R3_w=scalar(umax=255,var_off=(0x0; 0xff)) + * from 4 to 6: R0_w=pkt(off=8,r=8) R1=ctx() R2_w=pkt(r=8) R3_w=pkt_end() R10=fp0 + * 6: R0_w=pkt(off=8,r=8) R1=ctx() R2_w=pkt(r=8) R3_w=pkt_end() R10=fp0 + * 6: (71) r3 = *(u8 *)(r2 +0) ; R2_w=pkt(r=8) R3_w=scalar(umax=255,var_off=(0x0; 0xff)) */ while (!(p = strstr(line_ptr, m.reg)) || !strstr(p, m.match)) { cur_line = -1; diff --git a/tools/testing/selftests/bpf/prog_tests/log_buf.c b/tools/testing/selftests/bpf/prog_tests/log_buf.c index fe9a23e65ef4..0f7ea4d7d9f6 100644 --- a/tools/testing/selftests/bpf/prog_tests/log_buf.c +++ b/tools/testing/selftests/bpf/prog_tests/log_buf.c @@ -78,7 +78,7 @@ static void obj_load_log_buf(void) ASSERT_OK_PTR(strstr(libbpf_log_buf, "prog 'bad_prog': BPF program load failed"), "libbpf_log_not_empty"); ASSERT_OK_PTR(strstr(obj_log_buf, "DATASEC license"), "obj_log_not_empty"); - ASSERT_OK_PTR(strstr(good_log_buf, "0: R1=ctx(off=0,imm=0) R10=fp0"), + ASSERT_OK_PTR(strstr(good_log_buf, "0: R1=ctx() R10=fp0"), "good_log_verbose"); ASSERT_OK_PTR(strstr(bad_log_buf, "invalid access to map value, value_size=16 off=16000 size=4"), "bad_log_not_empty"); @@ -175,7 +175,7 @@ static void bpf_prog_load_log_buf(void) opts.log_level = 2; fd = bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER, "good_prog", "GPL", good_prog_insns, good_prog_insn_cnt, &opts); - ASSERT_OK_PTR(strstr(log_buf, "0: R1=ctx(off=0,imm=0) R10=fp0"), "good_log_2"); + ASSERT_OK_PTR(strstr(log_buf, "0: R1=ctx() R10=fp0"), "good_log_2"); ASSERT_GE(fd, 0, "good_fd2"); if (fd >= 0) close(fd); diff --git a/tools/testing/selftests/bpf/prog_tests/spin_lock.c b/tools/testing/selftests/bpf/prog_tests/spin_lock.c index ace65224286f..18d451be57c8 100644 --- a/tools/testing/selftests/bpf/prog_tests/spin_lock.c +++ b/tools/testing/selftests/bpf/prog_tests/spin_lock.c @@ -13,22 +13,22 @@ static struct { const char *err_msg; } spin_lock_fail_tests[] = { { "lock_id_kptr_preserve", - "5: (bf) r1 = r0 ; R0_w=ptr_foo(id=2,ref_obj_id=2,off=0,imm=0) " - "R1_w=ptr_foo(id=2,ref_obj_id=2,off=0,imm=0) refs=2\n6: (85) call bpf_this_cpu_ptr#154\n" + "5: (bf) r1 = r0 ; R0_w=ptr_foo(id=2,ref_obj_id=2) " + "R1_w=ptr_foo(id=2,ref_obj_id=2) refs=2\n6: (85) call bpf_this_cpu_ptr#154\n" "R1 type=ptr_ expected=percpu_ptr_" }, { "lock_id_global_zero", - "; R1_w=map_value(map=.data.A,ks=4,vs=4,off=0,imm=0)\n2: (85) call bpf_this_cpu_ptr#154\n" + "; R1_w=map_value(map=.data.A,ks=4,vs=4)\n2: (85) call bpf_this_cpu_ptr#154\n" "R1 type=map_value expected=percpu_ptr_" }, { "lock_id_mapval_preserve", "[0-9]\\+: (bf) r1 = r0 ;" - " R0_w=map_value(id=1,map=array_map,ks=4,vs=8,off=0,imm=0)" - " R1_w=map_value(id=1,map=array_map,ks=4,vs=8,off=0,imm=0)\n" + " R0_w=map_value(id=1,map=array_map,ks=4,vs=8)" + " R1_w=map_value(id=1,map=array_map,ks=4,vs=8)\n" "[0-9]\\+: (85) call bpf_this_cpu_ptr#154\n" "R1 type=map_value expected=percpu_ptr_" }, { "lock_id_innermapval_preserve", "[0-9]\\+: (bf) r1 = r0 ;" - " R0=map_value(id=2,ks=4,vs=8,off=0,imm=0)" - " R1_w=map_value(id=2,ks=4,vs=8,off=0,imm=0)\n" + " R0=map_value(id=2,ks=4,vs=8)" + " R1_w=map_value(id=2,ks=4,vs=8)\n" "[0-9]\\+: (85) call bpf_this_cpu_ptr#154\n" "R1 type=map_value expected=percpu_ptr_" }, { "lock_id_mismatch_kptr_kptr", "bpf_spin_unlock of different lock" }, diff --git a/tools/testing/selftests/bpf/progs/exceptions_assert.c b/tools/testing/selftests/bpf/progs/exceptions_assert.c index e1e5c54a6a11..26f7d67432cc 100644 --- a/tools/testing/selftests/bpf/progs/exceptions_assert.c +++ b/tools/testing/selftests/bpf/progs/exceptions_assert.c @@ -59,7 +59,7 @@ check_assert(s64, ge, neg, INT_MIN); SEC("?tc") __log_level(2) __failure -__msg(": R0=0 R1=ctx(off=0,imm=0) R2=scalar(smin=smin32=-2147483646,smax=smax32=2147483645) R10=fp0") +__msg(": R0=0 R1=ctx() R2=scalar(smin=smin32=-2147483646,smax=smax32=2147483645) R10=fp0") int check_assert_range_s64(struct __sk_buff *ctx) { struct bpf_sock *sk = ctx->sk; @@ -75,7 +75,7 @@ int check_assert_range_s64(struct __sk_buff *ctx) SEC("?tc") __log_level(2) __failure -__msg(": R1=ctx(off=0,imm=0) R2=scalar(smin=umin=smin32=umin32=4096,smax=umax=smax32=umax32=8192,var_off=(0x0; 0x3fff))") +__msg(": R1=ctx() R2=scalar(smin=umin=smin32=umin32=4096,smax=umax=smax32=umax32=8192,var_off=(0x0; 0x3fff))") int check_assert_range_u64(struct __sk_buff *ctx) { u64 num = ctx->len; @@ -86,7 +86,7 @@ int check_assert_range_u64(struct __sk_buff *ctx) SEC("?tc") __log_level(2) __failure -__msg(": R0=0 R1=ctx(off=0,imm=0) R2=4096 R10=fp0") +__msg(": R0=0 R1=ctx() R2=4096 R10=fp0") int check_assert_single_range_s64(struct __sk_buff *ctx) { struct bpf_sock *sk = ctx->sk; @@ -103,7 +103,7 @@ int check_assert_single_range_s64(struct __sk_buff *ctx) SEC("?tc") __log_level(2) __failure -__msg(": R1=ctx(off=0,imm=0) R2=4096 R10=fp0") +__msg(": R1=ctx() R2=4096 R10=fp0") int check_assert_single_range_u64(struct __sk_buff *ctx) { u64 num = ctx->len; @@ -114,7 +114,7 @@ int check_assert_single_range_u64(struct __sk_buff *ctx) SEC("?tc") __log_level(2) __failure -__msg(": R1=pkt(off=64,r=64,imm=0) R2=pkt_end(off=0,imm=0) R6=pkt(off=0,r=64,imm=0) R10=fp0") +__msg(": R1=pkt(off=64,r=64) R2=pkt_end() R6=pkt(r=64) R10=fp0") int check_assert_generic(struct __sk_buff *ctx) { u8 *data_end = (void *)(long)ctx->data_end; -- cgit From 0f8dbdbc641b45a5fa31d497f9fc83ffe1174fa3 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 17 Nov 2023 19:46:22 -0800 Subject: bpf: smarter verifier log number printing logic Instead of always printing numbers as either decimals (and in some cases, like for "imm=%llx", in hexadecimals), decide the form based on actual values. For numbers in a reasonably small range (currently, [0, U16_MAX] for unsigned values, and [S16_MIN, S16_MAX] for signed ones), emit them as decimals. In all other cases, even for signed values, emit them in hexadecimals. For large values hex form is often times way more useful: it's easier to see an exact difference between 0xffffffff80000000 and 0xffffffff7fffffff, than between 18446744071562067966 and 18446744071562067967, as one particular example. Small values representing small pointer offsets or application constants, on the other hand, are way more useful to be represented in decimal notation. Adjust reg_bounds register state parsing logic to take into account this change. Acked-by: Eduard Zingerman Acked-by: Stanislav Fomichev Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231118034623.3320920-8-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/log.c | 79 +++++++++++++++++++--- .../testing/selftests/bpf/prog_tests/reg_bounds.c | 53 +++++++++------ .../selftests/bpf/progs/exceptions_assert.c | 32 ++++----- 3 files changed, 118 insertions(+), 46 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c index 20b4f81087da..87105aa482ed 100644 --- a/kernel/bpf/log.c +++ b/kernel/bpf/log.c @@ -509,10 +509,52 @@ static void print_liveness(struct bpf_verifier_env *env, verbose(env, "D"); } +#define UNUM_MAX_DECIMAL U16_MAX +#define SNUM_MAX_DECIMAL S16_MAX +#define SNUM_MIN_DECIMAL S16_MIN + +static bool is_unum_decimal(u64 num) +{ + return num <= UNUM_MAX_DECIMAL; +} + +static bool is_snum_decimal(s64 num) +{ + return num >= SNUM_MIN_DECIMAL && num <= SNUM_MAX_DECIMAL; +} + +static void verbose_unum(struct bpf_verifier_env *env, u64 num) +{ + if (is_unum_decimal(num)) + verbose(env, "%llu", num); + else + verbose(env, "%#llx", num); +} + +static void verbose_snum(struct bpf_verifier_env *env, s64 num) +{ + if (is_snum_decimal(num)) + verbose(env, "%lld", num); + else + verbose(env, "%#llx", num); +} + static void print_scalar_ranges(struct bpf_verifier_env *env, const struct bpf_reg_state *reg, const char **sep) { + /* For signed ranges, we want to unify 64-bit and 32-bit values in the + * output as much as possible, but there is a bit of a complication. + * If we choose to print values as decimals, this is natural to do, + * because negative 64-bit and 32-bit values >= -S32_MIN have the same + * representation due to sign extension. But if we choose to print + * them in hex format (see is_snum_decimal()), then sign extension is + * misleading. + * E.g., smin=-2 and smin32=-2 are exactly the same in decimal, but in + * hex they will be smin=0xfffffffffffffffe and smin32=0xfffffffe, two + * very different numbers. + * So we avoid sign extension if we choose to print values in hex. + */ struct { const char *name; u64 val; @@ -522,8 +564,14 @@ static void print_scalar_ranges(struct bpf_verifier_env *env, {"smax", reg->smax_value, reg->smax_value == S64_MAX}, {"umin", reg->umin_value, reg->umin_value == 0}, {"umax", reg->umax_value, reg->umax_value == U64_MAX}, - {"smin32", (s64)reg->s32_min_value, reg->s32_min_value == S32_MIN}, - {"smax32", (s64)reg->s32_max_value, reg->s32_max_value == S32_MAX}, + {"smin32", + is_snum_decimal((s64)reg->s32_min_value) + ? (s64)reg->s32_min_value + : (u32)reg->s32_min_value, reg->s32_min_value == S32_MIN}, + {"smax32", + is_snum_decimal((s64)reg->s32_max_value) + ? (s64)reg->s32_max_value + : (u32)reg->s32_max_value, reg->s32_max_value == S32_MAX}, {"umin32", reg->u32_min_value, reg->u32_min_value == 0}, {"umax32", reg->u32_max_value, reg->u32_max_value == U32_MAX}, }, *m1, *m2, *mend = &minmaxs[ARRAY_SIZE(minmaxs)]; @@ -549,7 +597,10 @@ static void print_scalar_ranges(struct bpf_verifier_env *env, verbose(env, "%s=", m2->name); } - verbose(env, m1->name[0] == 's' ? "%lld" : "%llu", m1->val); + if (m1->name[0] == 's') + verbose_snum(env, m1->val); + else + verbose_unum(env, m1->val); } } @@ -576,14 +627,14 @@ static void print_reg_state(struct bpf_verifier_env *env, const struct bpf_reg_s tnum_is_const(reg->var_off)) { /* reg->off should be 0 for SCALAR_VALUE */ verbose(env, "%s", t == SCALAR_VALUE ? "" : reg_type_str(env, t)); - verbose(env, "%lld", reg->var_off.value + reg->off); + verbose_snum(env, reg->var_off.value + reg->off); return; } /* * _a stands for append, was shortened to avoid multiline statements below. * This macro is used to output a comma separated list of attributes. */ -#define verbose_a(fmt, ...) ({ verbose(env, "%s" fmt, sep, __VA_ARGS__); sep = ","; }) +#define verbose_a(fmt, ...) ({ verbose(env, "%s" fmt, sep, ##__VA_ARGS__); sep = ","; }) verbose(env, "%s", reg_type_str(env, t)); if (base_type(t) == PTR_TO_BTF_ID) @@ -602,14 +653,20 @@ static void print_reg_state(struct bpf_verifier_env *env, const struct bpf_reg_s reg->map_ptr->key_size, reg->map_ptr->value_size); } - if (t != SCALAR_VALUE && reg->off) - verbose_a("off=%d", reg->off); - if (type_is_pkt_pointer(t)) - verbose_a("r=%d", reg->range); + if (t != SCALAR_VALUE && reg->off) { + verbose_a("off="); + verbose_snum(env, reg->off); + } + if (type_is_pkt_pointer(t)) { + verbose_a("r="); + verbose_unum(env, reg->range); + } if (tnum_is_const(reg->var_off)) { /* a pointer register with fixed offset */ - if (reg->var_off.value) - verbose_a("imm=%llx", reg->var_off.value); + if (reg->var_off.value) { + verbose_a("imm="); + verbose_snum(env, reg->var_off.value); + } } else { print_scalar_ranges(env, reg, &sep); if (!tnum_is_unknown(reg->var_off)) { diff --git a/tools/testing/selftests/bpf/prog_tests/reg_bounds.c b/tools/testing/selftests/bpf/prog_tests/reg_bounds.c index 7a8b0bf0a7f8..fd4ab23e6f54 100644 --- a/tools/testing/selftests/bpf/prog_tests/reg_bounds.c +++ b/tools/testing/selftests/bpf/prog_tests/reg_bounds.c @@ -13,10 +13,13 @@ */ #define U64_MAX ((u64)UINT64_MAX) #define U32_MAX ((u32)UINT_MAX) +#define U16_MAX ((u32)UINT_MAX) #define S64_MIN ((s64)INT64_MIN) #define S64_MAX ((s64)INT64_MAX) #define S32_MIN ((s32)INT_MIN) #define S32_MAX ((s32)INT_MAX) +#define S16_MIN ((s16)0x80000000) +#define S16_MAX ((s16)0x7fffffff) typedef unsigned long long ___u64; typedef unsigned int ___u32; @@ -138,13 +141,17 @@ static enum num_t t_unsigned(enum num_t t) } } +#define UNUM_MAX_DECIMAL U16_MAX +#define SNUM_MAX_DECIMAL S16_MAX +#define SNUM_MIN_DECIMAL S16_MIN + static bool num_is_small(enum num_t t, u64 x) { switch (t) { - case U64: return (u64)x <= 256; - case U32: return (u32)x <= 256; - case S64: return (s64)x >= -256 && (s64)x <= 256; - case S32: return (s32)x >= -256 && (s32)x <= 256; + case U64: return (u64)x <= UNUM_MAX_DECIMAL; + case U32: return (u32)x <= UNUM_MAX_DECIMAL; + case S64: return (s64)x >= SNUM_MIN_DECIMAL && (s64)x <= SNUM_MAX_DECIMAL; + case S32: return (s32)x >= SNUM_MIN_DECIMAL && (s32)x <= SNUM_MAX_DECIMAL; default: printf("num_is_small!\n"); exit(1); } } @@ -1023,20 +1030,19 @@ static int parse_reg_state(const char *s, struct reg_state *reg) */ struct { const char *pfx; - const char *fmt; u64 *dst, def; bool is_32, is_set; } *f, fields[8] = { - {"smin=", "%lld", ®->r[S64].a, S64_MIN}, - {"smax=", "%lld", ®->r[S64].b, S64_MAX}, - {"umin=", "%llu", ®->r[U64].a, 0}, - {"umax=", "%llu", ®->r[U64].b, U64_MAX}, - {"smin32=", "%lld", ®->r[S32].a, (u32)S32_MIN, true}, - {"smax32=", "%lld", ®->r[S32].b, (u32)S32_MAX, true}, - {"umin32=", "%llu", ®->r[U32].a, 0, true}, - {"umax32=", "%llu", ®->r[U32].b, U32_MAX, true}, + {"smin=", ®->r[S64].a, S64_MIN}, + {"smax=", ®->r[S64].b, S64_MAX}, + {"umin=", ®->r[U64].a, 0}, + {"umax=", ®->r[U64].b, U64_MAX}, + {"smin32=", ®->r[S32].a, (u32)S32_MIN, true}, + {"smax32=", ®->r[S32].b, (u32)S32_MAX, true}, + {"umin32=", ®->r[U32].a, 0, true}, + {"umax32=", ®->r[U32].b, U32_MAX, true}, }; - const char *p, *fmt; + const char *p; int i; p = strchr(s, '='); @@ -1050,8 +1056,13 @@ static int parse_reg_state(const char *s, struct reg_state *reg) long long sval; enum num_t t; - if (sscanf(p, "%lld", &sval) != 1) - return -EINVAL; + if (p[0] == '0' && p[1] == 'x') { + if (sscanf(p, "%llx", &sval) != 1) + return -EINVAL; + } else { + if (sscanf(p, "%lld", &sval) != 1) + return -EINVAL; + } reg->valid = true; for (t = first_t; t <= last_t; t++) { @@ -1075,9 +1086,13 @@ static int parse_reg_state(const char *s, struct reg_state *reg) if (mcnt) { /* populate all matched fields */ - fmt = fields[midxs[0]].fmt; - if (sscanf(p, fmt, &val) != 1) - return -EINVAL; + if (p[0] == '0' && p[1] == 'x') { + if (sscanf(p, "%llx", &val) != 1) + return -EINVAL; + } else { + if (sscanf(p, "%lld", &val) != 1) + return -EINVAL; + } for (i = 0; i < mcnt; i++) { f = &fields[midxs[i]]; diff --git a/tools/testing/selftests/bpf/progs/exceptions_assert.c b/tools/testing/selftests/bpf/progs/exceptions_assert.c index 26f7d67432cc..49efaed143fc 100644 --- a/tools/testing/selftests/bpf/progs/exceptions_assert.c +++ b/tools/testing/selftests/bpf/progs/exceptions_assert.c @@ -18,48 +18,48 @@ return *(u64 *)num; \ } -__msg(": R0_w=-2147483648 R10=fp0") +__msg(": R0_w=0xffffffff80000000 R10=fp0") check_assert(s64, eq, int_min, INT_MIN); -__msg(": R0_w=2147483647 R10=fp0") +__msg(": R0_w=0x7fffffff R10=fp0") check_assert(s64, eq, int_max, INT_MAX); __msg(": R0_w=0 R10=fp0") check_assert(s64, eq, zero, 0); -__msg(": R0_w=-9223372036854775808 R1_w=-9223372036854775808 R10=fp0") +__msg(": R0_w=0x8000000000000000 R1_w=0x8000000000000000 R10=fp0") check_assert(s64, eq, llong_min, LLONG_MIN); -__msg(": R0_w=9223372036854775807 R1_w=9223372036854775807 R10=fp0") +__msg(": R0_w=0x7fffffffffffffff R1_w=0x7fffffffffffffff R10=fp0") check_assert(s64, eq, llong_max, LLONG_MAX); -__msg(": R0_w=scalar(smax=2147483646) R10=fp0") +__msg(": R0_w=scalar(smax=0x7ffffffe) R10=fp0") check_assert(s64, lt, pos, INT_MAX); -__msg(": R0_w=scalar(smax=-1,umin=9223372036854775808,var_off=(0x8000000000000000; 0x7fffffffffffffff))") +__msg(": R0_w=scalar(smax=-1,umin=0x8000000000000000,var_off=(0x8000000000000000; 0x7fffffffffffffff))") check_assert(s64, lt, zero, 0); -__msg(": R0_w=scalar(smax=-2147483649,umin=9223372036854775808,umax=18446744071562067967,var_off=(0x8000000000000000; 0x7fffffffffffffff))") +__msg(": R0_w=scalar(smax=0xffffffff7fffffff,umin=0x8000000000000000,umax=0xffffffff7fffffff,var_off=(0x8000000000000000; 0x7fffffffffffffff))") check_assert(s64, lt, neg, INT_MIN); -__msg(": R0_w=scalar(smax=2147483647) R10=fp0") +__msg(": R0_w=scalar(smax=0x7fffffff) R10=fp0") check_assert(s64, le, pos, INT_MAX); __msg(": R0_w=scalar(smax=0) R10=fp0") check_assert(s64, le, zero, 0); -__msg(": R0_w=scalar(smax=-2147483648,umin=9223372036854775808,umax=18446744071562067968,var_off=(0x8000000000000000; 0x7fffffffffffffff))") +__msg(": R0_w=scalar(smax=0xffffffff80000000,umin=0x8000000000000000,umax=0xffffffff80000000,var_off=(0x8000000000000000; 0x7fffffffffffffff))") check_assert(s64, le, neg, INT_MIN); -__msg(": R0_w=scalar(smin=umin=2147483648,umax=9223372036854775807,var_off=(0x0; 0x7fffffffffffffff))") +__msg(": R0_w=scalar(smin=umin=0x80000000,umax=0x7fffffffffffffff,var_off=(0x0; 0x7fffffffffffffff))") check_assert(s64, gt, pos, INT_MAX); -__msg(": R0_w=scalar(smin=umin=1,umax=9223372036854775807,var_off=(0x0; 0x7fffffffffffffff))") +__msg(": R0_w=scalar(smin=umin=1,umax=0x7fffffffffffffff,var_off=(0x0; 0x7fffffffffffffff))") check_assert(s64, gt, zero, 0); -__msg(": R0_w=scalar(smin=-2147483647) R10=fp0") +__msg(": R0_w=scalar(smin=0xffffffff80000001) R10=fp0") check_assert(s64, gt, neg, INT_MIN); -__msg(": R0_w=scalar(smin=umin=2147483647,umax=9223372036854775807,var_off=(0x0; 0x7fffffffffffffff))") +__msg(": R0_w=scalar(smin=umin=0x7fffffff,umax=0x7fffffffffffffff,var_off=(0x0; 0x7fffffffffffffff))") check_assert(s64, ge, pos, INT_MAX); -__msg(": R0_w=scalar(smin=0,umax=9223372036854775807,var_off=(0x0; 0x7fffffffffffffff)) R10=fp0") +__msg(": R0_w=scalar(smin=0,umax=0x7fffffffffffffff,var_off=(0x0; 0x7fffffffffffffff)) R10=fp0") check_assert(s64, ge, zero, 0); -__msg(": R0_w=scalar(smin=-2147483648) R10=fp0") +__msg(": R0_w=scalar(smin=0xffffffff80000000) R10=fp0") check_assert(s64, ge, neg, INT_MIN); SEC("?tc") __log_level(2) __failure -__msg(": R0=0 R1=ctx() R2=scalar(smin=smin32=-2147483646,smax=smax32=2147483645) R10=fp0") +__msg(": R0=0 R1=ctx() R2=scalar(smin=0xffffffff80000002,smax=smax32=0x7ffffffd,smin32=0x80000002) R10=fp0") int check_assert_range_s64(struct __sk_buff *ctx) { struct bpf_sock *sk = ctx->sk; -- cgit From 46862ee854b4f5a315d63b677ca3af14a89aefeb Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 17 Nov 2023 19:46:23 -0800 Subject: bpf: emit frameno for PTR_TO_STACK regs if it differs from current one It's possible to pass a pointer to parent's stack to child subprogs. In such case verifier state output is ambiguous not showing whether register container a pointer to "current" stack, belonging to current subprog (frame), or it's actually a pointer to one of parent frames. So emit this information if frame number differs between the state which register is part of. E.g., if current state is in frame 2 and it has a register pointing to stack in grand parent state (frame #0), we'll see something like 'R1=fp[0]-16', while "local stack pointer" will be just 'R2=fp-16'. Acked-by: Eduard Zingerman Acked-by: Stanislav Fomichev Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231118034623.3320920-9-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/log.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c index 87105aa482ed..3505f3e5ae96 100644 --- a/kernel/bpf/log.c +++ b/kernel/bpf/log.c @@ -615,7 +615,9 @@ static bool type_is_map_ptr(enum bpf_reg_type t) { } } -static void print_reg_state(struct bpf_verifier_env *env, const struct bpf_reg_state *reg) +static void print_reg_state(struct bpf_verifier_env *env, + const struct bpf_func_state *state, + const struct bpf_reg_state *reg) { enum bpf_reg_type t; const char *sep = ""; @@ -623,10 +625,8 @@ static void print_reg_state(struct bpf_verifier_env *env, const struct bpf_reg_s t = reg->type; if (t == SCALAR_VALUE && reg->precise) verbose(env, "P"); - if ((t == SCALAR_VALUE || t == PTR_TO_STACK) && - tnum_is_const(reg->var_off)) { + if (t == SCALAR_VALUE && tnum_is_const(reg->var_off)) { /* reg->off should be 0 for SCALAR_VALUE */ - verbose(env, "%s", t == SCALAR_VALUE ? "" : reg_type_str(env, t)); verbose_snum(env, reg->var_off.value + reg->off); return; } @@ -637,6 +637,14 @@ static void print_reg_state(struct bpf_verifier_env *env, const struct bpf_reg_s #define verbose_a(fmt, ...) ({ verbose(env, "%s" fmt, sep, ##__VA_ARGS__); sep = ","; }) verbose(env, "%s", reg_type_str(env, t)); + if (t == PTR_TO_STACK) { + if (state->frameno != reg->frameno) + verbose(env, "[%d]", reg->frameno); + if (tnum_is_const(reg->var_off)) { + verbose_snum(env, reg->var_off.value + reg->off); + return; + } + } if (base_type(t) == PTR_TO_BTF_ID) verbose(env, "%s", btf_type_name(reg->btf, reg->btf_id)); verbose(env, "("); @@ -698,7 +706,7 @@ void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_func_st verbose(env, " R%d", i); print_liveness(env, reg->live); verbose(env, "="); - print_reg_state(env, reg); + print_reg_state(env, state, reg); } for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { char types_buf[BPF_REG_SIZE + 1]; @@ -731,7 +739,7 @@ void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_func_st verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE); print_liveness(env, reg->live); verbose(env, "=%s", types_buf); - print_reg_state(env, reg); + print_reg_state(env, state, reg); break; case STACK_DYNPTR: /* skip to main dynptr slot */ -- cgit From 2d1618054f25e11c44d189dbff4a60342a4cfb4b Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 14 Nov 2023 17:32:34 +0100 Subject: bpf: task_group_seq_get_next: use __next_thread() rather than next_thread() Lockless use of next_thread() should be avoided, kernel/bpf/task_iter.c is the last user and the usage is wrong. task_group_seq_get_next() can return the group leader twice if it races with mt-thread exec which changes the group->leader's pid. Change the main loop to use __next_thread(), kill "next_tid == common->pid" check. __next_thread() can't loop forever, we can also change this code to retry if next_tid == 0. Signed-off-by: Oleg Nesterov Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20231114163234.GA890@redhat.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/task_iter.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index 26082b97894d..51ae15e2b290 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -70,15 +70,13 @@ static struct task_struct *task_group_seq_get_next(struct bpf_iter_seq_task_comm return NULL; retry: - task = next_thread(task); + task = __next_thread(task); + if (!task) + return NULL; next_tid = __task_pid_nr_ns(task, PIDTYPE_PID, common->ns); - if (!next_tid || next_tid == common->pid) { - /* Run out of tasks of a process. The tasks of a - * thread_group are linked as circular linked list. - */ - return NULL; - } + if (!next_tid) + goto retry; if (skip_if_dup_files && task->files == task->group_leader->files) goto retry; -- cgit From 5a34f9dabd9aa567e2d37e1aa27a67f80acfaa1c Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 14 Nov 2023 17:32:37 +0100 Subject: bpf: bpf_iter_task_next: use __next_thread() rather than next_thread() Lockless use of next_thread() should be avoided, kernel/bpf/task_iter.c is the last user and the usage is wrong. bpf_iter_task_next() can loop forever, "kit->pos == kit->task" can never happen if kit->pos execs. Change this code to use __next_thread(). With or without this change the usage of kit->pos/task and next_task() doesn't look nice, see the next patch. Signed-off-by: Oleg Nesterov Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20231114163237.GA897@redhat.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/task_iter.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index 51ae15e2b290..d42e08d0d0b7 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -1015,12 +1015,11 @@ __bpf_kfunc struct task_struct *bpf_iter_task_next(struct bpf_iter_task *it) if (flags == BPF_TASK_ITER_ALL_PROCS) goto get_next_task; - kit->pos = next_thread(kit->pos); - if (kit->pos == kit->task) { - if (flags == BPF_TASK_ITER_PROC_THREADS) { - kit->pos = NULL; + kit->pos = __next_thread(kit->pos); + if (!kit->pos) { + if (flags == BPF_TASK_ITER_PROC_THREADS) return pos; - } + kit->pos = kit->task; } else return pos; -- cgit From ac8148d957f50434411a0c15a2e4f352b5bb4ff2 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 14 Nov 2023 17:32:39 +0100 Subject: bpf: bpf_iter_task_next: use next_task(kit->task) rather than next_task(kit->pos) This looks more clear and simplifies the code. While at it, remove the unnecessary initialization of pos/task at the start of bpf_iter_task_new(). Note that we can even kill kit->task, we can just use pos->group_leader, but I don't understand the BUILD_BUG_ON() checks in bpf_iter_task_new(). Signed-off-by: Oleg Nesterov Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20231114163239.GA903@redhat.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/task_iter.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index d42e08d0d0b7..e5c3500443c6 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -978,7 +978,6 @@ __bpf_kfunc int bpf_iter_task_new(struct bpf_iter_task *it, BUILD_BUG_ON(__alignof__(struct bpf_iter_task_kern) != __alignof__(struct bpf_iter_task)); - kit->task = kit->pos = NULL; switch (flags) { case BPF_TASK_ITER_ALL_THREADS: case BPF_TASK_ITER_ALL_PROCS: @@ -1016,18 +1015,15 @@ __bpf_kfunc struct task_struct *bpf_iter_task_next(struct bpf_iter_task *it) goto get_next_task; kit->pos = __next_thread(kit->pos); - if (!kit->pos) { - if (flags == BPF_TASK_ITER_PROC_THREADS) - return pos; - kit->pos = kit->task; - } else + if (kit->pos || flags == BPF_TASK_ITER_PROC_THREADS) return pos; get_next_task: - kit->pos = next_task(kit->pos); - kit->task = kit->pos; - if (kit->pos == &init_task) + kit->task = next_task(kit->task); + if (kit->task == &init_task) kit->pos = NULL; + else + kit->pos = kit->task; return pos; } -- cgit From 49277a5b76373e630075ff7d32fc0f9f51294f24 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Mon, 20 Nov 2023 21:18:40 -0500 Subject: workqueue: Move workqueue_set_unbound_cpumask() and its helpers inside CONFIG_SYSFS Commit fe28f631fa94 ("workqueue: Add workqueue_unbound_exclude_cpumask() to exclude CPUs from wq_unbound_cpumask") makes workqueue_set_unbound_cpumask() static as it is not used elsewhere in the kernel. However, this triggers a kernel test robot warning about 'workqueue_set_unbound_cpumask' defined but not used when CONFIG_SYS isn't defined. It happens that workqueue_set_unbound_cpumask() is only called when CONFIG_SYS is defined. Move workqueue_set_unbound_cpumask() and its helpers inside the CONFIG_SYSFS compilation block to avoid the warning. There is no functional change. Fixes: fe28f631fa94 ("workqueue: Add workqueue_unbound_exclude_cpumask() to exclude CPUs from wq_unbound_cpumask") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202311130831.uh0AoCd1-lkp@intel.com/ Signed-off-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/workqueue.c | 102 ++++++++++++++++++++++++++--------------------------- 1 file changed, 51 insertions(+), 51 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index bd9d34eacd78..2fc585d3d6ca 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4417,19 +4417,6 @@ static void apply_wqattrs_commit(struct apply_wqattrs_ctx *ctx) mutex_unlock(&ctx->wq->mutex); } -static void apply_wqattrs_lock(void) -{ - /* CPUs should stay stable across pwq creations and installations */ - cpus_read_lock(); - mutex_lock(&wq_pool_mutex); -} - -static void apply_wqattrs_unlock(void) -{ - mutex_unlock(&wq_pool_mutex); - cpus_read_unlock(); -} - static int apply_workqueue_attrs_locked(struct workqueue_struct *wq, const struct workqueue_attrs *attrs) { @@ -5833,44 +5820,6 @@ static int workqueue_apply_unbound_cpumask(const cpumask_var_t unbound_cpumask) return ret; } -/** - * workqueue_set_unbound_cpumask - Set the low-level unbound cpumask - * @cpumask: the cpumask to set - * - * The low-level workqueues cpumask is a global cpumask that limits - * the affinity of all unbound workqueues. This function check the @cpumask - * and apply it to all unbound workqueues and updates all pwqs of them. - * - * Return: 0 - Success - * -EINVAL - Invalid @cpumask - * -ENOMEM - Failed to allocate memory for attrs or pwqs. - */ -static int workqueue_set_unbound_cpumask(cpumask_var_t cpumask) -{ - int ret = -EINVAL; - - /* - * Not excluding isolated cpus on purpose. - * If the user wishes to include them, we allow that. - */ - cpumask_and(cpumask, cpumask, cpu_possible_mask); - if (!cpumask_empty(cpumask)) { - apply_wqattrs_lock(); - cpumask_copy(wq_requested_unbound_cpumask, cpumask); - if (cpumask_equal(cpumask, wq_unbound_cpumask)) { - ret = 0; - goto out_unlock; - } - - ret = workqueue_apply_unbound_cpumask(cpumask); - -out_unlock: - apply_wqattrs_unlock(); - } - - return ret; -} - /** * workqueue_unbound_exclude_cpumask - Exclude given CPUs from unbound cpumask * @exclude_cpumask: the cpumask to be excluded from wq_unbound_cpumask @@ -6027,6 +5976,19 @@ static struct attribute *wq_sysfs_attrs[] = { }; ATTRIBUTE_GROUPS(wq_sysfs); +static void apply_wqattrs_lock(void) +{ + /* CPUs should stay stable across pwq creations and installations */ + cpus_read_lock(); + mutex_lock(&wq_pool_mutex); +} + +static void apply_wqattrs_unlock(void) +{ + mutex_unlock(&wq_pool_mutex); + cpus_read_unlock(); +} + static ssize_t wq_nice_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -6203,6 +6165,44 @@ static struct bus_type wq_subsys = { .dev_groups = wq_sysfs_groups, }; +/** + * workqueue_set_unbound_cpumask - Set the low-level unbound cpumask + * @cpumask: the cpumask to set + * + * The low-level workqueues cpumask is a global cpumask that limits + * the affinity of all unbound workqueues. This function check the @cpumask + * and apply it to all unbound workqueues and updates all pwqs of them. + * + * Return: 0 - Success + * -EINVAL - Invalid @cpumask + * -ENOMEM - Failed to allocate memory for attrs or pwqs. + */ +static int workqueue_set_unbound_cpumask(cpumask_var_t cpumask) +{ + int ret = -EINVAL; + + /* + * Not excluding isolated cpus on purpose. + * If the user wishes to include them, we allow that. + */ + cpumask_and(cpumask, cpumask, cpu_possible_mask); + if (!cpumask_empty(cpumask)) { + apply_wqattrs_lock(); + cpumask_copy(wq_requested_unbound_cpumask, cpumask); + if (cpumask_equal(cpumask, wq_unbound_cpumask)) { + ret = 0; + goto out_unlock; + } + + ret = workqueue_apply_unbound_cpumask(cpumask); + +out_unlock: + apply_wqattrs_unlock(); + } + + return ret; +} + static ssize_t __wq_cpumask_show(struct device *dev, struct device_attribute *attr, char *buf, cpumask_var_t mask) { -- cgit From a89299c40911ee29c6ec4fb66f9c598cd947265b Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 8 Nov 2023 13:58:25 +0100 Subject: time: Make sysfs_get_uname() function visible in header MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This function is defined globally in clocksource.c and used conditionally in clockevent.c, which the declaration hidden when clockevent support is disabled. This causes a harmless warning in the definition: kernel/time/clocksource.c:1324:9: warning: no previous prototype for 'sysfs_get_uname' [-Wmissing-prototypes] 1324 | ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt) Move the declaration out of the #ifdef so it is always visible. Signed-off-by: Arnd Bergmann Signed-off-by: Thomas Gleixner Reviewed-by: Uwe Kleine-König Link: https://lore.kernel.org/r/20231108125843.3806765-5-arnd@kernel.org --- kernel/time/tick-internal.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index 649f2b48e8f0..481b7ab65e2c 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -56,7 +56,6 @@ extern int clockevents_program_event(struct clock_event_device *dev, ktime_t expires, bool force); extern void clockevents_handle_noop(struct clock_event_device *dev); extern int __clockevents_update_freq(struct clock_event_device *dev, u32 freq); -extern ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt); /* Broadcasting support */ # ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST @@ -197,3 +196,5 @@ void hrtimers_resume_local(void); #else #define JIFFIES_SHIFT 8 #endif + +extern ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt); -- cgit From 4a6c5607d4502ccd1b15b57d57f17d12b6f257a7 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 21 Nov 2023 11:39:36 -1000 Subject: workqueue: Make sure that wq_unbound_cpumask is never empty During boot, depending on how the housekeeping and workqueue.unbound_cpus masks are set, wq_unbound_cpumask can end up empty. Since 8639ecebc9b1 ("workqueue: Implement non-strict affinity scope for unbound workqueues"), this may end up feeding -1 as a CPU number into scheduler leading to oopses. BUG: unable to handle page fault for address: ffffffff8305e9c0 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page ... Call Trace: select_idle_sibling+0x79/0xaf0 select_task_rq_fair+0x1cb/0x7b0 try_to_wake_up+0x29c/0x5c0 wake_up_process+0x19/0x20 kick_pool+0x5e/0xb0 __queue_work+0x119/0x430 queue_work_on+0x29/0x30 ... An empty wq_unbound_cpumask is a clear misconfiguration and already disallowed once system is booted up. Let's warn on and ignore unbound_cpumask restrictions which lead to no unbound cpus. While at it, also remove now unncessary empty check on wq_unbound_cpumask in wq_select_unbound_cpu(). Signed-off-by: Tejun Heo Reported-and-Tested-by: Yong He Link: http://lkml.kernel.org/r/20231120121623.119780-1-alexyonghe@tencent.com Fixes: 8639ecebc9b1 ("workqueue: Implement non-strict affinity scope for unbound workqueues") Cc: stable@vger.kernel.org # v6.6+ Reviewed-by: Waiman Long --- kernel/workqueue.c | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 6e578f576a6f..2989b57e154a 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1684,9 +1684,6 @@ static int wq_select_unbound_cpu(int cpu) pr_warn_once("workqueue: round-robin CPU selection forced, expect performance impact\n"); } - if (cpumask_empty(wq_unbound_cpumask)) - return cpu; - new_cpu = __this_cpu_read(wq_rr_cpu_last); new_cpu = cpumask_next_and(new_cpu, wq_unbound_cpumask, cpu_online_mask); if (unlikely(new_cpu >= nr_cpu_ids)) { @@ -6515,6 +6512,17 @@ static inline void wq_watchdog_init(void) { } #endif /* CONFIG_WQ_WATCHDOG */ +static void __init restrict_unbound_cpumask(const char *name, const struct cpumask *mask) +{ + if (!cpumask_intersects(wq_unbound_cpumask, mask)) { + pr_warn("workqueue: Restricting unbound_cpumask (%*pb) with %s (%*pb) leaves no CPU, ignoring\n", + cpumask_pr_args(wq_unbound_cpumask), name, cpumask_pr_args(mask)); + return; + } + + cpumask_and(wq_unbound_cpumask, wq_unbound_cpumask, mask); +} + /** * workqueue_init_early - early init for workqueue subsystem * @@ -6534,11 +6542,11 @@ void __init workqueue_init_early(void) BUILD_BUG_ON(__alignof__(struct pool_workqueue) < __alignof__(long long)); BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL)); - cpumask_copy(wq_unbound_cpumask, housekeeping_cpumask(HK_TYPE_WQ)); - cpumask_and(wq_unbound_cpumask, wq_unbound_cpumask, housekeeping_cpumask(HK_TYPE_DOMAIN)); - + cpumask_copy(wq_unbound_cpumask, cpu_possible_mask); + restrict_unbound_cpumask("HK_TYPE_WQ", housekeeping_cpumask(HK_TYPE_WQ)); + restrict_unbound_cpumask("HK_TYPE_DOMAIN", housekeeping_cpumask(HK_TYPE_DOMAIN)); if (!cpumask_empty(&wq_cmdline_cpumask)) - cpumask_and(wq_unbound_cpumask, wq_unbound_cpumask, &wq_cmdline_cpumask); + restrict_unbound_cpumask("workqueue.unbound_cpus", &wq_cmdline_cpumask); pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC); -- cgit From 69dcbbd80421a5d8230c178e01869f8e2edb2317 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 10 Oct 2023 09:52:19 -0700 Subject: locktorture: Increase Hamming distance between call_rcu_chain and rcu_call_chains One letter difference is really not enough, so this commit changes call_rcu_chain to call_rcu_chain_list. Reported-by: Dan Carpenter Signed-off-by: Paul E. McKenney Signed-off-by: Neeraj Upadhyay (AMD) --- kernel/locking/locktorture.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index 69d3cd2cfc3b..415d81e6ce70 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -124,7 +124,7 @@ struct call_rcu_chain { struct rcu_head crc_rh; bool crc_stop; }; -struct call_rcu_chain *call_rcu_chain; +struct call_rcu_chain *call_rcu_chain_list; /* Forward reference. */ static void lock_torture_cleanup(void); @@ -1074,12 +1074,12 @@ static int call_rcu_chain_init(void) if (call_rcu_chains <= 0) return 0; - call_rcu_chain = kcalloc(call_rcu_chains, sizeof(*call_rcu_chain), GFP_KERNEL); - if (!call_rcu_chain) + call_rcu_chain_list = kcalloc(call_rcu_chains, sizeof(*call_rcu_chain_list), GFP_KERNEL); + if (!call_rcu_chain_list) return -ENOMEM; for (i = 0; i < call_rcu_chains; i++) { - call_rcu_chain[i].crc_stop = false; - call_rcu(&call_rcu_chain[i].crc_rh, call_rcu_chain_cb); + call_rcu_chain_list[i].crc_stop = false; + call_rcu(&call_rcu_chain_list[i].crc_rh, call_rcu_chain_cb); } return 0; } @@ -1089,13 +1089,13 @@ static void call_rcu_chain_cleanup(void) { int i; - if (!call_rcu_chain) + if (!call_rcu_chain_list) return; for (i = 0; i < call_rcu_chains; i++) - smp_store_release(&call_rcu_chain[i].crc_stop, true); + smp_store_release(&call_rcu_chain_list[i].crc_stop, true); rcu_barrier(); - kfree(call_rcu_chain); - call_rcu_chain = NULL; + kfree(call_rcu_chain_list); + call_rcu_chain_list = NULL; } static void lock_torture_cleanup(void) -- cgit From 90f1015dfee3d33f8ca7bfe03296d100d465e385 Mon Sep 17 00:00:00 2001 From: Zqiang Date: Fri, 3 Nov 2023 15:26:39 +0800 Subject: rcutorture: Add fqs_holdoff check before fqs_task is created For rcutorture tests on RCU implementations that support force-quiescent-state operations and that set the fqs_duration module parameter greater than zero, the fqs_task kthread will be created. However, if the fqs_holdoff module parameter is not set, then its default value of zero will cause fqs_task enter a long-term busy loop until stopped by kthread_stop(). This commit therefore adds a fqs_holdoff check before the fqs_task is created, making sure that whenever the fqs_task is created, the fqs_holdoff will be greater than zero. Signed-off-by: Zqiang Signed-off-by: Paul E. McKenney Signed-off-by: Neeraj Upadhyay (AMD) --- kernel/rcu/rcutorture.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 30fc9d34e329..a0b2520bd32b 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -3872,7 +3872,9 @@ rcu_torture_init(void) } if (fqs_duration < 0) fqs_duration = 0; - if (fqs_duration) { + if (fqs_holdoff < 0) + fqs_holdoff = 0; + if (fqs_duration && fqs_holdoff) { /* Create the fqs thread */ firsterr = torture_create_kthread(rcu_torture_fqs, NULL, fqs_task); -- cgit From 50181c0cff31281b9f1071575ffba8a102375ece Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Wed, 22 Nov 2023 15:01:19 +0100 Subject: sched/pelt: Avoid underestimation of task utilization Lukasz Luba reported that a thread's util_est can significantly decrease as a result of sharing the CPU with other threads. The use case can be easily reproduced with a periodic task TA that runs 1ms and sleeps 100us. When the task is alone on the CPU, its max utilization and its util_est is around 888. If another similar task starts to run on the same CPU, TA will have to share the CPU runtime and its maximum utilization will decrease around half the CPU capacity (512) then TA's util_est will follow this new maximum trend which is only the result of sharing the CPU with others tasks. Such situation can be detected with runnable_avg wich is close or equal to util_avg when TA is alone, but increases above util_avg when TA shares the CPU with other threads and wait on the runqueue. [ We prefer an util_est that overestimate rather than under estimate because in 1st case we will not provide enough performance to the task which will remain under-provisioned, whereas in the other case we will create some idle time which will enable to reduce contention and as a result reduces the util_est so the overestimate will be transient whereas the underestimate will remain. ] [ mingo: Refined the changelog, added comments from the LKML discussion. ] Reported-by: Lukasz Luba Signed-off-by: Vincent Guittot Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/lkml/CAKfTPtDd-HhF-YiNTtL9i5k0PfJbF819Yxu4YquzfXgwi7voyw@mail.gmail.com/#t Link: https://lore.kernel.org/r/20231122140119.472110-1-vincent.guittot@linaro.org Cc: Hongyan Xia --- kernel/sched/fair.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 07f555857698..53dea95ad8c9 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4774,6 +4774,11 @@ static inline unsigned long task_util(struct task_struct *p) return READ_ONCE(p->se.avg.util_avg); } +static inline unsigned long task_runnable(struct task_struct *p) +{ + return READ_ONCE(p->se.avg.runnable_avg); +} + static inline unsigned long _task_util_est(struct task_struct *p) { struct util_est ue = READ_ONCE(p->se.avg.util_est); @@ -4892,6 +4897,14 @@ static inline void util_est_update(struct cfs_rq *cfs_rq, if (task_util(p) > arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq)))) return; + /* + * To avoid underestimate of task utilization, skip updates of EWMA if + * we cannot grant that thread got all CPU time it wanted. + */ + if ((ue.enqueued + UTIL_EST_MARGIN) < task_runnable(p)) + goto done; + + /* * Update Task's estimated utilization * -- cgit From 9c0b4bb7f6303c9c4e2e34984c46f5a86478f84d Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Wed, 22 Nov 2023 14:39:03 +0100 Subject: sched/cpufreq: Rework schedutil governor performance estimation The current method to take into account uclamp hints when estimating the target frequency can end in a situation where the selected target frequency is finally higher than uclamp hints, whereas there are no real needs. Such cases mainly happen because we are currently mixing the traditional scheduler utilization signal with the uclamp performance hints. By adding these 2 metrics, we loose an important information when it comes to select the target frequency, and we have to make some assumptions which can't fit all cases. Rework the interface between the scheduler and schedutil governor in order to propagate all information down to the cpufreq governor. effective_cpu_util() interface changes and now returns the actual utilization of the CPU with 2 optional inputs: - The minimum performance for this CPU; typically the capacity to handle the deadline task and the interrupt pressure. But also uclamp_min request when available. - The maximum targeting performance for this CPU which reflects the maximum level that we would like to not exceed. By default it will be the CPU capacity but can be reduced because of some performance hints set with uclamp. The value can be lower than actual utilization and/or min performance level. A new sugov_effective_cpu_perf() interface is also available to compute the final performance level that is targeted for the CPU, after applying some cpufreq headroom and taking into account all inputs. With these 2 functions, schedutil is now able to decide when it must go above uclamp hints. It now also has a generic way to get the min performance level. The dependency between energy model and cpufreq governor and its headroom policy doesn't exist anymore. eenv_pd_max_util() asks schedutil for the targeted performance after applying the impact of the waking task. [ mingo: Refined the changelog & C comments. ] Signed-off-by: Vincent Guittot Signed-off-by: Ingo Molnar Acked-by: Rafael J. Wysocki Link: https://lore.kernel.org/r/20231122133904.446032-2-vincent.guittot@linaro.org --- include/linux/energy_model.h | 1 - kernel/sched/core.c | 90 +++++++++++++++++----------------------- kernel/sched/cpufreq_schedutil.c | 35 +++++++++++----- kernel/sched/fair.c | 22 ++++++++-- kernel/sched/sched.h | 24 ++++------- 5 files changed, 89 insertions(+), 83 deletions(-) (limited to 'kernel') diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h index b9caa01dfac4..adec808b371a 100644 --- a/include/linux/energy_model.h +++ b/include/linux/energy_model.h @@ -243,7 +243,6 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd, scale_cpu = arch_scale_cpu_capacity(cpu); ps = &pd->table[pd->nr_perf_states - 1]; - max_util = map_util_perf(max_util); max_util = min(max_util, allowed_cpu_cap); freq = map_util_freq(max_util, ps->frequency, scale_cpu); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2de77a6d5ef8..db4be4921e7f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7467,18 +7467,13 @@ int sched_core_idle_cpu(int cpu) * required to meet deadlines. */ unsigned long effective_cpu_util(int cpu, unsigned long util_cfs, - enum cpu_util_type type, - struct task_struct *p) + unsigned long *min, + unsigned long *max) { - unsigned long dl_util, util, irq, max; + unsigned long util, irq, scale; struct rq *rq = cpu_rq(cpu); - max = arch_scale_cpu_capacity(cpu); - - if (!uclamp_is_used() && - type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt)) { - return max; - } + scale = arch_scale_cpu_capacity(cpu); /* * Early check to see if IRQ/steal time saturates the CPU, can be @@ -7486,45 +7481,49 @@ unsigned long effective_cpu_util(int cpu, unsigned long util_cfs, * update_irq_load_avg(). */ irq = cpu_util_irq(rq); - if (unlikely(irq >= max)) - return max; + if (unlikely(irq >= scale)) { + if (min) + *min = scale; + if (max) + *max = scale; + return scale; + } + + if (min) { + /* + * The minimum utilization returns the highest level between: + * - the computed DL bandwidth needed with the IRQ pressure which + * steals time to the deadline task. + * - The minimum performance requirement for CFS and/or RT. + */ + *min = max(irq + cpu_bw_dl(rq), uclamp_rq_get(rq, UCLAMP_MIN)); + + /* + * When an RT task is runnable and uclamp is not used, we must + * ensure that the task will run at maximum compute capacity. + */ + if (!uclamp_is_used() && rt_rq_is_runnable(&rq->rt)) + *min = max(*min, scale); + } /* * Because the time spend on RT/DL tasks is visible as 'lost' time to * CFS tasks and we use the same metric to track the effective * utilization (PELT windows are synchronized) we can directly add them * to obtain the CPU's actual utilization. - * - * CFS and RT utilization can be boosted or capped, depending on - * utilization clamp constraints requested by currently RUNNABLE - * tasks. - * When there are no CFS RUNNABLE tasks, clamps are released and - * frequency will be gracefully reduced with the utilization decay. */ util = util_cfs + cpu_util_rt(rq); - if (type == FREQUENCY_UTIL) - util = uclamp_rq_util_with(rq, util, p); - - dl_util = cpu_util_dl(rq); + util += cpu_util_dl(rq); /* - * For frequency selection we do not make cpu_util_dl() a permanent part - * of this sum because we want to use cpu_bw_dl() later on, but we need - * to check if the CFS+RT+DL sum is saturated (ie. no idle time) such - * that we select f_max when there is no idle time. - * - * NOTE: numerical errors or stop class might cause us to not quite hit - * saturation when we should -- something for later. + * The maximum hint is a soft bandwidth requirement, which can be lower + * than the actual utilization because of uclamp_max requirements. */ - if (util + dl_util >= max) - return max; + if (max) + *max = min(scale, uclamp_rq_get(rq, UCLAMP_MAX)); - /* - * OTOH, for energy computation we need the estimated running time, so - * include util_dl and ignore dl_bw. - */ - if (type == ENERGY_UTIL) - util += dl_util; + if (util >= scale) + return scale; /* * There is still idle time; further improve the number by using the @@ -7535,28 +7534,15 @@ unsigned long effective_cpu_util(int cpu, unsigned long util_cfs, * U' = irq + --------- * U * max */ - util = scale_irq_capacity(util, irq, max); + util = scale_irq_capacity(util, irq, scale); util += irq; - /* - * Bandwidth required by DEADLINE must always be granted while, for - * FAIR and RT, we use blocked utilization of IDLE CPUs as a mechanism - * to gracefully reduce the frequency when no tasks show up for longer - * periods of time. - * - * Ideally we would like to set bw_dl as min/guaranteed freq and util + - * bw_dl as requested freq. However, cpufreq is not yet ready for such - * an interface. So, we only do the latter for now. - */ - if (type == FREQUENCY_UTIL) - util += cpu_bw_dl(rq); - - return min(max, util); + return min(scale, util); } unsigned long sched_cpu_util(int cpu) { - return effective_cpu_util(cpu, cpu_util_cfs(cpu), ENERGY_UTIL, NULL); + return effective_cpu_util(cpu, cpu_util_cfs(cpu), NULL, NULL); } #endif /* CONFIG_SMP */ diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 5888176354e2..f3acf2cf26ed 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -47,7 +47,7 @@ struct sugov_cpu { u64 last_update; unsigned long util; - unsigned long bw_dl; + unsigned long bw_min; /* The field below is for single-CPU policies only: */ #ifdef CONFIG_NO_HZ_COMMON @@ -143,7 +143,6 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy, unsigned int freq = arch_scale_freq_invariant() ? policy->cpuinfo.max_freq : policy->cur; - util = map_util_perf(util); freq = map_util_freq(util, freq, max); if (freq == sg_policy->cached_raw_freq && !sg_policy->need_freq_update) @@ -153,14 +152,30 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy, return cpufreq_driver_resolve_freq(policy, freq); } +unsigned long sugov_effective_cpu_perf(int cpu, unsigned long actual, + unsigned long min, + unsigned long max) +{ + /* Add dvfs headroom to actual utilization */ + actual = map_util_perf(actual); + /* Actually we don't need to target the max performance */ + if (actual < max) + max = actual; + + /* + * Ensure at least minimum performance while providing more compute + * capacity when possible. + */ + return max(min, max); +} + static void sugov_get_util(struct sugov_cpu *sg_cpu) { - unsigned long util = cpu_util_cfs_boost(sg_cpu->cpu); - struct rq *rq = cpu_rq(sg_cpu->cpu); + unsigned long min, max, util = cpu_util_cfs_boost(sg_cpu->cpu); - sg_cpu->bw_dl = cpu_bw_dl(rq); - sg_cpu->util = effective_cpu_util(sg_cpu->cpu, util, - FREQUENCY_UTIL, NULL); + util = effective_cpu_util(sg_cpu->cpu, util, &min, &max); + sg_cpu->bw_min = min; + sg_cpu->util = sugov_effective_cpu_perf(sg_cpu->cpu, util, min, max); } /** @@ -306,7 +321,7 @@ static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; } */ static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu) { - if (cpu_bw_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->bw_dl) + if (cpu_bw_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->bw_min) sg_cpu->sg_policy->limits_changed = true; } @@ -407,8 +422,8 @@ static void sugov_update_single_perf(struct update_util_data *hook, u64 time, sugov_cpu_is_busy(sg_cpu) && sg_cpu->util < prev_util) sg_cpu->util = prev_util; - cpufreq_driver_adjust_perf(sg_cpu->cpu, map_util_perf(sg_cpu->bw_dl), - map_util_perf(sg_cpu->util), max_cap); + cpufreq_driver_adjust_perf(sg_cpu->cpu, sg_cpu->bw_min, + sg_cpu->util, max_cap); sg_cpu->sg_policy->last_freq_update_time = time; } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 53dea95ad8c9..34fe6e9490c2 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7793,7 +7793,7 @@ static inline void eenv_pd_busy_time(struct energy_env *eenv, for_each_cpu(cpu, pd_cpus) { unsigned long util = cpu_util(cpu, p, -1, 0); - busy_time += effective_cpu_util(cpu, util, ENERGY_UTIL, NULL); + busy_time += effective_cpu_util(cpu, util, NULL, NULL); } eenv->pd_busy_time = min(eenv->pd_cap, busy_time); @@ -7816,7 +7816,7 @@ eenv_pd_max_util(struct energy_env *eenv, struct cpumask *pd_cpus, for_each_cpu(cpu, pd_cpus) { struct task_struct *tsk = (cpu == dst_cpu) ? p : NULL; unsigned long util = cpu_util(cpu, p, dst_cpu, 1); - unsigned long eff_util; + unsigned long eff_util, min, max; /* * Performance domain frequency: utilization clamping @@ -7825,7 +7825,23 @@ eenv_pd_max_util(struct energy_env *eenv, struct cpumask *pd_cpus, * NOTE: in case RT tasks are running, by default the * FREQUENCY_UTIL's utilization can be max OPP. */ - eff_util = effective_cpu_util(cpu, util, FREQUENCY_UTIL, tsk); + eff_util = effective_cpu_util(cpu, util, &min, &max); + + /* Task's uclamp can modify min and max value */ + if (tsk && uclamp_is_used()) { + min = max(min, uclamp_eff_value(p, UCLAMP_MIN)); + + /* + * If there is no active max uclamp constraint, + * directly use task's one, otherwise keep max. + */ + if (uclamp_rq_is_idle(cpu_rq(cpu))) + max = uclamp_eff_value(p, UCLAMP_MAX); + else + max = max(max, uclamp_eff_value(p, UCLAMP_MAX)); + } + + eff_util = sugov_effective_cpu_perf(cpu, eff_util, min, max); max_util = max(max_util, eff_util); } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 8a70d51ffa33..c1574cd388e7 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2994,24 +2994,14 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} #endif #ifdef CONFIG_SMP -/** - * enum cpu_util_type - CPU utilization type - * @FREQUENCY_UTIL: Utilization used to select frequency - * @ENERGY_UTIL: Utilization used during energy calculation - * - * The utilization signals of all scheduling classes (CFS/RT/DL) and IRQ time - * need to be aggregated differently depending on the usage made of them. This - * enum is used within effective_cpu_util() to differentiate the types of - * utilization expected by the callers, and adjust the aggregation accordingly. - */ -enum cpu_util_type { - FREQUENCY_UTIL, - ENERGY_UTIL, -}; - unsigned long effective_cpu_util(int cpu, unsigned long util_cfs, - enum cpu_util_type type, - struct task_struct *p); + unsigned long *min, + unsigned long *max); + +unsigned long sugov_effective_cpu_perf(int cpu, unsigned long actual, + unsigned long min, + unsigned long max); + /* * Verify the fitness of task @p to run on @cpu taking into account the -- cgit From f12560779f9d734446508f3df17f5632e9aaa2c8 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Wed, 22 Nov 2023 14:39:04 +0100 Subject: sched/cpufreq: Rework iowait boost Use the max value that has already been computed inside sugov_get_util() to cap the iowait boost and remove dependency with uclamp_rq_util_with() which is not used anymore. Signed-off-by: Vincent Guittot Signed-off-by: Ingo Molnar Acked-by: Rafael J. Wysocki Link: https://lore.kernel.org/r/20231122133904.446032-3-vincent.guittot@linaro.org --- kernel/sched/cpufreq_schedutil.c | 29 ++++++++++--------- kernel/sched/sched.h | 60 ---------------------------------------- 2 files changed, 14 insertions(+), 75 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index f3acf2cf26ed..4ee8ad70be99 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -169,11 +169,12 @@ unsigned long sugov_effective_cpu_perf(int cpu, unsigned long actual, return max(min, max); } -static void sugov_get_util(struct sugov_cpu *sg_cpu) +static void sugov_get_util(struct sugov_cpu *sg_cpu, unsigned long boost) { unsigned long min, max, util = cpu_util_cfs_boost(sg_cpu->cpu); util = effective_cpu_util(sg_cpu->cpu, util, &min, &max); + util = max(util, boost); sg_cpu->bw_min = min; sg_cpu->util = sugov_effective_cpu_perf(sg_cpu->cpu, util, min, max); } @@ -266,18 +267,16 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, * This mechanism is designed to boost high frequently IO waiting tasks, while * being more conservative on tasks which does sporadic IO operations. */ -static void sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time, +static unsigned long sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time, unsigned long max_cap) { - unsigned long boost; - /* No boost currently required */ if (!sg_cpu->iowait_boost) - return; + return 0; /* Reset boost if the CPU appears to have been idle enough */ if (sugov_iowait_reset(sg_cpu, time, false)) - return; + return 0; if (!sg_cpu->iowait_boost_pending) { /* @@ -286,7 +285,7 @@ static void sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time, sg_cpu->iowait_boost >>= 1; if (sg_cpu->iowait_boost < IOWAIT_BOOST_MIN) { sg_cpu->iowait_boost = 0; - return; + return 0; } } @@ -296,10 +295,7 @@ static void sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time, * sg_cpu->util is already in capacity scale; convert iowait_boost * into the same scale so we can compare. */ - boost = (sg_cpu->iowait_boost * max_cap) >> SCHED_CAPACITY_SHIFT; - boost = uclamp_rq_util_with(cpu_rq(sg_cpu->cpu), boost, NULL); - if (sg_cpu->util < boost) - sg_cpu->util = boost; + return (sg_cpu->iowait_boost * max_cap) >> SCHED_CAPACITY_SHIFT; } #ifdef CONFIG_NO_HZ_COMMON @@ -329,6 +325,8 @@ static inline bool sugov_update_single_common(struct sugov_cpu *sg_cpu, u64 time, unsigned long max_cap, unsigned int flags) { + unsigned long boost; + sugov_iowait_boost(sg_cpu, time, flags); sg_cpu->last_update = time; @@ -337,8 +335,8 @@ static inline bool sugov_update_single_common(struct sugov_cpu *sg_cpu, if (!sugov_should_update_freq(sg_cpu->sg_policy, time)) return false; - sugov_get_util(sg_cpu); - sugov_iowait_apply(sg_cpu, time, max_cap); + boost = sugov_iowait_apply(sg_cpu, time, max_cap); + sugov_get_util(sg_cpu, boost); return true; } @@ -439,9 +437,10 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time) for_each_cpu(j, policy->cpus) { struct sugov_cpu *j_sg_cpu = &per_cpu(sugov_cpu, j); + unsigned long boost; - sugov_get_util(j_sg_cpu); - sugov_iowait_apply(j_sg_cpu, time, max_cap); + boost = sugov_iowait_apply(j_sg_cpu, time, max_cap); + sugov_get_util(j_sg_cpu, boost); util = max(j_sg_cpu->util, util); } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index c1574cd388e7..e58a54bda77d 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -3058,59 +3058,6 @@ static inline bool uclamp_rq_is_idle(struct rq *rq) return rq->uclamp_flags & UCLAMP_FLAG_IDLE; } -/** - * uclamp_rq_util_with - clamp @util with @rq and @p effective uclamp values. - * @rq: The rq to clamp against. Must not be NULL. - * @util: The util value to clamp. - * @p: The task to clamp against. Can be NULL if you want to clamp - * against @rq only. - * - * Clamps the passed @util to the max(@rq, @p) effective uclamp values. - * - * If sched_uclamp_used static key is disabled, then just return the util - * without any clamping since uclamp aggregation at the rq level in the fast - * path is disabled, rendering this operation a NOP. - * - * Use uclamp_eff_value() if you don't care about uclamp values at rq level. It - * will return the correct effective uclamp value of the task even if the - * static key is disabled. - */ -static __always_inline -unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util, - struct task_struct *p) -{ - unsigned long min_util = 0; - unsigned long max_util = 0; - - if (!static_branch_likely(&sched_uclamp_used)) - return util; - - if (p) { - min_util = uclamp_eff_value(p, UCLAMP_MIN); - max_util = uclamp_eff_value(p, UCLAMP_MAX); - - /* - * Ignore last runnable task's max clamp, as this task will - * reset it. Similarly, no need to read the rq's min clamp. - */ - if (uclamp_rq_is_idle(rq)) - goto out; - } - - min_util = max_t(unsigned long, min_util, uclamp_rq_get(rq, UCLAMP_MIN)); - max_util = max_t(unsigned long, max_util, uclamp_rq_get(rq, UCLAMP_MAX)); -out: - /* - * Since CPU's {min,max}_util clamps are MAX aggregated considering - * RUNNABLE tasks with _different_ clamps, we can end up with an - * inversion. Fix it now when the clamps are applied. - */ - if (unlikely(min_util >= max_util)) - return min_util; - - return clamp(util, min_util, max_util); -} - /* Is the rq being capped/throttled by uclamp_max? */ static inline bool uclamp_rq_is_capped(struct rq *rq) { @@ -3148,13 +3095,6 @@ static inline unsigned long uclamp_eff_value(struct task_struct *p, return SCHED_CAPACITY_SCALE; } -static inline -unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util, - struct task_struct *p) -{ - return util; -} - static inline bool uclamp_rq_is_capped(struct rq *rq) { return false; } static inline bool uclamp_is_used(void) -- cgit From 388a1fb7da6aaa1970c7e2a7d7fcd983a87a8484 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 22 Nov 2023 11:07:56 +0100 Subject: perf: Fix the nr_addr_filters fix Thomas reported that commit 652ffc2104ec ("perf/core: Fix narrow startup race when creating the perf nr_addr_filters sysfs file") made the entire attribute group vanish, instead of only the nr_addr_filters attribute. Additionally a stray return. Insufficient coffee was involved with both writing and merging the patch. Fixes: 652ffc2104ec ("perf/core: Fix narrow startup race when creating the perf nr_addr_filters sysfs file") Reported-by: Thomas Richter Signed-off-by: Peter Zijlstra (Intel) Tested-by: Thomas Richter Link: https://lkml.kernel.org/r/20231122100756.GP8262@noisy.programming.kicks-ass.net --- kernel/events/core.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 4f0c45ab8d7d..59b332cce9e7 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -11417,12 +11417,10 @@ static umode_t pmu_dev_is_visible(struct kobject *kobj, struct attribute *a, int struct device *dev = kobj_to_dev(kobj); struct pmu *pmu = dev_get_drvdata(dev); - if (!pmu->nr_addr_filters) + if (n == 2 && !pmu->nr_addr_filters) return 0; return a->mode; - - return 0; } static struct attribute_group pmu_dev_attr_group = { -- cgit From 491dd8edecbc5027ee317f3f1e7e9800fb66d88f Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 23 Nov 2023 19:59:35 -0800 Subject: bpf: Emit global subprog name in verifier logs We have the name, instead of emitting just func#N to identify global subprog, augment verifier log messages with actual function name to make it more user-friendly. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Eduard Zingerman Acked-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20231124035937.403208-2-andrii@kernel.org --- kernel/bpf/verifier.c | 35 ++++++++++++++++++++++++----------- 1 file changed, 24 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 405da1f9e724..a2939ebf2638 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -339,6 +339,11 @@ struct bpf_kfunc_call_arg_meta { struct btf *btf_vmlinux; +static const char *btf_type_name(const struct btf *btf, u32 id) +{ + return btf_name_by_offset(btf, btf_type_by_id(btf, id)->name_off); +} + static DEFINE_MUTEX(bpf_verifier_lock); static DEFINE_MUTEX(bpf_percpu_ma_lock); @@ -418,6 +423,17 @@ static bool subprog_is_global(const struct bpf_verifier_env *env, int subprog) return aux && aux[subprog].linkage == BTF_FUNC_GLOBAL; } +static const char *subprog_name(const struct bpf_verifier_env *env, int subprog) +{ + struct bpf_func_info *info; + + if (!env->prog->aux->func_info) + return ""; + + info = &env->prog->aux->func_info[subprog]; + return btf_type_name(env->prog->aux->btf, info->type_id); +} + static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg) { return btf_record_has_field(reg_btf_record(reg), BPF_SPIN_LOCK); @@ -587,11 +603,6 @@ static int iter_get_spi(struct bpf_verifier_env *env, struct bpf_reg_state *reg, return stack_slot_obj_get_spi(env, reg, "iter", nr_slots); } -static const char *btf_type_name(const struct btf *btf, u32 id) -{ - return btf_name_by_offset(btf, btf_type_by_id(btf, id)->name_off); -} - static enum bpf_dynptr_type arg_to_dynptr_type(enum bpf_arg_type arg_type) { switch (arg_type & DYNPTR_TYPE_FLAG_MASK) { @@ -9269,13 +9280,16 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, if (err == -EFAULT) return err; if (subprog_is_global(env, subprog)) { + const char *sub_name = subprog_name(env, subprog); + if (err) { - verbose(env, "Caller passes invalid args into func#%d\n", subprog); + verbose(env, "Caller passes invalid args into func#%d ('%s')\n", + subprog, sub_name); return err; } - if (env->log.level & BPF_LOG_LEVEL) - verbose(env, "Func#%d is global and valid. Skipping.\n", subprog); + verbose(env, "Func#%d ('%s') is global and assumed valid.\n", + subprog, sub_name); clear_caller_saved_regs(env, caller->regs); /* All global functions return a 64-bit SCALAR_VALUE */ @@ -19893,9 +19907,8 @@ static int do_check_subprogs(struct bpf_verifier_env *env) if (ret) { return ret; } else if (env->log.level & BPF_LOG_LEVEL) { - verbose(env, - "Func#%d is safe for any args that match its prototype\n", - i); + verbose(env, "Func#%d ('%s') is safe for any args that match its prototype\n", + i, subprog_name(env, i)); } } return 0; -- cgit From 2afae08c9dcb8ac648414277cec70c2fe6a34d9e Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 23 Nov 2023 19:59:36 -0800 Subject: bpf: Validate global subprogs lazily Slightly change BPF verifier logic around eagerness and order of global subprog validation. Instead of going over every global subprog eagerly and validating it before main (entry) BPF program is verified, turn it around. Validate main program first, mark subprogs that were called from main program for later verification, but otherwise assume it is valid. Afterwards, go over marked global subprogs and validate those, potentially marking some more global functions as being called. Continue this process until all (transitively) callable global subprogs are validated. It's a BFS traversal at its heart and will always converge. This is an important change because it allows to feature-gate some subprograms that might not be verifiable on some older kernel, depending on supported set of features. E.g., at some point, global functions were allowed to accept a pointer to memory, which size is identified by user-provided type. Unfortunately, older kernels don't support this feature. With BPF CO-RE approach, the natural way would be to still compile BPF object file once and guard calls to this global subprog with some CO-RE check or using .rodata variables. That's what people do to guard usage of new helpers or kfuncs, and any other new BPF-side feature that might be missing on old kernels. That's currently impossible to do with global subprogs, unfortunately, because they are eagerly and unconditionally validated. This patch set aims to change this, so that in the future when global funcs gain new features, those can be guarded using BPF CO-RE techniques in the same fashion as any other new kernel feature. Two selftests had to be adjusted in sync with these changes. test_global_func12 relied on eager global subprog validation failing before main program failure is detected (unknown return value). Fix by making sure that main program is always valid. verifier_subprog_precision's parent_stack_slot_precise subtest relied on verifier checkpointing heuristic to do a checkpoint at instruction #5, but that's no longer true because we don't have enough jumps validated before reaching insn #5 due to global subprogs being validated later. Other than that, no changes, as one would expect. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Eduard Zingerman Acked-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20231124035937.403208-3-andrii@kernel.org --- include/linux/bpf.h | 2 + kernel/bpf/verifier.c | 48 +++++++++++++++++++--- .../selftests/bpf/progs/test_global_func12.c | 4 +- .../bpf/progs/verifier_subprog_precision.c | 4 +- 4 files changed, 48 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 258ba232e302..eb447b0a9423 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1347,6 +1347,8 @@ static inline bool bpf_prog_has_trampoline(const struct bpf_prog *prog) struct bpf_func_info_aux { u16 linkage; bool unreliable; + bool called : 1; + bool verified : 1; }; enum bpf_jit_poke_reason { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a2939ebf2638..8e7b6072e3f4 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -434,6 +434,11 @@ static const char *subprog_name(const struct bpf_verifier_env *env, int subprog) return btf_type_name(env->prog->aux->btf, info->type_id); } +static struct bpf_func_info_aux *subprog_aux(const struct bpf_verifier_env *env, int subprog) +{ + return &env->prog->aux->func_info_aux[subprog]; +} + static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg) { return btf_record_has_field(reg_btf_record(reg), BPF_SPIN_LOCK); @@ -9290,6 +9295,8 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, verbose(env, "Func#%d ('%s') is global and assumed valid.\n", subprog, sub_name); + /* mark global subprog for verifying after main prog */ + subprog_aux(env, subprog)->called = true; clear_caller_saved_regs(env, caller->regs); /* All global functions return a 64-bit SCALAR_VALUE */ @@ -19873,8 +19880,11 @@ out: return ret; } -/* Verify all global functions in a BPF program one by one based on their BTF. - * All global functions must pass verification. Otherwise the whole program is rejected. +/* Lazily verify all global functions based on their BTF, if they are called + * from main BPF program or any of subprograms transitively. + * BPF global subprogs called from dead code are not validated. + * All callable global functions must pass verification. + * Otherwise the whole program is rejected. * Consider: * int bar(int); * int foo(int f) @@ -19893,14 +19903,26 @@ out: static int do_check_subprogs(struct bpf_verifier_env *env) { struct bpf_prog_aux *aux = env->prog->aux; - int i, ret; + struct bpf_func_info_aux *sub_aux; + int i, ret, new_cnt; if (!aux->func_info) return 0; + /* exception callback is presumed to be always called */ + if (env->exception_callback_subprog) + subprog_aux(env, env->exception_callback_subprog)->called = true; + +again: + new_cnt = 0; for (i = 1; i < env->subprog_cnt; i++) { - if (aux->func_info_aux[i].linkage != BTF_FUNC_GLOBAL) + if (!subprog_is_global(env, i)) + continue; + + sub_aux = subprog_aux(env, i); + if (!sub_aux->called || sub_aux->verified) continue; + env->insn_idx = env->subprog_info[i].start; WARN_ON_ONCE(env->insn_idx == 0); ret = do_check_common(env, i, env->exception_callback_subprog == i); @@ -19910,7 +19932,21 @@ static int do_check_subprogs(struct bpf_verifier_env *env) verbose(env, "Func#%d ('%s') is safe for any args that match its prototype\n", i, subprog_name(env, i)); } + + /* We verified new global subprog, it might have called some + * more global subprogs that we haven't verified yet, so we + * need to do another pass over subprogs to verify those. + */ + sub_aux->verified = true; + new_cnt++; } + + /* We can't loop forever as we verify at least one global subprog on + * each pass. + */ + if (new_cnt) + goto again; + return 0; } @@ -20556,8 +20592,8 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3 if (ret < 0) goto skip_full_check; - ret = do_check_subprogs(env); - ret = ret ?: do_check_main(env); + ret = do_check_main(env); + ret = ret ?: do_check_subprogs(env); if (ret == 0 && bpf_prog_is_offloaded(env->prog->aux)) ret = bpf_prog_offload_finalize(env); diff --git a/tools/testing/selftests/bpf/progs/test_global_func12.c b/tools/testing/selftests/bpf/progs/test_global_func12.c index 7f159d83c6f6..6e03d42519a6 100644 --- a/tools/testing/selftests/bpf/progs/test_global_func12.c +++ b/tools/testing/selftests/bpf/progs/test_global_func12.c @@ -19,5 +19,7 @@ int global_func12(struct __sk_buff *skb) { const struct S s = {.x = skb->len }; - return foo(&s); + foo(&s); + + return 1; } diff --git a/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c b/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c index f61d623b1ce8..b5efcaeaa1ae 100644 --- a/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c +++ b/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c @@ -370,12 +370,10 @@ __naked int parent_stack_slot_precise(void) SEC("?raw_tp") __success __log_level(2) __msg("9: (0f) r1 += r6") -__msg("mark_precise: frame0: last_idx 9 first_idx 6") +__msg("mark_precise: frame0: last_idx 9 first_idx 0") __msg("mark_precise: frame0: regs=r6 stack= before 8: (bf) r1 = r7") __msg("mark_precise: frame0: regs=r6 stack= before 7: (27) r6 *= 4") __msg("mark_precise: frame0: regs=r6 stack= before 6: (79) r6 = *(u64 *)(r10 -8)") -__msg("mark_precise: frame0: parent state regs= stack=-8:") -__msg("mark_precise: frame0: last_idx 5 first_idx 0") __msg("mark_precise: frame0: regs= stack=-8 before 5: (85) call pc+6") __msg("mark_precise: frame0: regs= stack=-8 before 4: (b7) r1 = 0") __msg("mark_precise: frame0: regs= stack=-8 before 3: (7b) *(u64 *)(r10 -8) = r6") -- cgit From 75a442581d05edaee168222ffbe00d4389785636 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Sat, 11 Nov 2023 12:38:21 +0800 Subject: bpf: Add missed allocation hint for bpf_mem_cache_alloc_flags() bpf_mem_cache_alloc_flags() may call __alloc() directly when there is no free object in free list, but it doesn't initialize the allocation hint for the returned pointer. It may lead to bad memory dereference when freeing the pointer, so fix it by initializing the allocation hint. Fixes: 822fb26bdb55 ("bpf: Add a hint to allocated objects.") Signed-off-by: Hou Tao Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20231111043821.2258513-1-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/memalloc.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c index 63b909d277d4..6a51cfe4c2d6 100644 --- a/kernel/bpf/memalloc.c +++ b/kernel/bpf/memalloc.c @@ -978,6 +978,8 @@ void notrace *bpf_mem_cache_alloc_flags(struct bpf_mem_alloc *ma, gfp_t flags) memcg = get_memcg(c); old_memcg = set_active_memcg(memcg); ret = __alloc(c, NUMA_NO_NODE, GFP_KERNEL | __GFP_NOWARN | __GFP_ACCOUNT); + if (ret) + *(struct bpf_mem_cache **)ret = c; set_active_memcg(old_memcg); mem_cgroup_put(memcg); } -- cgit From 783822e44594639848b78d4bb61dde26fba04e05 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 22 Nov 2023 13:44:39 +0100 Subject: mnt_idmapping: decouple from namespaces There's no reason we need to couple mnt idmapping to namespaces in the way we currently do. Copy the idmapping when an idmapped mount is created and don't take any reference on the namespace at all. We also can't easily refcount struct uid_gid_map because it needs to stay the size of a cacheline otherwise we risk performance regressions (Ignoring for a second that right now struct uid_gid_map isn't actually 64 byte but 72 but that's a fix for another patch series.). Link: https://lore.kernel.org/r/20231122-vfs-mnt_idmap-v1-3-dae4abdde5bd@kernel.org Reviewed-by: Josef Bacik Signed-off-by: Christian Brauner --- fs/mnt_idmapping.c | 106 +++++++++++++++++++++++++++++++++++++++++------- include/linux/uidgid.h | 13 ++++++ kernel/user_namespace.c | 4 +- 3 files changed, 106 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/fs/mnt_idmapping.c b/fs/mnt_idmapping.c index 35d78cb3c38a..64c5205e2b5e 100644 --- a/fs/mnt_idmapping.c +++ b/fs/mnt_idmapping.c @@ -9,8 +9,16 @@ #include "internal.h" +/* + * Outside of this file vfs{g,u}id_t are always created from k{g,u}id_t, + * never from raw values. These are just internal helpers. + */ +#define VFSUIDT_INIT_RAW(val) (vfsuid_t){ val } +#define VFSGIDT_INIT_RAW(val) (vfsgid_t){ val } + struct mnt_idmap { - struct user_namespace *owner; + struct uid_gid_map uid_map; + struct uid_gid_map gid_map; refcount_t count; }; @@ -20,7 +28,6 @@ struct mnt_idmap { * mapped to {g,u}id 1, [...], {g,u}id 1000 to {g,u}id 1000, [...]. */ struct mnt_idmap nop_mnt_idmap = { - .owner = &init_user_ns, .count = REFCOUNT_INIT(1), }; EXPORT_SYMBOL_GPL(nop_mnt_idmap); @@ -65,7 +72,6 @@ vfsuid_t make_vfsuid(struct mnt_idmap *idmap, kuid_t kuid) { uid_t uid; - struct user_namespace *mnt_userns = idmap->owner; if (idmap == &nop_mnt_idmap) return VFSUIDT_INIT(kuid); @@ -75,7 +81,7 @@ vfsuid_t make_vfsuid(struct mnt_idmap *idmap, uid = from_kuid(fs_userns, kuid); if (uid == (uid_t)-1) return INVALID_VFSUID; - return VFSUIDT_INIT(make_kuid(mnt_userns, uid)); + return VFSUIDT_INIT_RAW(map_id_down(&idmap->uid_map, uid)); } EXPORT_SYMBOL_GPL(make_vfsuid); @@ -103,7 +109,6 @@ vfsgid_t make_vfsgid(struct mnt_idmap *idmap, struct user_namespace *fs_userns, kgid_t kgid) { gid_t gid; - struct user_namespace *mnt_userns = idmap->owner; if (idmap == &nop_mnt_idmap) return VFSGIDT_INIT(kgid); @@ -113,7 +118,7 @@ vfsgid_t make_vfsgid(struct mnt_idmap *idmap, gid = from_kgid(fs_userns, kgid); if (gid == (gid_t)-1) return INVALID_VFSGID; - return VFSGIDT_INIT(make_kgid(mnt_userns, gid)); + return VFSGIDT_INIT_RAW(map_id_down(&idmap->gid_map, gid)); } EXPORT_SYMBOL_GPL(make_vfsgid); @@ -132,11 +137,10 @@ kuid_t from_vfsuid(struct mnt_idmap *idmap, struct user_namespace *fs_userns, vfsuid_t vfsuid) { uid_t uid; - struct user_namespace *mnt_userns = idmap->owner; if (idmap == &nop_mnt_idmap) return AS_KUIDT(vfsuid); - uid = from_kuid(mnt_userns, AS_KUIDT(vfsuid)); + uid = map_id_up(&idmap->uid_map, __vfsuid_val(vfsuid)); if (uid == (uid_t)-1) return INVALID_UID; if (initial_idmapping(fs_userns)) @@ -160,11 +164,10 @@ kgid_t from_vfsgid(struct mnt_idmap *idmap, struct user_namespace *fs_userns, vfsgid_t vfsgid) { gid_t gid; - struct user_namespace *mnt_userns = idmap->owner; if (idmap == &nop_mnt_idmap) return AS_KGIDT(vfsgid); - gid = from_kgid(mnt_userns, AS_KGIDT(vfsgid)); + gid = map_id_up(&idmap->gid_map, __vfsgid_val(vfsgid)); if (gid == (gid_t)-1) return INVALID_GID; if (initial_idmapping(fs_userns)) @@ -195,16 +198,91 @@ int vfsgid_in_group_p(vfsgid_t vfsgid) #endif EXPORT_SYMBOL_GPL(vfsgid_in_group_p); +static int copy_mnt_idmap(struct uid_gid_map *map_from, + struct uid_gid_map *map_to) +{ + struct uid_gid_extent *forward, *reverse; + u32 nr_extents = READ_ONCE(map_from->nr_extents); + /* Pairs with smp_wmb() when writing the idmapping. */ + smp_rmb(); + + /* + * Don't blindly copy @map_to into @map_from if nr_extents is + * smaller or equal to UID_GID_MAP_MAX_BASE_EXTENTS. Since we + * read @nr_extents someone could have written an idmapping and + * then we might end up with inconsistent data. So just don't do + * anything at all. + */ + if (nr_extents == 0) + return 0; + + /* + * Here we know that nr_extents is greater than zero which means + * a map has been written. Since idmappings can't be changed + * once they have been written we know that we can safely copy + * from @map_to into @map_from. + */ + + if (nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) { + *map_to = *map_from; + return 0; + } + + forward = kmemdup(map_from->forward, + nr_extents * sizeof(struct uid_gid_extent), + GFP_KERNEL_ACCOUNT); + if (!forward) + return -ENOMEM; + + reverse = kmemdup(map_from->reverse, + nr_extents * sizeof(struct uid_gid_extent), + GFP_KERNEL_ACCOUNT); + if (!reverse) { + kfree(forward); + return -ENOMEM; + } + + /* + * The idmapping isn't exposed anywhere so we don't need to care + * about ordering between extent pointers and @nr_extents + * initialization. + */ + map_to->forward = forward; + map_to->reverse = reverse; + map_to->nr_extents = nr_extents; + return 0; +} + +static void free_mnt_idmap(struct mnt_idmap *idmap) +{ + if (idmap->uid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) { + kfree(idmap->uid_map.forward); + kfree(idmap->uid_map.reverse); + } + if (idmap->gid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) { + kfree(idmap->gid_map.forward); + kfree(idmap->gid_map.reverse); + } + kfree(idmap); +} + struct mnt_idmap *alloc_mnt_idmap(struct user_namespace *mnt_userns) { struct mnt_idmap *idmap; + int ret; idmap = kzalloc(sizeof(struct mnt_idmap), GFP_KERNEL_ACCOUNT); if (!idmap) return ERR_PTR(-ENOMEM); - idmap->owner = get_user_ns(mnt_userns); refcount_set(&idmap->count, 1); + ret = copy_mnt_idmap(&mnt_userns->uid_map, &idmap->uid_map); + if (!ret) + ret = copy_mnt_idmap(&mnt_userns->gid_map, &idmap->gid_map); + if (ret) { + free_mnt_idmap(idmap); + idmap = ERR_PTR(ret); + } return idmap; } @@ -234,9 +312,7 @@ EXPORT_SYMBOL_GPL(mnt_idmap_get); */ void mnt_idmap_put(struct mnt_idmap *idmap) { - if (idmap != &nop_mnt_idmap && refcount_dec_and_test(&idmap->count)) { - put_user_ns(idmap->owner); - kfree(idmap); - } + if (idmap != &nop_mnt_idmap && refcount_dec_and_test(&idmap->count)) + free_mnt_idmap(idmap); } EXPORT_SYMBOL_GPL(mnt_idmap_put); diff --git a/include/linux/uidgid.h b/include/linux/uidgid.h index b0542cd11aeb..415a7ca2b882 100644 --- a/include/linux/uidgid.h +++ b/include/linux/uidgid.h @@ -17,6 +17,7 @@ struct user_namespace; extern struct user_namespace init_user_ns; +struct uid_gid_map; typedef struct { uid_t val; @@ -138,6 +139,9 @@ static inline bool kgid_has_mapping(struct user_namespace *ns, kgid_t gid) return from_kgid(ns, gid) != (gid_t) -1; } +u32 map_id_down(struct uid_gid_map *map, u32 id); +u32 map_id_up(struct uid_gid_map *map, u32 id); + #else static inline kuid_t make_kuid(struct user_namespace *from, uid_t uid) @@ -186,6 +190,15 @@ static inline bool kgid_has_mapping(struct user_namespace *ns, kgid_t gid) return gid_valid(gid); } +static inline u32 map_id_down(struct uid_gid_map *map, u32 id) +{ + return id; +} + +static inline u32 map_id_up(struct uid_gid_map *map, u32 id) +{ + return id; +} #endif /* CONFIG_USER_NS */ #endif /* _LINUX_UIDGID_H */ diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 625101249e4d..ce4d99df5f0e 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -332,7 +332,7 @@ static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count) return id; } -static u32 map_id_down(struct uid_gid_map *map, u32 id) +u32 map_id_down(struct uid_gid_map *map, u32 id) { return map_id_range_down(map, id, 1); } @@ -375,7 +375,7 @@ map_id_up_max(unsigned extents, struct uid_gid_map *map, u32 id) sizeof(struct uid_gid_extent), cmp_map_id); } -static u32 map_id_up(struct uid_gid_map *map, u32 id) +u32 map_id_up(struct uid_gid_map *map, u32 id) { struct uid_gid_extent *extent; unsigned extents = map->nr_extents; -- cgit From 877c737db9355acaa1ec2fd2b8dbdaff82605df7 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Mon, 27 Nov 2023 14:51:05 -0500 Subject: cgroup/cpuset: Expose cpuset.cpus.isolated The root-only cpuset.cpus.isolated control file shows the current set of isolated CPUs in isolated partitions. This control file is currently exposed only with the cgroup_debug boot command line option which also adds the ".__DEBUG__." prefix. This is actually a useful control file if users want to find out which CPUs are currently in an isolated state by the cpuset controller. Remove CFTYPE_DEBUG flag for this control file and make it available by default without any prefix. The test_cpuset_prs.sh test script and the cgroup-v2.rst documentation file are also updated accordingly. Minor code change is also made in test_cpuset_prs.sh to avoid false test failure when running on debug kernel. Signed-off-by: Waiman Long Signed-off-by: Tejun Heo --- Documentation/admin-guide/cgroup-v2.rst | 7 +++++ kernel/cgroup/cpuset.c | 2 +- tools/testing/selftests/cgroup/test_cpuset_prs.sh | 32 +++++++++++++---------- 3 files changed, 26 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index cf5651a11df8..30f6ff2eba47 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -2316,6 +2316,13 @@ Cpuset Interface Files treated to have an implicit value of "cpuset.cpus" in the formation of local partition. + cpuset.cpus.isolated + A read-only and root cgroup only multiple values file. + + This file shows the set of all isolated CPUs used in existing + isolated partitions. It will be empty if no isolated partition + is created. + cpuset.cpus.partition A read-write single value file which exists on non-root cpuset-enabled cgroups. This flag is owned by the parent cgroup diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 1bad4007ff4b..2a16df86c55c 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -3974,7 +3974,7 @@ static struct cftype dfl_files[] = { .name = "cpus.isolated", .seq_show = cpuset_common_seq_show, .private = FILE_ISOLATED_CPULIST, - .flags = CFTYPE_ONLY_ON_ROOT | CFTYPE_DEBUG, + .flags = CFTYPE_ONLY_ON_ROOT, }, { } /* terminate */ diff --git a/tools/testing/selftests/cgroup/test_cpuset_prs.sh b/tools/testing/selftests/cgroup/test_cpuset_prs.sh index 7b7c4c2b6d85..b5eb1be2248c 100755 --- a/tools/testing/selftests/cgroup/test_cpuset_prs.sh +++ b/tools/testing/selftests/cgroup/test_cpuset_prs.sh @@ -508,7 +508,7 @@ dump_states() XECPUS=$DIR/cpuset.cpus.exclusive.effective PRS=$DIR/cpuset.cpus.partition PCPUS=$DIR/.__DEBUG__.cpuset.cpus.subpartitions - ISCPUS=$DIR/.__DEBUG__.cpuset.cpus.isolated + ISCPUS=$DIR/cpuset.cpus.isolated [[ -e $CPUS ]] && echo "$CPUS: $(cat $CPUS)" [[ -e $XCPUS ]] && echo "$XCPUS: $(cat $XCPUS)" [[ -e $ECPUS ]] && echo "$ECPUS: $(cat $ECPUS)" @@ -593,17 +593,17 @@ check_cgroup_states() # # Get isolated (including offline) CPUs by looking at -# /sys/kernel/debug/sched/domains and *cpuset.cpus.isolated control file, +# /sys/kernel/debug/sched/domains and cpuset.cpus.isolated control file, # if available, and compare that with the expected value. # # Note that isolated CPUs from the sched/domains context include offline # CPUs as well as CPUs in non-isolated 1-CPU partition. Those CPUs may -# not be included in the *cpuset.cpus.isolated control file which contains +# not be included in the cpuset.cpus.isolated control file which contains # only CPUs in isolated partitions. # # $1 - expected isolated cpu list(s) {,} # - expected sched/domains value -# - *cpuset.cpus.isolated value = if not defined +# - cpuset.cpus.isolated value = if not defined # check_isolcpus() { @@ -611,7 +611,7 @@ check_isolcpus() ISOLCPUS= LASTISOLCPU= SCHED_DOMAINS=/sys/kernel/debug/sched/domains - ISCPUS=${CGROUP2}/.__DEBUG__.cpuset.cpus.isolated + ISCPUS=${CGROUP2}/cpuset.cpus.isolated if [[ $EXPECT_VAL = . ]] then EXPECT_VAL= @@ -692,14 +692,18 @@ test_fail() null_isolcpus_check() { [[ $VERBOSE -gt 0 ]] || return 0 - pause 0.02 - check_isolcpus "." - if [[ $? -ne 0 ]] - then - echo "Unexpected isolated CPUs: $ISOLCPUS" - dump_states - exit 1 - fi + # Retry a few times before printing error + RETRY=0 + while [[ $RETRY -lt 5 ]] + do + pause 0.01 + check_isolcpus "." + [[ $? -eq 0 ]] && return 0 + ((RETRY++)) + done + echo "Unexpected isolated CPUs: $ISOLCPUS" + dump_states + exit 1 } # @@ -776,7 +780,7 @@ run_state_test() # NEWLIST=$(cat cpuset.cpus.effective) RETRY=0 - while [[ $NEWLIST != $CPULIST && $RETRY -lt 5 ]] + while [[ $NEWLIST != $CPULIST && $RETRY -lt 8 ]] do # Wait a bit longer & recheck a few times pause 0.01 -- cgit From cff5f49d433fcd0063c8be7dd08fa5bf190c6c37 Mon Sep 17 00:00:00 2001 From: Tim Van Patten Date: Wed, 15 Nov 2023 09:20:43 -0700 Subject: cgroup_freezer: cgroup_freezing: Check if not frozen __thaw_task() was recently updated to warn if the task being thawed was part of a freezer cgroup that is still currently freezing: void __thaw_task(struct task_struct *p) { ... if (WARN_ON_ONCE(freezing(p))) goto unlock; This has exposed a bug in cgroup1 freezing where when CGROUP_FROZEN is asserted, the CGROUP_FREEZING bits are not also cleared at the same time. Meaning, when a cgroup is marked FROZEN it continues to be marked FREEZING as well. This causes the WARNING to trigger, because cgroup_freezing() thinks the cgroup is still freezing. There are two ways to fix this: 1. Whenever FROZEN is set, clear FREEZING for the cgroup and all children cgroups. 2. Update cgroup_freezing() to also verify that FROZEN is not set. This patch implements option (2), since it's smaller and more straightforward. Signed-off-by: Tim Van Patten Tested-by: Mark Hasemeyer Fixes: f5d39b020809 ("freezer,sched: Rewrite core freezer logic") Cc: stable@vger.kernel.org # v6.1+ Signed-off-by: Tejun Heo --- kernel/cgroup/legacy_freezer.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup/legacy_freezer.c b/kernel/cgroup/legacy_freezer.c index 122dacb3a443..66d1708042a7 100644 --- a/kernel/cgroup/legacy_freezer.c +++ b/kernel/cgroup/legacy_freezer.c @@ -66,9 +66,15 @@ static struct freezer *parent_freezer(struct freezer *freezer) bool cgroup_freezing(struct task_struct *task) { bool ret; + unsigned int state; rcu_read_lock(); - ret = task_freezer(task)->state & CGROUP_FREEZING; + /* Check if the cgroup is still FREEZING, but not FROZEN. The extra + * !FROZEN check is required, because the FREEZING bit is not cleared + * when the state FROZEN is reached. + */ + state = task_freezer(task)->state; + ret = (state & CGROUP_FREEZING) && !(state & CGROUP_FROZEN); rcu_read_unlock(); return ret; -- cgit From 4930b7f53a298533bc31d7540b6ea8b79a000331 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sat, 25 Nov 2023 20:31:26 +0100 Subject: bpf: Store ref_ctr_offsets values in bpf_uprobe array We will need to return ref_ctr_offsets values through link_info interface in following change, so we need to keep them around. Storing ref_ctr_offsets values directly into bpf_uprobe array. Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20231125193130.834322-3-jolsa@kernel.org --- kernel/trace/bpf_trace.c | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index f0b8b7c29126..ad0323f27288 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -3033,6 +3033,7 @@ struct bpf_uprobe_multi_link; struct bpf_uprobe { struct bpf_uprobe_multi_link *link; loff_t offset; + unsigned long ref_ctr_offset; u64 cookie; struct uprobe_consumer consumer; }; @@ -3172,7 +3173,6 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr { struct bpf_uprobe_multi_link *link = NULL; unsigned long __user *uref_ctr_offsets; - unsigned long *ref_ctr_offsets = NULL; struct bpf_link_primer link_primer; struct bpf_uprobe *uprobes = NULL; struct task_struct *task = NULL; @@ -3245,18 +3245,12 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr if (!uprobes || !link) goto error_free; - if (uref_ctr_offsets) { - ref_ctr_offsets = kvcalloc(cnt, sizeof(*ref_ctr_offsets), GFP_KERNEL); - if (!ref_ctr_offsets) - goto error_free; - } - for (i = 0; i < cnt; i++) { if (ucookies && __get_user(uprobes[i].cookie, ucookies + i)) { err = -EFAULT; goto error_free; } - if (uref_ctr_offsets && __get_user(ref_ctr_offsets[i], uref_ctr_offsets + i)) { + if (uref_ctr_offsets && __get_user(uprobes[i].ref_ctr_offset, uref_ctr_offsets + i)) { err = -EFAULT; goto error_free; } @@ -3287,7 +3281,7 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr for (i = 0; i < cnt; i++) { err = uprobe_register_refctr(d_real_inode(link->path.dentry), uprobes[i].offset, - ref_ctr_offsets ? ref_ctr_offsets[i] : 0, + uprobes[i].ref_ctr_offset, &uprobes[i].consumer); if (err) { bpf_uprobe_unregister(&path, uprobes, i); @@ -3299,11 +3293,9 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr if (err) goto error_free; - kvfree(ref_ctr_offsets); return bpf_link_settle(&link_primer); error_free: - kvfree(ref_ctr_offsets); kvfree(uprobes); kfree(link); if (task) -- cgit From e56fdbfb06e26a7066b070967badef4148528df2 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sat, 25 Nov 2023 20:31:27 +0100 Subject: bpf: Add link_info support for uprobe multi link Adding support to get uprobe_link details through bpf_link_info interface. Adding new struct uprobe_multi to struct bpf_link_info to carry the uprobe_multi link details. The uprobe_multi.count is passed from user space to denote size of array fields (offsets/ref_ctr_offsets/cookies). The actual array size is stored back to uprobe_multi.count (allowing user to find out the actual array size) and array fields are populated up to the user passed size. All the non-array fields (path/count/flags/pid) are always set. Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20231125193130.834322-4-jolsa@kernel.org --- include/uapi/linux/bpf.h | 10 ++++++ kernel/trace/bpf_trace.c | 72 ++++++++++++++++++++++++++++++++++++++++++ tools/include/uapi/linux/bpf.h | 10 ++++++ 3 files changed, 92 insertions(+) (limited to 'kernel') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 7a5498242eaa..e88746ba7d21 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -6562,6 +6562,16 @@ struct bpf_link_info { __u32 flags; __u64 missed; } kprobe_multi; + struct { + __aligned_u64 path; + __aligned_u64 offsets; + __aligned_u64 ref_ctr_offsets; + __aligned_u64 cookies; + __u32 path_size; /* in/out: real path size on success, including zero byte */ + __u32 count; /* in/out: uprobe_multi offsets/ref_ctr_offsets/cookies count */ + __u32 flags; + __u32 pid; + } uprobe_multi; struct { __u32 type; /* enum bpf_perf_event_type */ __u32 :32; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index ad0323f27288..c284a4ad0315 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -3042,6 +3042,7 @@ struct bpf_uprobe_multi_link { struct path path; struct bpf_link link; u32 cnt; + u32 flags; struct bpf_uprobe *uprobes; struct task_struct *task; }; @@ -3083,9 +3084,79 @@ static void bpf_uprobe_multi_link_dealloc(struct bpf_link *link) kfree(umulti_link); } +static int bpf_uprobe_multi_link_fill_link_info(const struct bpf_link *link, + struct bpf_link_info *info) +{ + u64 __user *uref_ctr_offsets = u64_to_user_ptr(info->uprobe_multi.ref_ctr_offsets); + u64 __user *ucookies = u64_to_user_ptr(info->uprobe_multi.cookies); + u64 __user *uoffsets = u64_to_user_ptr(info->uprobe_multi.offsets); + u64 __user *upath = u64_to_user_ptr(info->uprobe_multi.path); + u32 upath_size = info->uprobe_multi.path_size; + struct bpf_uprobe_multi_link *umulti_link; + u32 ucount = info->uprobe_multi.count; + int err = 0, i; + long left; + + if (!upath ^ !upath_size) + return -EINVAL; + + if ((uoffsets || uref_ctr_offsets || ucookies) && !ucount) + return -EINVAL; + + umulti_link = container_of(link, struct bpf_uprobe_multi_link, link); + info->uprobe_multi.count = umulti_link->cnt; + info->uprobe_multi.flags = umulti_link->flags; + info->uprobe_multi.pid = umulti_link->task ? + task_pid_nr_ns(umulti_link->task, task_active_pid_ns(current)) : 0; + + if (upath) { + char *p, *buf; + + upath_size = min_t(u32, upath_size, PATH_MAX); + + buf = kmalloc(upath_size, GFP_KERNEL); + if (!buf) + return -ENOMEM; + p = d_path(&umulti_link->path, buf, upath_size); + if (IS_ERR(p)) { + kfree(buf); + return PTR_ERR(p); + } + upath_size = buf + upath_size - p; + left = copy_to_user(upath, p, upath_size); + kfree(buf); + if (left) + return -EFAULT; + info->uprobe_multi.path_size = upath_size; + } + + if (!uoffsets && !ucookies && !uref_ctr_offsets) + return 0; + + if (ucount < umulti_link->cnt) + err = -ENOSPC; + else + ucount = umulti_link->cnt; + + for (i = 0; i < ucount; i++) { + if (uoffsets && + put_user(umulti_link->uprobes[i].offset, uoffsets + i)) + return -EFAULT; + if (uref_ctr_offsets && + put_user(umulti_link->uprobes[i].ref_ctr_offset, uref_ctr_offsets + i)) + return -EFAULT; + if (ucookies && + put_user(umulti_link->uprobes[i].cookie, ucookies + i)) + return -EFAULT; + } + + return err; +} + static const struct bpf_link_ops bpf_uprobe_multi_link_lops = { .release = bpf_uprobe_multi_link_release, .dealloc = bpf_uprobe_multi_link_dealloc, + .fill_link_info = bpf_uprobe_multi_link_fill_link_info, }; static int uprobe_prog_run(struct bpf_uprobe *uprobe, @@ -3274,6 +3345,7 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr link->uprobes = uprobes; link->path = path; link->task = task; + link->flags = flags; bpf_link_init(&link->link, BPF_LINK_TYPE_UPROBE_MULTI, &bpf_uprobe_multi_link_lops, prog); diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 7a5498242eaa..e88746ba7d21 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -6562,6 +6562,16 @@ struct bpf_link_info { __u32 flags; __u64 missed; } kprobe_multi; + struct { + __aligned_u64 path; + __aligned_u64 offsets; + __aligned_u64 ref_ctr_offsets; + __aligned_u64 cookies; + __u32 path_size; /* in/out: real path size on success, including zero byte */ + __u32 count; /* in/out: uprobe_multi offsets/ref_ctr_offsets/cookies count */ + __u32 flags; + __u32 pid; + } uprobe_multi; struct { __u32 type; /* enum bpf_perf_event_type */ __u32 :32; -- cgit From 23ab79e8e469e2605beec2e3ccb40d19c68dd2e0 Mon Sep 17 00:00:00 2001 From: Elliot Berman Date: Mon, 20 Nov 2023 09:36:31 -0800 Subject: freezer,sched: Do not restore saved_state of a thawed task It is possible for a task to be thawed multiple times when mixing the *legacy* cgroup freezer and system-wide freezer. To do this, freeze the cgroup, do system-wide freeze/thaw, then thaw the cgroup. When this happens, then a stale saved_state can be written to the task's state and cause task to hang indefinitely. Fix this by only trying to thaw tasks that are actually frozen. This change also has the marginal benefit avoiding unnecessary wake_up_state(p, TASK_FROZEN) if we know the task is already thawed. There is not possibility of time-of-compare/time-of-use race when we skip the wake_up_state because entering/exiting TASK_FROZEN is guarded by freezer_lock. Fixes: 8f0eed4a78a8 ("freezer,sched: Use saved_state to reduce some spurious wakeups") Signed-off-by: Elliot Berman Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Abhijeet Dharmapurikar Link: https://lore.kernel.org/r/20231120-freezer-state-multiple-thaws-v1-1-f2e1dd7ce5a2@quicinc.com --- kernel/freezer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/freezer.c b/kernel/freezer.c index c450fa8b8b5e..759006a9a910 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -201,7 +201,7 @@ void __thaw_task(struct task_struct *p) if (WARN_ON_ONCE(freezing(p))) goto unlock; - if (task_call_func(p, __restore_freezer_state, NULL)) + if (!frozen(p) || task_call_func(p, __restore_freezer_state, NULL)) goto unlock; wake_up_state(p, TASK_FROZEN); -- cgit From 382c27f4ed28f803b1f1473ac2d8db0afc795a1b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 29 Nov 2023 15:24:52 +0100 Subject: perf: Fix perf_event_validate_size() Budimir noted that perf_event_validate_size() only checks the size of the newly added event, even though the sizes of all existing events can also change due to not all events having the same read_format. When we attach the new event, perf_group_attach(), we do re-compute the size for all events. Fixes: a723968c0ed3 ("perf: Fix u16 overflows") Reported-by: Budimir Markovic Signed-off-by: Peter Zijlstra (Intel) --- kernel/events/core.c | 61 ++++++++++++++++++++++++++++++++-------------------- 1 file changed, 38 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index b704d83a28b2..c9d123e13b57 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1814,31 +1814,34 @@ static inline void perf_event__state_init(struct perf_event *event) PERF_EVENT_STATE_INACTIVE; } -static void __perf_event_read_size(struct perf_event *event, int nr_siblings) +static int __perf_event_read_size(u64 read_format, int nr_siblings) { int entry = sizeof(u64); /* value */ int size = 0; int nr = 1; - if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) + if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) size += sizeof(u64); - if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) + if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) size += sizeof(u64); - if (event->attr.read_format & PERF_FORMAT_ID) + if (read_format & PERF_FORMAT_ID) entry += sizeof(u64); - if (event->attr.read_format & PERF_FORMAT_LOST) + if (read_format & PERF_FORMAT_LOST) entry += sizeof(u64); - if (event->attr.read_format & PERF_FORMAT_GROUP) { + if (read_format & PERF_FORMAT_GROUP) { nr += nr_siblings; size += sizeof(u64); } - size += entry * nr; - event->read_size = size; + /* + * Since perf_event_validate_size() limits this to 16k and inhibits + * adding more siblings, this will never overflow. + */ + return size + nr * entry; } static void __perf_event_header_size(struct perf_event *event, u64 sample_type) @@ -1888,8 +1891,9 @@ static void __perf_event_header_size(struct perf_event *event, u64 sample_type) */ static void perf_event__header_size(struct perf_event *event) { - __perf_event_read_size(event, - event->group_leader->nr_siblings); + event->read_size = + __perf_event_read_size(event->attr.read_format, + event->group_leader->nr_siblings); __perf_event_header_size(event, event->attr.sample_type); } @@ -1920,24 +1924,35 @@ static void perf_event__id_header_size(struct perf_event *event) event->id_header_size = size; } +/* + * Check that adding an event to the group does not result in anybody + * overflowing the 64k event limit imposed by the output buffer. + * + * Specifically, check that the read_size for the event does not exceed 16k, + * read_size being the one term that grows with groups size. Since read_size + * depends on per-event read_format, also (re)check the existing events. + * + * This leaves 48k for the constant size fields and things like callchains, + * branch stacks and register sets. + */ static bool perf_event_validate_size(struct perf_event *event) { - /* - * The values computed here will be over-written when we actually - * attach the event. - */ - __perf_event_read_size(event, event->group_leader->nr_siblings + 1); - __perf_event_header_size(event, event->attr.sample_type & ~PERF_SAMPLE_READ); - perf_event__id_header_size(event); + struct perf_event *sibling, *group_leader = event->group_leader; - /* - * Sum the lot; should not exceed the 64k limit we have on records. - * Conservative limit to allow for callchains and other variable fields. - */ - if (event->read_size + event->header_size + - event->id_header_size + sizeof(struct perf_event_header) >= 16*1024) + if (__perf_event_read_size(event->attr.read_format, + group_leader->nr_siblings + 1) > 16*1024) return false; + if (__perf_event_read_size(group_leader->attr.read_format, + group_leader->nr_siblings + 1) > 16*1024) + return false; + + for_each_sibling_event(sibling, group_leader) { + if (__perf_event_read_size(sibling->attr.read_format, + group_leader->nr_siblings + 1) > 16*1024) + return false; + } + return true; } -- cgit From 5068d84054b766efe7c6202fc71b2350d1c326f1 Mon Sep 17 00:00:00 2001 From: Yiwei Lin Date: Fri, 17 Nov 2023 16:01:06 +0800 Subject: sched/fair: Update min_vruntime for reweight_entity() correctly Since reweight_entity() may have chance to change the weight of cfs_rq->curr entity, we should also update_min_vruntime() if this is the case Fixes: eab03c23c2a1 ("sched/eevdf: Fix vruntime adjustment on reweight") Signed-off-by: Yiwei Lin Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Abel Wu Link: https://lore.kernel.org/r/20231117080106.12890-1-s921975628@gmail.com --- kernel/sched/fair.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 34fe6e9490c2..bcea3d55d95d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3815,17 +3815,17 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, enqueue_load_avg(cfs_rq, se); if (se->on_rq) { update_load_add(&cfs_rq->load, se->load.weight); - if (!curr) { - /* - * The entity's vruntime has been adjusted, so let's check - * whether the rq-wide min_vruntime needs updated too. Since - * the calculations above require stable min_vruntime rather - * than up-to-date one, we do the update at the end of the - * reweight process. - */ + if (!curr) __enqueue_entity(cfs_rq, se); - update_min_vruntime(cfs_rq); - } + + /* + * The entity's vruntime has been adjusted, so let's check + * whether the rq-wide min_vruntime needs updated too. Since + * the calculations above require stable min_vruntime rather + * than up-to-date one, we do the update at the end of the + * reweight process. + */ + update_min_vruntime(cfs_rq); } } -- cgit From 418146e39891ef1fb2284dee4cabbfe616cd21cf Mon Sep 17 00:00:00 2001 From: Elliot Berman Date: Mon, 20 Nov 2023 09:36:32 -0800 Subject: freezer,sched: Clean saved_state when restoring it during thaw Clean saved_state after using it during thaw. Cleaning the saved_state allows us to avoid some unnecessary branches in ttwu_state_match. Signed-off-by: Elliot Berman Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20231120-freezer-state-multiple-thaws-v1-2-f2e1dd7ce5a2@quicinc.com --- kernel/freezer.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/freezer.c b/kernel/freezer.c index c450fa8b8b5e..43b1d1b94d9e 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -187,6 +187,7 @@ static int __restore_freezer_state(struct task_struct *p, void *arg) if (state != TASK_RUNNING) { WRITE_ONCE(p->__state, state); + p->saved_state = TASK_RUNNING; return 1; } -- cgit From 5431fdd2c181dd2eac218e45b44deb2925fa48f0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sun, 17 Sep 2023 13:24:21 +0200 Subject: ptrace: Convert ptrace_attach() to use lock guards Created as testing for the conditional guard infrastructure. Specifically this makes use of the following form: scoped_cond_guard (mutex_intr, return -ERESTARTNOINTR, &task->signal->cred_guard_mutex) { ... } ... return 0; Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Oleg Nesterov Link: https://lkml.kernel.org/r/20231102110706.568467727%40infradead.org --- include/linux/sched/task.h | 2 + include/linux/spinlock.h | 26 +++++++++ kernel/ptrace.c | 128 +++++++++++++++++++++------------------------ 3 files changed, 89 insertions(+), 67 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index a23af225c898..4f3dca353556 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -226,4 +226,6 @@ static inline void task_unlock(struct task_struct *p) spin_unlock(&p->alloc_lock); } +DEFINE_GUARD(task_lock, struct task_struct *, task_lock(_T), task_unlock(_T)) + #endif /* _LINUX_SCHED_TASK_H */ diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h index ceb56b39c70f..90bc853cafb6 100644 --- a/include/linux/spinlock.h +++ b/include/linux/spinlock.h @@ -548,5 +548,31 @@ DEFINE_LOCK_GUARD_1(spinlock_irqsave, spinlock_t, DEFINE_LOCK_GUARD_1_COND(spinlock_irqsave, _try, spin_trylock_irqsave(_T->lock, _T->flags)) +DEFINE_LOCK_GUARD_1(read_lock, rwlock_t, + read_lock(_T->lock), + read_unlock(_T->lock)) + +DEFINE_LOCK_GUARD_1(read_lock_irq, rwlock_t, + read_lock_irq(_T->lock), + read_unlock_irq(_T->lock)) + +DEFINE_LOCK_GUARD_1(read_lock_irqsave, rwlock_t, + read_lock_irqsave(_T->lock, _T->flags), + read_unlock_irqrestore(_T->lock, _T->flags), + unsigned long flags) + +DEFINE_LOCK_GUARD_1(write_lock, rwlock_t, + write_lock(_T->lock), + write_unlock(_T->lock)) + +DEFINE_LOCK_GUARD_1(write_lock_irq, rwlock_t, + write_lock_irq(_T->lock), + write_unlock_irq(_T->lock)) + +DEFINE_LOCK_GUARD_1(write_lock_irqsave, rwlock_t, + write_lock_irqsave(_T->lock, _T->flags), + write_unlock_irqrestore(_T->lock, _T->flags), + unsigned long flags) + #undef __LINUX_INSIDE_SPINLOCK_H #endif /* __LINUX_SPINLOCK_H */ diff --git a/kernel/ptrace.c b/kernel/ptrace.c index d8b5e13a2229..5c579fb9a5e3 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -386,6 +386,34 @@ static int check_ptrace_options(unsigned long data) return 0; } +static inline void ptrace_set_stopped(struct task_struct *task) +{ + guard(spinlock)(&task->sighand->siglock); + + /* + * If the task is already STOPPED, set JOBCTL_TRAP_STOP and + * TRAPPING, and kick it so that it transits to TRACED. TRAPPING + * will be cleared if the child completes the transition or any + * event which clears the group stop states happens. We'll wait + * for the transition to complete before returning from this + * function. + * + * This hides STOPPED -> RUNNING -> TRACED transition from the + * attaching thread but a different thread in the same group can + * still observe the transient RUNNING state. IOW, if another + * thread's WNOHANG wait(2) on the stopped tracee races against + * ATTACH, the wait(2) may fail due to the transient RUNNING. + * + * The following task_is_stopped() test is safe as both transitions + * in and out of STOPPED are protected by siglock. + */ + if (task_is_stopped(task) && + task_set_jobctl_pending(task, JOBCTL_TRAP_STOP | JOBCTL_TRAPPING)) { + task->jobctl &= ~JOBCTL_STOPPED; + signal_wake_up_state(task, __TASK_STOPPED); + } +} + static int ptrace_attach(struct task_struct *task, long request, unsigned long addr, unsigned long flags) @@ -393,17 +421,17 @@ static int ptrace_attach(struct task_struct *task, long request, bool seize = (request == PTRACE_SEIZE); int retval; - retval = -EIO; if (seize) { if (addr != 0) - goto out; + return -EIO; /* * This duplicates the check in check_ptrace_options() because * ptrace_attach() and ptrace_setoptions() have historically * used different error codes for unknown ptrace options. */ if (flags & ~(unsigned long)PTRACE_O_MASK) - goto out; + return -EIO; + retval = check_ptrace_options(flags); if (retval) return retval; @@ -414,88 +442,54 @@ static int ptrace_attach(struct task_struct *task, long request, audit_ptrace(task); - retval = -EPERM; if (unlikely(task->flags & PF_KTHREAD)) - goto out; + return -EPERM; if (same_thread_group(task, current)) - goto out; + return -EPERM; /* * Protect exec's credential calculations against our interference; * SUID, SGID and LSM creds get determined differently * under ptrace. */ - retval = -ERESTARTNOINTR; - if (mutex_lock_interruptible(&task->signal->cred_guard_mutex)) - goto out; + scoped_cond_guard (mutex_intr, return -ERESTARTNOINTR, + &task->signal->cred_guard_mutex) { - task_lock(task); - retval = __ptrace_may_access(task, PTRACE_MODE_ATTACH_REALCREDS); - task_unlock(task); - if (retval) - goto unlock_creds; + scoped_guard (task_lock, task) { + retval = __ptrace_may_access(task, PTRACE_MODE_ATTACH_REALCREDS); + if (retval) + return retval; + } - write_lock_irq(&tasklist_lock); - retval = -EPERM; - if (unlikely(task->exit_state)) - goto unlock_tasklist; - if (task->ptrace) - goto unlock_tasklist; + scoped_guard (write_lock_irq, &tasklist_lock) { + if (unlikely(task->exit_state)) + return -EPERM; + if (task->ptrace) + return -EPERM; - task->ptrace = flags; + task->ptrace = flags; - ptrace_link(task, current); + ptrace_link(task, current); - /* SEIZE doesn't trap tracee on attach */ - if (!seize) - send_sig_info(SIGSTOP, SEND_SIG_PRIV, task); + /* SEIZE doesn't trap tracee on attach */ + if (!seize) + send_sig_info(SIGSTOP, SEND_SIG_PRIV, task); - spin_lock(&task->sighand->siglock); + ptrace_set_stopped(task); + } + } /* - * If the task is already STOPPED, set JOBCTL_TRAP_STOP and - * TRAPPING, and kick it so that it transits to TRACED. TRAPPING - * will be cleared if the child completes the transition or any - * event which clears the group stop states happens. We'll wait - * for the transition to complete before returning from this - * function. - * - * This hides STOPPED -> RUNNING -> TRACED transition from the - * attaching thread but a different thread in the same group can - * still observe the transient RUNNING state. IOW, if another - * thread's WNOHANG wait(2) on the stopped tracee races against - * ATTACH, the wait(2) may fail due to the transient RUNNING. - * - * The following task_is_stopped() test is safe as both transitions - * in and out of STOPPED are protected by siglock. + * We do not bother to change retval or clear JOBCTL_TRAPPING + * if wait_on_bit() was interrupted by SIGKILL. The tracer will + * not return to user-mode, it will exit and clear this bit in + * __ptrace_unlink() if it wasn't already cleared by the tracee; + * and until then nobody can ptrace this task. */ - if (task_is_stopped(task) && - task_set_jobctl_pending(task, JOBCTL_TRAP_STOP | JOBCTL_TRAPPING)) { - task->jobctl &= ~JOBCTL_STOPPED; - signal_wake_up_state(task, __TASK_STOPPED); - } - - spin_unlock(&task->sighand->siglock); - - retval = 0; -unlock_tasklist: - write_unlock_irq(&tasklist_lock); -unlock_creds: - mutex_unlock(&task->signal->cred_guard_mutex); -out: - if (!retval) { - /* - * We do not bother to change retval or clear JOBCTL_TRAPPING - * if wait_on_bit() was interrupted by SIGKILL. The tracer will - * not return to user-mode, it will exit and clear this bit in - * __ptrace_unlink() if it wasn't already cleared by the tracee; - * and until then nobody can ptrace this task. - */ - wait_on_bit(&task->jobctl, JOBCTL_TRAPPING_BIT, TASK_KILLABLE); - proc_ptrace_connector(task, PTRACE_ATTACH); - } + wait_on_bit(&task->jobctl, JOBCTL_TRAPPING_BIT, TASK_KILLABLE); + proc_ptrace_connector(task, PTRACE_ATTACH); - return retval; + return 0; } /** -- cgit From d839a656d0f3caca9f96e9bf912fd394ac6a11bc Mon Sep 17 00:00:00 2001 From: JP Kobryn Date: Fri, 1 Dec 2023 14:53:55 +0900 Subject: kprobes: consistent rcu api usage for kretprobe holder It seems that the pointer-to-kretprobe "rp" within the kretprobe_holder is RCU-managed, based on the (non-rethook) implementation of get_kretprobe(). The thought behind this patch is to make use of the RCU API where possible when accessing this pointer so that the needed barriers are always in place and to self-document the code. The __rcu annotation to "rp" allows for sparse RCU checking. Plain writes done to the "rp" pointer are changed to make use of the RCU macro for assignment. For the single read, the implementation of get_kretprobe() is simplified by making use of an RCU macro which accomplishes the same, but note that the log warning text will be more generic. I did find that there is a difference in assembly generated between the usage of the RCU macros vs without. For example, on arm64, when using rcu_assign_pointer(), the corresponding store instruction is a store-release (STLR) which has an implicit barrier. When normal assignment is done, a regular store (STR) is found. In the macro case, this seems to be a result of rcu_assign_pointer() using smp_store_release() when the value to write is not NULL. Link: https://lore.kernel.org/all/20231122132058.3359-1-inwardvessel@gmail.com/ Fixes: d741bf41d7c7 ("kprobes: Remove kretprobe hash") Cc: stable@vger.kernel.org Signed-off-by: JP Kobryn Acked-by: Masami Hiramatsu (Google) Signed-off-by: Masami Hiramatsu (Google) --- include/linux/kprobes.h | 7 ++----- kernel/kprobes.c | 4 ++-- 2 files changed, 4 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index ab1da3142b06..64672bace560 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -139,7 +139,7 @@ static inline bool kprobe_ftrace(struct kprobe *p) * */ struct kretprobe_holder { - struct kretprobe *rp; + struct kretprobe __rcu *rp; struct objpool_head pool; }; @@ -245,10 +245,7 @@ unsigned long kretprobe_trampoline_handler(struct pt_regs *regs, static nokprobe_inline struct kretprobe *get_kretprobe(struct kretprobe_instance *ri) { - RCU_LOCKDEP_WARN(!rcu_read_lock_any_held(), - "Kretprobe is accessed from instance under preemptive context"); - - return READ_ONCE(ri->rph->rp); + return rcu_dereference_check(ri->rph->rp, rcu_read_lock_any_held()); } static nokprobe_inline unsigned long get_kretprobe_retaddr(struct kretprobe_instance *ri) diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 075a632e6c7c..d5a0ee40bf66 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -2252,7 +2252,7 @@ int register_kretprobe(struct kretprobe *rp) rp->rph = NULL; return -ENOMEM; } - rp->rph->rp = rp; + rcu_assign_pointer(rp->rph->rp, rp); rp->nmissed = 0; /* Establish function entry probe point */ ret = register_kprobe(&rp->kp); @@ -2300,7 +2300,7 @@ void unregister_kretprobes(struct kretprobe **rps, int num) #ifdef CONFIG_KRETPROBE_ON_RETHOOK rethook_free(rps[i]->rh); #else - rps[i]->rph->rp = NULL; + rcu_assign_pointer(rps[i]->rph->rp, NULL); #endif } mutex_unlock(&kprobe_mutex); -- cgit From a1461f1fd6cfdc4b8917c9d4a91e92605d1f28dc Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Fri, 1 Dec 2023 14:53:56 +0900 Subject: rethook: Use __rcu pointer for rethook::handler Since the rethook::handler is an RCU-maganged pointer so that it will notice readers the rethook is stopped (unregistered) or not, it should be an __rcu pointer and use appropriate functions to be accessed. This will use appropriate memory barrier when accessing it. OTOH, rethook::data is never changed, so we don't need to check it in get_kretprobe(). NOTE: To avoid sparse warning, rethook::handler is defined by a raw function pointer type with __rcu instead of rethook_handler_t. Link: https://lore.kernel.org/all/170126066201.398836.837498688669005979.stgit@devnote2/ Fixes: 54ecbe6f1ed5 ("rethook: Add a generic return hook") Cc: stable@vger.kernel.org Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202311241808.rv9ceuAh-lkp@intel.com/ Tested-by: JP Kobryn Signed-off-by: Masami Hiramatsu (Google) --- include/linux/kprobes.h | 6 ++---- include/linux/rethook.h | 7 ++++++- kernel/trace/rethook.c | 23 ++++++++++++++--------- 3 files changed, 22 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index 64672bace560..0ff44d6633e3 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -197,10 +197,8 @@ extern int arch_trampoline_kprobe(struct kprobe *p); #ifdef CONFIG_KRETPROBE_ON_RETHOOK static nokprobe_inline struct kretprobe *get_kretprobe(struct kretprobe_instance *ri) { - RCU_LOCKDEP_WARN(!rcu_read_lock_any_held(), - "Kretprobe is accessed from instance under preemptive context"); - - return (struct kretprobe *)READ_ONCE(ri->node.rethook->data); + /* rethook::data is non-changed field, so that you can access it freely. */ + return (struct kretprobe *)ri->node.rethook->data; } static nokprobe_inline unsigned long get_kretprobe_retaddr(struct kretprobe_instance *ri) { diff --git a/include/linux/rethook.h b/include/linux/rethook.h index ce69b2b7bc35..ba60962805f6 100644 --- a/include/linux/rethook.h +++ b/include/linux/rethook.h @@ -28,7 +28,12 @@ typedef void (*rethook_handler_t) (struct rethook_node *, void *, unsigned long, */ struct rethook { void *data; - rethook_handler_t handler; + /* + * To avoid sparse warnings, this uses a raw function pointer with + * __rcu, instead of rethook_handler_t. But this must be same as + * rethook_handler_t. + */ + void (__rcu *handler) (struct rethook_node *, void *, unsigned long, struct pt_regs *); struct objpool_head pool; struct rcu_head rcu; }; diff --git a/kernel/trace/rethook.c b/kernel/trace/rethook.c index 6fd7d4ecbbc6..fa03094e9e69 100644 --- a/kernel/trace/rethook.c +++ b/kernel/trace/rethook.c @@ -48,7 +48,7 @@ static void rethook_free_rcu(struct rcu_head *head) */ void rethook_stop(struct rethook *rh) { - WRITE_ONCE(rh->handler, NULL); + rcu_assign_pointer(rh->handler, NULL); } /** @@ -63,7 +63,7 @@ void rethook_stop(struct rethook *rh) */ void rethook_free(struct rethook *rh) { - WRITE_ONCE(rh->handler, NULL); + rethook_stop(rh); call_rcu(&rh->rcu, rethook_free_rcu); } @@ -82,6 +82,12 @@ static int rethook_fini_pool(struct objpool_head *head, void *context) return 0; } +static inline rethook_handler_t rethook_get_handler(struct rethook *rh) +{ + return (rethook_handler_t)rcu_dereference_check(rh->handler, + rcu_read_lock_any_held()); +} + /** * rethook_alloc() - Allocate struct rethook. * @data: a data to pass the @handler when hooking the return. @@ -107,7 +113,7 @@ struct rethook *rethook_alloc(void *data, rethook_handler_t handler, return ERR_PTR(-ENOMEM); rh->data = data; - rh->handler = handler; + rcu_assign_pointer(rh->handler, handler); /* initialize the objpool for rethook nodes */ if (objpool_init(&rh->pool, num, size, GFP_KERNEL, rh, @@ -135,9 +141,10 @@ static void free_rethook_node_rcu(struct rcu_head *head) */ void rethook_recycle(struct rethook_node *node) { - lockdep_assert_preemption_disabled(); + rethook_handler_t handler; - if (likely(READ_ONCE(node->rethook->handler))) + handler = rethook_get_handler(node->rethook); + if (likely(handler)) objpool_push(node, &node->rethook->pool); else call_rcu(&node->rcu, free_rethook_node_rcu); @@ -153,9 +160,7 @@ NOKPROBE_SYMBOL(rethook_recycle); */ struct rethook_node *rethook_try_get(struct rethook *rh) { - rethook_handler_t handler = READ_ONCE(rh->handler); - - lockdep_assert_preemption_disabled(); + rethook_handler_t handler = rethook_get_handler(rh); /* Check whether @rh is going to be freed. */ if (unlikely(!handler)) @@ -300,7 +305,7 @@ unsigned long rethook_trampoline_handler(struct pt_regs *regs, rhn = container_of(first, struct rethook_node, llist); if (WARN_ON_ONCE(rhn->frame != frame)) break; - handler = READ_ONCE(rhn->rethook->handler); + handler = rethook_get_handler(rhn->rethook); if (handler) handler(rhn, rhn->rethook->data, correct_ret_addr, regs); -- cgit From a51749ab34d9e5dec548fe38ede7e01e8bb26454 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Thu, 30 Nov 2023 21:48:17 +0100 Subject: locking/mutex: Document that mutex_unlock() is non-atomic I have seen several cases of attempts to use mutex_unlock() to release an object such that the object can then be freed by another task. This is not safe because mutex_unlock(), in the MUTEX_FLAG_WAITERS && !MUTEX_FLAG_HANDOFF case, accesses the mutex structure after having marked it as unlocked; so mutex_unlock() requires its caller to ensure that the mutex stays alive until mutex_unlock() returns. If MUTEX_FLAG_WAITERS is set and there are real waiters, those waiters have to keep the mutex alive, but we could have a spurious MUTEX_FLAG_WAITERS left if an interruptible/killable waiter bailed between the points where __mutex_unlock_slowpath() did the cmpxchg reading the flags and where it acquired the wait_lock. ( With spinlocks, that kind of code pattern is allowed and, from what I remember, used in several places in the kernel. ) Document this, such a semantic difference between mutexes and spinlocks is fairly unintuitive. [ mingo: Made the changelog a bit more assertive, refined the comments. ] Signed-off-by: Jann Horn Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20231130204817.2031407-1-jannh@google.com --- Documentation/locking/mutex-design.rst | 6 ++++++ kernel/locking/mutex.c | 5 +++++ 2 files changed, 11 insertions(+) (limited to 'kernel') diff --git a/Documentation/locking/mutex-design.rst b/Documentation/locking/mutex-design.rst index 78540cd7f54b..7572339b2f12 100644 --- a/Documentation/locking/mutex-design.rst +++ b/Documentation/locking/mutex-design.rst @@ -101,6 +101,12 @@ features that make lock debugging easier and faster: - Detects multi-task circular deadlocks and prints out all affected locks and tasks (and only those tasks). +Releasing a mutex is not an atomic operation: Once a mutex release operation +has begun, another context may be able to acquire the mutex before the release +operation has fully completed. The mutex user must ensure that the mutex is not +destroyed while a release operation is still in progress - in other words, +callers of mutex_unlock() must ensure that the mutex stays alive until +mutex_unlock() has returned. Interfaces ---------- diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 2deeeca3e71b..cbae8c0b89ab 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -532,6 +532,11 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne * This function must not be used in interrupt context. Unlocking * of a not locked mutex is not allowed. * + * The caller must ensure that the mutex stays alive until this function has + * returned - mutex_unlock() can NOT directly be used to release an object such + * that another concurrent task can free it. + * Mutexes are different from spinlocks & refcounts in this aspect. + * * This function is similar to (but not equivalent to) up(). */ void __sched mutex_unlock(struct mutex *lock) -- cgit From d499fd418fa15949d86d28bb5442ab88203fc513 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Thu, 30 Nov 2023 15:43:26 -0500 Subject: cgroup/rstat: Optimize cgroup_rstat_updated_list() The current design of cgroup_rstat_cpu_pop_updated() is to traverse the updated tree in a way to pop out the leaf nodes first before their parents. This can cause traversal of multiple nodes before a leaf node can be found and popped out. IOW, a given node in the tree can be visited multiple times before the whole operation is done. So it is not very efficient and the code can be hard to read. With the introduction of cgroup_rstat_updated_list() to build a list of cgroups to be flushed first before any flushing operation is being done, we can optimize the way the updated tree nodes are being popped by pushing the parents first to the tail end of the list before their children. In this way, most updated tree nodes will be visited only once with the exception of the subtree root as we still need to go back to its parent and popped it out of its updated_children list. This also makes the code easier to read. Signed-off-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/rstat.c | 153 ++++++++++++++++++++++++++++++-------------------- 1 file changed, 91 insertions(+), 62 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index 1f300bf4dc40..4ec29e6b1d8d 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -74,64 +74,109 @@ __bpf_kfunc void cgroup_rstat_updated(struct cgroup *cgrp, int cpu) } /** - * cgroup_rstat_cpu_pop_updated - iterate and dismantle rstat_cpu updated tree - * @pos: current position - * @root: root of the tree to traversal + * cgroup_rstat_push_children - push children cgroups into the given list + * @head: current head of the list (= subtree root) + * @child: first child of the root * @cpu: target cpu + * Return: A new singly linked list of cgroups to be flush * - * Walks the updated rstat_cpu tree on @cpu from @root. %NULL @pos starts - * the traversal and %NULL return indicates the end. During traversal, - * each returned cgroup is unlinked from the tree. Must be called with the - * matching cgroup_rstat_cpu_lock held. + * Iteratively traverse down the cgroup_rstat_cpu updated tree level by + * level and push all the parents first before their next level children + * into a singly linked list built from the tail backward like "pushing" + * cgroups into a stack. The root is pushed by the caller. + */ +static struct cgroup *cgroup_rstat_push_children(struct cgroup *head, + struct cgroup *child, int cpu) +{ + struct cgroup *chead = child; /* Head of child cgroup level */ + struct cgroup *ghead = NULL; /* Head of grandchild cgroup level */ + struct cgroup *parent, *grandchild; + struct cgroup_rstat_cpu *crstatc; + + child->rstat_flush_next = NULL; + +next_level: + while (chead) { + child = chead; + chead = child->rstat_flush_next; + parent = cgroup_parent(child); + + /* updated_next is parent cgroup terminated */ + while (child != parent) { + child->rstat_flush_next = head; + head = child; + crstatc = cgroup_rstat_cpu(child, cpu); + grandchild = crstatc->updated_children; + if (grandchild != child) { + /* Push the grand child to the next level */ + crstatc->updated_children = child; + grandchild->rstat_flush_next = ghead; + ghead = grandchild; + } + child = crstatc->updated_next; + crstatc->updated_next = NULL; + } + } + + if (ghead) { + chead = ghead; + ghead = NULL; + goto next_level; + } + return head; +} + +/** + * cgroup_rstat_updated_list - return a list of updated cgroups to be flushed + * @root: root of the cgroup subtree to traverse + * @cpu: target cpu + * Return: A singly linked list of cgroups to be flushed + * + * Walks the updated rstat_cpu tree on @cpu from @root. During traversal, + * each returned cgroup is unlinked from the updated tree. * * The only ordering guarantee is that, for a parent and a child pair - * covered by a given traversal, if a child is visited, its parent is - * guaranteed to be visited afterwards. + * covered by a given traversal, the child is before its parent in + * the list. + * + * Note that updated_children is self terminated and points to a list of + * child cgroups if not empty. Whereas updated_next is like a sibling link + * within the children list and terminated by the parent cgroup. An exception + * here is the cgroup root whose updated_next can be self terminated. */ -static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos, - struct cgroup *root, int cpu) +static struct cgroup *cgroup_rstat_updated_list(struct cgroup *root, int cpu) { - struct cgroup_rstat_cpu *rstatc; - struct cgroup *parent; - - if (pos == root) - return NULL; + raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu); + struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(root, cpu); + struct cgroup *head = NULL, *parent, *child; + unsigned long flags; /* - * We're gonna walk down to the first leaf and visit/remove it. We - * can pick whatever unvisited node as the starting point. + * The _irqsave() is needed because cgroup_rstat_lock is + * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring + * this lock with the _irq() suffix only disables interrupts on + * a non-PREEMPT_RT kernel. The raw_spinlock_t below disables + * interrupts on both configurations. The _irqsave() ensures + * that interrupts are always disabled and later restored. */ - if (!pos) { - pos = root; - /* return NULL if this subtree is not on-list */ - if (!cgroup_rstat_cpu(pos, cpu)->updated_next) - return NULL; - } else { - pos = cgroup_parent(pos); - } + raw_spin_lock_irqsave(cpu_lock, flags); - /* walk down to the first leaf */ - while (true) { - rstatc = cgroup_rstat_cpu(pos, cpu); - if (rstatc->updated_children == pos) - break; - pos = rstatc->updated_children; - } + /* Return NULL if this subtree is not on-list */ + if (!rstatc->updated_next) + goto unlock_ret; /* - * Unlink @pos from the tree. As the updated_children list is + * Unlink @root from its parent. As the updated_children list is * singly linked, we have to walk it to find the removal point. - * However, due to the way we traverse, @pos will be the first - * child in most cases. The only exception is @root. */ - parent = cgroup_parent(pos); + parent = cgroup_parent(root); if (parent) { struct cgroup_rstat_cpu *prstatc; struct cgroup **nextp; prstatc = cgroup_rstat_cpu(parent, cpu); nextp = &prstatc->updated_children; - while (*nextp != pos) { + while (*nextp != root) { struct cgroup_rstat_cpu *nrstatc; nrstatc = cgroup_rstat_cpu(*nextp, cpu); @@ -142,31 +187,15 @@ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos, } rstatc->updated_next = NULL; - return pos; -} -/* Return a list of updated cgroups to be flushed */ -static struct cgroup *cgroup_rstat_updated_list(struct cgroup *root, int cpu) -{ - raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu); - struct cgroup *head, *tail, *next; - unsigned long flags; - - /* - * The _irqsave() is needed because cgroup_rstat_lock is - * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring - * this lock with the _irq() suffix only disables interrupts on - * a non-PREEMPT_RT kernel. The raw_spinlock_t below disables - * interrupts on both configurations. The _irqsave() ensures - * that interrupts are always disabled and later restored. - */ - raw_spin_lock_irqsave(cpu_lock, flags); - head = tail = cgroup_rstat_cpu_pop_updated(NULL, root, cpu); - while (tail) { - next = cgroup_rstat_cpu_pop_updated(tail, root, cpu); - tail->rstat_flush_next = next; - tail = next; - } + /* Push @root to the list first before pushing the children */ + head = root; + root->rstat_flush_next = NULL; + child = rstatc->updated_children; + rstatc->updated_children = root; + if (child != root) + head = cgroup_rstat_push_children(head, child, cpu); +unlock_ret: raw_spin_unlock_irqrestore(cpu_lock, flags); return head; } -- cgit From 12cd3cd8c797e07afcc47bc4afa760e4ec75e9d7 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 20 Nov 2023 17:11:42 +0200 Subject: params: Introduce the param_unknown_fn type Introduce a new type for the callback to parse an unknown argument. This unifies function prototypes which takes that as a parameter. Reviewed-by: Luis Chamberlain Reviewed-by: Kees Cook Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20231120151419.1661807-2-andriy.shevchenko@linux.intel.com Signed-off-by: Kees Cook --- include/linux/moduleparam.h | 6 +++--- kernel/params.c | 8 ++------ 2 files changed, 5 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h index 4fa9726bc328..bfb85fd13e1f 100644 --- a/include/linux/moduleparam.h +++ b/include/linux/moduleparam.h @@ -385,6 +385,8 @@ extern bool parameq(const char *name1, const char *name2); */ extern bool parameqn(const char *name1, const char *name2, size_t n); +typedef int (*parse_unknown_fn)(char *param, char *val, const char *doing, void *arg); + /* Called on module insert or kernel boot */ extern char *parse_args(const char *name, char *args, @@ -392,9 +394,7 @@ extern char *parse_args(const char *name, unsigned num, s16 level_min, s16 level_max, - void *arg, - int (*unknown)(char *param, char *val, - const char *doing, void *arg)); + void *arg, parse_unknown_fn unknown); /* Called by module remove. */ #ifdef CONFIG_SYSFS diff --git a/kernel/params.c b/kernel/params.c index 2d4a0564697e..626fa8265932 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -120,9 +120,7 @@ static int parse_one(char *param, unsigned num_params, s16 min_level, s16 max_level, - void *arg, - int (*handle_unknown)(char *param, char *val, - const char *doing, void *arg)) + void *arg, parse_unknown_fn handle_unknown) { unsigned int i; int err; @@ -165,9 +163,7 @@ char *parse_args(const char *doing, unsigned num, s16 min_level, s16 max_level, - void *arg, - int (*unknown)(char *param, char *val, - const char *doing, void *arg)) + void *arg, parse_unknown_fn unknown) { char *param, *val, *err = NULL; -- cgit From fd0cd057a1b7351604daa6ffc91dfe28adf7225d Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 20 Nov 2023 17:11:43 +0200 Subject: params: Do not go over the limit when getting the string length We can use strnlen() even on early stages and it prevents from going over the string boundaries in case it's already too long. Reviewed-by: Luis Chamberlain Reviewed-by: Kees Cook Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20231120151419.1661807-3-andriy.shevchenko@linux.intel.com Signed-off-by: Kees Cook --- kernel/params.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/params.c b/kernel/params.c index 626fa8265932..f8e3c4139854 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -260,7 +260,10 @@ EXPORT_SYMBOL_GPL(param_set_uint_minmax); int param_set_charp(const char *val, const struct kernel_param *kp) { - if (strlen(val) > 1024) { + size_t len, maxlen = 1024; + + len = strnlen(val, maxlen + 1); + if (len == maxlen + 1) { pr_err("%s: string parameter too long\n", kp->name); return -ENOSPC; } @@ -270,7 +273,7 @@ int param_set_charp(const char *val, const struct kernel_param *kp) /* This is a hack. We can't kmalloc in early boot, and we * don't need to; this mangled commandline is preserved. */ if (slab_is_available()) { - *(char **)kp->arg = kmalloc_parameter(strlen(val)+1); + *(char **)kp->arg = kmalloc_parameter(len + 1); if (!*(char **)kp->arg) return -ENOMEM; strcpy(*(char **)kp->arg, val); @@ -508,7 +511,7 @@ int param_set_copystring(const char *val, const struct kernel_param *kp) { const struct kparam_string *kps = kp->str; - if (strlen(val)+1 > kps->maxlen) { + if (strnlen(val, kps->maxlen) == kps->maxlen) { pr_err("%s: string doesn't fit in %u chars.\n", kp->name, kps->maxlen-1); return -ENOSPC; -- cgit From 0fc79cbc937f2a754a302a710a94b68c61d0a89a Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 20 Nov 2023 17:11:44 +0200 Subject: params: Use size_add() for kmalloc() Prevent allocations from integer overflow by using size_add(). Reviewed-by: Luis Chamberlain Reviewed-by: Kees Cook Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20231120151419.1661807-4-andriy.shevchenko@linux.intel.com Signed-off-by: Kees Cook --- kernel/params.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/params.c b/kernel/params.c index f8e3c4139854..c3a029fe183d 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -48,7 +49,7 @@ static void *kmalloc_parameter(unsigned int size) { struct kmalloced_param *p; - p = kmalloc(sizeof(*p) + size, GFP_KERNEL); + p = kmalloc(size_add(sizeof(*p), size), GFP_KERNEL); if (!p) return NULL; -- cgit From a05f096c2c0ca52e8fd34740c7d4b53ab3e7123e Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 20 Nov 2023 17:11:45 +0200 Subject: params: Sort headers Sort the headers in alphabetic order in order to ease the maintenance for this part. Reviewed-by: Luis Chamberlain Reviewed-by: Kees Cook Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20231120151419.1661807-5-andriy.shevchenko@linux.intel.com Signed-off-by: Kees Cook --- kernel/params.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/params.c b/kernel/params.c index c3a029fe183d..eb55b32399b4 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -3,18 +3,18 @@ Copyright (C) 2001 Rusty Russell. */ +#include +#include +#include +#include #include #include -#include -#include #include #include -#include -#include #include -#include -#include #include +#include +#include #ifdef CONFIG_SYSFS /* Protects all built-in parameters, modules use their own param_lock */ -- cgit From b5e3f86a47d34f7b8af899f8cc70520f6daf8b53 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 20 Nov 2023 17:11:46 +0200 Subject: params: Fix multi-line comment style The multi-line comment style in the file is rather arbitrary. Make it follow the standard one. Reviewed-by: Luis Chamberlain Reviewed-by: Kees Cook Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20231120151419.1661807-6-andriy.shevchenko@linux.intel.com Signed-off-by: Kees Cook --- kernel/params.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/params.c b/kernel/params.c index eb55b32399b4..2e447f8ae183 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -1,8 +1,8 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* Helpers for initial module or kernel cmdline parsing - Copyright (C) 2001 Rusty Russell. - -*/ +/* + * Helpers for initial module or kernel cmdline parsing + * Copyright (C) 2001 Rusty Russell. + */ #include #include #include @@ -271,8 +271,10 @@ int param_set_charp(const char *val, const struct kernel_param *kp) maybe_kfree_parameter(*(char **)kp->arg); - /* This is a hack. We can't kmalloc in early boot, and we - * don't need to; this mangled commandline is preserved. */ + /* + * This is a hack. We can't kmalloc() in early boot, and we + * don't need to; this mangled commandline is preserved. + */ if (slab_is_available()) { *(char **)kp->arg = kmalloc_parameter(len + 1); if (!*(char **)kp->arg) @@ -743,8 +745,10 @@ void module_param_sysfs_remove(struct module *mod) { if (mod->mkobj.mp) { sysfs_remove_group(&mod->mkobj.kobj, &mod->mkobj.mp->grp); - /* We are positive that no one is using any param - * attrs at this point. Deallocate immediately. */ + /* + * We are positive that no one is using any param + * attrs at this point. Deallocate immediately. + */ free_module_param_attrs(&mod->mkobj); } } -- cgit From 8a3750ecf8104de55c569ffbe844a85aa9d5deaa Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 30 Nov 2023 12:56:08 -0800 Subject: tracing/uprobe: Replace strlcpy() with strscpy() strlcpy() reads the entire source buffer first. This read may exceed the destination size limit. This is both inefficient and can lead to linear read overflows if a source string is not NUL-terminated[1]. Additionally, it returns the size of the source string, not the resulting size of the destination string. In an effort to remove strlcpy() completely[2], replace strlcpy() here with strscpy(). The negative return value is already handled by this code so no new handling is needed here. Link: https://www.kernel.org/doc/html/latest/process/deprecated.html#strlcpy [1] Link: https://github.com/KSPP/linux/issues/89 [2] Cc: Steven Rostedt Cc: Masami Hiramatsu Cc: linux-trace-kernel@vger.kernel.org Acked-by: "Masami Hiramatsu (Google)" Link: https://lore.kernel.org/r/20231130205607.work.463-kees@kernel.org Signed-off-by: Kees Cook --- kernel/trace/trace_uprobe.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 99c051de412a..a84b85d8aac1 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -151,7 +151,7 @@ fetch_store_string(unsigned long addr, void *dest, void *base) return -ENOMEM; if (addr == FETCH_TOKEN_COMM) - ret = strlcpy(dst, current->comm, maxlen); + ret = strscpy(dst, current->comm, maxlen); else ret = strncpy_from_user(dst, src, maxlen); if (ret >= 0) { -- cgit From dfce9cb3140592b886838e06f3e0c25fea2a9cae Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 30 Nov 2023 18:46:40 -0800 Subject: bpf: Fix a verifier bug due to incorrect branch offset comparison with cpu=v4 Bpf cpu=v4 support is introduced in [1] and Commit 4cd58e9af8b9 ("bpf: Support new 32bit offset jmp instruction") added support for new 32bit offset jmp instruction. Unfortunately, in function bpf_adj_delta_to_off(), for new branch insn with 32bit offset, the offset (plus/minor a small delta) compares to 16-bit offset bound [S16_MIN, S16_MAX], which caused the following verification failure: $ ./test_progs-cpuv4 -t verif_scale_pyperf180 ... insn 10 cannot be patched due to 16-bit range ... libbpf: failed to load object 'pyperf180.bpf.o' scale_test:FAIL:expect_success unexpected error: -12 (errno 12) #405 verif_scale_pyperf180:FAIL Note that due to recent llvm18 development, the patch [2] (already applied in bpf-next) needs to be applied to bpf tree for testing purpose. The fix is rather simple. For 32bit offset branch insn, the adjusted offset compares to [S32_MIN, S32_MAX] and then verification succeeded. [1] https://lore.kernel.org/all/20230728011143.3710005-1-yonghong.song@linux.dev [2] https://lore.kernel.org/bpf/20231110193644.3130906-1-yonghong.song@linux.dev Fixes: 4cd58e9af8b9 ("bpf: Support new 32bit offset jmp instruction") Signed-off-by: Yonghong Song Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20231201024640.3417057-1-yonghong.song@linux.dev --- kernel/bpf/core.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index cd3afe57ece3..fe254ae035fe 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -371,14 +371,18 @@ static int bpf_adj_delta_to_imm(struct bpf_insn *insn, u32 pos, s32 end_old, static int bpf_adj_delta_to_off(struct bpf_insn *insn, u32 pos, s32 end_old, s32 end_new, s32 curr, const bool probe_pass) { - const s32 off_min = S16_MIN, off_max = S16_MAX; + s64 off_min, off_max, off; s32 delta = end_new - end_old; - s32 off; - if (insn->code == (BPF_JMP32 | BPF_JA)) + if (insn->code == (BPF_JMP32 | BPF_JA)) { off = insn->imm; - else + off_min = S32_MIN; + off_max = S32_MAX; + } else { off = insn->off; + off_min = S16_MIN; + off_max = S16_MAX; + } if (curr < pos && curr + off + 1 >= end_old) off += delta; -- cgit From ac9c05e0e453cfcab2866f6d28f257590e4f66e5 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Wed, 29 Nov 2023 15:44:12 -0800 Subject: bpf: Add kfunc bpf_get_file_xattr It is common practice for security solutions to store tags/labels in xattrs. To implement similar functionalities in BPF LSM, add new kfunc bpf_get_file_xattr(). The first use case of bpf_get_file_xattr() is to implement file verifications with asymmetric keys. Specificially, security applications could use fsverity for file hashes and use xattr to store file signatures. (kfunc for fsverity hash will be added in a separate commit.) Currently, only xattrs with "user." prefix can be read with kfunc bpf_get_file_xattr(). As use cases evolve, we may add a dedicated prefix for bpf_get_file_xattr(). To avoid recursion, bpf_get_file_xattr can be only called from LSM hooks. Signed-off-by: Song Liu Acked-by: Christian Brauner Acked-by: KP Singh Link: https://lore.kernel.org/r/20231129234417.856536-2-song@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/trace/bpf_trace.c | 67 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 67 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index c284a4ad0315..1648bde28f01 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -24,6 +24,7 @@ #include #include #include +#include #include @@ -1431,6 +1432,72 @@ static int __init bpf_key_sig_kfuncs_init(void) late_initcall(bpf_key_sig_kfuncs_init); #endif /* CONFIG_KEYS */ +/* filesystem kfuncs */ +__bpf_kfunc_start_defs(); + +/** + * bpf_get_file_xattr - get xattr of a file + * @file: file to get xattr from + * @name__str: name of the xattr + * @value_ptr: output buffer of the xattr value + * + * Get xattr *name__str* of *file* and store the output in *value_ptr*. + * + * For security reasons, only *name__str* with prefix "user." is allowed. + * + * Return: 0 on success, a negative value on error. + */ +__bpf_kfunc int bpf_get_file_xattr(struct file *file, const char *name__str, + struct bpf_dynptr_kern *value_ptr) +{ + struct dentry *dentry; + u32 value_len; + void *value; + int ret; + + if (strncmp(name__str, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)) + return -EPERM; + + value_len = __bpf_dynptr_size(value_ptr); + value = __bpf_dynptr_data_rw(value_ptr, value_len); + if (!value) + return -EINVAL; + + dentry = file_dentry(file); + ret = inode_permission(&nop_mnt_idmap, dentry->d_inode, MAY_READ); + if (ret) + return ret; + return __vfs_getxattr(dentry, dentry->d_inode, name__str, value, value_len); +} + +__bpf_kfunc_end_defs(); + +BTF_SET8_START(fs_kfunc_set_ids) +BTF_ID_FLAGS(func, bpf_get_file_xattr, KF_SLEEPABLE | KF_TRUSTED_ARGS) +BTF_SET8_END(fs_kfunc_set_ids) + +static int bpf_get_file_xattr_filter(const struct bpf_prog *prog, u32 kfunc_id) +{ + if (!btf_id_set8_contains(&fs_kfunc_set_ids, kfunc_id)) + return 0; + + /* Only allow to attach from LSM hooks, to avoid recursion */ + return prog->type != BPF_PROG_TYPE_LSM ? -EACCES : 0; +} + +static const struct btf_kfunc_id_set bpf_fs_kfunc_set = { + .owner = THIS_MODULE, + .set = &fs_kfunc_set_ids, + .filter = bpf_get_file_xattr_filter, +}; + +static int __init bpf_fs_kfuncs_init(void) +{ + return register_btf_kfunc_id_set(BPF_PROG_TYPE_LSM, &bpf_fs_kfunc_set); +} + +late_initcall(bpf_fs_kfuncs_init); + static const struct bpf_func_proto * bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { -- cgit From 5fad52bee30414270104525e3a0266327a6e9d11 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sat, 2 Dec 2023 09:56:56 -0800 Subject: bpf: provide correct register name for exception callback retval check bpf_throw() is checking R1, so let's report R1 in the log. Acked-by: Eduard Zingerman Acked-by: Shung-Hsi Yu Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231202175705.885270-3-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 12 ++++++------ tools/testing/selftests/bpf/progs/exceptions_assert.c | 2 +- tools/testing/selftests/bpf/progs/exceptions_fail.c | 2 +- 3 files changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 8e7b6072e3f4..25b9d470957e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -11805,7 +11805,7 @@ static int fetch_kfunc_meta(struct bpf_verifier_env *env, return 0; } -static int check_return_code(struct bpf_verifier_env *env, int regno); +static int check_return_code(struct bpf_verifier_env *env, int regno, const char *reg_name); static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, int *insn_idx_p) @@ -11942,7 +11942,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, * to bpf_throw becomes the return value of the program. */ if (!env->exception_callback_subprog) { - err = check_return_code(env, BPF_REG_1); + err = check_return_code(env, BPF_REG_1, "R1"); if (err < 0) return err; } @@ -14972,7 +14972,7 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) return 0; } -static int check_return_code(struct bpf_verifier_env *env, int regno) +static int check_return_code(struct bpf_verifier_env *env, int regno, const char *reg_name) { struct tnum enforce_attach_type_range = tnum_unknown; const struct bpf_prog *prog = env->prog; @@ -15026,7 +15026,7 @@ static int check_return_code(struct bpf_verifier_env *env, int regno) } if (!tnum_in(const_0, reg->var_off)) { - verbose_invalid_scalar(env, reg, &const_0, "async callback", "R0"); + verbose_invalid_scalar(env, reg, &const_0, "async callback", reg_name); return -EINVAL; } return 0; @@ -15126,7 +15126,7 @@ static int check_return_code(struct bpf_verifier_env *env, int regno) } if (!tnum_in(range, reg->var_off)) { - verbose_invalid_scalar(env, reg, &range, "program exit", "R0"); + verbose_invalid_scalar(env, reg, &range, "program exit", reg_name); if (prog->expected_attach_type == BPF_LSM_CGROUP && prog_type == BPF_PROG_TYPE_LSM && !prog->aux->attach_func_proto->type) @@ -17410,7 +17410,7 @@ process_bpf_exit_full: continue; } - err = check_return_code(env, BPF_REG_0); + err = check_return_code(env, BPF_REG_0, "R0"); if (err) return err; process_bpf_exit: diff --git a/tools/testing/selftests/bpf/progs/exceptions_assert.c b/tools/testing/selftests/bpf/progs/exceptions_assert.c index 49efaed143fc..575e7dd719c4 100644 --- a/tools/testing/selftests/bpf/progs/exceptions_assert.c +++ b/tools/testing/selftests/bpf/progs/exceptions_assert.c @@ -125,7 +125,7 @@ int check_assert_generic(struct __sk_buff *ctx) } SEC("?fentry/bpf_check") -__failure __msg("At program exit the register R0 has value (0x40; 0x0)") +__failure __msg("At program exit the register R1 has value (0x40; 0x0)") int check_assert_with_return(void *ctx) { bpf_assert_with(!ctx, 64); diff --git a/tools/testing/selftests/bpf/progs/exceptions_fail.c b/tools/testing/selftests/bpf/progs/exceptions_fail.c index 8c0ef2742208..81ead7512ba2 100644 --- a/tools/testing/selftests/bpf/progs/exceptions_fail.c +++ b/tools/testing/selftests/bpf/progs/exceptions_fail.c @@ -308,7 +308,7 @@ int reject_set_exception_cb_bad_ret1(void *ctx) } SEC("?fentry/bpf_check") -__failure __msg("At program exit the register R0 has value (0x40; 0x0) should") +__failure __msg("At program exit the register R1 has value (0x40; 0x0) should") int reject_set_exception_cb_bad_ret2(void *ctx) { bpf_throw(64); -- cgit From 0acd03a5bd188b0c501d285d938439618bd855c4 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sat, 2 Dec 2023 09:56:57 -0800 Subject: bpf: enforce precision of R0 on callback return Given verifier checks actual value, r0 has to be precise, so we need to propagate precision properly. r0 also has to be marked as read, otherwise subsequent state comparisons will ignore such register as unimportant and precision won't really help here. Fixes: 69c087ba6225 ("bpf: Add bpf_for_each_map_elem() helper") Acked-by: Eduard Zingerman Acked-by: Shung-Hsi Yu Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231202175705.885270-4-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 25b9d470957e..849fbf47b5f3 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -9590,6 +9590,13 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) verbose(env, "R0 not a scalar value\n"); return -EACCES; } + + /* we are going to rely on register's precise value */ + err = mark_reg_read(env, r0, r0->parent, REG_LIVE_READ64); + err = err ?: mark_chain_precision(env, BPF_REG_0); + if (err) + return err; + if (!tnum_in(range, r0->var_off)) { verbose_invalid_scalar(env, r0, &range, "callback return", "R0"); return -EINVAL; -- cgit From 8fa4ecd49b81ccd9d1d87f1c8b2260e218644878 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sat, 2 Dec 2023 09:56:58 -0800 Subject: bpf: enforce exact retval range on subprog/callback exit Instead of relying on potentially imprecise tnum representation of expected return value range for callbacks and subprogs, validate that smin/smax range satisfy exact expected range of return values. E.g., if callback would need to return [0, 2] range, tnum can't represent this precisely and instead will allow [0, 3] range. By checking smin/smax range, we can make sure that subprog/callback indeed returns only valid [0, 2] range. Acked-by: Eduard Zingerman Acked-by: Shung-Hsi Yu Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231202175705.885270-5-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 7 ++++++- kernel/bpf/verifier.c | 33 ++++++++++++++++++++++----------- 2 files changed, 28 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 0c0e1bccad45..3378cc753061 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -275,6 +275,11 @@ struct bpf_reference_state { int callback_ref; }; +struct bpf_retval_range { + s32 minval; + s32 maxval; +}; + /* state of the program: * type of all registers and stack info */ @@ -297,7 +302,7 @@ struct bpf_func_state { * void foo(void) { bpf_timer_set_callback(,foo); } */ u32 async_entry_cnt; - struct tnum callback_ret_range; + struct bpf_retval_range callback_ret_range; bool in_callback_fn; bool in_async_callback_fn; bool in_exception_callback_fn; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 849fbf47b5f3..f3d9d7de68da 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2305,6 +2305,11 @@ static void init_reg_state(struct bpf_verifier_env *env, regs[BPF_REG_FP].frameno = state->frameno; } +static struct bpf_retval_range retval_range(s32 minval, s32 maxval) +{ + return (struct bpf_retval_range){ minval, maxval }; +} + #define BPF_MAIN_FUNC (-1) static void init_func_state(struct bpf_verifier_env *env, struct bpf_func_state *state, @@ -2313,7 +2318,7 @@ static void init_func_state(struct bpf_verifier_env *env, state->callsite = callsite; state->frameno = frameno; state->subprogno = subprogno; - state->callback_ret_range = tnum_range(0, 0); + state->callback_ret_range = retval_range(0, 0); init_reg_state(env, state); mark_verifier_state_scratched(env); } @@ -9396,7 +9401,7 @@ static int set_map_elem_callback_state(struct bpf_verifier_env *env, return err; callee->in_callback_fn = true; - callee->callback_ret_range = tnum_range(0, 1); + callee->callback_ret_range = retval_range(0, 1); return 0; } @@ -9418,7 +9423,7 @@ static int set_loop_callback_state(struct bpf_verifier_env *env, __mark_reg_not_init(env, &callee->regs[BPF_REG_5]); callee->in_callback_fn = true; - callee->callback_ret_range = tnum_range(0, 1); + callee->callback_ret_range = retval_range(0, 1); return 0; } @@ -9448,7 +9453,7 @@ static int set_timer_callback_state(struct bpf_verifier_env *env, __mark_reg_not_init(env, &callee->regs[BPF_REG_4]); __mark_reg_not_init(env, &callee->regs[BPF_REG_5]); callee->in_async_callback_fn = true; - callee->callback_ret_range = tnum_range(0, 1); + callee->callback_ret_range = retval_range(0, 1); return 0; } @@ -9476,7 +9481,7 @@ static int set_find_vma_callback_state(struct bpf_verifier_env *env, __mark_reg_not_init(env, &callee->regs[BPF_REG_4]); __mark_reg_not_init(env, &callee->regs[BPF_REG_5]); callee->in_callback_fn = true; - callee->callback_ret_range = tnum_range(0, 1); + callee->callback_ret_range = retval_range(0, 1); return 0; } @@ -9499,7 +9504,7 @@ static int set_user_ringbuf_callback_state(struct bpf_verifier_env *env, __mark_reg_not_init(env, &callee->regs[BPF_REG_5]); callee->in_callback_fn = true; - callee->callback_ret_range = tnum_range(0, 1); + callee->callback_ret_range = retval_range(0, 1); return 0; } @@ -9531,7 +9536,7 @@ static int set_rbtree_add_callback_state(struct bpf_verifier_env *env, __mark_reg_not_init(env, &callee->regs[BPF_REG_4]); __mark_reg_not_init(env, &callee->regs[BPF_REG_5]); callee->in_callback_fn = true; - callee->callback_ret_range = tnum_range(0, 1); + callee->callback_ret_range = retval_range(0, 1); return 0; } @@ -9560,6 +9565,11 @@ static bool in_rbtree_lock_required_cb(struct bpf_verifier_env *env) return is_rbtree_lock_required_kfunc(kfunc_btf_id); } +static bool retval_range_within(struct bpf_retval_range range, const struct bpf_reg_state *reg) +{ + return range.minval <= reg->smin_value && reg->smax_value <= range.maxval; +} + static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) { struct bpf_verifier_state *state = env->cur_state, *prev_st; @@ -9583,9 +9593,6 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) caller = state->frame[state->curframe - 1]; if (callee->in_callback_fn) { - /* enforce R0 return value range [0, 1]. */ - struct tnum range = callee->callback_ret_range; - if (r0->type != SCALAR_VALUE) { verbose(env, "R0 not a scalar value\n"); return -EACCES; @@ -9597,7 +9604,11 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) if (err) return err; - if (!tnum_in(range, r0->var_off)) { + /* enforce R0 return value range */ + if (!retval_range_within(callee->callback_ret_range, r0)) { + struct tnum range = tnum_range(callee->callback_ret_range.minval, + callee->callback_ret_range.maxval); + verbose_invalid_scalar(env, r0, &range, "callback return", "R0"); return -EINVAL; } -- cgit From c871d0e00f0e8c207ce8ff89025e35cc49a8a3c3 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sat, 2 Dec 2023 09:57:00 -0800 Subject: bpf: enforce precise retval range on program exit Similarly to subprog/callback logic, enforce return value of BPF program using more precise smin/smax range. We need to adjust a bunch of tests due to a changed format of an error message. Acked-by: Eduard Zingerman Acked-by: Shung-Hsi Yu Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231202175705.885270-7-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 56 +++++++++++----------- .../selftests/bpf/progs/exceptions_assert.c | 2 +- .../testing/selftests/bpf/progs/exceptions_fail.c | 2 +- .../selftests/bpf/progs/test_global_func15.c | 2 +- tools/testing/selftests/bpf/progs/timer_failure.c | 2 +- .../selftests/bpf/progs/user_ringbuf_fail.c | 2 +- .../bpf/progs/verifier_cgroup_inv_retcode.c | 8 ++-- .../bpf/progs/verifier_netfilter_retcode.c | 2 +- .../bpf/progs/verifier_subprog_precision.c | 2 +- 9 files changed, 40 insertions(+), 38 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f3d9d7de68da..9411c1046268 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -362,20 +362,23 @@ __printf(2, 3) static void verbose(void *private_data, const char *fmt, ...) static void verbose_invalid_scalar(struct bpf_verifier_env *env, struct bpf_reg_state *reg, - struct tnum *range, const char *ctx, + struct bpf_retval_range range, const char *ctx, const char *reg_name) { - char tn_buf[48]; + bool unknown = true; - verbose(env, "At %s the register %s ", ctx, reg_name); - if (!tnum_is_unknown(reg->var_off)) { - tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose(env, "has value %s", tn_buf); - } else { - verbose(env, "has unknown scalar value"); + verbose(env, "At %s the register %s has", ctx, reg_name); + if (reg->smin_value > S64_MIN) { + verbose(env, " smin=%lld", reg->smin_value); + unknown = false; } - tnum_strn(tn_buf, sizeof(tn_buf), *range); - verbose(env, " should have been in %s\n", tn_buf); + if (reg->smax_value < S64_MAX) { + verbose(env, " smax=%lld", reg->smax_value); + unknown = false; + } + if (unknown) + verbose(env, " unknown scalar value"); + verbose(env, " should have been in [%d, %d]\n", range.minval, range.maxval); } static bool type_may_be_null(u32 type) @@ -9606,10 +9609,8 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) /* enforce R0 return value range */ if (!retval_range_within(callee->callback_ret_range, r0)) { - struct tnum range = tnum_range(callee->callback_ret_range.minval, - callee->callback_ret_range.maxval); - - verbose_invalid_scalar(env, r0, &range, "callback return", "R0"); + verbose_invalid_scalar(env, r0, callee->callback_ret_range, + "callback return", "R0"); return -EINVAL; } if (!calls_callback(env, callee->callsite)) { @@ -14995,7 +14996,8 @@ static int check_return_code(struct bpf_verifier_env *env, int regno, const char struct tnum enforce_attach_type_range = tnum_unknown; const struct bpf_prog *prog = env->prog; struct bpf_reg_state *reg; - struct tnum range = tnum_range(0, 1), const_0 = tnum_const(0); + struct bpf_retval_range range = retval_range(0, 1); + struct bpf_retval_range const_0 = retval_range(0, 0); enum bpf_prog_type prog_type = resolve_prog_type(env->prog); int err; struct bpf_func_state *frame = env->cur_state->frame[0]; @@ -15043,8 +15045,8 @@ static int check_return_code(struct bpf_verifier_env *env, int regno, const char return -EINVAL; } - if (!tnum_in(const_0, reg->var_off)) { - verbose_invalid_scalar(env, reg, &const_0, "async callback", reg_name); + if (!retval_range_within(const_0, reg)) { + verbose_invalid_scalar(env, reg, const_0, "async callback", reg_name); return -EINVAL; } return 0; @@ -15070,14 +15072,14 @@ static int check_return_code(struct bpf_verifier_env *env, int regno, const char env->prog->expected_attach_type == BPF_CGROUP_INET4_GETSOCKNAME || env->prog->expected_attach_type == BPF_CGROUP_INET6_GETSOCKNAME || env->prog->expected_attach_type == BPF_CGROUP_UNIX_GETSOCKNAME) - range = tnum_range(1, 1); + range = retval_range(1, 1); if (env->prog->expected_attach_type == BPF_CGROUP_INET4_BIND || env->prog->expected_attach_type == BPF_CGROUP_INET6_BIND) - range = tnum_range(0, 3); + range = retval_range(0, 3); break; case BPF_PROG_TYPE_CGROUP_SKB: if (env->prog->expected_attach_type == BPF_CGROUP_INET_EGRESS) { - range = tnum_range(0, 3); + range = retval_range(0, 3); enforce_attach_type_range = tnum_range(2, 3); } break; @@ -15090,13 +15092,13 @@ static int check_return_code(struct bpf_verifier_env *env, int regno, const char case BPF_PROG_TYPE_RAW_TRACEPOINT: if (!env->prog->aux->attach_btf_id) return 0; - range = tnum_const(0); + range = retval_range(0, 0); break; case BPF_PROG_TYPE_TRACING: switch (env->prog->expected_attach_type) { case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: - range = tnum_const(0); + range = retval_range(0, 0); break; case BPF_TRACE_RAW_TP: case BPF_MODIFY_RETURN: @@ -15108,7 +15110,7 @@ static int check_return_code(struct bpf_verifier_env *env, int regno, const char } break; case BPF_PROG_TYPE_SK_LOOKUP: - range = tnum_range(SK_DROP, SK_PASS); + range = retval_range(SK_DROP, SK_PASS); break; case BPF_PROG_TYPE_LSM: @@ -15122,12 +15124,12 @@ static int check_return_code(struct bpf_verifier_env *env, int regno, const char /* Make sure programs that attach to void * hooks don't try to modify return value. */ - range = tnum_range(1, 1); + range = retval_range(1, 1); } break; case BPF_PROG_TYPE_NETFILTER: - range = tnum_range(NF_DROP, NF_ACCEPT); + range = retval_range(NF_DROP, NF_ACCEPT); break; case BPF_PROG_TYPE_EXT: /* freplace program can return anything as its return value @@ -15143,8 +15145,8 @@ static int check_return_code(struct bpf_verifier_env *env, int regno, const char return -EINVAL; } - if (!tnum_in(range, reg->var_off)) { - verbose_invalid_scalar(env, reg, &range, "program exit", reg_name); + if (!retval_range_within(range, reg)) { + verbose_invalid_scalar(env, reg, range, "program exit", reg_name); if (prog->expected_attach_type == BPF_LSM_CGROUP && prog_type == BPF_PROG_TYPE_LSM && !prog->aux->attach_func_proto->type) diff --git a/tools/testing/selftests/bpf/progs/exceptions_assert.c b/tools/testing/selftests/bpf/progs/exceptions_assert.c index 575e7dd719c4..0ef81040da59 100644 --- a/tools/testing/selftests/bpf/progs/exceptions_assert.c +++ b/tools/testing/selftests/bpf/progs/exceptions_assert.c @@ -125,7 +125,7 @@ int check_assert_generic(struct __sk_buff *ctx) } SEC("?fentry/bpf_check") -__failure __msg("At program exit the register R1 has value (0x40; 0x0)") +__failure __msg("At program exit the register R1 has smin=64 smax=64") int check_assert_with_return(void *ctx) { bpf_assert_with(!ctx, 64); diff --git a/tools/testing/selftests/bpf/progs/exceptions_fail.c b/tools/testing/selftests/bpf/progs/exceptions_fail.c index 81ead7512ba2..9cceb6521143 100644 --- a/tools/testing/selftests/bpf/progs/exceptions_fail.c +++ b/tools/testing/selftests/bpf/progs/exceptions_fail.c @@ -308,7 +308,7 @@ int reject_set_exception_cb_bad_ret1(void *ctx) } SEC("?fentry/bpf_check") -__failure __msg("At program exit the register R1 has value (0x40; 0x0) should") +__failure __msg("At program exit the register R1 has smin=64 smax=64 should") int reject_set_exception_cb_bad_ret2(void *ctx) { bpf_throw(64); diff --git a/tools/testing/selftests/bpf/progs/test_global_func15.c b/tools/testing/selftests/bpf/progs/test_global_func15.c index b512d6a6c75e..f80207480e8a 100644 --- a/tools/testing/selftests/bpf/progs/test_global_func15.c +++ b/tools/testing/selftests/bpf/progs/test_global_func15.c @@ -13,7 +13,7 @@ __noinline int foo(unsigned int *v) } SEC("cgroup_skb/ingress") -__failure __msg("At program exit the register R0 has value") +__failure __msg("At program exit the register R0 has ") int global_func15(struct __sk_buff *skb) { unsigned int v = 1; diff --git a/tools/testing/selftests/bpf/progs/timer_failure.c b/tools/testing/selftests/bpf/progs/timer_failure.c index 226d33b5a05c..9000da1e2120 100644 --- a/tools/testing/selftests/bpf/progs/timer_failure.c +++ b/tools/testing/selftests/bpf/progs/timer_failure.c @@ -30,7 +30,7 @@ static int timer_cb_ret1(void *map, int *key, struct bpf_timer *timer) } SEC("fentry/bpf_fentry_test1") -__failure __msg("should have been in (0x0; 0x0)") +__failure __msg("should have been in [0, 0]") int BPF_PROG2(test_ret_1, int, a) { int key = 0; diff --git a/tools/testing/selftests/bpf/progs/user_ringbuf_fail.c b/tools/testing/selftests/bpf/progs/user_ringbuf_fail.c index 03ee946c6bf7..11ab25c42c36 100644 --- a/tools/testing/selftests/bpf/progs/user_ringbuf_fail.c +++ b/tools/testing/selftests/bpf/progs/user_ringbuf_fail.c @@ -184,7 +184,7 @@ invalid_drain_callback_return(struct bpf_dynptr *dynptr, void *context) * not be able to write to that pointer. */ SEC("?raw_tp") -__failure __msg("At callback return the register R0 has value") +__failure __msg("At callback return the register R0 has ") int user_ringbuf_callback_invalid_return(void *ctx) { bpf_user_ringbuf_drain(&user_ringbuf, invalid_drain_callback_return, NULL, 0); diff --git a/tools/testing/selftests/bpf/progs/verifier_cgroup_inv_retcode.c b/tools/testing/selftests/bpf/progs/verifier_cgroup_inv_retcode.c index d6c4a7f3f790..6e0f349f8f15 100644 --- a/tools/testing/selftests/bpf/progs/verifier_cgroup_inv_retcode.c +++ b/tools/testing/selftests/bpf/progs/verifier_cgroup_inv_retcode.c @@ -7,7 +7,7 @@ SEC("cgroup/sock") __description("bpf_exit with invalid return code. test1") -__failure __msg("R0 has value (0x0; 0xffffffff)") +__failure __msg("smin=0 smax=4294967295 should have been in [0, 1]") __naked void with_invalid_return_code_test1(void) { asm volatile (" \ @@ -30,7 +30,7 @@ __naked void with_invalid_return_code_test2(void) SEC("cgroup/sock") __description("bpf_exit with invalid return code. test3") -__failure __msg("R0 has value (0x0; 0x3)") +__failure __msg("smin=0 smax=3 should have been in [0, 1]") __naked void with_invalid_return_code_test3(void) { asm volatile (" \ @@ -53,7 +53,7 @@ __naked void with_invalid_return_code_test4(void) SEC("cgroup/sock") __description("bpf_exit with invalid return code. test5") -__failure __msg("R0 has value (0x2; 0x0)") +__failure __msg("smin=2 smax=2 should have been in [0, 1]") __naked void with_invalid_return_code_test5(void) { asm volatile (" \ @@ -75,7 +75,7 @@ __naked void with_invalid_return_code_test6(void) SEC("cgroup/sock") __description("bpf_exit with invalid return code. test7") -__failure __msg("R0 has unknown scalar value") +__failure __msg("R0 has unknown scalar value should have been in [0, 1]") __naked void with_invalid_return_code_test7(void) { asm volatile (" \ diff --git a/tools/testing/selftests/bpf/progs/verifier_netfilter_retcode.c b/tools/testing/selftests/bpf/progs/verifier_netfilter_retcode.c index 353ae6da00e1..e1ffa5d32ff0 100644 --- a/tools/testing/selftests/bpf/progs/verifier_netfilter_retcode.c +++ b/tools/testing/selftests/bpf/progs/verifier_netfilter_retcode.c @@ -39,7 +39,7 @@ __naked void with_valid_return_code_test3(void) SEC("netfilter") __description("bpf_exit with invalid return code. test4") -__failure __msg("R0 has value (0x2; 0x0)") +__failure __msg("R0 has smin=2 smax=2 should have been in [0, 1]") __naked void with_invalid_return_code_test4(void) { asm volatile (" \ diff --git a/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c b/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c index d41d2a8bb97e..0dfe3f8b69ac 100644 --- a/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c +++ b/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c @@ -148,7 +148,7 @@ __msg("mark_precise: frame1: regs=r0 stack= before 11: (b7) r0 = 0") __msg("from 10 to 12: frame1: R0=scalar(smin=umin=1001") /* check that branch code path marks r0 as precise, before failing */ __msg("mark_precise: frame1: regs=r0 stack= before 9: (85) call bpf_get_prandom_u32#7") -__msg("At callback return the register R0 has value (0x0; 0x7fffffffffffffff) should have been in (0x0; 0x1)") +__msg("At callback return the register R0 has smin=1001 should have been in [0, 1]") __naked int callback_precise_return_fail(void) { asm volatile ( -- cgit From 0ef24c8dfae24a4b8aa2e92eac20faecdc5502e5 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sat, 2 Dec 2023 09:57:01 -0800 Subject: bpf: unify async callback and program retval checks Use common logic to verify program return values and async callback return values. This allows to avoid duplication of any extra steps necessary, like precision marking, which will be added in the next patch. Acked-by: Eduard Zingerman Acked-by: Shung-Hsi Yu Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231202175705.885270-8-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 30 ++++++++++++------------------ 1 file changed, 12 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 9411c1046268..c54944af1bcc 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -367,7 +367,7 @@ static void verbose_invalid_scalar(struct bpf_verifier_env *env, { bool unknown = true; - verbose(env, "At %s the register %s has", ctx, reg_name); + verbose(env, "%s the register %s has", ctx, reg_name); if (reg->smin_value > S64_MIN) { verbose(env, " smin=%lld", reg->smin_value); unknown = false; @@ -9610,7 +9610,7 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) /* enforce R0 return value range */ if (!retval_range_within(callee->callback_ret_range, r0)) { verbose_invalid_scalar(env, r0, callee->callback_ret_range, - "callback return", "R0"); + "At callback return", "R0"); return -EINVAL; } if (!calls_callback(env, callee->callsite)) { @@ -14993,11 +14993,11 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) static int check_return_code(struct bpf_verifier_env *env, int regno, const char *reg_name) { + const char *exit_ctx = "At program exit"; struct tnum enforce_attach_type_range = tnum_unknown; const struct bpf_prog *prog = env->prog; struct bpf_reg_state *reg; struct bpf_retval_range range = retval_range(0, 1); - struct bpf_retval_range const_0 = retval_range(0, 0); enum bpf_prog_type prog_type = resolve_prog_type(env->prog); int err; struct bpf_func_state *frame = env->cur_state->frame[0]; @@ -15039,17 +15039,9 @@ static int check_return_code(struct bpf_verifier_env *env, int regno, const char if (frame->in_async_callback_fn) { /* enforce return zero from async callbacks like timer */ - if (reg->type != SCALAR_VALUE) { - verbose(env, "In async callback the register R%d is not a known value (%s)\n", - regno, reg_type_str(env, reg->type)); - return -EINVAL; - } - - if (!retval_range_within(const_0, reg)) { - verbose_invalid_scalar(env, reg, const_0, "async callback", reg_name); - return -EINVAL; - } - return 0; + exit_ctx = "At async callback return"; + range = retval_range(0, 0); + goto enforce_retval; } if (is_subprog && !frame->in_exception_callback_fn) { @@ -15139,15 +15131,17 @@ static int check_return_code(struct bpf_verifier_env *env, int regno, const char return 0; } +enforce_retval: if (reg->type != SCALAR_VALUE) { - verbose(env, "At program exit the register R%d is not a known value (%s)\n", - regno, reg_type_str(env, reg->type)); + verbose(env, "%s the register R%d is not a known value (%s)\n", + exit_ctx, regno, reg_type_str(env, reg->type)); return -EINVAL; } if (!retval_range_within(range, reg)) { - verbose_invalid_scalar(env, reg, range, "program exit", reg_name); - if (prog->expected_attach_type == BPF_LSM_CGROUP && + verbose_invalid_scalar(env, reg, range, exit_ctx, reg_name); + if (!is_subprog && + prog->expected_attach_type == BPF_LSM_CGROUP && prog_type == BPF_PROG_TYPE_LSM && !prog->aux->attach_func_proto->type) verbose(env, "Note, BPF_LSM_CGROUP that attach to void LSM hooks can't modify return value!\n"); -- cgit From eabe518de533a4291996020977054a7a7b78c7d3 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sat, 2 Dec 2023 09:57:02 -0800 Subject: bpf: enforce precision of R0 on program/async callback return Given we enforce a valid range for program and async callback return value, we must mark R0 as precise to avoid incorrect state pruning. Fixes: b5dc0163d8fd ("bpf: precise scalar_value tracking") Acked-by: Eduard Zingerman Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231202175705.885270-9-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c54944af1bcc..2cd150d6d141 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -15138,6 +15138,10 @@ enforce_retval: return -EINVAL; } + err = mark_chain_precision(env, regno); + if (err) + return err; + if (!retval_range_within(range, reg)) { verbose_invalid_scalar(env, reg, range, exit_ctx, reg_name); if (!is_subprog && -- cgit From 81eff2e36481c5cf4a2ac906ae56c3fbd3e6f305 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sat, 2 Dec 2023 09:57:05 -0800 Subject: bpf: simplify tnum output if a fully known constant Emit tnum representation as just a constant if all bits are known. Use decimal-vs-hex logic to determine exact format of emitted constant value, just like it's done for register range values. For that move tnum_strn() to kernel/bpf/log.c to reuse decimal-vs-hex determination logic and constants. Acked-by: Shung-Hsi Yu Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231202175705.885270-12-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/log.c | 13 +++++++++++++ kernel/bpf/tnum.c | 6 ------ .../selftests/bpf/progs/verifier_direct_packet_access.c | 2 +- tools/testing/selftests/bpf/progs/verifier_int_ptr.c | 2 +- tools/testing/selftests/bpf/progs/verifier_stack_ptr.c | 4 ++-- 5 files changed, 17 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c index 3505f3e5ae96..55d019f30e91 100644 --- a/kernel/bpf/log.c +++ b/kernel/bpf/log.c @@ -539,6 +539,19 @@ static void verbose_snum(struct bpf_verifier_env *env, s64 num) verbose(env, "%#llx", num); } +int tnum_strn(char *str, size_t size, struct tnum a) +{ + /* print as a constant, if tnum is fully known */ + if (a.mask == 0) { + if (is_unum_decimal(a.value)) + return snprintf(str, size, "%llu", a.value); + else + return snprintf(str, size, "%#llx", a.value); + } + return snprintf(str, size, "(%#llx; %#llx)", a.value, a.mask); +} +EXPORT_SYMBOL_GPL(tnum_strn); + static void print_scalar_ranges(struct bpf_verifier_env *env, const struct bpf_reg_state *reg, const char **sep) diff --git a/kernel/bpf/tnum.c b/kernel/bpf/tnum.c index f4c91c9b27d7..9dbc31b25e3d 100644 --- a/kernel/bpf/tnum.c +++ b/kernel/bpf/tnum.c @@ -172,12 +172,6 @@ bool tnum_in(struct tnum a, struct tnum b) return a.value == b.value; } -int tnum_strn(char *str, size_t size, struct tnum a) -{ - return snprintf(str, size, "(%#llx; %#llx)", a.value, a.mask); -} -EXPORT_SYMBOL_GPL(tnum_strn); - int tnum_sbin(char *str, size_t size, struct tnum a) { size_t n; diff --git a/tools/testing/selftests/bpf/progs/verifier_direct_packet_access.c b/tools/testing/selftests/bpf/progs/verifier_direct_packet_access.c index 99a23dea8233..be95570ab382 100644 --- a/tools/testing/selftests/bpf/progs/verifier_direct_packet_access.c +++ b/tools/testing/selftests/bpf/progs/verifier_direct_packet_access.c @@ -411,7 +411,7 @@ l0_%=: r0 = 0; \ SEC("tc") __description("direct packet access: test17 (pruning, alignment)") -__failure __msg("misaligned packet access off 2+(0x0; 0x0)+15+-4 size 4") +__failure __msg("misaligned packet access off 2+0+15+-4 size 4") __flag(BPF_F_STRICT_ALIGNMENT) __naked void packet_access_test17_pruning_alignment(void) { diff --git a/tools/testing/selftests/bpf/progs/verifier_int_ptr.c b/tools/testing/selftests/bpf/progs/verifier_int_ptr.c index b054f9c48143..74d9cad469d9 100644 --- a/tools/testing/selftests/bpf/progs/verifier_int_ptr.c +++ b/tools/testing/selftests/bpf/progs/verifier_int_ptr.c @@ -67,7 +67,7 @@ __naked void ptr_to_long_half_uninitialized(void) SEC("cgroup/sysctl") __description("ARG_PTR_TO_LONG misaligned") -__failure __msg("misaligned stack access off (0x0; 0x0)+-20+0 size 8") +__failure __msg("misaligned stack access off 0+-20+0 size 8") __naked void arg_ptr_to_long_misaligned(void) { asm volatile (" \ diff --git a/tools/testing/selftests/bpf/progs/verifier_stack_ptr.c b/tools/testing/selftests/bpf/progs/verifier_stack_ptr.c index e0f77e3e7869..417c61cd4b19 100644 --- a/tools/testing/selftests/bpf/progs/verifier_stack_ptr.c +++ b/tools/testing/selftests/bpf/progs/verifier_stack_ptr.c @@ -37,7 +37,7 @@ __naked void ptr_to_stack_store_load(void) SEC("socket") __description("PTR_TO_STACK store/load - bad alignment on off") -__failure __msg("misaligned stack access off (0x0; 0x0)+-8+2 size 8") +__failure __msg("misaligned stack access off 0+-8+2 size 8") __failure_unpriv __naked void load_bad_alignment_on_off(void) { @@ -53,7 +53,7 @@ __naked void load_bad_alignment_on_off(void) SEC("socket") __description("PTR_TO_STACK store/load - bad alignment on reg") -__failure __msg("misaligned stack access off (0x0; 0x0)+-10+8 size 8") +__failure __msg("misaligned stack access off 0+-10+8 size 8") __failure_unpriv __naked void load_bad_alignment_on_reg(void) { -- cgit From 5bd90cdc65ef9ef5e13c9ff23620079db5c608a0 Mon Sep 17 00:00:00 2001 From: Andrei Matei Date: Sun, 3 Dec 2023 20:12:48 -0500 Subject: bpf: Minor logging improvement One place where we were logging a register was only logging the variable part, not also the fixed part. Signed-off-by: Andrei Matei Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20231204011248.2040084-1-andreimatei1@gmail.com --- kernel/bpf/verifier.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 2cd150d6d141..cdb4f5f0ba79 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6585,8 +6585,8 @@ static int check_stack_access_within_bounds( char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose(env, "invalid variable-offset%s stack R%d var_off=%s size=%d\n", - err_extra, regno, tn_buf, access_size); + verbose(env, "invalid variable-offset%s stack R%d var_off=%s off=%d size=%d\n", + err_extra, regno, tn_buf, off, access_size); } } return err; -- cgit From 659aa050a53817157b7459529538598a6449c1d3 Mon Sep 17 00:00:00 2001 From: Alison Schofield Date: Mon, 13 Nov 2023 14:13:24 -0800 Subject: kernel/resource: Increment by align value in get_free_mem_region() Currently get_free_mem_region() searches for available capacity in increments equal to the region size being requested. This can cause the search to take giant steps through the resource leaving needless gaps and missing available space. Specifically 'cxl create-region' fails with ERANGE even though capacity of the given size and CXL's expected 256M x InterleaveWays alignment can be satisfied. Replace the total-request-size increment with a next alignment increment so that the next possible address is always examined for availability. Fixes: 14b80582c43e ("resource: Introduce alloc_free_mem_region()") Reported-by: Dmytro Adamenko Reported-by: Dan Williams Signed-off-by: Alison Schofield Reviewed-by: Dave Jiang Link: https://lore.kernel.org/r/20231113221324.1118092-1-alison.schofield@intel.com Cc: Jason Gunthorpe Reviewed-by: Christoph Hellwig Signed-off-by: Dan Williams --- kernel/resource.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index 866ef3663a0b..91be1bc50b60 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -1844,8 +1844,8 @@ get_free_mem_region(struct device *dev, struct resource *base, write_lock(&resource_lock); for (addr = gfr_start(base, size, align, flags); - gfr_continue(base, addr, size, flags); - addr = gfr_next(addr, size, flags)) { + gfr_continue(base, addr, align, flags); + addr = gfr_next(addr, align, flags)) { if (__region_intersects(base, addr, size, 0, IORES_DESC_NONE) != REGION_DISJOINT) continue; -- cgit From 169410eba271afc9f0fb476d996795aa26770c6d Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Mon, 4 Dec 2023 22:04:19 +0800 Subject: bpf: Check rcu_read_lock_trace_held() before calling bpf map helpers These three bpf_map_{lookup,update,delete}_elem() helpers are also available for sleepable bpf program, so add the corresponding lock assertion for sleepable bpf program, otherwise the following warning will be reported when a sleepable bpf program manipulates bpf map under interpreter mode (aka bpf_jit_enable=0): WARNING: CPU: 3 PID: 4985 at kernel/bpf/helpers.c:40 ...... CPU: 3 PID: 4985 Comm: test_progs Not tainted 6.6.0+ #2 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996) ...... RIP: 0010:bpf_map_lookup_elem+0x54/0x60 ...... Call Trace: ? __warn+0xa5/0x240 ? bpf_map_lookup_elem+0x54/0x60 ? report_bug+0x1ba/0x1f0 ? handle_bug+0x40/0x80 ? exc_invalid_op+0x18/0x50 ? asm_exc_invalid_op+0x1b/0x20 ? __pfx_bpf_map_lookup_elem+0x10/0x10 ? rcu_lockdep_current_cpu_online+0x65/0xb0 ? rcu_is_watching+0x23/0x50 ? bpf_map_lookup_elem+0x54/0x60 ? __pfx_bpf_map_lookup_elem+0x10/0x10 ___bpf_prog_run+0x513/0x3b70 __bpf_prog_run32+0x9d/0xd0 ? __bpf_prog_enter_sleepable_recur+0xad/0x120 ? __bpf_prog_enter_sleepable_recur+0x3e/0x120 bpf_trampoline_6442580665+0x4d/0x1000 __x64_sys_getpgid+0x5/0x30 ? do_syscall_64+0x36/0xb0 entry_SYSCALL_64_after_hwframe+0x6e/0x76 Signed-off-by: Hou Tao Link: https://lore.kernel.org/r/20231204140425.1480317-2-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index b45a8381f9bd..ee9bdf29246a 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -32,12 +32,13 @@ * * Different map implementations will rely on rcu in map methods * lookup/update/delete, therefore eBPF programs must run under rcu lock - * if program is allowed to access maps, so check rcu_read_lock_held in - * all three functions. + * if program is allowed to access maps, so check rcu_read_lock_held() or + * rcu_read_lock_trace_held() in all three functions. */ BPF_CALL_2(bpf_map_lookup_elem, struct bpf_map *, map, void *, key) { - WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held()); + WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && + !rcu_read_lock_bh_held()); return (unsigned long) map->ops->map_lookup_elem(map, key); } @@ -53,7 +54,8 @@ const struct bpf_func_proto bpf_map_lookup_elem_proto = { BPF_CALL_4(bpf_map_update_elem, struct bpf_map *, map, void *, key, void *, value, u64, flags) { - WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held()); + WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && + !rcu_read_lock_bh_held()); return map->ops->map_update_elem(map, key, value, flags); } @@ -70,7 +72,8 @@ const struct bpf_func_proto bpf_map_update_elem_proto = { BPF_CALL_2(bpf_map_delete_elem, struct bpf_map *, map, void *, key) { - WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held()); + WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && + !rcu_read_lock_bh_held()); return map->ops->map_delete_elem(map, key); } -- cgit From 20c20bd11a0702ce4dc9300c3da58acf551d9725 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Mon, 4 Dec 2023 22:04:20 +0800 Subject: bpf: Add map and need_defer parameters to .map_fd_put_ptr() map is the pointer of outer map, and need_defer needs some explanation. need_defer tells the implementation to defer the reference release of the passed element and ensure that the element is still alive before the bpf program, which may manipulate it, exits. The following three cases will invoke map_fd_put_ptr() and different need_defer values will be passed to these callers: 1) release the reference of the old element in the map during map update or map deletion. The release must be deferred, otherwise the bpf program may incur use-after-free problem, so need_defer needs to be true. 2) release the reference of the to-be-added element in the error path of map update. The to-be-added element is not visible to any bpf program, so it is OK to pass false for need_defer parameter. 3) release the references of all elements in the map during map release. Any bpf program which has access to the map must have been exited and released, so need_defer=false will be OK. These two parameters will be used by the following patches to fix the potential use-after-free problem for map-in-map. Signed-off-by: Hou Tao Link: https://lore.kernel.org/r/20231204140425.1480317-3-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 6 +++++- kernel/bpf/arraymap.c | 12 +++++++----- kernel/bpf/hashtab.c | 6 +++--- kernel/bpf/map_in_map.c | 2 +- kernel/bpf/map_in_map.h | 2 +- 5 files changed, 17 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index eb447b0a9423..d273348cfb2f 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -106,7 +106,11 @@ struct bpf_map_ops { /* funcs called by prog_array and perf_event_array map */ void *(*map_fd_get_ptr)(struct bpf_map *map, struct file *map_file, int fd); - void (*map_fd_put_ptr)(void *ptr); + /* If need_defer is true, the implementation should guarantee that + * the to-be-put element is still alive before the bpf program, which + * may manipulate it, exists. + */ + void (*map_fd_put_ptr)(struct bpf_map *map, void *ptr, bool need_defer); int (*map_gen_lookup)(struct bpf_map *map, struct bpf_insn *insn_buf); u32 (*map_fd_sys_lookup_elem)(void *ptr); void (*map_seq_show_elem)(struct bpf_map *map, void *key, diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 2058e89b5ddd..f9aed5909d6e 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -867,7 +867,7 @@ int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file, } if (old_ptr) - map->ops->map_fd_put_ptr(old_ptr); + map->ops->map_fd_put_ptr(map, old_ptr, true); return 0; } @@ -890,7 +890,7 @@ static long fd_array_map_delete_elem(struct bpf_map *map, void *key) } if (old_ptr) { - map->ops->map_fd_put_ptr(old_ptr); + map->ops->map_fd_put_ptr(map, old_ptr, true); return 0; } else { return -ENOENT; @@ -913,8 +913,9 @@ static void *prog_fd_array_get_ptr(struct bpf_map *map, return prog; } -static void prog_fd_array_put_ptr(void *ptr) +static void prog_fd_array_put_ptr(struct bpf_map *map, void *ptr, bool need_defer) { + /* bpf_prog is freed after one RCU or tasks trace grace period */ bpf_prog_put(ptr); } @@ -1239,8 +1240,9 @@ err_out: return ee; } -static void perf_event_fd_array_put_ptr(void *ptr) +static void perf_event_fd_array_put_ptr(struct bpf_map *map, void *ptr, bool need_defer) { + /* bpf_perf_event is freed after one RCU grace period */ bpf_event_entry_free_rcu(ptr); } @@ -1294,7 +1296,7 @@ static void *cgroup_fd_array_get_ptr(struct bpf_map *map, return cgroup_get_from_fd(fd); } -static void cgroup_fd_array_put_ptr(void *ptr) +static void cgroup_fd_array_put_ptr(struct bpf_map *map, void *ptr, bool need_defer) { /* cgroup_put free cgrp after a rcu grace period */ cgroup_put(ptr); diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index fd8d4b0addfc..5b9146fa825f 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -897,7 +897,7 @@ static void htab_put_fd_value(struct bpf_htab *htab, struct htab_elem *l) if (map->ops->map_fd_put_ptr) { ptr = fd_htab_map_get_ptr(map, l); - map->ops->map_fd_put_ptr(ptr); + map->ops->map_fd_put_ptr(map, ptr, true); } } @@ -2484,7 +2484,7 @@ static void fd_htab_map_free(struct bpf_map *map) hlist_nulls_for_each_entry_safe(l, n, head, hash_node) { void *ptr = fd_htab_map_get_ptr(map, l); - map->ops->map_fd_put_ptr(ptr); + map->ops->map_fd_put_ptr(map, ptr, false); } } @@ -2525,7 +2525,7 @@ int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file, ret = htab_map_update_elem(map, key, &ptr, map_flags); if (ret) - map->ops->map_fd_put_ptr(ptr); + map->ops->map_fd_put_ptr(map, ptr, false); return ret; } diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c index cd5eafaba97e..2dfeb5835e16 100644 --- a/kernel/bpf/map_in_map.c +++ b/kernel/bpf/map_in_map.c @@ -127,7 +127,7 @@ void *bpf_map_fd_get_ptr(struct bpf_map *map, return inner_map; } -void bpf_map_fd_put_ptr(void *ptr) +void bpf_map_fd_put_ptr(struct bpf_map *map, void *ptr, bool need_defer) { /* ptr->ops->map_free() has to go through one * rcu grace period by itself. diff --git a/kernel/bpf/map_in_map.h b/kernel/bpf/map_in_map.h index bcb7534afb3c..7d61602354de 100644 --- a/kernel/bpf/map_in_map.h +++ b/kernel/bpf/map_in_map.h @@ -13,7 +13,7 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd); void bpf_map_meta_free(struct bpf_map *map_meta); void *bpf_map_fd_get_ptr(struct bpf_map *map, struct file *map_file, int ufd); -void bpf_map_fd_put_ptr(void *ptr); +void bpf_map_fd_put_ptr(struct bpf_map *map, void *ptr, bool need_defer); u32 bpf_map_fd_sys_lookup_elem(void *ptr); #endif -- cgit From 79d93b3c6ffd79abcd8e43345980aa1e904879c4 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Mon, 4 Dec 2023 22:04:21 +0800 Subject: bpf: Set need_defer as false when clearing fd array during map free Both map deletion operation, map release and map free operation use fd_array_map_delete_elem() to remove the element from fd array and need_defer is always true in fd_array_map_delete_elem(). For the map deletion operation and map release operation, need_defer=true is necessary, because the bpf program, which accesses the element in fd array, may still alive. However for map free operation, it is certain that the bpf program which owns the fd array has already been exited, so setting need_defer as false is appropriate for map free operation. So fix it by adding need_defer parameter to bpf_fd_array_map_clear() and adding a new helper __fd_array_map_delete_elem() to handle the map deletion, map release and map free operations correspondingly. Signed-off-by: Hou Tao Link: https://lore.kernel.org/r/20231204140425.1480317-4-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/arraymap.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index f9aed5909d6e..4a4a67956e21 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -871,7 +871,7 @@ int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file, return 0; } -static long fd_array_map_delete_elem(struct bpf_map *map, void *key) +static long __fd_array_map_delete_elem(struct bpf_map *map, void *key, bool need_defer) { struct bpf_array *array = container_of(map, struct bpf_array, map); void *old_ptr; @@ -890,13 +890,18 @@ static long fd_array_map_delete_elem(struct bpf_map *map, void *key) } if (old_ptr) { - map->ops->map_fd_put_ptr(map, old_ptr, true); + map->ops->map_fd_put_ptr(map, old_ptr, need_defer); return 0; } else { return -ENOENT; } } +static long fd_array_map_delete_elem(struct bpf_map *map, void *key) +{ + return __fd_array_map_delete_elem(map, key, true); +} + static void *prog_fd_array_get_ptr(struct bpf_map *map, struct file *map_file, int fd) { @@ -925,13 +930,13 @@ static u32 prog_fd_array_sys_lookup_elem(void *ptr) } /* decrement refcnt of all bpf_progs that are stored in this map */ -static void bpf_fd_array_map_clear(struct bpf_map *map) +static void bpf_fd_array_map_clear(struct bpf_map *map, bool need_defer) { struct bpf_array *array = container_of(map, struct bpf_array, map); int i; for (i = 0; i < array->map.max_entries; i++) - fd_array_map_delete_elem(map, &i); + __fd_array_map_delete_elem(map, &i, need_defer); } static void prog_array_map_seq_show_elem(struct bpf_map *map, void *key, @@ -1110,7 +1115,7 @@ static void prog_array_map_clear_deferred(struct work_struct *work) { struct bpf_map *map = container_of(work, struct bpf_array_aux, work)->map; - bpf_fd_array_map_clear(map); + bpf_fd_array_map_clear(map, true); bpf_map_put(map); } @@ -1260,7 +1265,7 @@ static void perf_event_fd_array_release(struct bpf_map *map, for (i = 0; i < array->map.max_entries; i++) { ee = READ_ONCE(array->ptrs[i]); if (ee && ee->map_file == map_file) - fd_array_map_delete_elem(map, &i); + __fd_array_map_delete_elem(map, &i, true); } rcu_read_unlock(); } @@ -1268,7 +1273,7 @@ static void perf_event_fd_array_release(struct bpf_map *map, static void perf_event_fd_array_map_free(struct bpf_map *map) { if (map->map_flags & BPF_F_PRESERVE_ELEMS) - bpf_fd_array_map_clear(map); + bpf_fd_array_map_clear(map, false); fd_array_map_free(map); } @@ -1304,7 +1309,7 @@ static void cgroup_fd_array_put_ptr(struct bpf_map *map, void *ptr, bool need_de static void cgroup_fd_array_free(struct bpf_map *map) { - bpf_fd_array_map_clear(map); + bpf_fd_array_map_clear(map, false); fd_array_map_free(map); } @@ -1349,7 +1354,7 @@ static void array_of_map_free(struct bpf_map *map) * is protected by fdget/fdput. */ bpf_map_meta_free(map->inner_map_meta); - bpf_fd_array_map_clear(map); + bpf_fd_array_map_clear(map, false); fd_array_map_free(map); } -- cgit From 876673364161da50eed6b472d746ef88242b2368 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Mon, 4 Dec 2023 22:04:22 +0800 Subject: bpf: Defer the free of inner map when necessary When updating or deleting an inner map in map array or map htab, the map may still be accessed by non-sleepable program or sleepable program. However bpf_map_fd_put_ptr() decreases the ref-counter of the inner map directly through bpf_map_put(), if the ref-counter is the last one (which is true for most cases), the inner map will be freed by ops->map_free() in a kworker. But for now, most .map_free() callbacks don't use synchronize_rcu() or its variants to wait for the elapse of a RCU grace period, so after the invocation of ops->map_free completes, the bpf program which is accessing the inner map may incur use-after-free problem. Fix the free of inner map by invoking bpf_map_free_deferred() after both one RCU grace period and one tasks trace RCU grace period if the inner map has been removed from the outer map before. The deferment is accomplished by using call_rcu() or call_rcu_tasks_trace() when releasing the last ref-counter of bpf map. The newly-added rcu_head field in bpf_map shares the same storage space with work field to reduce the size of bpf_map. Fixes: bba1dc0b55ac ("bpf: Remove redundant synchronize_rcu.") Fixes: 638e4b825d52 ("bpf: Allows per-cpu maps and map-in-map in sleepable programs") Signed-off-by: Hou Tao Link: https://lore.kernel.org/r/20231204140425.1480317-5-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 7 ++++++- kernel/bpf/map_in_map.c | 11 ++++++++--- kernel/bpf/syscall.c | 32 +++++++++++++++++++++++++++----- 3 files changed, 41 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index d273348cfb2f..de3bd03cbeea 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -276,7 +276,11 @@ struct bpf_map { */ atomic64_t refcnt ____cacheline_aligned; atomic64_t usercnt; - struct work_struct work; + /* rcu is used before freeing and work is only used during freeing */ + union { + struct work_struct work; + struct rcu_head rcu; + }; struct mutex freeze_mutex; atomic64_t writecnt; /* 'Ownership' of program-containing map is claimed by the first program @@ -292,6 +296,7 @@ struct bpf_map { } owner; bool bypass_spec_v1; bool frozen; /* write-once; write-protected by freeze_mutex */ + bool free_after_mult_rcu_gp; s64 __percpu *elem_count; }; diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c index 2dfeb5835e16..3248ff5d8161 100644 --- a/kernel/bpf/map_in_map.c +++ b/kernel/bpf/map_in_map.c @@ -129,10 +129,15 @@ void *bpf_map_fd_get_ptr(struct bpf_map *map, void bpf_map_fd_put_ptr(struct bpf_map *map, void *ptr, bool need_defer) { - /* ptr->ops->map_free() has to go through one - * rcu grace period by itself. + struct bpf_map *inner_map = ptr; + + /* The inner map may still be used by both non-sleepable and sleepable + * bpf program, so free it after one RCU grace period and one tasks + * trace RCU grace period. */ - bpf_map_put(ptr); + if (need_defer) + WRITE_ONCE(inner_map->free_after_mult_rcu_gp, true); + bpf_map_put(inner_map); } u32 bpf_map_fd_sys_lookup_elem(void *ptr) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 5e43ddd1b83f..dd515f6b9741 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -719,6 +719,28 @@ static void bpf_map_put_uref(struct bpf_map *map) } } +static void bpf_map_free_in_work(struct bpf_map *map) +{ + INIT_WORK(&map->work, bpf_map_free_deferred); + /* Avoid spawning kworkers, since they all might contend + * for the same mutex like slab_mutex. + */ + queue_work(system_unbound_wq, &map->work); +} + +static void bpf_map_free_rcu_gp(struct rcu_head *rcu) +{ + bpf_map_free_in_work(container_of(rcu, struct bpf_map, rcu)); +} + +static void bpf_map_free_mult_rcu_gp(struct rcu_head *rcu) +{ + if (rcu_trace_implies_rcu_gp()) + bpf_map_free_rcu_gp(rcu); + else + call_rcu(rcu, bpf_map_free_rcu_gp); +} + /* decrement map refcnt and schedule it for freeing via workqueue * (underlying map implementation ops->map_free() might sleep) */ @@ -728,11 +750,11 @@ void bpf_map_put(struct bpf_map *map) /* bpf_map_free_id() must be called first */ bpf_map_free_id(map); btf_put(map->btf); - INIT_WORK(&map->work, bpf_map_free_deferred); - /* Avoid spawning kworkers, since they all might contend - * for the same mutex like slab_mutex. - */ - queue_work(system_unbound_wq, &map->work); + + if (READ_ONCE(map->free_after_mult_rcu_gp)) + call_rcu_tasks_trace(&map->rcu, bpf_map_free_mult_rcu_gp); + else + bpf_map_free_in_work(map); } } EXPORT_SYMBOL_GPL(bpf_map_put); -- cgit From af66bfd3c8538ed21cf72af18426fc4a408665cf Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Mon, 4 Dec 2023 22:04:23 +0800 Subject: bpf: Optimize the free of inner map When removing the inner map from the outer map, the inner map will be freed after one RCU grace period and one RCU tasks trace grace period, so it is certain that the bpf program, which may access the inner map, has exited before the inner map is freed. However there is no need to wait for one RCU tasks trace grace period if the outer map is only accessed by non-sleepable program. So adding sleepable_refcnt in bpf_map and increasing sleepable_refcnt when adding the outer map into env->used_maps for sleepable program. Although the max number of bpf program is INT_MAX - 1, the number of bpf programs which are being loaded may be greater than INT_MAX, so using atomic64_t instead of atomic_t for sleepable_refcnt. When removing the inner map from the outer map, using sleepable_refcnt to decide whether or not a RCU tasks trace grace period is needed before freeing the inner map. Signed-off-by: Hou Tao Link: https://lore.kernel.org/r/20231204140425.1480317-6-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 ++ kernel/bpf/core.c | 4 ++++ kernel/bpf/map_in_map.c | 14 +++++++++----- kernel/bpf/syscall.c | 8 ++++++++ kernel/bpf/verifier.c | 4 +++- 5 files changed, 26 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index de3bd03cbeea..10e5e4d8a00f 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -297,6 +297,8 @@ struct bpf_map { bool bypass_spec_v1; bool frozen; /* write-once; write-protected by freeze_mutex */ bool free_after_mult_rcu_gp; + bool free_after_rcu_gp; + atomic64_t sleepable_refcnt; s64 __percpu *elem_count; }; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index cd3afe57ece3..4b813da8d6c0 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2664,12 +2664,16 @@ void __bpf_free_used_maps(struct bpf_prog_aux *aux, struct bpf_map **used_maps, u32 len) { struct bpf_map *map; + bool sleepable; u32 i; + sleepable = aux->sleepable; for (i = 0; i < len; i++) { map = used_maps[i]; if (map->ops->map_poke_untrack) map->ops->map_poke_untrack(map, aux); + if (sleepable) + atomic64_dec(&map->sleepable_refcnt); bpf_map_put(map); } } diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c index 3248ff5d8161..8ef269e66ba5 100644 --- a/kernel/bpf/map_in_map.c +++ b/kernel/bpf/map_in_map.c @@ -131,12 +131,16 @@ void bpf_map_fd_put_ptr(struct bpf_map *map, void *ptr, bool need_defer) { struct bpf_map *inner_map = ptr; - /* The inner map may still be used by both non-sleepable and sleepable - * bpf program, so free it after one RCU grace period and one tasks - * trace RCU grace period. + /* Defer the freeing of inner map according to the sleepable attribute + * of bpf program which owns the outer map, so unnecessary waiting for + * RCU tasks trace grace period can be avoided. */ - if (need_defer) - WRITE_ONCE(inner_map->free_after_mult_rcu_gp, true); + if (need_defer) { + if (atomic64_read(&map->sleepable_refcnt)) + WRITE_ONCE(inner_map->free_after_mult_rcu_gp, true); + else + WRITE_ONCE(inner_map->free_after_rcu_gp, true); + } bpf_map_put(inner_map); } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index dd515f6b9741..ebaccf77d56e 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -751,8 +751,11 @@ void bpf_map_put(struct bpf_map *map) bpf_map_free_id(map); btf_put(map->btf); + WARN_ON_ONCE(atomic64_read(&map->sleepable_refcnt)); if (READ_ONCE(map->free_after_mult_rcu_gp)) call_rcu_tasks_trace(&map->rcu, bpf_map_free_mult_rcu_gp); + else if (READ_ONCE(map->free_after_rcu_gp)) + call_rcu(&map->rcu, bpf_map_free_rcu_gp); else bpf_map_free_in_work(map); } @@ -5345,6 +5348,11 @@ static int bpf_prog_bind_map(union bpf_attr *attr) goto out_unlock; } + /* The bpf program will not access the bpf map, but for the sake of + * simplicity, increase sleepable_refcnt for sleepable program as well. + */ + if (prog->aux->sleepable) + atomic64_inc(&map->sleepable_refcnt); memcpy(used_maps_new, used_maps_old, sizeof(used_maps_old[0]) * prog->aux->used_map_cnt); used_maps_new[prog->aux->used_map_cnt] = map; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index cdb4f5f0ba79..1ed39665f802 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -17889,10 +17889,12 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env) return -E2BIG; } + if (env->prog->aux->sleepable) + atomic64_inc(&map->sleepable_refcnt); /* hold the map. If the program is rejected by verifier, * the map will be released by release_maps() or it * will be used by the valid program until it's unloaded - * and all maps are released in free_used_maps() + * and all maps are released in bpf_free_used_maps() */ bpf_map_inc(map); -- cgit From 70da1d01edf6da3fde1df98b2125a77083a0fb82 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Mon, 2 Oct 2023 16:36:55 +0200 Subject: cpu/hotplug: remove CPUHP_SLAB_PREPARE hooks The CPUHP_SLAB_PREPARE hooks are only used by SLAB which is removed. SLUB defines them as NULL, so we can remove those altogether. Acked-by: Thomas Gleixner Acked-by: David Rientjes Tested-by: David Rientjes Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Tested-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Signed-off-by: Vlastimil Babka --- include/linux/cpuhotplug.h | 1 - include/linux/slab.h | 8 -------- kernel/cpu.c | 5 ----- 3 files changed, 14 deletions(-) (limited to 'kernel') diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index d305db70674b..07cb8f7030b6 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -108,7 +108,6 @@ enum cpuhp_state { CPUHP_X2APIC_PREPARE, CPUHP_SMPCFD_PREPARE, CPUHP_RELAY_PREPARE, - CPUHP_SLAB_PREPARE, CPUHP_MD_RAID5_PREPARE, CPUHP_RCUTREE_PREP, CPUHP_CPUIDLE_COUPLED_PREPARE, diff --git a/include/linux/slab.h b/include/linux/slab.h index d6d6ffeeb9a2..34e43cddc520 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -788,12 +788,4 @@ size_t kmalloc_size_roundup(size_t size); void __init kmem_cache_init_late(void); -#if defined(CONFIG_SMP) && defined(CONFIG_SLAB) -int slab_prepare_cpu(unsigned int cpu); -int slab_dead_cpu(unsigned int cpu); -#else -#define slab_prepare_cpu NULL -#define slab_dead_cpu NULL -#endif - #endif /* _LINUX_SLAB_H */ diff --git a/kernel/cpu.c b/kernel/cpu.c index 9e4c6780adde..530b026d95a1 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -2125,11 +2125,6 @@ static struct cpuhp_step cpuhp_hp_states[] = { .startup.single = relay_prepare_cpu, .teardown.single = NULL, }, - [CPUHP_SLAB_PREPARE] = { - .name = "slab:prepare", - .startup.single = slab_prepare_cpu, - .teardown.single = slab_dead_cpu, - }, [CPUHP_RCUTREE_PREP] = { .name = "RCU/tree:prepare", .startup.single = rcutree_prepare_cpu, -- cgit From 6ac805d13870925c787a28e3fe5cc73610cacd03 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 4 Dec 2023 10:47:49 -0700 Subject: iov_iter: remove unused 'iov' argument from import_single_range() It is entirely unused, just get rid of it. Signed-off-by: Jens Axboe Link: https://lore.kernel.org/r/20231204174827.1258875-2-axboe@kernel.dk Signed-off-by: Christian Brauner --- drivers/block/ublk_drv.c | 6 ++---- drivers/char/random.c | 6 ++---- fs/aio.c | 2 +- include/linux/uio.h | 2 +- kernel/trace/trace_events_user.c | 3 +-- lib/iov_iter.c | 2 +- net/ipv4/tcp.c | 6 ++---- net/socket.c | 6 ++---- security/keys/keyctl.c | 3 +-- 9 files changed, 13 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 83600b45e12a..5656b0a1233d 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -893,11 +893,10 @@ static int ublk_map_io(const struct ublk_queue *ubq, const struct request *req, */ if (ublk_need_map_req(req)) { struct iov_iter iter; - struct iovec iov; const int dir = ITER_DEST; import_single_range(dir, u64_to_user_ptr(io->addr), rq_bytes, - &iov, &iter); + &iter); return ublk_copy_user_pages(req, 0, &iter, dir); } @@ -915,13 +914,12 @@ static int ublk_unmap_io(const struct ublk_queue *ubq, if (ublk_need_unmap_req(req)) { struct iov_iter iter; - struct iovec iov; const int dir = ITER_SOURCE; WARN_ON_ONCE(io->res > rq_bytes); import_single_range(dir, u64_to_user_ptr(io->addr), io->res, - &iov, &iter); + &iter); return ublk_copy_user_pages(req, 0, &iter, dir); } return rq_bytes; diff --git a/drivers/char/random.c b/drivers/char/random.c index 4a9c79391dee..e79ae238b30d 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -1364,7 +1364,6 @@ static void __cold try_to_generate_entropy(void) SYSCALL_DEFINE3(getrandom, char __user *, ubuf, size_t, len, unsigned int, flags) { struct iov_iter iter; - struct iovec iov; int ret; if (flags & ~(GRND_NONBLOCK | GRND_RANDOM | GRND_INSECURE)) @@ -1385,7 +1384,7 @@ SYSCALL_DEFINE3(getrandom, char __user *, ubuf, size_t, len, unsigned int, flags return ret; } - ret = import_single_range(ITER_DEST, ubuf, len, &iov, &iter); + ret = import_single_range(ITER_DEST, ubuf, len, &iter); if (unlikely(ret)) return ret; return get_random_bytes_user(&iter); @@ -1491,7 +1490,6 @@ static long random_ioctl(struct file *f, unsigned int cmd, unsigned long arg) return 0; case RNDADDENTROPY: { struct iov_iter iter; - struct iovec iov; ssize_t ret; int len; @@ -1503,7 +1501,7 @@ static long random_ioctl(struct file *f, unsigned int cmd, unsigned long arg) return -EINVAL; if (get_user(len, p++)) return -EFAULT; - ret = import_single_range(ITER_SOURCE, p, len, &iov, &iter); + ret = import_single_range(ITER_SOURCE, p, len, &iter); if (unlikely(ret)) return ret; ret = write_pool_user(&iter); diff --git a/fs/aio.c b/fs/aio.c index f8589caef9c1..251eeaef7fbf 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -1498,7 +1498,7 @@ static ssize_t aio_setup_rw(int rw, const struct iocb *iocb, size_t len = iocb->aio_nbytes; if (!vectored) { - ssize_t ret = import_single_range(rw, buf, len, *iovec, iter); + ssize_t ret = import_single_range(rw, buf, len, iter); *iovec = NULL; return ret; } diff --git a/include/linux/uio.h b/include/linux/uio.h index b6214cbf2a43..bfafd3542fa7 100644 --- a/include/linux/uio.h +++ b/include/linux/uio.h @@ -348,7 +348,7 @@ ssize_t __import_iovec(int type, const struct iovec __user *uvec, unsigned nr_segs, unsigned fast_segs, struct iovec **iovp, struct iov_iter *i, bool compat); int import_single_range(int type, void __user *buf, size_t len, - struct iovec *iov, struct iov_iter *i); + struct iov_iter *i); int import_ubuf(int type, void __user *buf, size_t len, struct iov_iter *i); static inline void iov_iter_ubuf(struct iov_iter *i, unsigned int direction, diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index 9365ce407426..4efc75d90a0d 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -2177,14 +2177,13 @@ static int user_events_open(struct inode *node, struct file *file) static ssize_t user_events_write(struct file *file, const char __user *ubuf, size_t count, loff_t *ppos) { - struct iovec iov; struct iov_iter i; if (unlikely(*ppos != 0)) return -EFAULT; if (unlikely(import_single_range(ITER_SOURCE, (char __user *)ubuf, - count, &iov, &i))) + count, &i))) return -EFAULT; return user_events_write_core(file, &i); diff --git a/lib/iov_iter.c b/lib/iov_iter.c index de7d11cf4c63..d60c73354e1f 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -1370,7 +1370,7 @@ ssize_t import_iovec(int type, const struct iovec __user *uvec, EXPORT_SYMBOL(import_iovec); int import_single_range(int rw, void __user *buf, size_t len, - struct iovec *iov, struct iov_iter *i) + struct iov_iter *i) { if (len > MAX_RW_COUNT) len = MAX_RW_COUNT; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 53bcc17c91e4..57cf3adb191a 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1849,7 +1849,6 @@ static int receive_fallback_to_copy(struct sock *sk, { unsigned long copy_address = (unsigned long)zc->copybuf_address; struct msghdr msg = {}; - struct iovec iov; int err; zc->length = 0; @@ -1859,7 +1858,7 @@ static int receive_fallback_to_copy(struct sock *sk, return -EINVAL; err = import_single_range(ITER_DEST, (void __user *)copy_address, - inq, &iov, &msg.msg_iter); + inq, &msg.msg_iter); if (err) return err; @@ -1886,14 +1885,13 @@ static int tcp_copy_straggler_data(struct tcp_zerocopy_receive *zc, { unsigned long copy_address = (unsigned long)zc->copybuf_address; struct msghdr msg = {}; - struct iovec iov; int err; if (copy_address != zc->copybuf_address) return -EINVAL; err = import_single_range(ITER_DEST, (void __user *)copy_address, - copylen, &iov, &msg.msg_iter); + copylen, &msg.msg_iter); if (err) return err; err = skb_copy_datagram_msg(skb, *offset, &msg, copylen); diff --git a/net/socket.c b/net/socket.c index 3379c64217a4..1f0d0e8d0a50 100644 --- a/net/socket.c +++ b/net/socket.c @@ -2161,10 +2161,9 @@ int __sys_sendto(int fd, void __user *buff, size_t len, unsigned int flags, struct sockaddr_storage address; int err; struct msghdr msg; - struct iovec iov; int fput_needed; - err = import_single_range(ITER_SOURCE, buff, len, &iov, &msg.msg_iter); + err = import_single_range(ITER_SOURCE, buff, len, &msg.msg_iter); if (unlikely(err)) return err; sock = sockfd_lookup_light(fd, &err, &fput_needed); @@ -2226,11 +2225,10 @@ int __sys_recvfrom(int fd, void __user *ubuf, size_t size, unsigned int flags, .msg_name = addr ? (struct sockaddr *)&address : NULL, }; struct socket *sock; - struct iovec iov; int err, err2; int fput_needed; - err = import_single_range(ITER_DEST, ubuf, size, &iov, &msg.msg_iter); + err = import_single_range(ITER_DEST, ubuf, size, &msg.msg_iter); if (unlikely(err)) return err; sock = sockfd_lookup_light(fd, &err, &fput_needed); diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c index 19be69fa4d05..193df7ca3ca8 100644 --- a/security/keys/keyctl.c +++ b/security/keys/keyctl.c @@ -1252,12 +1252,11 @@ long keyctl_instantiate_key(key_serial_t id, key_serial_t ringid) { if (_payload && plen) { - struct iovec iov; struct iov_iter from; int ret; ret = import_single_range(ITER_SOURCE, (void __user *)_payload, plen, - &iov, &from); + &from); if (unlikely(ret)) return ret; -- cgit From 9fd7874c0e5c89d7da0b4442271696ec0f8edcba Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 4 Dec 2023 10:47:50 -0700 Subject: iov_iter: replace import_single_range() with import_ubuf() With the removal of the 'iov' argument to import_single_range(), the two functions are now fully identical. Convert the import_single_range() callers to import_ubuf(), and remove the former fully. Signed-off-by: Jens Axboe Link: https://lore.kernel.org/r/20231204174827.1258875-3-axboe@kernel.dk Signed-off-by: Christian Brauner --- drivers/block/ublk_drv.c | 7 ++----- drivers/char/random.c | 4 ++-- fs/aio.c | 2 +- include/linux/uio.h | 2 -- kernel/trace/trace_events_user.c | 3 +-- lib/iov_iter.c | 13 ------------- net/ipv4/tcp.c | 8 ++++---- net/socket.c | 4 ++-- security/keys/keyctl.c | 4 ++-- 9 files changed, 14 insertions(+), 33 deletions(-) (limited to 'kernel') diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 5656b0a1233d..3eaf02ebeebe 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -895,9 +895,7 @@ static int ublk_map_io(const struct ublk_queue *ubq, const struct request *req, struct iov_iter iter; const int dir = ITER_DEST; - import_single_range(dir, u64_to_user_ptr(io->addr), rq_bytes, - &iter); - + import_ubuf(dir, u64_to_user_ptr(io->addr), rq_bytes, &iter); return ublk_copy_user_pages(req, 0, &iter, dir); } return rq_bytes; @@ -918,8 +916,7 @@ static int ublk_unmap_io(const struct ublk_queue *ubq, WARN_ON_ONCE(io->res > rq_bytes); - import_single_range(dir, u64_to_user_ptr(io->addr), io->res, - &iter); + import_ubuf(dir, u64_to_user_ptr(io->addr), io->res, &iter); return ublk_copy_user_pages(req, 0, &iter, dir); } return rq_bytes; diff --git a/drivers/char/random.c b/drivers/char/random.c index e79ae238b30d..456be28ba67c 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -1384,7 +1384,7 @@ SYSCALL_DEFINE3(getrandom, char __user *, ubuf, size_t, len, unsigned int, flags return ret; } - ret = import_single_range(ITER_DEST, ubuf, len, &iter); + ret = import_ubuf(ITER_DEST, ubuf, len, &iter); if (unlikely(ret)) return ret; return get_random_bytes_user(&iter); @@ -1501,7 +1501,7 @@ static long random_ioctl(struct file *f, unsigned int cmd, unsigned long arg) return -EINVAL; if (get_user(len, p++)) return -EFAULT; - ret = import_single_range(ITER_SOURCE, p, len, &iter); + ret = import_ubuf(ITER_SOURCE, p, len, &iter); if (unlikely(ret)) return ret; ret = write_pool_user(&iter); diff --git a/fs/aio.c b/fs/aio.c index 251eeaef7fbf..4ea639509d41 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -1498,7 +1498,7 @@ static ssize_t aio_setup_rw(int rw, const struct iocb *iocb, size_t len = iocb->aio_nbytes; if (!vectored) { - ssize_t ret = import_single_range(rw, buf, len, iter); + ssize_t ret = import_ubuf(rw, buf, len, iter); *iovec = NULL; return ret; } diff --git a/include/linux/uio.h b/include/linux/uio.h index bfafd3542fa7..bea9c89922d9 100644 --- a/include/linux/uio.h +++ b/include/linux/uio.h @@ -347,8 +347,6 @@ ssize_t import_iovec(int type, const struct iovec __user *uvec, ssize_t __import_iovec(int type, const struct iovec __user *uvec, unsigned nr_segs, unsigned fast_segs, struct iovec **iovp, struct iov_iter *i, bool compat); -int import_single_range(int type, void __user *buf, size_t len, - struct iov_iter *i); int import_ubuf(int type, void __user *buf, size_t len, struct iov_iter *i); static inline void iov_iter_ubuf(struct iov_iter *i, unsigned int direction, diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index 4efc75d90a0d..e76f5e1efdf2 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -2182,8 +2182,7 @@ static ssize_t user_events_write(struct file *file, const char __user *ubuf, if (unlikely(*ppos != 0)) return -EFAULT; - if (unlikely(import_single_range(ITER_SOURCE, (char __user *)ubuf, - count, &i))) + if (unlikely(import_ubuf(ITER_SOURCE, (char __user *)ubuf, count, &i))) return -EFAULT; return user_events_write_core(file, &i); diff --git a/lib/iov_iter.c b/lib/iov_iter.c index d60c73354e1f..009875bc95aa 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -1369,19 +1369,6 @@ ssize_t import_iovec(int type, const struct iovec __user *uvec, } EXPORT_SYMBOL(import_iovec); -int import_single_range(int rw, void __user *buf, size_t len, - struct iov_iter *i) -{ - if (len > MAX_RW_COUNT) - len = MAX_RW_COUNT; - if (unlikely(!access_ok(buf, len))) - return -EFAULT; - - iov_iter_ubuf(i, rw, buf, len); - return 0; -} -EXPORT_SYMBOL(import_single_range); - int import_ubuf(int rw, void __user *buf, size_t len, struct iov_iter *i) { if (len > MAX_RW_COUNT) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 57cf3adb191a..54d3c762d400 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1857,8 +1857,8 @@ static int receive_fallback_to_copy(struct sock *sk, if (copy_address != zc->copybuf_address) return -EINVAL; - err = import_single_range(ITER_DEST, (void __user *)copy_address, - inq, &msg.msg_iter); + err = import_ubuf(ITER_DEST, (void __user *)copy_address, inq, + &msg.msg_iter); if (err) return err; @@ -1890,8 +1890,8 @@ static int tcp_copy_straggler_data(struct tcp_zerocopy_receive *zc, if (copy_address != zc->copybuf_address) return -EINVAL; - err = import_single_range(ITER_DEST, (void __user *)copy_address, - copylen, &msg.msg_iter); + err = import_ubuf(ITER_DEST, (void __user *)copy_address, copylen, + &msg.msg_iter); if (err) return err; err = skb_copy_datagram_msg(skb, *offset, &msg, copylen); diff --git a/net/socket.c b/net/socket.c index 1f0d0e8d0a50..5d49ae0c1b79 100644 --- a/net/socket.c +++ b/net/socket.c @@ -2163,7 +2163,7 @@ int __sys_sendto(int fd, void __user *buff, size_t len, unsigned int flags, struct msghdr msg; int fput_needed; - err = import_single_range(ITER_SOURCE, buff, len, &msg.msg_iter); + err = import_ubuf(ITER_SOURCE, buff, len, &msg.msg_iter); if (unlikely(err)) return err; sock = sockfd_lookup_light(fd, &err, &fput_needed); @@ -2228,7 +2228,7 @@ int __sys_recvfrom(int fd, void __user *ubuf, size_t size, unsigned int flags, int err, err2; int fput_needed; - err = import_single_range(ITER_DEST, ubuf, size, &msg.msg_iter); + err = import_ubuf(ITER_DEST, ubuf, size, &msg.msg_iter); if (unlikely(err)) return err; sock = sockfd_lookup_light(fd, &err, &fput_needed); diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c index 193df7ca3ca8..10ba439968f7 100644 --- a/security/keys/keyctl.c +++ b/security/keys/keyctl.c @@ -1255,8 +1255,8 @@ long keyctl_instantiate_key(key_serial_t id, struct iov_iter from; int ret; - ret = import_single_range(ITER_SOURCE, (void __user *)_payload, plen, - &from); + ret = import_ubuf(ITER_SOURCE, (void __user *)_payload, plen, + &from); if (unlikely(ret)) return ret; -- cgit From 41f6f64e6999a837048b1bd13a2f8742964eca6b Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 5 Dec 2023 10:42:39 -0800 Subject: bpf: support non-r10 register spill/fill to/from stack in precision tracking Use instruction (jump) history to record instructions that performed register spill/fill to/from stack, regardless if this was done through read-only r10 register, or any other register after copying r10 into it *and* potentially adjusting offset. To make this work reliably, we push extra per-instruction flags into instruction history, encoding stack slot index (spi) and stack frame number in extra 10 bit flags we take away from prev_idx in instruction history. We don't touch idx field for maximum performance, as it's checked most frequently during backtracking. This change removes basically the last remaining practical limitation of precision backtracking logic in BPF verifier. It fixes known deficiencies, but also opens up new opportunities to reduce number of verified states, explored in the subsequent patches. There are only three differences in selftests' BPF object files according to veristat, all in the positive direction (less states). File Program Insns (A) Insns (B) Insns (DIFF) States (A) States (B) States (DIFF) -------------------------------------- ------------- --------- --------- ------------- ---------- ---------- ------------- test_cls_redirect_dynptr.bpf.linked3.o cls_redirect 2987 2864 -123 (-4.12%) 240 231 -9 (-3.75%) xdp_synproxy_kern.bpf.linked3.o syncookie_tc 82848 82661 -187 (-0.23%) 5107 5073 -34 (-0.67%) xdp_synproxy_kern.bpf.linked3.o syncookie_xdp 85116 84964 -152 (-0.18%) 5162 5130 -32 (-0.62%) Note, I avoided renaming jmp_history to more generic insn_hist to minimize number of lines changed and potential merge conflicts between bpf and bpf-next trees. Notice also cur_hist_entry pointer reset to NULL at the beginning of instruction verification loop. This pointer avoids the problem of relying on last jump history entry's insn_idx to determine whether we already have entry for current instruction or not. It can happen that we added jump history entry because current instruction is_jmp_point(), but also we need to add instruction flags for stack access. In this case, we don't want to entries, so we need to reuse last added entry, if it is present. Relying on insn_idx comparison has the same ambiguity problem as the one that was fixed recently in [0], so we avoid that. [0] https://patchwork.kernel.org/project/netdevbpf/patch/20231110002638.4168352-3-andrii@kernel.org/ Acked-by: Eduard Zingerman Reported-by: Tao Lyu Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231205184248.1502704-2-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 31 +++- kernel/bpf/verifier.c | 175 ++++++++++++--------- .../bpf/progs/verifier_subprog_precision.c | 23 ++- tools/testing/selftests/bpf/verifier/precise.c | 38 +++-- 4 files changed, 169 insertions(+), 98 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 3378cc753061..bada59812e00 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -325,12 +325,34 @@ struct bpf_func_state { int allocated_stack; }; -struct bpf_idx_pair { - u32 prev_idx; +#define MAX_CALL_FRAMES 8 + +/* instruction history flags, used in bpf_jmp_history_entry.flags field */ +enum { + /* instruction references stack slot through PTR_TO_STACK register; + * we also store stack's frame number in lower 3 bits (MAX_CALL_FRAMES is 8) + * and accessed stack slot's index in next 6 bits (MAX_BPF_STACK is 512, + * 8 bytes per slot, so slot index (spi) is [0, 63]) + */ + INSN_F_FRAMENO_MASK = 0x7, /* 3 bits */ + + INSN_F_SPI_MASK = 0x3f, /* 6 bits */ + INSN_F_SPI_SHIFT = 3, /* shifted 3 bits to the left */ + + INSN_F_STACK_ACCESS = BIT(9), /* we need 10 bits total */ +}; + +static_assert(INSN_F_FRAMENO_MASK + 1 >= MAX_CALL_FRAMES); +static_assert(INSN_F_SPI_MASK + 1 >= MAX_BPF_STACK / 8); + +struct bpf_jmp_history_entry { u32 idx; + /* insn idx can't be bigger than 1 million */ + u32 prev_idx : 22; + /* special flags, e.g., whether insn is doing register stack spill/load */ + u32 flags : 10; }; -#define MAX_CALL_FRAMES 8 /* Maximum number of register states that can exist at once */ #define BPF_ID_MAP_SIZE ((MAX_BPF_REG + MAX_BPF_STACK / BPF_REG_SIZE) * MAX_CALL_FRAMES) struct bpf_verifier_state { @@ -413,7 +435,7 @@ struct bpf_verifier_state { * For most states jmp_history_cnt is [0-3]. * For loops can go up to ~40. */ - struct bpf_idx_pair *jmp_history; + struct bpf_jmp_history_entry *jmp_history; u32 jmp_history_cnt; u32 dfs_depth; u32 callback_unroll_depth; @@ -656,6 +678,7 @@ struct bpf_verifier_env { int cur_stack; } cfg; struct backtrack_state bt; + struct bpf_jmp_history_entry *cur_hist_ent; u32 pass_cnt; /* number of times do_check() was called */ u32 subprog_cnt; /* number of instructions analyzed by the verifier */ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 1ed39665f802..9bc16dc66465 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1355,8 +1355,8 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state, int i, err; dst_state->jmp_history = copy_array(dst_state->jmp_history, src->jmp_history, - src->jmp_history_cnt, sizeof(struct bpf_idx_pair), - GFP_USER); + src->jmp_history_cnt, sizeof(*dst_state->jmp_history), + GFP_USER); if (!dst_state->jmp_history) return -ENOMEM; dst_state->jmp_history_cnt = src->jmp_history_cnt; @@ -3221,6 +3221,21 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno, return __check_reg_arg(env, state->regs, regno, t); } +static int insn_stack_access_flags(int frameno, int spi) +{ + return INSN_F_STACK_ACCESS | (spi << INSN_F_SPI_SHIFT) | frameno; +} + +static int insn_stack_access_spi(int insn_flags) +{ + return (insn_flags >> INSN_F_SPI_SHIFT) & INSN_F_SPI_MASK; +} + +static int insn_stack_access_frameno(int insn_flags) +{ + return insn_flags & INSN_F_FRAMENO_MASK; +} + static void mark_jmp_point(struct bpf_verifier_env *env, int idx) { env->insn_aux_data[idx].jmp_point = true; @@ -3232,28 +3247,51 @@ static bool is_jmp_point(struct bpf_verifier_env *env, int insn_idx) } /* for any branch, call, exit record the history of jmps in the given state */ -static int push_jmp_history(struct bpf_verifier_env *env, - struct bpf_verifier_state *cur) +static int push_jmp_history(struct bpf_verifier_env *env, struct bpf_verifier_state *cur, + int insn_flags) { u32 cnt = cur->jmp_history_cnt; - struct bpf_idx_pair *p; + struct bpf_jmp_history_entry *p; size_t alloc_size; - if (!is_jmp_point(env, env->insn_idx)) + /* combine instruction flags if we already recorded this instruction */ + if (env->cur_hist_ent) { + /* atomic instructions push insn_flags twice, for READ and + * WRITE sides, but they should agree on stack slot + */ + WARN_ONCE((env->cur_hist_ent->flags & insn_flags) && + (env->cur_hist_ent->flags & insn_flags) != insn_flags, + "verifier insn history bug: insn_idx %d cur flags %x new flags %x\n", + env->insn_idx, env->cur_hist_ent->flags, insn_flags); + env->cur_hist_ent->flags |= insn_flags; return 0; + } cnt++; alloc_size = kmalloc_size_roundup(size_mul(cnt, sizeof(*p))); p = krealloc(cur->jmp_history, alloc_size, GFP_USER); if (!p) return -ENOMEM; - p[cnt - 1].idx = env->insn_idx; - p[cnt - 1].prev_idx = env->prev_insn_idx; cur->jmp_history = p; + + p = &cur->jmp_history[cnt - 1]; + p->idx = env->insn_idx; + p->prev_idx = env->prev_insn_idx; + p->flags = insn_flags; cur->jmp_history_cnt = cnt; + env->cur_hist_ent = p; + return 0; } +static struct bpf_jmp_history_entry *get_jmp_hist_entry(struct bpf_verifier_state *st, + u32 hist_end, int insn_idx) +{ + if (hist_end > 0 && st->jmp_history[hist_end - 1].idx == insn_idx) + return &st->jmp_history[hist_end - 1]; + return NULL; +} + /* Backtrack one insn at a time. If idx is not at the top of recorded * history then previous instruction came from straight line execution. * Return -ENOENT if we exhausted all instructions within given state. @@ -3415,9 +3453,14 @@ static inline bool bt_is_reg_set(struct backtrack_state *bt, u32 reg) return bt->reg_masks[bt->frame] & (1 << reg); } +static inline bool bt_is_frame_slot_set(struct backtrack_state *bt, u32 frame, u32 slot) +{ + return bt->stack_masks[frame] & (1ull << slot); +} + static inline bool bt_is_slot_set(struct backtrack_state *bt, u32 slot) { - return bt->stack_masks[bt->frame] & (1ull << slot); + return bt_is_frame_slot_set(bt, bt->frame, slot); } /* format registers bitmask, e.g., "r0,r2,r4" for 0x15 mask */ @@ -3471,7 +3514,7 @@ static bool calls_callback(struct bpf_verifier_env *env, int insn_idx); * - *was* processed previously during backtracking. */ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, - struct backtrack_state *bt) + struct bpf_jmp_history_entry *hist, struct backtrack_state *bt) { const struct bpf_insn_cbs cbs = { .cb_call = disasm_kfunc_name, @@ -3484,7 +3527,7 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, u8 mode = BPF_MODE(insn->code); u32 dreg = insn->dst_reg; u32 sreg = insn->src_reg; - u32 spi, i; + u32 spi, i, fr; if (insn->code == 0) return 0; @@ -3545,20 +3588,15 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, * by 'precise' mark in corresponding register of this state. * No further tracking necessary. */ - if (insn->src_reg != BPF_REG_FP) + if (!hist || !(hist->flags & INSN_F_STACK_ACCESS)) return 0; - /* dreg = *(u64 *)[fp - off] was a fill from the stack. * that [fp - off] slot contains scalar that needs to be * tracked with precision */ - spi = (-insn->off - 1) / BPF_REG_SIZE; - if (spi >= 64) { - verbose(env, "BUG spi %d\n", spi); - WARN_ONCE(1, "verifier backtracking bug"); - return -EFAULT; - } - bt_set_slot(bt, spi); + spi = insn_stack_access_spi(hist->flags); + fr = insn_stack_access_frameno(hist->flags); + bt_set_frame_slot(bt, fr, spi); } else if (class == BPF_STX || class == BPF_ST) { if (bt_is_reg_set(bt, dreg)) /* stx & st shouldn't be using _scalar_ dst_reg @@ -3567,17 +3605,13 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, */ return -ENOTSUPP; /* scalars can only be spilled into stack */ - if (insn->dst_reg != BPF_REG_FP) + if (!hist || !(hist->flags & INSN_F_STACK_ACCESS)) return 0; - spi = (-insn->off - 1) / BPF_REG_SIZE; - if (spi >= 64) { - verbose(env, "BUG spi %d\n", spi); - WARN_ONCE(1, "verifier backtracking bug"); - return -EFAULT; - } - if (!bt_is_slot_set(bt, spi)) + spi = insn_stack_access_spi(hist->flags); + fr = insn_stack_access_frameno(hist->flags); + if (!bt_is_frame_slot_set(bt, fr, spi)) return 0; - bt_clear_slot(bt, spi); + bt_clear_frame_slot(bt, fr, spi); if (class == BPF_STX) bt_set_reg(bt, sreg); } else if (class == BPF_JMP || class == BPF_JMP32) { @@ -3621,10 +3655,14 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, WARN_ONCE(1, "verifier backtracking bug"); return -EFAULT; } - /* we don't track register spills perfectly, - * so fallback to force-precise instead of failing */ - if (bt_stack_mask(bt) != 0) - return -ENOTSUPP; + /* we are now tracking register spills correctly, + * so any instance of leftover slots is a bug + */ + if (bt_stack_mask(bt) != 0) { + verbose(env, "BUG stack slots %llx\n", bt_stack_mask(bt)); + WARN_ONCE(1, "verifier backtracking bug (subprog leftover stack slots)"); + return -EFAULT; + } /* propagate r1-r5 to the caller */ for (i = BPF_REG_1; i <= BPF_REG_5; i++) { if (bt_is_reg_set(bt, i)) { @@ -3649,8 +3687,11 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, WARN_ONCE(1, "verifier backtracking bug"); return -EFAULT; } - if (bt_stack_mask(bt) != 0) - return -ENOTSUPP; + if (bt_stack_mask(bt) != 0) { + verbose(env, "BUG stack slots %llx\n", bt_stack_mask(bt)); + WARN_ONCE(1, "verifier backtracking bug (callback leftover stack slots)"); + return -EFAULT; + } /* clear r1-r5 in callback subprog's mask */ for (i = BPF_REG_1; i <= BPF_REG_5; i++) bt_clear_reg(bt, i); @@ -4087,6 +4128,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno) for (;;) { DECLARE_BITMAP(mask, 64); u32 history = st->jmp_history_cnt; + struct bpf_jmp_history_entry *hist; if (env->log.level & BPF_LOG_LEVEL2) { verbose(env, "mark_precise: frame%d: last_idx %d first_idx %d subseq_idx %d \n", @@ -4150,7 +4192,8 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno) err = 0; skip_first = false; } else { - err = backtrack_insn(env, i, subseq_idx, bt); + hist = get_jmp_hist_entry(st, history, i); + err = backtrack_insn(env, i, subseq_idx, hist, bt); } if (err == -ENOTSUPP) { mark_all_scalars_precise(env, env->cur_state); @@ -4203,22 +4246,10 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno) bitmap_from_u64(mask, bt_frame_stack_mask(bt, fr)); for_each_set_bit(i, mask, 64) { if (i >= func->allocated_stack / BPF_REG_SIZE) { - /* the sequence of instructions: - * 2: (bf) r3 = r10 - * 3: (7b) *(u64 *)(r3 -8) = r0 - * 4: (79) r4 = *(u64 *)(r10 -8) - * doesn't contain jmps. It's backtracked - * as a single block. - * During backtracking insn 3 is not recognized as - * stack access, so at the end of backtracking - * stack slot fp-8 is still marked in stack_mask. - * However the parent state may not have accessed - * fp-8 and it's "unallocated" stack space. - * In such case fallback to conservative. - */ - mark_all_scalars_precise(env, env->cur_state); - bt_reset(bt); - return 0; + verbose(env, "BUG backtracking (stack slot %d, total slots %d)\n", + i, func->allocated_stack / BPF_REG_SIZE); + WARN_ONCE(1, "verifier backtracking bug (stack slot out of bounds)"); + return -EFAULT; } if (!is_spilled_scalar_reg(&func->stack[i])) { @@ -4391,7 +4422,7 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, int i, slot = -off - 1, spi = slot / BPF_REG_SIZE, err; struct bpf_insn *insn = &env->prog->insnsi[insn_idx]; struct bpf_reg_state *reg = NULL; - u32 dst_reg = insn->dst_reg; + int insn_flags = insn_stack_access_flags(state->frameno, spi); err = grow_stack_state(state, round_up(slot + 1, BPF_REG_SIZE)); if (err) @@ -4432,17 +4463,6 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, mark_stack_slot_scratched(env, spi); if (reg && !(off % BPF_REG_SIZE) && register_is_bounded(reg) && !register_is_null(reg) && env->bpf_capable) { - if (dst_reg != BPF_REG_FP) { - /* The backtracking logic can only recognize explicit - * stack slot address like [fp - 8]. Other spill of - * scalar via different register has to be conservative. - * Backtrack from here and mark all registers as precise - * that contributed into 'reg' being a constant. - */ - err = mark_chain_precision(env, value_regno); - if (err) - return err; - } save_register_state(state, spi, reg, size); /* Break the relation on a narrowing spill. */ if (fls64(reg->umax_value) > BITS_PER_BYTE * size) @@ -4454,6 +4474,7 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, __mark_reg_known(&fake_reg, insn->imm); fake_reg.type = SCALAR_VALUE; save_register_state(state, spi, &fake_reg, size); + insn_flags = 0; /* not a register spill */ } else if (reg && is_spillable_regtype(reg->type)) { /* register containing pointer is being spilled into stack */ if (size != BPF_REG_SIZE) { @@ -4499,9 +4520,12 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, /* Mark slots affected by this stack write. */ for (i = 0; i < size; i++) - state->stack[spi].slot_type[(slot - i) % BPF_REG_SIZE] = - type; + state->stack[spi].slot_type[(slot - i) % BPF_REG_SIZE] = type; + insn_flags = 0; /* not a register spill */ } + + if (insn_flags) + return push_jmp_history(env, env->cur_state, insn_flags); return 0; } @@ -4694,6 +4718,7 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env, int i, slot = -off - 1, spi = slot / BPF_REG_SIZE; struct bpf_reg_state *reg; u8 *stype, type; + int insn_flags = insn_stack_access_flags(reg_state->frameno, spi); stype = reg_state->stack[spi].slot_type; reg = ®_state->stack[spi].spilled_ptr; @@ -4739,12 +4764,10 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env, return -EACCES; } mark_reg_unknown(env, state->regs, dst_regno); + insn_flags = 0; /* not restoring original register state */ } state->regs[dst_regno].live |= REG_LIVE_WRITTEN; - return 0; - } - - if (dst_regno >= 0) { + } else if (dst_regno >= 0) { /* restore register state from stack */ copy_register_state(&state->regs[dst_regno], reg); /* mark reg as written since spilled pointer state likely @@ -4780,7 +4803,10 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env, mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64); if (dst_regno >= 0) mark_reg_stack_read(env, reg_state, off, off + size, dst_regno); + insn_flags = 0; /* we are not restoring spilled register */ } + if (insn_flags) + return push_jmp_history(env, env->cur_state, insn_flags); return 0; } @@ -6940,7 +6966,6 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i BPF_SIZE(insn->code), BPF_WRITE, -1, true, false); if (err) return err; - return 0; } @@ -16910,7 +16935,8 @@ hit: * the precision needs to be propagated back in * the current state. */ - err = err ? : push_jmp_history(env, cur); + if (is_jmp_point(env, env->insn_idx)) + err = err ? : push_jmp_history(env, cur, 0); err = err ? : propagate_precision(env, &sl->state); if (err) return err; @@ -17135,6 +17161,9 @@ static int do_check(struct bpf_verifier_env *env) u8 class; int err; + /* reset current history entry on each new instruction */ + env->cur_hist_ent = NULL; + env->prev_insn_idx = prev_insn_idx; if (env->insn_idx >= insn_cnt) { verbose(env, "invalid insn idx %d insn_cnt %d\n", @@ -17174,7 +17203,7 @@ static int do_check(struct bpf_verifier_env *env) } if (is_jmp_point(env, env->insn_idx)) { - err = push_jmp_history(env, state); + err = push_jmp_history(env, state, 0); if (err) return err; } diff --git a/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c b/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c index 0dfe3f8b69ac..eba98fab2f54 100644 --- a/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c +++ b/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c @@ -589,11 +589,24 @@ static __u64 subprog_spill_reg_precise(void) SEC("?raw_tp") __success __log_level(2) -/* precision backtracking can't currently handle stack access not through r10, - * so we won't be able to mark stack slot fp-8 as precise, and so will - * fallback to forcing all as precise - */ -__msg("mark_precise: frame0: falling back to forcing all scalars precise") +__msg("10: (0f) r1 += r7") +__msg("mark_precise: frame0: last_idx 10 first_idx 7 subseq_idx -1") +__msg("mark_precise: frame0: regs=r7 stack= before 9: (bf) r1 = r8") +__msg("mark_precise: frame0: regs=r7 stack= before 8: (27) r7 *= 4") +__msg("mark_precise: frame0: regs=r7 stack= before 7: (79) r7 = *(u64 *)(r10 -8)") +__msg("mark_precise: frame0: parent state regs= stack=-8: R0_w=2 R6_w=1 R8_rw=map_value(map=.data.vals,ks=4,vs=16) R10=fp0 fp-8_rw=P1") +__msg("mark_precise: frame0: last_idx 18 first_idx 0 subseq_idx 7") +__msg("mark_precise: frame0: regs= stack=-8 before 18: (95) exit") +__msg("mark_precise: frame1: regs= stack= before 17: (0f) r0 += r2") +__msg("mark_precise: frame1: regs= stack= before 16: (79) r2 = *(u64 *)(r1 +0)") +__msg("mark_precise: frame1: regs= stack= before 15: (79) r0 = *(u64 *)(r10 -16)") +__msg("mark_precise: frame1: regs= stack= before 14: (7b) *(u64 *)(r10 -16) = r2") +__msg("mark_precise: frame1: regs= stack= before 13: (7b) *(u64 *)(r1 +0) = r2") +__msg("mark_precise: frame1: regs=r2 stack= before 6: (85) call pc+6") +__msg("mark_precise: frame0: regs=r2 stack= before 5: (bf) r2 = r6") +__msg("mark_precise: frame0: regs=r6 stack= before 4: (07) r1 += -8") +__msg("mark_precise: frame0: regs=r6 stack= before 3: (bf) r1 = r10") +__msg("mark_precise: frame0: regs=r6 stack= before 2: (b7) r6 = 1") __naked int subprog_spill_into_parent_stack_slot_precise(void) { asm volatile ( diff --git a/tools/testing/selftests/bpf/verifier/precise.c b/tools/testing/selftests/bpf/verifier/precise.c index 0d84dd1f38b6..8a2ff81d8350 100644 --- a/tools/testing/selftests/bpf/verifier/precise.c +++ b/tools/testing/selftests/bpf/verifier/precise.c @@ -140,10 +140,11 @@ .result = REJECT, }, { - "precise: ST insn causing spi > allocated_stack", + "precise: ST zero to stack insn is supported", .insns = { BPF_MOV64_REG(BPF_REG_3, BPF_REG_10), BPF_JMP_IMM(BPF_JNE, BPF_REG_3, 123, 0), + /* not a register spill, so we stop precision propagation for R4 here */ BPF_ST_MEM(BPF_DW, BPF_REG_3, -8, 0), BPF_LDX_MEM(BPF_DW, BPF_REG_4, BPF_REG_10, -8), BPF_MOV64_IMM(BPF_REG_0, -1), @@ -157,11 +158,11 @@ mark_precise: frame0: last_idx 4 first_idx 2\ mark_precise: frame0: regs=r4 stack= before 4\ mark_precise: frame0: regs=r4 stack= before 3\ - mark_precise: frame0: regs= stack=-8 before 2\ - mark_precise: frame0: falling back to forcing all scalars precise\ - force_precise: frame0: forcing r0 to be precise\ mark_precise: frame0: last_idx 5 first_idx 5\ - mark_precise: frame0: parent state regs= stack=:", + mark_precise: frame0: parent state regs=r0 stack=:\ + mark_precise: frame0: last_idx 4 first_idx 2\ + mark_precise: frame0: regs=r0 stack= before 4\ + 5: R0=-1 R4=0", .result = VERBOSE_ACCEPT, .retval = -1, }, @@ -169,6 +170,8 @@ "precise: STX insn causing spi > allocated_stack", .insns = { BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_prandom_u32), + /* make later reg spill more interesting by having somewhat known scalar */ + BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 0xff), BPF_MOV64_REG(BPF_REG_3, BPF_REG_10), BPF_JMP_IMM(BPF_JNE, BPF_REG_3, 123, 0), BPF_STX_MEM(BPF_DW, BPF_REG_3, BPF_REG_0, -8), @@ -179,18 +182,21 @@ }, .prog_type = BPF_PROG_TYPE_XDP, .flags = BPF_F_TEST_STATE_FREQ, - .errstr = "mark_precise: frame0: last_idx 6 first_idx 6\ + .errstr = "mark_precise: frame0: last_idx 7 first_idx 7\ mark_precise: frame0: parent state regs=r4 stack=:\ - mark_precise: frame0: last_idx 5 first_idx 3\ - mark_precise: frame0: regs=r4 stack= before 5\ - mark_precise: frame0: regs=r4 stack= before 4\ - mark_precise: frame0: regs= stack=-8 before 3\ - mark_precise: frame0: falling back to forcing all scalars precise\ - force_precise: frame0: forcing r0 to be precise\ - force_precise: frame0: forcing r0 to be precise\ - force_precise: frame0: forcing r0 to be precise\ - force_precise: frame0: forcing r0 to be precise\ - mark_precise: frame0: last_idx 6 first_idx 6\ + mark_precise: frame0: last_idx 6 first_idx 4\ + mark_precise: frame0: regs=r4 stack= before 6: (b7) r0 = -1\ + mark_precise: frame0: regs=r4 stack= before 5: (79) r4 = *(u64 *)(r10 -8)\ + mark_precise: frame0: regs= stack=-8 before 4: (7b) *(u64 *)(r3 -8) = r0\ + mark_precise: frame0: parent state regs=r0 stack=:\ + mark_precise: frame0: last_idx 3 first_idx 3\ + mark_precise: frame0: regs=r0 stack= before 3: (55) if r3 != 0x7b goto pc+0\ + mark_precise: frame0: regs=r0 stack= before 2: (bf) r3 = r10\ + mark_precise: frame0: regs=r0 stack= before 1: (57) r0 &= 255\ + mark_precise: frame0: parent state regs=r0 stack=:\ + mark_precise: frame0: last_idx 0 first_idx 0\ + mark_precise: frame0: regs=r0 stack= before 0: (85) call bpf_get_prandom_u32#7\ + mark_precise: frame0: last_idx 7 first_idx 7\ mark_precise: frame0: parent state regs= stack=:", .result = VERBOSE_ACCEPT, .retval = -1, -- cgit From ab125ed3ec1c10ccc36bc98c7a4256ad114a3dae Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 5 Dec 2023 10:42:41 -0800 Subject: bpf: fix check for attempt to corrupt spilled pointer When register is spilled onto a stack as a 1/2/4-byte register, we set slot_type[BPF_REG_SIZE - 1] (plus potentially few more below it, depending on actual spill size). So to check if some stack slot has spilled register we need to consult slot_type[7], not slot_type[0]. To avoid the need to remember and double-check this in the future, just use is_spilled_reg() helper. Fixes: 27113c59b6d0 ("bpf: Check the other end of slot_type for STACK_SPILL") Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231205184248.1502704-4-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 9bc16dc66465..3edca06de9fd 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4431,7 +4431,7 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, * so it's aligned access and [off, off + size) are within stack limits */ if (!env->allow_ptr_leaks && - state->stack[spi].slot_type[0] == STACK_SPILL && + is_spilled_reg(&state->stack[spi]) && size != BPF_REG_SIZE) { verbose(env, "attempt to corrupt spilled pointer on stack\n"); return -EACCES; -- cgit From eaf18febd6ebc381aeb61543705148b3e28c7c47 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 5 Dec 2023 10:42:42 -0800 Subject: bpf: preserve STACK_ZERO slots on partial reg spills Instead of always forcing STACK_ZERO slots to STACK_MISC, preserve it in situations where this is possible. E.g., when spilling register as 1/2/4-byte subslots on the stack, all the remaining bytes in the stack slot do not automatically become unknown. If we knew they contained zeroes, we can preserve those STACK_ZERO markers. Add a helper mark_stack_slot_misc(), similar to scrub_spilled_slot(), but that doesn't overwrite either STACK_INVALID nor STACK_ZERO. Note that we need to take into account possibility of being in unprivileged mode, in which case STACK_INVALID is forced to STACK_MISC for correctness, as treating STACK_INVALID as equivalent STACK_MISC is only enabled in privileged mode. Acked-by: Eduard Zingerman Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231205184248.1502704-5-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 3edca06de9fd..93de39a6e36e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1144,6 +1144,21 @@ static bool is_spilled_scalar_reg(const struct bpf_stack_state *stack) stack->spilled_ptr.type == SCALAR_VALUE; } +/* Mark stack slot as STACK_MISC, unless it is already STACK_INVALID, in which + * case they are equivalent, or it's STACK_ZERO, in which case we preserve + * more precise STACK_ZERO. + * Note, in uprivileged mode leaving STACK_INVALID is wrong, so we take + * env->allow_ptr_leaks into account and force STACK_MISC, if necessary. + */ +static void mark_stack_slot_misc(struct bpf_verifier_env *env, u8 *stype) +{ + if (*stype == STACK_ZERO) + return; + if (env->allow_ptr_leaks && *stype == STACK_INVALID) + return; + *stype = STACK_MISC; +} + static void scrub_spilled_slot(u8 *stype) { if (*stype != STACK_INVALID) @@ -4386,7 +4401,8 @@ static void copy_register_state(struct bpf_reg_state *dst, const struct bpf_reg_ dst->live = live; } -static void save_register_state(struct bpf_func_state *state, +static void save_register_state(struct bpf_verifier_env *env, + struct bpf_func_state *state, int spi, struct bpf_reg_state *reg, int size) { @@ -4401,7 +4417,7 @@ static void save_register_state(struct bpf_func_state *state, /* size < 8 bytes spill */ for (; i; i--) - scrub_spilled_slot(&state->stack[spi].slot_type[i - 1]); + mark_stack_slot_misc(env, &state->stack[spi].slot_type[i - 1]); } static bool is_bpf_st_mem(struct bpf_insn *insn) @@ -4463,7 +4479,7 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, mark_stack_slot_scratched(env, spi); if (reg && !(off % BPF_REG_SIZE) && register_is_bounded(reg) && !register_is_null(reg) && env->bpf_capable) { - save_register_state(state, spi, reg, size); + save_register_state(env, state, spi, reg, size); /* Break the relation on a narrowing spill. */ if (fls64(reg->umax_value) > BITS_PER_BYTE * size) state->stack[spi].spilled_ptr.id = 0; @@ -4473,7 +4489,7 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, __mark_reg_known(&fake_reg, insn->imm); fake_reg.type = SCALAR_VALUE; - save_register_state(state, spi, &fake_reg, size); + save_register_state(env, state, spi, &fake_reg, size); insn_flags = 0; /* not a register spill */ } else if (reg && is_spillable_regtype(reg->type)) { /* register containing pointer is being spilled into stack */ @@ -4486,7 +4502,7 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, verbose(env, "cannot spill pointers to stack into stack frame of the caller\n"); return -EINVAL; } - save_register_state(state, spi, reg, size); + save_register_state(env, state, spi, reg, size); } else { u8 type = STACK_MISC; @@ -4757,6 +4773,8 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env, continue; if (type == STACK_MISC) continue; + if (type == STACK_ZERO) + continue; if (type == STACK_INVALID && env->allow_uninit_stack) continue; verbose(env, "invalid read from stack off %d+%d size %d\n", -- cgit From e322f0bcb8d371f4606eaf141c7f967e1a79bcb7 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 5 Dec 2023 10:42:44 -0800 Subject: bpf: preserve constant zero when doing partial register restore Similar to special handling of STACK_ZERO, when reading 1/2/4 bytes from stack from slot that has register spilled into it and that register has a constant value zero, preserve that zero and mark spilled register as precise for that. This makes spilled const zero register and STACK_ZERO cases equivalent in their behavior. Acked-by: Eduard Zingerman Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231205184248.1502704-7-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 93de39a6e36e..1ebe76c98451 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4767,22 +4767,39 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env, copy_register_state(&state->regs[dst_regno], reg); state->regs[dst_regno].subreg_def = subreg_def; } else { + int spill_cnt = 0, zero_cnt = 0; + for (i = 0; i < size; i++) { type = stype[(slot - i) % BPF_REG_SIZE]; - if (type == STACK_SPILL) + if (type == STACK_SPILL) { + spill_cnt++; continue; + } if (type == STACK_MISC) continue; - if (type == STACK_ZERO) + if (type == STACK_ZERO) { + zero_cnt++; continue; + } if (type == STACK_INVALID && env->allow_uninit_stack) continue; verbose(env, "invalid read from stack off %d+%d size %d\n", off, i, size); return -EACCES; } - mark_reg_unknown(env, state->regs, dst_regno); - insn_flags = 0; /* not restoring original register state */ + + if (spill_cnt == size && + tnum_is_const(reg->var_off) && reg->var_off.value == 0) { + __mark_reg_const_zero(&state->regs[dst_regno]); + /* this IS register fill, so keep insn_flags */ + } else if (zero_cnt == size) { + /* similarly to mark_reg_stack_read(), preserve zeroes */ + __mark_reg_const_zero(&state->regs[dst_regno]); + insn_flags = 0; /* not restoring original register state */ + } else { + mark_reg_unknown(env, state->regs, dst_regno); + insn_flags = 0; /* not restoring original register state */ + } } state->regs[dst_regno].live |= REG_LIVE_WRITTEN; } else if (dst_regno >= 0) { -- cgit From 18a433b62061e3d787bfc3e670fa711fecbd7cb4 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 5 Dec 2023 10:42:46 -0800 Subject: bpf: track aligned STACK_ZERO cases as imprecise spilled registers Now that precision backtracing is supporting register spill/fill to/from stack, there is another oportunity to be exploited here: minimizing precise STACK_ZERO cases. With a simple code change we can rely on initially imprecise register spill tracking for cases when register spilled to stack was a known zero. This is a very common case for initializing on the stack variables, including rather large structures. Often times zero has no special meaning for the subsequent BPF program logic and is often overwritten with non-zero values soon afterwards. But due to STACK_ZERO vs STACK_MISC tracking, such initial zero initialization actually causes duplication of verifier states as STACK_ZERO is clearly different than STACK_MISC or spilled SCALAR_VALUE register. The effect of this (now) trivial change is huge, as can be seen below. These are differences between BPF selftests, Cilium, and Meta-internal BPF object files relative to previous patch in this series. You can see improvements ranging from single-digit percentage improvement for instructions and states, all the way to 50-60% reduction for some of Meta-internal host agent programs, and even some Cilium programs. For Meta-internal ones I left only the differences for largest BPF object files by states/instructions, as there were too many differences in the overall output. All the differences were improvements, reducting number of states and thus instructions validated. Note, Meta-internal BPF object file names are not printed below. Many copies of balancer_ingress are actually many different configurations of Katran, so they are different BPF programs, which explains state reduction going from -16% all the way to 31%, depending on BPF program logic complexity. I also tooked a closer look at a few small-ish BPF programs to validate the behavior. Let's take bpf_iter_netrlink.bpf.o (first row below). While it's just 8 vs 5 states, verifier log is still pretty long to include it here. But the reduction in states is due to the following piece of C code: unsigned long ino; ... sk = s->sk_socket; if (!sk) { ino = 0; } else { inode = SOCK_INODE(sk); bpf_probe_read_kernel(&ino, sizeof(ino), &inode->i_ino); } BPF_SEQ_PRINTF(seq, "%-8u %-8lu\n", s->sk_drops.counter, ino); return 0; You can see that in some situations `ino` is zero-initialized, while in others it's unknown value filled out by bpf_probe_read_kernel(). Before this change code after if/else branches have to be validated twice. Once with (precise) ino == 0, due to eager STACK_ZERO logic, and then again for when ino is just STACK_MISC. But BPF_SEQ_PRINTF() doesn't care about precise value of ino, so with the change in this patch verifier is able to prune states from after one of the branches, reducing number of total states (and instructions) required for successful validation. Similar principle applies to bigger real-world applications, just at a much larger scale. SELFTESTS ========= File Program Insns (A) Insns (B) Insns (DIFF) States (A) States (B) States (DIFF) --------------------------------------- ----------------------- --------- --------- --------------- ---------- ---------- ------------- bpf_iter_netlink.bpf.linked3.o dump_netlink 148 104 -44 (-29.73%) 8 5 -3 (-37.50%) bpf_iter_unix.bpf.linked3.o dump_unix 8474 8404 -70 (-0.83%) 151 147 -4 (-2.65%) bpf_loop.bpf.linked3.o stack_check 560 324 -236 (-42.14%) 42 24 -18 (-42.86%) local_storage_bench.bpf.linked3.o get_local 120 77 -43 (-35.83%) 9 6 -3 (-33.33%) loop6.bpf.linked3.o trace_virtqueue_add_sgs 10167 9868 -299 (-2.94%) 226 206 -20 (-8.85%) pyperf600_bpf_loop.bpf.linked3.o on_event 4872 3423 -1449 (-29.74%) 322 229 -93 (-28.88%) strobemeta.bpf.linked3.o on_event 180697 176036 -4661 (-2.58%) 4780 4734 -46 (-0.96%) test_cls_redirect.bpf.linked3.o cls_redirect 65594 65401 -193 (-0.29%) 4230 4212 -18 (-0.43%) test_global_func_args.bpf.linked3.o test_cls 145 136 -9 (-6.21%) 10 9 -1 (-10.00%) test_l4lb.bpf.linked3.o balancer_ingress 4760 2612 -2148 (-45.13%) 113 102 -11 (-9.73%) test_l4lb_noinline.bpf.linked3.o balancer_ingress 4845 4877 +32 (+0.66%) 219 221 +2 (+0.91%) test_l4lb_noinline_dynptr.bpf.linked3.o balancer_ingress 2072 2087 +15 (+0.72%) 97 98 +1 (+1.03%) test_seg6_loop.bpf.linked3.o __add_egr_x 12440 9975 -2465 (-19.82%) 364 353 -11 (-3.02%) test_tcp_hdr_options.bpf.linked3.o estab 2558 2572 +14 (+0.55%) 179 180 +1 (+0.56%) test_xdp_dynptr.bpf.linked3.o _xdp_tx_iptunnel 645 596 -49 (-7.60%) 26 24 -2 (-7.69%) test_xdp_noinline.bpf.linked3.o balancer_ingress_v6 3520 3516 -4 (-0.11%) 216 216 +0 (+0.00%) xdp_synproxy_kern.bpf.linked3.o syncookie_tc 82661 81241 -1420 (-1.72%) 5073 5155 +82 (+1.62%) xdp_synproxy_kern.bpf.linked3.o syncookie_xdp 84964 82297 -2667 (-3.14%) 5130 5157 +27 (+0.53%) META-INTERNAL ============= Program Insns (A) Insns (B) Insns (DIFF) States (A) States (B) States (DIFF) -------------------------------------- --------- --------- ----------------- ---------- ---------- --------------- balancer_ingress 27925 23608 -4317 (-15.46%) 1488 1482 -6 (-0.40%) balancer_ingress 31824 27546 -4278 (-13.44%) 1658 1652 -6 (-0.36%) balancer_ingress 32213 27935 -4278 (-13.28%) 1689 1683 -6 (-0.36%) balancer_ingress 32213 27935 -4278 (-13.28%) 1689 1683 -6 (-0.36%) balancer_ingress 31824 27546 -4278 (-13.44%) 1658 1652 -6 (-0.36%) balancer_ingress 38647 29562 -9085 (-23.51%) 2069 1835 -234 (-11.31%) balancer_ingress 38647 29562 -9085 (-23.51%) 2069 1835 -234 (-11.31%) balancer_ingress 40339 30792 -9547 (-23.67%) 2193 1934 -259 (-11.81%) balancer_ingress 37321 29055 -8266 (-22.15%) 1972 1795 -177 (-8.98%) balancer_ingress 38176 29753 -8423 (-22.06%) 2008 1831 -177 (-8.81%) balancer_ingress 29193 20910 -8283 (-28.37%) 1599 1422 -177 (-11.07%) balancer_ingress 30013 21452 -8561 (-28.52%) 1645 1447 -198 (-12.04%) balancer_ingress 28691 24290 -4401 (-15.34%) 1545 1531 -14 (-0.91%) balancer_ingress 34223 28965 -5258 (-15.36%) 1984 1875 -109 (-5.49%) balancer_ingress 35481 26158 -9323 (-26.28%) 2095 1806 -289 (-13.79%) balancer_ingress 35481 26158 -9323 (-26.28%) 2095 1806 -289 (-13.79%) balancer_ingress 35868 26455 -9413 (-26.24%) 2140 1827 -313 (-14.63%) balancer_ingress 35868 26455 -9413 (-26.24%) 2140 1827 -313 (-14.63%) balancer_ingress 35481 26158 -9323 (-26.28%) 2095 1806 -289 (-13.79%) balancer_ingress 35481 26158 -9323 (-26.28%) 2095 1806 -289 (-13.79%) balancer_ingress 34844 29485 -5359 (-15.38%) 2036 1918 -118 (-5.80%) fbflow_egress 3256 2652 -604 (-18.55%) 218 192 -26 (-11.93%) fbflow_ingress 1026 944 -82 (-7.99%) 70 63 -7 (-10.00%) sslwall_tc_egress 8424 7360 -1064 (-12.63%) 498 458 -40 (-8.03%) syar_accept_protect 15040 9539 -5501 (-36.58%) 364 220 -144 (-39.56%) syar_connect_tcp_v6 15036 9535 -5501 (-36.59%) 360 216 -144 (-40.00%) syar_connect_udp_v4 15039 9538 -5501 (-36.58%) 361 217 -144 (-39.89%) syar_connect_connect4_protect4 24805 15833 -8972 (-36.17%) 756 480 -276 (-36.51%) syar_lsm_file_open 167772 151813 -15959 (-9.51%) 1836 1667 -169 (-9.20%) syar_namespace_create_new 14805 9304 -5501 (-37.16%) 353 209 -144 (-40.79%) syar_python3_detect 17531 12030 -5501 (-31.38%) 391 247 -144 (-36.83%) syar_ssh_post_fork 16412 10911 -5501 (-33.52%) 405 261 -144 (-35.56%) syar_enter_execve 14728 9227 -5501 (-37.35%) 345 201 -144 (-41.74%) syar_enter_execveat 14728 9227 -5501 (-37.35%) 345 201 -144 (-41.74%) syar_exit_execve 16622 11121 -5501 (-33.09%) 376 232 -144 (-38.30%) syar_exit_execveat 16622 11121 -5501 (-33.09%) 376 232 -144 (-38.30%) syar_syscalls_kill 15288 9787 -5501 (-35.98%) 398 254 -144 (-36.18%) syar_task_enter_pivot_root 14898 9397 -5501 (-36.92%) 357 213 -144 (-40.34%) syar_syscalls_setreuid 16678 11177 -5501 (-32.98%) 429 285 -144 (-33.57%) syar_syscalls_setuid 16678 11177 -5501 (-32.98%) 429 285 -144 (-33.57%) syar_syscalls_process_vm_readv 14959 9458 -5501 (-36.77%) 364 220 -144 (-39.56%) syar_syscalls_process_vm_writev 15757 10256 -5501 (-34.91%) 390 246 -144 (-36.92%) do_uprobe 15519 10018 -5501 (-35.45%) 373 229 -144 (-38.61%) edgewall 179715 55783 -123932 (-68.96%) 12607 3999 -8608 (-68.28%) bictcp_state 7570 4131 -3439 (-45.43%) 496 269 -227 (-45.77%) cubictcp_state 7570 4131 -3439 (-45.43%) 496 269 -227 (-45.77%) tcp_rate_skb_delivered 447 272 -175 (-39.15%) 29 18 -11 (-37.93%) kprobe__bbr_set_state 4566 2615 -1951 (-42.73%) 209 124 -85 (-40.67%) kprobe__bictcp_state 4566 2615 -1951 (-42.73%) 209 124 -85 (-40.67%) inet_sock_set_state 1501 1337 -164 (-10.93%) 93 85 -8 (-8.60%) tcp_retransmit_skb 1145 981 -164 (-14.32%) 67 59 -8 (-11.94%) tcp_retransmit_synack 1183 951 -232 (-19.61%) 67 55 -12 (-17.91%) bpf_tcptuner 1459 1187 -272 (-18.64%) 99 80 -19 (-19.19%) tw_egress 801 776 -25 (-3.12%) 69 66 -3 (-4.35%) tw_ingress 795 770 -25 (-3.14%) 69 66 -3 (-4.35%) ttls_tc_ingress 19025 19383 +358 (+1.88%) 470 465 -5 (-1.06%) ttls_nat_egress 490 299 -191 (-38.98%) 33 20 -13 (-39.39%) ttls_nat_ingress 448 285 -163 (-36.38%) 32 21 -11 (-34.38%) tw_twfw_egress 511127 212071 -299056 (-58.51%) 16733 8504 -8229 (-49.18%) tw_twfw_ingress 500095 212069 -288026 (-57.59%) 16223 8504 -7719 (-47.58%) tw_twfw_tc_eg 511113 212064 -299049 (-58.51%) 16732 8504 -8228 (-49.18%) tw_twfw_tc_in 500095 212069 -288026 (-57.59%) 16223 8504 -7719 (-47.58%) tw_twfw_egress 12632 12435 -197 (-1.56%) 276 260 -16 (-5.80%) tw_twfw_ingress 12631 12454 -177 (-1.40%) 278 261 -17 (-6.12%) tw_twfw_tc_eg 12595 12435 -160 (-1.27%) 274 259 -15 (-5.47%) tw_twfw_tc_in 12631 12454 -177 (-1.40%) 278 261 -17 (-6.12%) tw_xdp_dump 266 209 -57 (-21.43%) 9 8 -1 (-11.11%) CILIUM ========= File Program Insns (A) Insns (B) Insns (DIFF) States (A) States (B) States (DIFF) ------------- -------------------------------- --------- --------- ---------------- ---------- ---------- -------------- bpf_host.o cil_to_netdev 6047 4578 -1469 (-24.29%) 362 249 -113 (-31.22%) bpf_host.o handle_lxc_traffic 2227 1585 -642 (-28.83%) 156 103 -53 (-33.97%) bpf_host.o tail_handle_ipv4_from_netdev 2244 1458 -786 (-35.03%) 163 106 -57 (-34.97%) bpf_host.o tail_handle_nat_fwd_ipv4 21022 10479 -10543 (-50.15%) 1289 670 -619 (-48.02%) bpf_host.o tail_handle_nat_fwd_ipv6 15433 11375 -4058 (-26.29%) 905 643 -262 (-28.95%) bpf_host.o tail_ipv4_host_policy_ingress 2219 1367 -852 (-38.40%) 161 96 -65 (-40.37%) bpf_host.o tail_nodeport_nat_egress_ipv4 22460 19862 -2598 (-11.57%) 1469 1293 -176 (-11.98%) bpf_host.o tail_nodeport_nat_ingress_ipv4 5526 3534 -1992 (-36.05%) 366 243 -123 (-33.61%) bpf_host.o tail_nodeport_nat_ingress_ipv6 5132 4256 -876 (-17.07%) 241 219 -22 (-9.13%) bpf_host.o tail_nodeport_nat_ipv6_egress 3702 3542 -160 (-4.32%) 215 205 -10 (-4.65%) bpf_lxc.o tail_handle_nat_fwd_ipv4 21022 10479 -10543 (-50.15%) 1289 670 -619 (-48.02%) bpf_lxc.o tail_handle_nat_fwd_ipv6 15433 11375 -4058 (-26.29%) 905 643 -262 (-28.95%) bpf_lxc.o tail_ipv4_ct_egress 5073 3374 -1699 (-33.49%) 262 172 -90 (-34.35%) bpf_lxc.o tail_ipv4_ct_ingress 5093 3385 -1708 (-33.54%) 262 172 -90 (-34.35%) bpf_lxc.o tail_ipv4_ct_ingress_policy_only 5093 3385 -1708 (-33.54%) 262 172 -90 (-34.35%) bpf_lxc.o tail_ipv6_ct_egress 4593 3878 -715 (-15.57%) 194 151 -43 (-22.16%) bpf_lxc.o tail_ipv6_ct_ingress 4606 3891 -715 (-15.52%) 194 151 -43 (-22.16%) bpf_lxc.o tail_ipv6_ct_ingress_policy_only 4606 3891 -715 (-15.52%) 194 151 -43 (-22.16%) bpf_lxc.o tail_nodeport_nat_ingress_ipv4 5526 3534 -1992 (-36.05%) 366 243 -123 (-33.61%) bpf_lxc.o tail_nodeport_nat_ingress_ipv6 5132 4256 -876 (-17.07%) 241 219 -22 (-9.13%) bpf_overlay.o tail_handle_nat_fwd_ipv4 20524 10114 -10410 (-50.72%) 1271 638 -633 (-49.80%) bpf_overlay.o tail_nodeport_nat_egress_ipv4 22718 19490 -3228 (-14.21%) 1475 1275 -200 (-13.56%) bpf_overlay.o tail_nodeport_nat_ingress_ipv4 5526 3534 -1992 (-36.05%) 366 243 -123 (-33.61%) bpf_overlay.o tail_nodeport_nat_ingress_ipv6 5132 4256 -876 (-17.07%) 241 219 -22 (-9.13%) bpf_overlay.o tail_nodeport_nat_ipv6_egress 3638 3548 -90 (-2.47%) 209 203 -6 (-2.87%) bpf_overlay.o tail_rev_nodeport_lb4 4368 3820 -548 (-12.55%) 248 215 -33 (-13.31%) bpf_overlay.o tail_rev_nodeport_lb6 2867 2428 -439 (-15.31%) 167 140 -27 (-16.17%) bpf_sock.o cil_sock6_connect 1718 1703 -15 (-0.87%) 100 99 -1 (-1.00%) bpf_xdp.o tail_handle_nat_fwd_ipv4 12917 12443 -474 (-3.67%) 875 849 -26 (-2.97%) bpf_xdp.o tail_handle_nat_fwd_ipv6 13515 13264 -251 (-1.86%) 715 702 -13 (-1.82%) bpf_xdp.o tail_lb_ipv4 39492 36367 -3125 (-7.91%) 2430 2251 -179 (-7.37%) bpf_xdp.o tail_lb_ipv6 80441 78058 -2383 (-2.96%) 3647 3523 -124 (-3.40%) bpf_xdp.o tail_nodeport_ipv6_dsr 1038 901 -137 (-13.20%) 61 55 -6 (-9.84%) bpf_xdp.o tail_nodeport_nat_egress_ipv4 13027 12096 -931 (-7.15%) 868 809 -59 (-6.80%) bpf_xdp.o tail_nodeport_nat_ingress_ipv4 7617 5900 -1717 (-22.54%) 522 413 -109 (-20.88%) bpf_xdp.o tail_nodeport_nat_ingress_ipv6 7575 7395 -180 (-2.38%) 383 374 -9 (-2.35%) bpf_xdp.o tail_rev_nodeport_lb4 6808 6739 -69 (-1.01%) 403 396 -7 (-1.74%) bpf_xdp.o tail_rev_nodeport_lb6 16173 15847 -326 (-2.02%) 1010 990 -20 (-1.98%) Acked-by: Eduard Zingerman Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231205184248.1502704-9-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 1ebe76c98451..e5ce530641ba 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4477,8 +4477,7 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, return err; mark_stack_slot_scratched(env, spi); - if (reg && !(off % BPF_REG_SIZE) && register_is_bounded(reg) && - !register_is_null(reg) && env->bpf_capable) { + if (reg && !(off % BPF_REG_SIZE) && register_is_bounded(reg) && env->bpf_capable) { save_register_state(env, state, spi, reg, size); /* Break the relation on a narrowing spill. */ if (fls64(reg->umax_value) > BITS_PER_BYTE * size) @@ -4527,7 +4526,12 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, /* when we zero initialize stack slots mark them as such */ if ((reg && register_is_null(reg)) || (!reg && is_bpf_st_mem(insn) && insn->imm == 0)) { - /* backtracking doesn't work for STACK_ZERO yet. */ + /* STACK_ZERO case happened because register spill + * wasn't properly aligned at the stack slot boundary, + * so it's not a register spill anymore; force + * originating register to be precise to make + * STACK_ZERO correct for subsequent states + */ err = mark_chain_precision(env, value_regno); if (err) return err; -- cgit From 7be76461f302ec05cbd62b90b2a05c64299ca01f Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 5 Dec 2023 16:52:09 -0500 Subject: tracing: Always update snapshot buffer size It use to be that only the top level instance had a snapshot buffer (for latency tracers like wakeup and irqsoff). The update of the ring buffer size would check if the instance was the top level and if so, it would also update the snapshot buffer as it needs to be the same as the main buffer. Now that lower level instances also has a snapshot buffer, they too need to update their snapshot buffer sizes when the main buffer is changed, otherwise the following can be triggered: # cd /sys/kernel/tracing # echo 1500 > buffer_size_kb # mkdir instances/foo # echo irqsoff > instances/foo/current_tracer # echo 1000 > instances/foo/buffer_size_kb Produces: WARNING: CPU: 2 PID: 856 at kernel/trace/trace.c:1938 update_max_tr_single.part.0+0x27d/0x320 Which is: ret = ring_buffer_swap_cpu(tr->max_buffer.buffer, tr->array_buffer.buffer, cpu); if (ret == -EBUSY) { [..] } WARN_ON_ONCE(ret && ret != -EAGAIN && ret != -EBUSY); <== here That's because ring_buffer_swap_cpu() has: int ret = -EINVAL; [..] /* At least make sure the two buffers are somewhat the same */ if (cpu_buffer_a->nr_pages != cpu_buffer_b->nr_pages) goto out; [..] out: return ret; } Instead, update all instances' snapshot buffer sizes when their main buffer size is updated. Link: https://lkml.kernel.org/r/20231205220010.454662151@goodmis.org Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Fixes: 6d9b3fa5e7f6 ("tracing: Move tracing_max_latency into trace_array") Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 9aebf904ff97..231c173ec04f 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6392,8 +6392,7 @@ static int __tracing_resize_ring_buffer(struct trace_array *tr, return ret; #ifdef CONFIG_TRACER_MAX_TRACE - if (!(tr->flags & TRACE_ARRAY_FL_GLOBAL) || - !tr->current_trace->use_max_tr) + if (!tr->current_trace->use_max_tr) goto out; ret = ring_buffer_resize(tr->max_buffer.buffer, size, cpu); -- cgit From d78ab792705c7be1b91243b2544d1a79406a2ad7 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 5 Dec 2023 16:52:10 -0500 Subject: tracing: Stop current tracer when resizing buffer When the ring buffer is being resized, it can cause side effects to the running tracer. For instance, there's a race with irqsoff tracer that swaps individual per cpu buffers between the main buffer and the snapshot buffer. The resize operation modifies the main buffer and then the snapshot buffer. If a swap happens in between those two operations it will break the tracer. Simply stop the running tracer before resizing the buffers and enable it again when finished. Link: https://lkml.kernel.org/r/20231205220010.748996423@goodmis.org Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Fixes: 3928a8a2d9808 ("ftrace: make work with new ring buffer") Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 231c173ec04f..e978868b1a22 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6387,9 +6387,12 @@ static int __tracing_resize_ring_buffer(struct trace_array *tr, if (!tr->array_buffer.buffer) return 0; + /* Do not allow tracing while resizng ring buffer */ + tracing_stop_tr(tr); + ret = ring_buffer_resize(tr->array_buffer.buffer, size, cpu); if (ret < 0) - return ret; + goto out_start; #ifdef CONFIG_TRACER_MAX_TRACE if (!tr->current_trace->use_max_tr) @@ -6417,7 +6420,7 @@ static int __tracing_resize_ring_buffer(struct trace_array *tr, WARN_ON(1); tracing_disabled = 1; } - return ret; + goto out_start; } update_buffer_entries(&tr->max_buffer, cpu); @@ -6426,7 +6429,8 @@ static int __tracing_resize_ring_buffer(struct trace_array *tr, #endif /* CONFIG_TRACER_MAX_TRACE */ update_buffer_entries(&tr->array_buffer, cpu); - + out_start: + tracing_start_tr(tr); return ret; } -- cgit From b538bf7d0ec11ca49f536dfda742a5f6db90a798 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 5 Dec 2023 16:52:11 -0500 Subject: tracing: Disable snapshot buffer when stopping instance tracers It use to be that only the top level instance had a snapshot buffer (for latency tracers like wakeup and irqsoff). When stopping a tracer in an instance would not disable the snapshot buffer. This could have some unintended consequences if the irqsoff tracer is enabled. Consolidate the tracing_start/stop() with tracing_start/stop_tr() so that all instances behave the same. The tracing_start/stop() functions will just call their respective tracing_start/stop_tr() with the global_array passed in. Link: https://lkml.kernel.org/r/20231205220011.041220035@goodmis.org Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Fixes: 6d9b3fa5e7f6 ("tracing: Move tracing_max_latency into trace_array") Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 110 ++++++++++++++++----------------------------------- 1 file changed, 34 insertions(+), 76 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index e978868b1a22..2492c6c76850 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2360,13 +2360,7 @@ int is_tracing_stopped(void) return global_trace.stop_count; } -/** - * tracing_start - quick start of the tracer - * - * If tracing is enabled but was stopped by tracing_stop, - * this will start the tracer back up. - */ -void tracing_start(void) +static void tracing_start_tr(struct trace_array *tr) { struct trace_buffer *buffer; unsigned long flags; @@ -2374,119 +2368,83 @@ void tracing_start(void) if (tracing_disabled) return; - raw_spin_lock_irqsave(&global_trace.start_lock, flags); - if (--global_trace.stop_count) { - if (global_trace.stop_count < 0) { + raw_spin_lock_irqsave(&tr->start_lock, flags); + if (--tr->stop_count) { + if (WARN_ON_ONCE(tr->stop_count < 0)) { /* Someone screwed up their debugging */ - WARN_ON_ONCE(1); - global_trace.stop_count = 0; + tr->stop_count = 0; } goto out; } /* Prevent the buffers from switching */ - arch_spin_lock(&global_trace.max_lock); + arch_spin_lock(&tr->max_lock); - buffer = global_trace.array_buffer.buffer; + buffer = tr->array_buffer.buffer; if (buffer) ring_buffer_record_enable(buffer); #ifdef CONFIG_TRACER_MAX_TRACE - buffer = global_trace.max_buffer.buffer; + buffer = tr->max_buffer.buffer; if (buffer) ring_buffer_record_enable(buffer); #endif - arch_spin_unlock(&global_trace.max_lock); - - out: - raw_spin_unlock_irqrestore(&global_trace.start_lock, flags); -} - -static void tracing_start_tr(struct trace_array *tr) -{ - struct trace_buffer *buffer; - unsigned long flags; - - if (tracing_disabled) - return; - - /* If global, we need to also start the max tracer */ - if (tr->flags & TRACE_ARRAY_FL_GLOBAL) - return tracing_start(); - - raw_spin_lock_irqsave(&tr->start_lock, flags); - - if (--tr->stop_count) { - if (tr->stop_count < 0) { - /* Someone screwed up their debugging */ - WARN_ON_ONCE(1); - tr->stop_count = 0; - } - goto out; - } - - buffer = tr->array_buffer.buffer; - if (buffer) - ring_buffer_record_enable(buffer); + arch_spin_unlock(&tr->max_lock); out: raw_spin_unlock_irqrestore(&tr->start_lock, flags); } /** - * tracing_stop - quick stop of the tracer + * tracing_start - quick start of the tracer * - * Light weight way to stop tracing. Use in conjunction with - * tracing_start. + * If tracing is enabled but was stopped by tracing_stop, + * this will start the tracer back up. */ -void tracing_stop(void) +void tracing_start(void) + +{ + return tracing_start_tr(&global_trace); +} + +static void tracing_stop_tr(struct trace_array *tr) { struct trace_buffer *buffer; unsigned long flags; - raw_spin_lock_irqsave(&global_trace.start_lock, flags); - if (global_trace.stop_count++) + raw_spin_lock_irqsave(&tr->start_lock, flags); + if (tr->stop_count++) goto out; /* Prevent the buffers from switching */ - arch_spin_lock(&global_trace.max_lock); + arch_spin_lock(&tr->max_lock); - buffer = global_trace.array_buffer.buffer; + buffer = tr->array_buffer.buffer; if (buffer) ring_buffer_record_disable(buffer); #ifdef CONFIG_TRACER_MAX_TRACE - buffer = global_trace.max_buffer.buffer; + buffer = tr->max_buffer.buffer; if (buffer) ring_buffer_record_disable(buffer); #endif - arch_spin_unlock(&global_trace.max_lock); + arch_spin_unlock(&tr->max_lock); out: - raw_spin_unlock_irqrestore(&global_trace.start_lock, flags); + raw_spin_unlock_irqrestore(&tr->start_lock, flags); } -static void tracing_stop_tr(struct trace_array *tr) +/** + * tracing_stop - quick stop of the tracer + * + * Light weight way to stop tracing. Use in conjunction with + * tracing_start. + */ +void tracing_stop(void) { - struct trace_buffer *buffer; - unsigned long flags; - - /* If global, we need to also stop the max tracer */ - if (tr->flags & TRACE_ARRAY_FL_GLOBAL) - return tracing_stop(); - - raw_spin_lock_irqsave(&tr->start_lock, flags); - if (tr->stop_count++) - goto out; - - buffer = tr->array_buffer.buffer; - if (buffer) - ring_buffer_record_disable(buffer); - - out: - raw_spin_unlock_irqrestore(&tr->start_lock, flags); + return tracing_stop_tr(&global_trace); } static int trace_save_cmdline(struct task_struct *tsk) -- cgit From 7fed14f7ac9cf5e38c693836fe4a874720141845 Mon Sep 17 00:00:00 2001 From: Petr Pavlu Date: Tue, 5 Dec 2023 17:17:34 +0100 Subject: tracing: Fix incomplete locking when disabling buffered events The following warning appears when using buffered events: [ 203.556451] WARNING: CPU: 53 PID: 10220 at kernel/trace/ring_buffer.c:3912 ring_buffer_discard_commit+0x2eb/0x420 [...] [ 203.670690] CPU: 53 PID: 10220 Comm: stress-ng-sysin Tainted: G E 6.7.0-rc2-default #4 56e6d0fcf5581e6e51eaaecbdaec2a2338c80f3a [ 203.670704] Hardware name: Intel Corp. GROVEPORT/GROVEPORT, BIOS GVPRCRB1.86B.0016.D04.1705030402 05/03/2017 [ 203.670709] RIP: 0010:ring_buffer_discard_commit+0x2eb/0x420 [ 203.735721] Code: 4c 8b 4a 50 48 8b 42 48 49 39 c1 0f 84 b3 00 00 00 49 83 e8 01 75 b1 48 8b 42 10 f0 ff 40 08 0f 0b e9 fc fe ff ff f0 ff 47 08 <0f> 0b e9 77 fd ff ff 48 8b 42 10 f0 ff 40 08 0f 0b e9 f5 fe ff ff [ 203.735734] RSP: 0018:ffffb4ae4f7b7d80 EFLAGS: 00010202 [ 203.735745] RAX: 0000000000000000 RBX: ffffb4ae4f7b7de0 RCX: ffff8ac10662c000 [ 203.735754] RDX: ffff8ac0c750be00 RSI: ffff8ac10662c000 RDI: ffff8ac0c004d400 [ 203.781832] RBP: ffff8ac0c039cea0 R08: 0000000000000000 R09: 0000000000000000 [ 203.781839] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000 [ 203.781842] R13: ffff8ac10662c000 R14: ffff8ac0c004d400 R15: ffff8ac10662c008 [ 203.781846] FS: 00007f4cd8a67740(0000) GS:ffff8ad798880000(0000) knlGS:0000000000000000 [ 203.781851] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 203.781855] CR2: 0000559766a74028 CR3: 00000001804c4000 CR4: 00000000001506f0 [ 203.781862] Call Trace: [ 203.781870] [ 203.851949] trace_event_buffer_commit+0x1ea/0x250 [ 203.851967] trace_event_raw_event_sys_enter+0x83/0xe0 [ 203.851983] syscall_trace_enter.isra.0+0x182/0x1a0 [ 203.851990] do_syscall_64+0x3a/0xe0 [ 203.852075] entry_SYSCALL_64_after_hwframe+0x6e/0x76 [ 203.852090] RIP: 0033:0x7f4cd870fa77 [ 203.982920] Code: 00 b8 ff ff ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 66 90 b8 89 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d e9 43 0e 00 f7 d8 64 89 01 48 [ 203.982932] RSP: 002b:00007fff99717dd8 EFLAGS: 00000246 ORIG_RAX: 0000000000000089 [ 203.982942] RAX: ffffffffffffffda RBX: 0000558ea1d7b6f0 RCX: 00007f4cd870fa77 [ 203.982948] RDX: 0000000000000000 RSI: 00007fff99717de0 RDI: 0000558ea1d7b6f0 [ 203.982957] RBP: 00007fff99717de0 R08: 00007fff997180e0 R09: 00007fff997180e0 [ 203.982962] R10: 00007fff997180e0 R11: 0000000000000246 R12: 00007fff99717f40 [ 204.049239] R13: 00007fff99718590 R14: 0000558e9f2127a8 R15: 00007fff997180b0 [ 204.049256] For instance, it can be triggered by running these two commands in parallel: $ while true; do echo hist:key=id.syscall:val=hitcount > \ /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/trigger; done $ stress-ng --sysinfo $(nproc) The warning indicates that the current ring_buffer_per_cpu is not in the committing state. It happens because the active ring_buffer_event doesn't actually come from the ring_buffer_per_cpu but is allocated from trace_buffered_event. The bug is in function trace_buffered_event_disable() where the following normally happens: * The code invokes disable_trace_buffered_event() via smp_call_function_many() and follows it by synchronize_rcu(). This increments the per-CPU variable trace_buffered_event_cnt on each target CPU and grants trace_buffered_event_disable() the exclusive access to the per-CPU variable trace_buffered_event. * Maintenance is performed on trace_buffered_event, all per-CPU event buffers get freed. * The code invokes enable_trace_buffered_event() via smp_call_function_many(). This decrements trace_buffered_event_cnt and releases the access to trace_buffered_event. A problem is that smp_call_function_many() runs a given function on all target CPUs except on the current one. The following can then occur: * Task X executing trace_buffered_event_disable() runs on CPU 0. * The control reaches synchronize_rcu() and the task gets rescheduled on another CPU 1. * The RCU synchronization finishes. At this point, trace_buffered_event_disable() has the exclusive access to all trace_buffered_event variables except trace_buffered_event[CPU0] because trace_buffered_event_cnt[CPU0] is never incremented and if the buffer is currently unused, remains set to 0. * A different task Y is scheduled on CPU 0 and hits a trace event. The code in trace_event_buffer_lock_reserve() sees that trace_buffered_event_cnt[CPU0] is set to 0 and decides the use the buffer provided by trace_buffered_event[CPU0]. * Task X continues its execution in trace_buffered_event_disable(). The code incorrectly frees the event buffer pointed by trace_buffered_event[CPU0] and resets the variable to NULL. * Task Y writes event data to the now freed buffer and later detects the created inconsistency. The issue is observable since commit dea499781a11 ("tracing: Fix warning in trace_buffered_event_disable()") which moved the call of trace_buffered_event_disable() in __ftrace_event_enable_disable() earlier, prior to invoking call->class->reg(.. TRACE_REG_UNREGISTER ..). The underlying problem in trace_buffered_event_disable() is however present since the original implementation in commit 0fc1b09ff1ff ("tracing: Use temp buffer when filtering events"). Fix the problem by replacing the two smp_call_function_many() calls with on_each_cpu_mask() which invokes a given callback on all CPUs. Link: https://lore.kernel.org/all/20231127151248.7232-2-petr.pavlu@suse.com/ Link: https://lkml.kernel.org/r/20231205161736.19663-2-petr.pavlu@suse.com Cc: stable@vger.kernel.org Fixes: 0fc1b09ff1ff ("tracing: Use temp buffer when filtering events") Fixes: dea499781a11 ("tracing: Fix warning in trace_buffered_event_disable()") Signed-off-by: Petr Pavlu Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 2492c6c76850..6aeffa4a6994 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2781,11 +2781,9 @@ void trace_buffered_event_disable(void) if (--trace_buffered_event_ref) return; - preempt_disable(); /* For each CPU, set the buffer as used. */ - smp_call_function_many(tracing_buffer_mask, - disable_trace_buffered_event, NULL, 1); - preempt_enable(); + on_each_cpu_mask(tracing_buffer_mask, disable_trace_buffered_event, + NULL, true); /* Wait for all current users to finish */ synchronize_rcu(); @@ -2800,11 +2798,9 @@ void trace_buffered_event_disable(void) */ smp_wmb(); - preempt_disable(); /* Do the work on each cpu */ - smp_call_function_many(tracing_buffer_mask, - enable_trace_buffered_event, NULL, 1); - preempt_enable(); + on_each_cpu_mask(tracing_buffer_mask, enable_trace_buffered_event, NULL, + true); } static struct trace_buffer *temp_buffer; -- cgit From 34209fe83ef8404353f91ab4ea4035dbc9922d04 Mon Sep 17 00:00:00 2001 From: Petr Pavlu Date: Tue, 5 Dec 2023 17:17:35 +0100 Subject: tracing: Fix a warning when allocating buffered events fails Function trace_buffered_event_disable() produces an unexpected warning when the previous call to trace_buffered_event_enable() fails to allocate pages for buffered events. The situation can occur as follows: * The counter trace_buffered_event_ref is at 0. * The soft mode gets enabled for some event and trace_buffered_event_enable() is called. The function increments trace_buffered_event_ref to 1 and starts allocating event pages. * The allocation fails for some page and trace_buffered_event_disable() is called for cleanup. * Function trace_buffered_event_disable() decrements trace_buffered_event_ref back to 0, recognizes that it was the last use of buffered events and frees all allocated pages. * The control goes back to trace_buffered_event_enable() which returns. The caller of trace_buffered_event_enable() has no information that the function actually failed. * Some time later, the soft mode is disabled for the same event. Function trace_buffered_event_disable() is called. It warns on "WARN_ON_ONCE(!trace_buffered_event_ref)" and returns. Buffered events are just an optimization and can handle failures. Make trace_buffered_event_enable() exit on the first failure and left any cleanup later to when trace_buffered_event_disable() is called. Link: https://lore.kernel.org/all/20231127151248.7232-2-petr.pavlu@suse.com/ Link: https://lkml.kernel.org/r/20231205161736.19663-3-petr.pavlu@suse.com Fixes: 0fc1b09ff1ff ("tracing: Use temp buffer when filtering events") Signed-off-by: Petr Pavlu Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 6aeffa4a6994..ef72354f61ce 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2728,8 +2728,11 @@ void trace_buffered_event_enable(void) for_each_tracing_cpu(cpu) { page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL | __GFP_NORETRY, 0); - if (!page) - goto failed; + /* This is just an optimization and can handle failures */ + if (!page) { + pr_err("Failed to allocate event buffer\n"); + break; + } event = page_address(page); memset(event, 0, sizeof(*event)); @@ -2743,10 +2746,6 @@ void trace_buffered_event_enable(void) WARN_ON_ONCE(1); preempt_enable(); } - - return; - failed: - trace_buffered_event_disable(); } static void enable_trace_buffered_event(void *data) -- cgit From c0591b1cccf708a47bc465c62436d669a4213323 Mon Sep 17 00:00:00 2001 From: Petr Pavlu Date: Tue, 5 Dec 2023 17:17:36 +0100 Subject: tracing: Fix a possible race when disabling buffered events Function trace_buffered_event_disable() is responsible for freeing pages backing buffered events and this process can run concurrently with trace_event_buffer_lock_reserve(). The following race is currently possible: * Function trace_buffered_event_disable() is called on CPU 0. It increments trace_buffered_event_cnt on each CPU and waits via synchronize_rcu() for each user of trace_buffered_event to complete. * After synchronize_rcu() is finished, function trace_buffered_event_disable() has the exclusive access to trace_buffered_event. All counters trace_buffered_event_cnt are at 1 and all pointers trace_buffered_event are still valid. * At this point, on a different CPU 1, the execution reaches trace_event_buffer_lock_reserve(). The function calls preempt_disable_notrace() and only now enters an RCU read-side critical section. The function proceeds and reads a still valid pointer from trace_buffered_event[CPU1] into the local variable "entry". However, it doesn't yet read trace_buffered_event_cnt[CPU1] which happens later. * Function trace_buffered_event_disable() continues. It frees trace_buffered_event[CPU1] and decrements trace_buffered_event_cnt[CPU1] back to 0. * Function trace_event_buffer_lock_reserve() continues. It reads and increments trace_buffered_event_cnt[CPU1] from 0 to 1. This makes it believe that it can use the "entry" that it already obtained but the pointer is now invalid and any access results in a use-after-free. Fix the problem by making a second synchronize_rcu() call after all trace_buffered_event values are set to NULL. This waits on all potential users in trace_event_buffer_lock_reserve() that still read a previous pointer from trace_buffered_event. Link: https://lore.kernel.org/all/20231127151248.7232-2-petr.pavlu@suse.com/ Link: https://lkml.kernel.org/r/20231205161736.19663-4-petr.pavlu@suse.com Cc: stable@vger.kernel.org Fixes: 0fc1b09ff1ff ("tracing: Use temp buffer when filtering events") Signed-off-by: Petr Pavlu Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index ef72354f61ce..fbcd3bafb93e 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2791,13 +2791,17 @@ void trace_buffered_event_disable(void) free_page((unsigned long)per_cpu(trace_buffered_event, cpu)); per_cpu(trace_buffered_event, cpu) = NULL; } + /* - * Make sure trace_buffered_event is NULL before clearing - * trace_buffered_event_cnt. + * Wait for all CPUs that potentially started checking if they can use + * their event buffer only after the previous synchronize_rcu() call and + * they still read a valid pointer from trace_buffered_event. It must be + * ensured they don't see cleared trace_buffered_event_cnt else they + * could wrongly decide to use the pointed-to buffer which is now freed. */ - smp_wmb(); + synchronize_rcu(); - /* Do the work on each cpu */ + /* For each CPU, relinquish the buffer */ on_each_cpu_mask(tracing_buffer_mask, enable_trace_buffered_event, NULL, true); } -- cgit From 909fa05dd3c181e5b403912889057f7cdbf3906c Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 30 Nov 2023 10:52:13 -0800 Subject: bpf: align CAP_NET_ADMIN checks with bpf_capable() approach Within BPF syscall handling code CAP_NET_ADMIN checks stand out a bit compared to CAP_BPF and CAP_PERFMON checks. For the latter, CAP_BPF or CAP_PERFMON are checked first, but if they are not set, CAP_SYS_ADMIN takes over and grants whatever part of BPF syscall is required. Similar kind of checks that involve CAP_NET_ADMIN are not so consistent. One out of four uses does follow CAP_BPF/CAP_PERFMON model: during BPF_PROG_LOAD, if the type of BPF program is "network-related" either CAP_NET_ADMIN or CAP_SYS_ADMIN is required to proceed. But in three other cases CAP_NET_ADMIN is required even if CAP_SYS_ADMIN is set: - when creating DEVMAP/XDKMAP/CPU_MAP maps; - when attaching CGROUP_SKB programs; - when handling BPF_PROG_QUERY command. This patch is changing the latter three cases to follow BPF_PROG_LOAD model, that is allowing to proceed under either CAP_NET_ADMIN or CAP_SYS_ADMIN. This also makes it cleaner in subsequent BPF token patches to switch wholesomely to a generic bpf_token_capable(int cap) check, that always falls back to CAP_SYS_ADMIN if requested capability is missing. Cc: Jakub Kicinski Acked-by: Yafang Shao Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231130185229.2688956-2-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index ebaccf77d56e..ee33a52abf18 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1121,6 +1121,11 @@ free_map_tab: return ret; } +static bool bpf_net_capable(void) +{ + return capable(CAP_NET_ADMIN) || capable(CAP_SYS_ADMIN); +} + #define BPF_MAP_CREATE_LAST_FIELD map_extra /* called via syscall */ static int map_create(union bpf_attr *attr) @@ -1224,7 +1229,7 @@ static int map_create(union bpf_attr *attr) case BPF_MAP_TYPE_DEVMAP: case BPF_MAP_TYPE_DEVMAP_HASH: case BPF_MAP_TYPE_XSKMAP: - if (!capable(CAP_NET_ADMIN)) + if (!bpf_net_capable()) return -EPERM; break; default: @@ -2625,7 +2630,7 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) !bpf_capable()) return -EPERM; - if (is_net_admin_prog_type(type) && !capable(CAP_NET_ADMIN) && !capable(CAP_SYS_ADMIN)) + if (is_net_admin_prog_type(type) && !bpf_net_capable()) return -EPERM; if (is_perfmon_prog_type(type) && !perfmon_capable()) return -EPERM; @@ -3777,7 +3782,7 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, case BPF_PROG_TYPE_SK_LOOKUP: return attach_type == prog->expected_attach_type ? 0 : -EINVAL; case BPF_PROG_TYPE_CGROUP_SKB: - if (!capable(CAP_NET_ADMIN)) + if (!bpf_net_capable()) /* cg-skb progs can be loaded by unpriv user. * check permissions at attach time. */ @@ -3980,7 +3985,7 @@ static int bpf_prog_detach(const union bpf_attr *attr) static int bpf_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr) { - if (!capable(CAP_NET_ADMIN)) + if (!bpf_net_capable()) return -EPERM; if (CHECK_ATTR(BPF_PROG_QUERY)) return -EINVAL; -- cgit From 40bba140c60fbb3ee8df6203c82fbd3de9f19d95 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 30 Nov 2023 10:52:14 -0800 Subject: bpf: add BPF token delegation mount options to BPF FS Add few new mount options to BPF FS that allow to specify that a given BPF FS instance allows creation of BPF token (added in the next patch), and what sort of operations are allowed under BPF token. As such, we get 4 new mount options, each is a bit mask - `delegate_cmds` allow to specify which bpf() syscall commands are allowed with BPF token derived from this BPF FS instance; - if BPF_MAP_CREATE command is allowed, `delegate_maps` specifies a set of allowable BPF map types that could be created with BPF token; - if BPF_PROG_LOAD command is allowed, `delegate_progs` specifies a set of allowable BPF program types that could be loaded with BPF token; - if BPF_PROG_LOAD command is allowed, `delegate_attachs` specifies a set of allowable BPF program attach types that could be loaded with BPF token; delegate_progs and delegate_attachs are meant to be used together, as full BPF program type is, in general, determined through both program type and program attach type. Currently, these mount options accept the following forms of values: - a special value "any", that enables all possible values of a given bit set; - numeric value (decimal or hexadecimal, determined by kernel automatically) that specifies a bit mask value directly; - all the values for a given mount option are combined, if specified multiple times. E.g., `mount -t bpf nodev /path/to/mount -o delegate_maps=0x1 -o delegate_maps=0x2` will result in a combined 0x3 mask. Ideally, more convenient (for humans) symbolic form derived from corresponding UAPI enums would be accepted (e.g., `-o delegate_progs=kprobe|tracepoint`) and I intend to implement this, but it requires a bunch of UAPI header churn, so I postponed it until this feature lands upstream or at least there is a definite consensus that this feature is acceptable and is going to make it, just to minimize amount of wasted effort and not increase amount of non-essential code to be reviewed. Attentive reader will notice that BPF FS is now marked as FS_USERNS_MOUNT, which theoretically makes it mountable inside non-init user namespace as long as the process has sufficient *namespaced* capabilities within that user namespace. But in reality we still restrict BPF FS to be mountable only by processes with CAP_SYS_ADMIN *in init userns* (extra check in bpf_fill_super()). FS_USERNS_MOUNT is added to allow creating BPF FS context object (i.e., fsopen("bpf")) from inside unprivileged process inside non-init userns, to capture that userns as the owning userns. It will still be required to pass this context object back to privileged process to instantiate and mount it. This manipulation is important, because capturing non-init userns as the owning userns of BPF FS instance (super block) allows to use that userns to constraint BPF token to that userns later on (see next patch). So creating BPF FS with delegation inside unprivileged userns will restrict derived BPF token objects to only "work" inside that intended userns, making it scoped to a intended "container". Also, setting these delegation options requires capable(CAP_SYS_ADMIN), so unprivileged process cannot set this up without involvement of a privileged process. There is a set of selftests at the end of the patch set that simulates this sequence of steps and validates that everything works as intended. But careful review is requested to make sure there are no missed gaps in the implementation and testing. This somewhat subtle set of aspects is the result of previous discussions ([0]) about various user namespace implications and interactions with BPF token functionality and is necessary to contain BPF token inside intended user namespace. [0] https://lore.kernel.org/bpf/20230704-hochverdient-lehne-eeb9eeef785e@brauner/ Acked-by: Christian Brauner Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231130185229.2688956-3-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 10 ++++++ kernel/bpf/inode.c | 88 +++++++++++++++++++++++++++++++++++++++++++++++------ 2 files changed, 88 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 10e5e4d8a00f..d3c9acc593ea 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1581,6 +1581,16 @@ struct bpf_link_primer { u32 id; }; +struct bpf_mount_opts { + umode_t mode; + + /* BPF token-related delegation options */ + u64 delegate_cmds; + u64 delegate_maps; + u64 delegate_progs; + u64 delegate_attachs; +}; + struct bpf_struct_ops_value; struct btf_member; diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 1aafb2ff2e95..220fe0f99095 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -20,6 +20,7 @@ #include #include #include +#include #include "preload/bpf_preload.h" enum bpf_type { @@ -599,10 +600,31 @@ EXPORT_SYMBOL(bpf_prog_get_type_path); */ static int bpf_show_options(struct seq_file *m, struct dentry *root) { + struct bpf_mount_opts *opts = root->d_sb->s_fs_info; umode_t mode = d_inode(root)->i_mode & S_IALLUGO & ~S_ISVTX; if (mode != S_IRWXUGO) seq_printf(m, ",mode=%o", mode); + + if (opts->delegate_cmds == ~0ULL) + seq_printf(m, ",delegate_cmds=any"); + else if (opts->delegate_cmds) + seq_printf(m, ",delegate_cmds=0x%llx", opts->delegate_cmds); + + if (opts->delegate_maps == ~0ULL) + seq_printf(m, ",delegate_maps=any"); + else if (opts->delegate_maps) + seq_printf(m, ",delegate_maps=0x%llx", opts->delegate_maps); + + if (opts->delegate_progs == ~0ULL) + seq_printf(m, ",delegate_progs=any"); + else if (opts->delegate_progs) + seq_printf(m, ",delegate_progs=0x%llx", opts->delegate_progs); + + if (opts->delegate_attachs == ~0ULL) + seq_printf(m, ",delegate_attachs=any"); + else if (opts->delegate_attachs) + seq_printf(m, ",delegate_attachs=0x%llx", opts->delegate_attachs); return 0; } @@ -626,22 +648,27 @@ static const struct super_operations bpf_super_ops = { enum { OPT_MODE, + OPT_DELEGATE_CMDS, + OPT_DELEGATE_MAPS, + OPT_DELEGATE_PROGS, + OPT_DELEGATE_ATTACHS, }; static const struct fs_parameter_spec bpf_fs_parameters[] = { fsparam_u32oct ("mode", OPT_MODE), + fsparam_string ("delegate_cmds", OPT_DELEGATE_CMDS), + fsparam_string ("delegate_maps", OPT_DELEGATE_MAPS), + fsparam_string ("delegate_progs", OPT_DELEGATE_PROGS), + fsparam_string ("delegate_attachs", OPT_DELEGATE_ATTACHS), {} }; -struct bpf_mount_opts { - umode_t mode; -}; - static int bpf_parse_param(struct fs_context *fc, struct fs_parameter *param) { - struct bpf_mount_opts *opts = fc->fs_private; + struct bpf_mount_opts *opts = fc->s_fs_info; struct fs_parse_result result; - int opt; + int opt, err; + u64 msk; opt = fs_parse(fc, bpf_fs_parameters, param, &result); if (opt < 0) { @@ -665,6 +692,28 @@ static int bpf_parse_param(struct fs_context *fc, struct fs_parameter *param) case OPT_MODE: opts->mode = result.uint_32 & S_IALLUGO; break; + case OPT_DELEGATE_CMDS: + case OPT_DELEGATE_MAPS: + case OPT_DELEGATE_PROGS: + case OPT_DELEGATE_ATTACHS: + if (strcmp(param->string, "any") == 0) { + msk = ~0ULL; + } else { + err = kstrtou64(param->string, 0, &msk); + if (err) + return err; + } + /* Setting delegation mount options requires privileges */ + if (msk && !capable(CAP_SYS_ADMIN)) + return -EPERM; + switch (opt) { + case OPT_DELEGATE_CMDS: opts->delegate_cmds |= msk; break; + case OPT_DELEGATE_MAPS: opts->delegate_maps |= msk; break; + case OPT_DELEGATE_PROGS: opts->delegate_progs |= msk; break; + case OPT_DELEGATE_ATTACHS: opts->delegate_attachs |= msk; break; + default: return -EINVAL; + } + break; } return 0; @@ -739,10 +788,14 @@ out: static int bpf_fill_super(struct super_block *sb, struct fs_context *fc) { static const struct tree_descr bpf_rfiles[] = { { "" } }; - struct bpf_mount_opts *opts = fc->fs_private; + struct bpf_mount_opts *opts = sb->s_fs_info; struct inode *inode; int ret; + /* Mounting an instance of BPF FS requires privileges */ + if (fc->user_ns != &init_user_ns && !capable(CAP_SYS_ADMIN)) + return -EPERM; + ret = simple_fill_super(sb, BPF_FS_MAGIC, bpf_rfiles); if (ret) return ret; @@ -764,7 +817,7 @@ static int bpf_get_tree(struct fs_context *fc) static void bpf_free_fc(struct fs_context *fc) { - kfree(fc->fs_private); + kfree(fc->s_fs_info); } static const struct fs_context_operations bpf_context_ops = { @@ -786,17 +839,32 @@ static int bpf_init_fs_context(struct fs_context *fc) opts->mode = S_IRWXUGO; - fc->fs_private = opts; + /* start out with no BPF token delegation enabled */ + opts->delegate_cmds = 0; + opts->delegate_maps = 0; + opts->delegate_progs = 0; + opts->delegate_attachs = 0; + + fc->s_fs_info = opts; fc->ops = &bpf_context_ops; return 0; } +static void bpf_kill_super(struct super_block *sb) +{ + struct bpf_mount_opts *opts = sb->s_fs_info; + + kill_litter_super(sb); + kfree(opts); +} + static struct file_system_type bpf_fs_type = { .owner = THIS_MODULE, .name = "bpf", .init_fs_context = bpf_init_fs_context, .parameters = bpf_fs_parameters, - .kill_sb = kill_litter_super, + .kill_sb = bpf_kill_super, + .fs_flags = FS_USERNS_MOUNT, }; static int __init bpf_init(void) -- cgit From 4527358b76861dfd64ee34aba45d81648fbc8a61 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 30 Nov 2023 10:52:15 -0800 Subject: bpf: introduce BPF token object Add new kind of BPF kernel object, BPF token. BPF token is meant to allow delegating privileged BPF functionality, like loading a BPF program or creating a BPF map, from privileged process to a *trusted* unprivileged process, all while having a good amount of control over which privileged operations could be performed using provided BPF token. This is achieved through mounting BPF FS instance with extra delegation mount options, which determine what operations are delegatable, and also constraining it to the owning user namespace (as mentioned in the previous patch). BPF token itself is just a derivative from BPF FS and can be created through a new bpf() syscall command, BPF_TOKEN_CREATE, which accepts BPF FS FD, which can be attained through open() API by opening BPF FS mount point. Currently, BPF token "inherits" delegated command, map types, prog type, and attach type bit sets from BPF FS as is. In the future, having an BPF token as a separate object with its own FD, we can allow to further restrict BPF token's allowable set of things either at the creation time or after the fact, allowing the process to guard itself further from unintentionally trying to load undesired kind of BPF programs. But for now we keep things simple and just copy bit sets as is. When BPF token is created from BPF FS mount, we take reference to the BPF super block's owning user namespace, and then use that namespace for checking all the {CAP_BPF, CAP_PERFMON, CAP_NET_ADMIN, CAP_SYS_ADMIN} capabilities that are normally only checked against init userns (using capable()), but now we check them using ns_capable() instead (if BPF token is provided). See bpf_token_capable() for details. Such setup means that BPF token in itself is not sufficient to grant BPF functionality. User namespaced process has to *also* have necessary combination of capabilities inside that user namespace. So while previously CAP_BPF was useless when granted within user namespace, now it gains a meaning and allows container managers and sys admins to have a flexible control over which processes can and need to use BPF functionality within the user namespace (i.e., container in practice). And BPF FS delegation mount options and derived BPF tokens serve as a per-container "flag" to grant overall ability to use bpf() (plus further restrict on which parts of bpf() syscalls are treated as namespaced). Note also, BPF_TOKEN_CREATE command itself requires ns_capable(CAP_BPF) within the BPF FS owning user namespace, rounding up the ns_capable() story of BPF token. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231130185229.2688956-4-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 41 ++++++++ include/uapi/linux/bpf.h | 37 +++++++ kernel/bpf/Makefile | 2 +- kernel/bpf/inode.c | 12 ++- kernel/bpf/syscall.c | 17 ++++ kernel/bpf/token.c | 214 +++++++++++++++++++++++++++++++++++++++++ tools/include/uapi/linux/bpf.h | 37 +++++++ 7 files changed, 354 insertions(+), 6 deletions(-) create mode 100644 kernel/bpf/token.c (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index d3c9acc593ea..aa9cf8e5fab1 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -51,6 +51,10 @@ struct module; struct bpf_func_state; struct ftrace_ops; struct cgroup; +struct bpf_token; +struct user_namespace; +struct super_block; +struct inode; extern struct idr btf_idr; extern spinlock_t btf_idr_lock; @@ -1591,6 +1595,13 @@ struct bpf_mount_opts { u64 delegate_attachs; }; +struct bpf_token { + struct work_struct work; + atomic64_t refcnt; + struct user_namespace *userns; + u64 allowed_cmds; +}; + struct bpf_struct_ops_value; struct btf_member; @@ -2048,6 +2059,7 @@ static inline void bpf_enable_instrumentation(void) migrate_enable(); } +extern const struct super_operations bpf_super_ops; extern const struct file_operations bpf_map_fops; extern const struct file_operations bpf_prog_fops; extern const struct file_operations bpf_iter_fops; @@ -2182,6 +2194,8 @@ static inline void bpf_map_dec_elem_count(struct bpf_map *map) extern int sysctl_unprivileged_bpf_disabled; +bool bpf_token_capable(const struct bpf_token *token, int cap); + static inline bool bpf_allow_ptr_leaks(void) { return perfmon_capable(); @@ -2216,8 +2230,17 @@ int bpf_link_new_fd(struct bpf_link *link); struct bpf_link *bpf_link_get_from_fd(u32 ufd); struct bpf_link *bpf_link_get_curr_or_next(u32 *id); +void bpf_token_inc(struct bpf_token *token); +void bpf_token_put(struct bpf_token *token); +int bpf_token_create(union bpf_attr *attr); +struct bpf_token *bpf_token_get_from_fd(u32 ufd); + +bool bpf_token_allow_cmd(const struct bpf_token *token, enum bpf_cmd cmd); + int bpf_obj_pin_user(u32 ufd, int path_fd, const char __user *pathname); int bpf_obj_get_user(int path_fd, const char __user *pathname, int flags); +struct inode *bpf_get_inode(struct super_block *sb, const struct inode *dir, + umode_t mode); #define BPF_ITER_FUNC_PREFIX "bpf_iter_" #define DEFINE_BPF_ITER_FUNC(target, args...) \ @@ -2580,6 +2603,24 @@ static inline int bpf_obj_get_user(const char __user *pathname, int flags) return -EOPNOTSUPP; } +static inline bool bpf_token_capable(const struct bpf_token *token, int cap) +{ + return capable(cap) || (cap != CAP_SYS_ADMIN && capable(CAP_SYS_ADMIN)); +} + +static inline void bpf_token_inc(struct bpf_token *token) +{ +} + +static inline void bpf_token_put(struct bpf_token *token) +{ +} + +static inline struct bpf_token *bpf_token_get_from_fd(u32 ufd) +{ + return ERR_PTR(-EOPNOTSUPP); +} + static inline void __dev_flush(void) { } diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index e88746ba7d21..d4a567e5bc3c 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -847,6 +847,36 @@ union bpf_iter_link_info { * Returns zero on success. On error, -1 is returned and *errno* * is set appropriately. * + * BPF_TOKEN_CREATE + * Description + * Create BPF token with embedded information about what + * BPF-related functionality it allows: + * - a set of allowed bpf() syscall commands; + * - a set of allowed BPF map types to be created with + * BPF_MAP_CREATE command, if BPF_MAP_CREATE itself is allowed; + * - a set of allowed BPF program types and BPF program attach + * types to be loaded with BPF_PROG_LOAD command, if + * BPF_PROG_LOAD itself is allowed. + * + * BPF token is created (derived) from an instance of BPF FS, + * assuming it has necessary delegation mount options specified. + * This BPF token can be passed as an extra parameter to various + * bpf() syscall commands to grant BPF subsystem functionality to + * unprivileged processes. + * + * When created, BPF token is "associated" with the owning + * user namespace of BPF FS instance (super block) that it was + * derived from, and subsequent BPF operations performed with + * BPF token would be performing capabilities checks (i.e., + * CAP_BPF, CAP_PERFMON, CAP_NET_ADMIN, CAP_SYS_ADMIN) within + * that user namespace. Without BPF token, such capabilities + * have to be granted in init user namespace, making bpf() + * syscall incompatible with user namespace, for the most part. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * * NOTES * eBPF objects (maps and programs) can be shared between processes. * @@ -901,6 +931,8 @@ enum bpf_cmd { BPF_ITER_CREATE, BPF_LINK_DETACH, BPF_PROG_BIND_MAP, + BPF_TOKEN_CREATE, + __MAX_BPF_CMD, }; enum bpf_map_type { @@ -1712,6 +1744,11 @@ union bpf_attr { __u32 flags; /* extra flags */ } prog_bind_map; + struct { /* struct used by BPF_TOKEN_CREATE command */ + __u32 flags; + __u32 bpffs_fd; + } token_create; + } __attribute__((aligned(8))); /* The description below is an attempt at providing documentation to eBPF diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index f526b7573e97..4ce95acfcaa7 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -6,7 +6,7 @@ cflags-nogcse-$(CONFIG_X86)$(CONFIG_CC_IS_GCC) := -fno-gcse endif CFLAGS_core.o += $(call cc-disable-warning, override-init) $(cflags-nogcse-yy) -obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o log.o +obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o log.o token.o obj-$(CONFIG_BPF_SYSCALL) += bpf_iter.o map_iter.o task_iter.o prog_iter.o link_iter.o obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o bloom_filter.o obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 220fe0f99095..6ce3f9696e72 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -99,9 +99,9 @@ static const struct inode_operations bpf_prog_iops = { }; static const struct inode_operations bpf_map_iops = { }; static const struct inode_operations bpf_link_iops = { }; -static struct inode *bpf_get_inode(struct super_block *sb, - const struct inode *dir, - umode_t mode) +struct inode *bpf_get_inode(struct super_block *sb, + const struct inode *dir, + umode_t mode) { struct inode *inode; @@ -602,11 +602,13 @@ static int bpf_show_options(struct seq_file *m, struct dentry *root) { struct bpf_mount_opts *opts = root->d_sb->s_fs_info; umode_t mode = d_inode(root)->i_mode & S_IALLUGO & ~S_ISVTX; + u64 mask; if (mode != S_IRWXUGO) seq_printf(m, ",mode=%o", mode); - if (opts->delegate_cmds == ~0ULL) + mask = (1ULL << __MAX_BPF_CMD) - 1; + if ((opts->delegate_cmds & mask) == mask) seq_printf(m, ",delegate_cmds=any"); else if (opts->delegate_cmds) seq_printf(m, ",delegate_cmds=0x%llx", opts->delegate_cmds); @@ -639,7 +641,7 @@ static void bpf_free_inode(struct inode *inode) free_inode_nonrcu(inode); } -static const struct super_operations bpf_super_ops = { +const struct super_operations bpf_super_ops = { .statfs = simple_statfs, .drop_inode = generic_delete_inode, .show_options = bpf_show_options, diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index ee33a52abf18..a156d549b356 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -5377,6 +5377,20 @@ out_prog_put: return ret; } +#define BPF_TOKEN_CREATE_LAST_FIELD token_create.bpffs_fd + +static int token_create(union bpf_attr *attr) +{ + if (CHECK_ATTR(BPF_TOKEN_CREATE)) + return -EINVAL; + + /* no flags are supported yet */ + if (attr->token_create.flags) + return -EINVAL; + + return bpf_token_create(attr); +} + static int __sys_bpf(int cmd, bpfptr_t uattr, unsigned int size) { union bpf_attr attr; @@ -5510,6 +5524,9 @@ static int __sys_bpf(int cmd, bpfptr_t uattr, unsigned int size) case BPF_PROG_BIND_MAP: err = bpf_prog_bind_map(&attr); break; + case BPF_TOKEN_CREATE: + err = token_create(&attr); + break; default: err = -EINVAL; break; diff --git a/kernel/bpf/token.c b/kernel/bpf/token.c new file mode 100644 index 000000000000..e18aaecc67e9 --- /dev/null +++ b/kernel/bpf/token.c @@ -0,0 +1,214 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +bool bpf_token_capable(const struct bpf_token *token, int cap) +{ + /* BPF token allows ns_capable() level of capabilities, but only if + * token's userns is *exactly* the same as current user's userns + */ + if (token && current_user_ns() == token->userns) { + if (ns_capable(token->userns, cap)) + return true; + if (cap != CAP_SYS_ADMIN && ns_capable(token->userns, CAP_SYS_ADMIN)) + return true; + } + /* otherwise fallback to capable() checks */ + return capable(cap) || (cap != CAP_SYS_ADMIN && capable(CAP_SYS_ADMIN)); +} + +void bpf_token_inc(struct bpf_token *token) +{ + atomic64_inc(&token->refcnt); +} + +static void bpf_token_free(struct bpf_token *token) +{ + put_user_ns(token->userns); + kvfree(token); +} + +static void bpf_token_put_deferred(struct work_struct *work) +{ + struct bpf_token *token = container_of(work, struct bpf_token, work); + + bpf_token_free(token); +} + +void bpf_token_put(struct bpf_token *token) +{ + if (!token) + return; + + if (!atomic64_dec_and_test(&token->refcnt)) + return; + + INIT_WORK(&token->work, bpf_token_put_deferred); + schedule_work(&token->work); +} + +static int bpf_token_release(struct inode *inode, struct file *filp) +{ + struct bpf_token *token = filp->private_data; + + bpf_token_put(token); + return 0; +} + +static void bpf_token_show_fdinfo(struct seq_file *m, struct file *filp) +{ + struct bpf_token *token = filp->private_data; + u64 mask; + + BUILD_BUG_ON(__MAX_BPF_CMD >= 64); + mask = (1ULL << __MAX_BPF_CMD) - 1; + if ((token->allowed_cmds & mask) == mask) + seq_printf(m, "allowed_cmds:\tany\n"); + else + seq_printf(m, "allowed_cmds:\t0x%llx\n", token->allowed_cmds); +} + +#define BPF_TOKEN_INODE_NAME "bpf-token" + +static const struct inode_operations bpf_token_iops = { }; + +static const struct file_operations bpf_token_fops = { + .release = bpf_token_release, + .show_fdinfo = bpf_token_show_fdinfo, +}; + +int bpf_token_create(union bpf_attr *attr) +{ + struct bpf_mount_opts *mnt_opts; + struct bpf_token *token = NULL; + struct user_namespace *userns; + struct inode *inode; + struct file *file; + struct path path; + struct fd f; + umode_t mode; + int err, fd; + + f = fdget(attr->token_create.bpffs_fd); + if (!f.file) + return -EBADF; + + path = f.file->f_path; + path_get(&path); + fdput(f); + + if (path.dentry != path.mnt->mnt_sb->s_root) { + err = -EINVAL; + goto out_path; + } + if (path.mnt->mnt_sb->s_op != &bpf_super_ops) { + err = -EINVAL; + goto out_path; + } + err = path_permission(&path, MAY_ACCESS); + if (err) + goto out_path; + + userns = path.dentry->d_sb->s_user_ns; + /* + * Enforce that creators of BPF tokens are in the same user + * namespace as the BPF FS instance. This makes reasoning about + * permissions a lot easier and we can always relax this later. + */ + if (current_user_ns() != userns) { + err = -EPERM; + goto out_path; + } + if (!ns_capable(userns, CAP_BPF)) { + err = -EPERM; + goto out_path; + } + + mode = S_IFREG | ((S_IRUSR | S_IWUSR) & ~current_umask()); + inode = bpf_get_inode(path.mnt->mnt_sb, NULL, mode); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + goto out_path; + } + + inode->i_op = &bpf_token_iops; + inode->i_fop = &bpf_token_fops; + clear_nlink(inode); /* make sure it is unlinked */ + + file = alloc_file_pseudo(inode, path.mnt, BPF_TOKEN_INODE_NAME, O_RDWR, &bpf_token_fops); + if (IS_ERR(file)) { + iput(inode); + err = PTR_ERR(file); + goto out_path; + } + + token = kvzalloc(sizeof(*token), GFP_USER); + if (!token) { + err = -ENOMEM; + goto out_file; + } + + atomic64_set(&token->refcnt, 1); + + /* remember bpffs owning userns for future ns_capable() checks */ + token->userns = get_user_ns(userns); + + mnt_opts = path.dentry->d_sb->s_fs_info; + token->allowed_cmds = mnt_opts->delegate_cmds; + + fd = get_unused_fd_flags(O_CLOEXEC); + if (fd < 0) { + err = fd; + goto out_token; + } + + file->private_data = token; + fd_install(fd, file); + + path_put(&path); + return fd; + +out_token: + bpf_token_free(token); +out_file: + fput(file); +out_path: + path_put(&path); + return err; +} + +struct bpf_token *bpf_token_get_from_fd(u32 ufd) +{ + struct fd f = fdget(ufd); + struct bpf_token *token; + + if (!f.file) + return ERR_PTR(-EBADF); + if (f.file->f_op != &bpf_token_fops) { + fdput(f); + return ERR_PTR(-EINVAL); + } + + token = f.file->private_data; + bpf_token_inc(token); + fdput(f); + + return token; +} + +bool bpf_token_allow_cmd(const struct bpf_token *token, enum bpf_cmd cmd) +{ + /* BPF token can be used only within exactly the same userns in which + * it was created + */ + if (!token || current_user_ns() != token->userns) + return false; + + return token->allowed_cmds & (1ULL << cmd); +} diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index e88746ba7d21..d4a567e5bc3c 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -847,6 +847,36 @@ union bpf_iter_link_info { * Returns zero on success. On error, -1 is returned and *errno* * is set appropriately. * + * BPF_TOKEN_CREATE + * Description + * Create BPF token with embedded information about what + * BPF-related functionality it allows: + * - a set of allowed bpf() syscall commands; + * - a set of allowed BPF map types to be created with + * BPF_MAP_CREATE command, if BPF_MAP_CREATE itself is allowed; + * - a set of allowed BPF program types and BPF program attach + * types to be loaded with BPF_PROG_LOAD command, if + * BPF_PROG_LOAD itself is allowed. + * + * BPF token is created (derived) from an instance of BPF FS, + * assuming it has necessary delegation mount options specified. + * This BPF token can be passed as an extra parameter to various + * bpf() syscall commands to grant BPF subsystem functionality to + * unprivileged processes. + * + * When created, BPF token is "associated" with the owning + * user namespace of BPF FS instance (super block) that it was + * derived from, and subsequent BPF operations performed with + * BPF token would be performing capabilities checks (i.e., + * CAP_BPF, CAP_PERFMON, CAP_NET_ADMIN, CAP_SYS_ADMIN) within + * that user namespace. Without BPF token, such capabilities + * have to be granted in init user namespace, making bpf() + * syscall incompatible with user namespace, for the most part. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * * NOTES * eBPF objects (maps and programs) can be shared between processes. * @@ -901,6 +931,8 @@ enum bpf_cmd { BPF_ITER_CREATE, BPF_LINK_DETACH, BPF_PROG_BIND_MAP, + BPF_TOKEN_CREATE, + __MAX_BPF_CMD, }; enum bpf_map_type { @@ -1712,6 +1744,11 @@ union bpf_attr { __u32 flags; /* extra flags */ } prog_bind_map; + struct { /* struct used by BPF_TOKEN_CREATE command */ + __u32 flags; + __u32 bpffs_fd; + } token_create; + } __attribute__((aligned(8))); /* The description below is an attempt at providing documentation to eBPF -- cgit From 688b7270b3cb75e8ac78123d719967db40336e5b Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 30 Nov 2023 10:52:16 -0800 Subject: bpf: add BPF token support to BPF_MAP_CREATE command Allow providing token_fd for BPF_MAP_CREATE command to allow controlled BPF map creation from unprivileged process through delegated BPF token. Wire through a set of allowed BPF map types to BPF token, derived from BPF FS at BPF token creation time. This, in combination with allowed_cmds allows to create a narrowly-focused BPF token (controlled by privileged agent) with a restrictive set of BPF maps that application can attempt to create. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231130185229.2688956-5-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 + include/uapi/linux/bpf.h | 2 + kernel/bpf/inode.c | 3 +- kernel/bpf/syscall.c | 52 ++++++++++++++++------ kernel/bpf/token.c | 16 +++++++ tools/include/uapi/linux/bpf.h | 2 + .../selftests/bpf/prog_tests/libbpf_probes.c | 2 + .../testing/selftests/bpf/prog_tests/libbpf_str.c | 3 ++ 8 files changed, 67 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index aa9cf8e5fab1..e08e8436df38 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1600,6 +1600,7 @@ struct bpf_token { atomic64_t refcnt; struct user_namespace *userns; u64 allowed_cmds; + u64 allowed_maps; }; struct bpf_struct_ops_value; @@ -2236,6 +2237,7 @@ int bpf_token_create(union bpf_attr *attr); struct bpf_token *bpf_token_get_from_fd(u32 ufd); bool bpf_token_allow_cmd(const struct bpf_token *token, enum bpf_cmd cmd); +bool bpf_token_allow_map_type(const struct bpf_token *token, enum bpf_map_type type); int bpf_obj_pin_user(u32 ufd, int path_fd, const char __user *pathname); int bpf_obj_get_user(int path_fd, const char __user *pathname, int flags); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index d4a567e5bc3c..0bba3392b17a 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -983,6 +983,7 @@ enum bpf_map_type { BPF_MAP_TYPE_BLOOM_FILTER, BPF_MAP_TYPE_USER_RINGBUF, BPF_MAP_TYPE_CGRP_STORAGE, + __MAX_BPF_MAP_TYPE }; /* Note that tracing related programs such as @@ -1433,6 +1434,7 @@ union bpf_attr { * to using 5 hash functions). */ __u64 map_extra; + __u32 map_token_fd; }; struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */ diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 6ce3f9696e72..9c7865d1c53d 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -613,7 +613,8 @@ static int bpf_show_options(struct seq_file *m, struct dentry *root) else if (opts->delegate_cmds) seq_printf(m, ",delegate_cmds=0x%llx", opts->delegate_cmds); - if (opts->delegate_maps == ~0ULL) + mask = (1ULL << __MAX_BPF_MAP_TYPE) - 1; + if ((opts->delegate_maps & mask) == mask) seq_printf(m, ",delegate_maps=any"); else if (opts->delegate_maps) seq_printf(m, ",delegate_maps=0x%llx", opts->delegate_maps); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index a156d549b356..22e14124cd61 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1009,8 +1009,8 @@ int map_check_no_btf(const struct bpf_map *map, return -ENOTSUPP; } -static int map_check_btf(struct bpf_map *map, const struct btf *btf, - u32 btf_key_id, u32 btf_value_id) +static int map_check_btf(struct bpf_map *map, struct bpf_token *token, + const struct btf *btf, u32 btf_key_id, u32 btf_value_id) { const struct btf_type *key_type, *value_type; u32 key_size, value_size; @@ -1038,7 +1038,7 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf, if (!IS_ERR_OR_NULL(map->record)) { int i; - if (!bpf_capable()) { + if (!bpf_token_capable(token, CAP_BPF)) { ret = -EPERM; goto free_map_tab; } @@ -1126,11 +1126,12 @@ static bool bpf_net_capable(void) return capable(CAP_NET_ADMIN) || capable(CAP_SYS_ADMIN); } -#define BPF_MAP_CREATE_LAST_FIELD map_extra +#define BPF_MAP_CREATE_LAST_FIELD map_token_fd /* called via syscall */ static int map_create(union bpf_attr *attr) { const struct bpf_map_ops *ops; + struct bpf_token *token = NULL; int numa_node = bpf_map_attr_numa_node(attr); u32 map_type = attr->map_type; struct bpf_map *map; @@ -1181,14 +1182,32 @@ static int map_create(union bpf_attr *attr) if (!ops->map_mem_usage) return -EINVAL; + if (attr->map_token_fd) { + token = bpf_token_get_from_fd(attr->map_token_fd); + if (IS_ERR(token)) + return PTR_ERR(token); + + /* if current token doesn't grant map creation permissions, + * then we can't use this token, so ignore it and rely on + * system-wide capabilities checks + */ + if (!bpf_token_allow_cmd(token, BPF_MAP_CREATE) || + !bpf_token_allow_map_type(token, attr->map_type)) { + bpf_token_put(token); + token = NULL; + } + } + + err = -EPERM; + /* Intent here is for unprivileged_bpf_disabled to block BPF map * creation for unprivileged users; other actions depend * on fd availability and access to bpffs, so are dependent on * object creation success. Even with unprivileged BPF disabled, * capability checks are still carried out. */ - if (sysctl_unprivileged_bpf_disabled && !bpf_capable()) - return -EPERM; + if (sysctl_unprivileged_bpf_disabled && !bpf_token_capable(token, CAP_BPF)) + goto put_token; /* check privileged map type permissions */ switch (map_type) { @@ -1221,25 +1240,27 @@ static int map_create(union bpf_attr *attr) case BPF_MAP_TYPE_LRU_PERCPU_HASH: case BPF_MAP_TYPE_STRUCT_OPS: case BPF_MAP_TYPE_CPUMAP: - if (!bpf_capable()) - return -EPERM; + if (!bpf_token_capable(token, CAP_BPF)) + goto put_token; break; case BPF_MAP_TYPE_SOCKMAP: case BPF_MAP_TYPE_SOCKHASH: case BPF_MAP_TYPE_DEVMAP: case BPF_MAP_TYPE_DEVMAP_HASH: case BPF_MAP_TYPE_XSKMAP: - if (!bpf_net_capable()) - return -EPERM; + if (!bpf_token_capable(token, CAP_NET_ADMIN)) + goto put_token; break; default: WARN(1, "unsupported map type %d", map_type); - return -EPERM; + goto put_token; } map = ops->map_alloc(attr); - if (IS_ERR(map)) - return PTR_ERR(map); + if (IS_ERR(map)) { + err = PTR_ERR(map); + goto put_token; + } map->ops = ops; map->map_type = map_type; @@ -1276,7 +1297,7 @@ static int map_create(union bpf_attr *attr) map->btf = btf; if (attr->btf_value_type_id) { - err = map_check_btf(map, btf, attr->btf_key_type_id, + err = map_check_btf(map, token, btf, attr->btf_key_type_id, attr->btf_value_type_id); if (err) goto free_map; @@ -1297,6 +1318,7 @@ static int map_create(union bpf_attr *attr) goto free_map_sec; bpf_map_save_memcg(map); + bpf_token_put(token); err = bpf_map_new_fd(map, f_flags); if (err < 0) { @@ -1317,6 +1339,8 @@ free_map_sec: free_map: btf_put(map->btf); map->ops->map_free(map); +put_token: + bpf_token_put(token); return err; } diff --git a/kernel/bpf/token.c b/kernel/bpf/token.c index e18aaecc67e9..06c34dae658e 100644 --- a/kernel/bpf/token.c +++ b/kernel/bpf/token.c @@ -72,6 +72,13 @@ static void bpf_token_show_fdinfo(struct seq_file *m, struct file *filp) seq_printf(m, "allowed_cmds:\tany\n"); else seq_printf(m, "allowed_cmds:\t0x%llx\n", token->allowed_cmds); + + BUILD_BUG_ON(__MAX_BPF_MAP_TYPE >= 64); + mask = (1ULL << __MAX_BPF_MAP_TYPE) - 1; + if ((token->allowed_maps & mask) == mask) + seq_printf(m, "allowed_maps:\tany\n"); + else + seq_printf(m, "allowed_maps:\t0x%llx\n", token->allowed_maps); } #define BPF_TOKEN_INODE_NAME "bpf-token" @@ -161,6 +168,7 @@ int bpf_token_create(union bpf_attr *attr) mnt_opts = path.dentry->d_sb->s_fs_info; token->allowed_cmds = mnt_opts->delegate_cmds; + token->allowed_maps = mnt_opts->delegate_maps; fd = get_unused_fd_flags(O_CLOEXEC); if (fd < 0) { @@ -212,3 +220,11 @@ bool bpf_token_allow_cmd(const struct bpf_token *token, enum bpf_cmd cmd) return token->allowed_cmds & (1ULL << cmd); } + +bool bpf_token_allow_map_type(const struct bpf_token *token, enum bpf_map_type type) +{ + if (!token || type >= __MAX_BPF_MAP_TYPE) + return false; + + return token->allowed_maps & (1ULL << type); +} diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index d4a567e5bc3c..0bba3392b17a 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -983,6 +983,7 @@ enum bpf_map_type { BPF_MAP_TYPE_BLOOM_FILTER, BPF_MAP_TYPE_USER_RINGBUF, BPF_MAP_TYPE_CGRP_STORAGE, + __MAX_BPF_MAP_TYPE }; /* Note that tracing related programs such as @@ -1433,6 +1434,7 @@ union bpf_attr { * to using 5 hash functions). */ __u64 map_extra; + __u32 map_token_fd; }; struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */ diff --git a/tools/testing/selftests/bpf/prog_tests/libbpf_probes.c b/tools/testing/selftests/bpf/prog_tests/libbpf_probes.c index 9f766ddd946a..573249a2814d 100644 --- a/tools/testing/selftests/bpf/prog_tests/libbpf_probes.c +++ b/tools/testing/selftests/bpf/prog_tests/libbpf_probes.c @@ -68,6 +68,8 @@ void test_libbpf_probe_map_types(void) if (map_type == BPF_MAP_TYPE_UNSPEC) continue; + if (strcmp(map_type_name, "__MAX_BPF_MAP_TYPE") == 0) + continue; if (!test__start_subtest(map_type_name)) continue; diff --git a/tools/testing/selftests/bpf/prog_tests/libbpf_str.c b/tools/testing/selftests/bpf/prog_tests/libbpf_str.c index c440ea3311ed..2a0633f43c73 100644 --- a/tools/testing/selftests/bpf/prog_tests/libbpf_str.c +++ b/tools/testing/selftests/bpf/prog_tests/libbpf_str.c @@ -132,6 +132,9 @@ static void test_libbpf_bpf_map_type_str(void) const char *map_type_str; char buf[256]; + if (map_type == __MAX_BPF_MAP_TYPE) + continue; + map_type_name = btf__str_by_offset(btf, e->name_off); map_type_str = libbpf_bpf_map_type_str(map_type); ASSERT_OK_PTR(map_type_str, map_type_name); -- cgit From ee54b1a910e4d49c9a104f31ae3f5b979131adf8 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 30 Nov 2023 10:52:17 -0800 Subject: bpf: add BPF token support to BPF_BTF_LOAD command Accept BPF token FD in BPF_BTF_LOAD command to allow BTF data loading through delegated BPF token. BTF loading is a pretty straightforward operation, so as long as BPF token is created with allow_cmds granting BPF_BTF_LOAD command, kernel proceeds to parsing BTF data and creating BTF object. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231130185229.2688956-6-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 1 + kernel/bpf/syscall.c | 20 ++++++++++++++++++-- tools/include/uapi/linux/bpf.h | 1 + 3 files changed, 20 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 0bba3392b17a..9f9989e0d062 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1616,6 +1616,7 @@ union bpf_attr { * truncated), or smaller (if log buffer wasn't filled completely). */ __u32 btf_log_true_size; + __u32 btf_token_fd; }; struct { diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 22e14124cd61..d87c5c27cde3 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -4777,15 +4777,31 @@ static int bpf_obj_get_info_by_fd(const union bpf_attr *attr, return err; } -#define BPF_BTF_LOAD_LAST_FIELD btf_log_true_size +#define BPF_BTF_LOAD_LAST_FIELD btf_token_fd static int bpf_btf_load(const union bpf_attr *attr, bpfptr_t uattr, __u32 uattr_size) { + struct bpf_token *token = NULL; + if (CHECK_ATTR(BPF_BTF_LOAD)) return -EINVAL; - if (!bpf_capable()) + if (attr->btf_token_fd) { + token = bpf_token_get_from_fd(attr->btf_token_fd); + if (IS_ERR(token)) + return PTR_ERR(token); + if (!bpf_token_allow_cmd(token, BPF_BTF_LOAD)) { + bpf_token_put(token); + token = NULL; + } + } + + if (!bpf_token_capable(token, CAP_BPF)) { + bpf_token_put(token); return -EPERM; + } + + bpf_token_put(token); return btf_new_fd(attr, uattr, uattr_size); } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 0bba3392b17a..9f9989e0d062 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1616,6 +1616,7 @@ union bpf_attr { * truncated), or smaller (if log buffer wasn't filled completely). */ __u32 btf_log_true_size; + __u32 btf_token_fd; }; struct { -- cgit From e1cef620f598853a90f17701fcb1057a6768f7b8 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 30 Nov 2023 10:52:18 -0800 Subject: bpf: add BPF token support to BPF_PROG_LOAD command Add basic support of BPF token to BPF_PROG_LOAD. Wire through a set of allowed BPF program types and attach types, derived from BPF FS at BPF token creation time. Then make sure we perform bpf_token_capable() checks everywhere where it's relevant. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231130185229.2688956-7-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 6 ++ include/uapi/linux/bpf.h | 2 + kernel/bpf/core.c | 1 + kernel/bpf/inode.c | 6 +- kernel/bpf/syscall.c | 87 ++++++++++++++++------ kernel/bpf/token.c | 27 +++++++ tools/include/uapi/linux/bpf.h | 2 + .../selftests/bpf/prog_tests/libbpf_probes.c | 2 + .../testing/selftests/bpf/prog_tests/libbpf_str.c | 3 + 9 files changed, 110 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index e08e8436df38..20af87b59d70 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1461,6 +1461,7 @@ struct bpf_prog_aux { #ifdef CONFIG_SECURITY void *security; #endif + struct bpf_token *token; struct bpf_prog_offload *offload; struct btf *btf; struct bpf_func_info *func_info; @@ -1601,6 +1602,8 @@ struct bpf_token { struct user_namespace *userns; u64 allowed_cmds; u64 allowed_maps; + u64 allowed_progs; + u64 allowed_attachs; }; struct bpf_struct_ops_value; @@ -2238,6 +2241,9 @@ struct bpf_token *bpf_token_get_from_fd(u32 ufd); bool bpf_token_allow_cmd(const struct bpf_token *token, enum bpf_cmd cmd); bool bpf_token_allow_map_type(const struct bpf_token *token, enum bpf_map_type type); +bool bpf_token_allow_prog_type(const struct bpf_token *token, + enum bpf_prog_type prog_type, + enum bpf_attach_type attach_type); int bpf_obj_pin_user(u32 ufd, int path_fd, const char __user *pathname); int bpf_obj_get_user(int path_fd, const char __user *pathname, int flags); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 9f9989e0d062..4df2d025c784 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1028,6 +1028,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_SK_LOOKUP, BPF_PROG_TYPE_SYSCALL, /* a program that can execute syscalls */ BPF_PROG_TYPE_NETFILTER, + __MAX_BPF_PROG_TYPE }; enum bpf_attach_type { @@ -1504,6 +1505,7 @@ union bpf_attr { * truncated), or smaller (if log buffer wasn't filled completely). */ __u32 log_true_size; + __u32 prog_token_fd; }; struct { /* anonymous struct used by BPF_OBJ_* commands */ diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 4b813da8d6c0..47085839af8d 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2751,6 +2751,7 @@ void bpf_prog_free(struct bpf_prog *fp) if (aux->dst_prog) bpf_prog_put(aux->dst_prog); + bpf_token_put(aux->token); INIT_WORK(&aux->work, bpf_prog_free_deferred); schedule_work(&aux->work); } diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 9c7865d1c53d..5359a0929c35 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -619,12 +619,14 @@ static int bpf_show_options(struct seq_file *m, struct dentry *root) else if (opts->delegate_maps) seq_printf(m, ",delegate_maps=0x%llx", opts->delegate_maps); - if (opts->delegate_progs == ~0ULL) + mask = (1ULL << __MAX_BPF_PROG_TYPE) - 1; + if ((opts->delegate_progs & mask) == mask) seq_printf(m, ",delegate_progs=any"); else if (opts->delegate_progs) seq_printf(m, ",delegate_progs=0x%llx", opts->delegate_progs); - if (opts->delegate_attachs == ~0ULL) + mask = (1ULL << __MAX_BPF_ATTACH_TYPE) - 1; + if ((opts->delegate_attachs & mask) == mask) seq_printf(m, ",delegate_attachs=any"); else if (opts->delegate_attachs) seq_printf(m, ",delegate_attachs=0x%llx", opts->delegate_attachs); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index d87c5c27cde3..2c8393c21b8c 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2608,13 +2608,15 @@ static bool is_perfmon_prog_type(enum bpf_prog_type prog_type) } /* last field in 'union bpf_attr' used by this command */ -#define BPF_PROG_LOAD_LAST_FIELD log_true_size +#define BPF_PROG_LOAD_LAST_FIELD prog_token_fd static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) { enum bpf_prog_type type = attr->prog_type; struct bpf_prog *prog, *dst_prog = NULL; struct btf *attach_btf = NULL; + struct bpf_token *token = NULL; + bool bpf_cap; int err; char license[128]; @@ -2631,10 +2633,31 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) BPF_F_TEST_REG_INVARIANTS)) return -EINVAL; + bpf_prog_load_fixup_attach_type(attr); + + if (attr->prog_token_fd) { + token = bpf_token_get_from_fd(attr->prog_token_fd); + if (IS_ERR(token)) + return PTR_ERR(token); + /* if current token doesn't grant prog loading permissions, + * then we can't use this token, so ignore it and rely on + * system-wide capabilities checks + */ + if (!bpf_token_allow_cmd(token, BPF_PROG_LOAD) || + !bpf_token_allow_prog_type(token, attr->prog_type, + attr->expected_attach_type)) { + bpf_token_put(token); + token = NULL; + } + } + + bpf_cap = bpf_token_capable(token, CAP_BPF); + err = -EPERM; + if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && (attr->prog_flags & BPF_F_ANY_ALIGNMENT) && - !bpf_capable()) - return -EPERM; + !bpf_cap) + goto put_token; /* Intent here is for unprivileged_bpf_disabled to block BPF program * creation for unprivileged users; other actions depend @@ -2643,21 +2666,23 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) * capability checks are still carried out for these * and other operations. */ - if (sysctl_unprivileged_bpf_disabled && !bpf_capable()) - return -EPERM; + if (sysctl_unprivileged_bpf_disabled && !bpf_cap) + goto put_token; if (attr->insn_cnt == 0 || - attr->insn_cnt > (bpf_capable() ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS)) - return -E2BIG; + attr->insn_cnt > (bpf_cap ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS)) { + err = -E2BIG; + goto put_token; + } if (type != BPF_PROG_TYPE_SOCKET_FILTER && type != BPF_PROG_TYPE_CGROUP_SKB && - !bpf_capable()) - return -EPERM; + !bpf_cap) + goto put_token; - if (is_net_admin_prog_type(type) && !bpf_net_capable()) - return -EPERM; - if (is_perfmon_prog_type(type) && !perfmon_capable()) - return -EPERM; + if (is_net_admin_prog_type(type) && !bpf_token_capable(token, CAP_NET_ADMIN)) + goto put_token; + if (is_perfmon_prog_type(type) && !bpf_token_capable(token, CAP_PERFMON)) + goto put_token; /* attach_prog_fd/attach_btf_obj_fd can specify fd of either bpf_prog * or btf, we need to check which one it is @@ -2667,27 +2692,33 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) if (IS_ERR(dst_prog)) { dst_prog = NULL; attach_btf = btf_get_by_fd(attr->attach_btf_obj_fd); - if (IS_ERR(attach_btf)) - return -EINVAL; + if (IS_ERR(attach_btf)) { + err = -EINVAL; + goto put_token; + } if (!btf_is_kernel(attach_btf)) { /* attaching through specifying bpf_prog's BTF * objects directly might be supported eventually */ btf_put(attach_btf); - return -ENOTSUPP; + err = -ENOTSUPP; + goto put_token; } } } else if (attr->attach_btf_id) { /* fall back to vmlinux BTF, if BTF type ID is specified */ attach_btf = bpf_get_btf_vmlinux(); - if (IS_ERR(attach_btf)) - return PTR_ERR(attach_btf); - if (!attach_btf) - return -EINVAL; + if (IS_ERR(attach_btf)) { + err = PTR_ERR(attach_btf); + goto put_token; + } + if (!attach_btf) { + err = -EINVAL; + goto put_token; + } btf_get(attach_btf); } - bpf_prog_load_fixup_attach_type(attr); if (bpf_prog_load_check_attach(type, attr->expected_attach_type, attach_btf, attr->attach_btf_id, dst_prog)) { @@ -2695,7 +2726,8 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) bpf_prog_put(dst_prog); if (attach_btf) btf_put(attach_btf); - return -EINVAL; + err = -EINVAL; + goto put_token; } /* plain bpf_prog allocation */ @@ -2705,7 +2737,8 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) bpf_prog_put(dst_prog); if (attach_btf) btf_put(attach_btf); - return -ENOMEM; + err = -EINVAL; + goto put_token; } prog->expected_attach_type = attr->expected_attach_type; @@ -2716,6 +2749,10 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) prog->aux->sleepable = attr->prog_flags & BPF_F_SLEEPABLE; prog->aux->xdp_has_frags = attr->prog_flags & BPF_F_XDP_HAS_FRAGS; + /* move token into prog->aux, reuse taken refcnt */ + prog->aux->token = token; + token = NULL; + err = security_bpf_prog_alloc(prog->aux); if (err) goto free_prog; @@ -2817,6 +2854,8 @@ free_prog: if (prog->aux->attach_btf) btf_put(prog->aux->attach_btf); bpf_prog_free(prog); +put_token: + bpf_token_put(token); return err; } @@ -3806,7 +3845,7 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, case BPF_PROG_TYPE_SK_LOOKUP: return attach_type == prog->expected_attach_type ? 0 : -EINVAL; case BPF_PROG_TYPE_CGROUP_SKB: - if (!bpf_net_capable()) + if (!bpf_token_capable(prog->aux->token, CAP_NET_ADMIN)) /* cg-skb progs can be loaded by unpriv user. * check permissions at attach time. */ diff --git a/kernel/bpf/token.c b/kernel/bpf/token.c index 06c34dae658e..5a51e6b8f6bf 100644 --- a/kernel/bpf/token.c +++ b/kernel/bpf/token.c @@ -79,6 +79,20 @@ static void bpf_token_show_fdinfo(struct seq_file *m, struct file *filp) seq_printf(m, "allowed_maps:\tany\n"); else seq_printf(m, "allowed_maps:\t0x%llx\n", token->allowed_maps); + + BUILD_BUG_ON(__MAX_BPF_PROG_TYPE >= 64); + mask = (1ULL << __MAX_BPF_PROG_TYPE) - 1; + if ((token->allowed_progs & mask) == mask) + seq_printf(m, "allowed_progs:\tany\n"); + else + seq_printf(m, "allowed_progs:\t0x%llx\n", token->allowed_progs); + + BUILD_BUG_ON(__MAX_BPF_ATTACH_TYPE >= 64); + mask = (1ULL << __MAX_BPF_ATTACH_TYPE) - 1; + if ((token->allowed_attachs & mask) == mask) + seq_printf(m, "allowed_attachs:\tany\n"); + else + seq_printf(m, "allowed_attachs:\t0x%llx\n", token->allowed_attachs); } #define BPF_TOKEN_INODE_NAME "bpf-token" @@ -169,6 +183,8 @@ int bpf_token_create(union bpf_attr *attr) mnt_opts = path.dentry->d_sb->s_fs_info; token->allowed_cmds = mnt_opts->delegate_cmds; token->allowed_maps = mnt_opts->delegate_maps; + token->allowed_progs = mnt_opts->delegate_progs; + token->allowed_attachs = mnt_opts->delegate_attachs; fd = get_unused_fd_flags(O_CLOEXEC); if (fd < 0) { @@ -228,3 +244,14 @@ bool bpf_token_allow_map_type(const struct bpf_token *token, enum bpf_map_type t return token->allowed_maps & (1ULL << type); } + +bool bpf_token_allow_prog_type(const struct bpf_token *token, + enum bpf_prog_type prog_type, + enum bpf_attach_type attach_type) +{ + if (!token || prog_type >= __MAX_BPF_PROG_TYPE || attach_type >= __MAX_BPF_ATTACH_TYPE) + return false; + + return (token->allowed_progs & (1ULL << prog_type)) && + (token->allowed_attachs & (1ULL << attach_type)); +} diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 9f9989e0d062..4df2d025c784 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1028,6 +1028,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_SK_LOOKUP, BPF_PROG_TYPE_SYSCALL, /* a program that can execute syscalls */ BPF_PROG_TYPE_NETFILTER, + __MAX_BPF_PROG_TYPE }; enum bpf_attach_type { @@ -1504,6 +1505,7 @@ union bpf_attr { * truncated), or smaller (if log buffer wasn't filled completely). */ __u32 log_true_size; + __u32 prog_token_fd; }; struct { /* anonymous struct used by BPF_OBJ_* commands */ diff --git a/tools/testing/selftests/bpf/prog_tests/libbpf_probes.c b/tools/testing/selftests/bpf/prog_tests/libbpf_probes.c index 573249a2814d..4ed46ed58a7b 100644 --- a/tools/testing/selftests/bpf/prog_tests/libbpf_probes.c +++ b/tools/testing/selftests/bpf/prog_tests/libbpf_probes.c @@ -30,6 +30,8 @@ void test_libbpf_probe_prog_types(void) if (prog_type == BPF_PROG_TYPE_UNSPEC) continue; + if (strcmp(prog_type_name, "__MAX_BPF_PROG_TYPE") == 0) + continue; if (!test__start_subtest(prog_type_name)) continue; diff --git a/tools/testing/selftests/bpf/prog_tests/libbpf_str.c b/tools/testing/selftests/bpf/prog_tests/libbpf_str.c index 2a0633f43c73..384bc1f7a65e 100644 --- a/tools/testing/selftests/bpf/prog_tests/libbpf_str.c +++ b/tools/testing/selftests/bpf/prog_tests/libbpf_str.c @@ -189,6 +189,9 @@ static void test_libbpf_bpf_prog_type_str(void) const char *prog_type_str; char buf[256]; + if (prog_type == __MAX_BPF_PROG_TYPE) + continue; + prog_type_name = btf__str_by_offset(btf, e->name_off); prog_type_str = libbpf_bpf_prog_type_str(prog_type); ASSERT_OK_PTR(prog_type_str, prog_type_name); -- cgit From 4cbb270e115bc197ff2046aeb54cc951666b16ec Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 30 Nov 2023 10:52:19 -0800 Subject: bpf: take into account BPF token when fetching helper protos Instead of performing unconditional system-wide bpf_capable() and perfmon_capable() calls inside bpf_base_func_proto() function (and other similar ones) to determine eligibility of a given BPF helper for a given program, use previously recorded BPF token during BPF_PROG_LOAD command handling to inform the decision. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231130185229.2688956-8-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- drivers/media/rc/bpf-lirc.c | 2 +- include/linux/bpf.h | 5 +++-- kernel/bpf/cgroup.c | 6 +++--- kernel/bpf/helpers.c | 6 +++--- kernel/bpf/syscall.c | 5 +++-- kernel/trace/bpf_trace.c | 2 +- net/core/filter.c | 32 ++++++++++++++++---------------- net/ipv4/bpf_tcp_ca.c | 2 +- net/netfilter/nf_bpf_link.c | 2 +- 9 files changed, 32 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/drivers/media/rc/bpf-lirc.c b/drivers/media/rc/bpf-lirc.c index fe17c7f98e81..6d07693c6b9f 100644 --- a/drivers/media/rc/bpf-lirc.c +++ b/drivers/media/rc/bpf-lirc.c @@ -110,7 +110,7 @@ lirc_mode2_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_get_prandom_u32: return &bpf_get_prandom_u32_proto; case BPF_FUNC_trace_printk: - if (perfmon_capable()) + if (bpf_token_capable(prog->aux->token, CAP_PERFMON)) return bpf_get_trace_printk_proto(); fallthrough; default: diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 20af87b59d70..2a3ab4f3dd8c 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2492,7 +2492,8 @@ const char *btf_find_decl_tag_value(const struct btf *btf, const struct btf_type struct bpf_prog *bpf_prog_by_id(u32 id); struct bpf_link *bpf_link_by_id(u32 id); -const struct bpf_func_proto *bpf_base_func_proto(enum bpf_func_id func_id); +const struct bpf_func_proto *bpf_base_func_proto(enum bpf_func_id func_id, + const struct bpf_prog *prog); void bpf_task_storage_free(struct task_struct *task); void bpf_cgrp_storage_free(struct cgroup *cgroup); bool bpf_prog_has_kfunc_call(const struct bpf_prog *prog); @@ -2752,7 +2753,7 @@ static inline int btf_struct_access(struct bpf_verifier_log *log, } static inline const struct bpf_func_proto * -bpf_base_func_proto(enum bpf_func_id func_id) +bpf_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { return NULL; } diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 491d20038cbe..98e0e3835b28 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1630,7 +1630,7 @@ cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_perf_event_output: return &bpf_event_output_data_proto; default: - return bpf_base_func_proto(func_id); + return bpf_base_func_proto(func_id, prog); } } @@ -2191,7 +2191,7 @@ sysctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_perf_event_output: return &bpf_event_output_data_proto; default: - return bpf_base_func_proto(func_id); + return bpf_base_func_proto(func_id, prog); } } @@ -2348,7 +2348,7 @@ cg_sockopt_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_perf_event_output: return &bpf_event_output_data_proto; default: - return bpf_base_func_proto(func_id); + return bpf_base_func_proto(func_id, prog); } } diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index ee9bdf29246a..b3be5742d6f1 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1679,7 +1679,7 @@ const struct bpf_func_proto bpf_probe_read_kernel_str_proto __weak; const struct bpf_func_proto bpf_task_pt_regs_proto __weak; const struct bpf_func_proto * -bpf_base_func_proto(enum bpf_func_id func_id) +bpf_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_map_lookup_elem: @@ -1730,7 +1730,7 @@ bpf_base_func_proto(enum bpf_func_id func_id) break; } - if (!bpf_capable()) + if (!bpf_token_capable(prog->aux->token, CAP_BPF)) return NULL; switch (func_id) { @@ -1788,7 +1788,7 @@ bpf_base_func_proto(enum bpf_func_id func_id) break; } - if (!perfmon_capable()) + if (!bpf_token_capable(prog->aux->token, CAP_PERFMON)) return NULL; switch (func_id) { diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 2c8393c21b8c..1cc03f08c9cd 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -5712,7 +5712,7 @@ static const struct bpf_func_proto bpf_sys_bpf_proto = { const struct bpf_func_proto * __weak tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { - return bpf_base_func_proto(func_id); + return bpf_base_func_proto(func_id, prog); } BPF_CALL_1(bpf_sys_close, u32, fd) @@ -5762,7 +5762,8 @@ syscall_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_sys_bpf: - return !perfmon_capable() ? NULL : &bpf_sys_bpf_proto; + return !bpf_token_capable(prog->aux->token, CAP_PERFMON) + ? NULL : &bpf_sys_bpf_proto; case BPF_FUNC_btf_find_by_name_kind: return &bpf_btf_find_by_name_kind_proto; case BPF_FUNC_sys_close: diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 1648bde28f01..774cf476a892 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1626,7 +1626,7 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_trace_vprintk: return bpf_get_trace_vprintk_proto(); default: - return bpf_base_func_proto(func_id); + return bpf_base_func_proto(func_id, prog); } } diff --git a/net/core/filter.c b/net/core/filter.c index 0adaa4afa35f..0bf2a03d8203 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -87,7 +87,7 @@ #include "dev.h" static const struct bpf_func_proto * -bpf_sk_base_func_proto(enum bpf_func_id func_id); +bpf_sk_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog); int copy_bpf_fprog_from_user(struct sock_fprog *dst, sockptr_t src, int len) { @@ -7841,7 +7841,7 @@ sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_ktime_get_coarse_ns: return &bpf_ktime_get_coarse_ns_proto; default: - return bpf_base_func_proto(func_id); + return bpf_base_func_proto(func_id, prog); } } @@ -7934,7 +7934,7 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return NULL; } default: - return bpf_sk_base_func_proto(func_id); + return bpf_sk_base_func_proto(func_id, prog); } } @@ -7953,7 +7953,7 @@ sk_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_perf_event_output: return &bpf_skb_event_output_proto; default: - return bpf_sk_base_func_proto(func_id); + return bpf_sk_base_func_proto(func_id, prog); } } @@ -8140,7 +8140,7 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) #endif #endif default: - return bpf_sk_base_func_proto(func_id); + return bpf_sk_base_func_proto(func_id, prog); } } @@ -8199,7 +8199,7 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) #endif #endif default: - return bpf_sk_base_func_proto(func_id); + return bpf_sk_base_func_proto(func_id, prog); } #if IS_MODULE(CONFIG_NF_CONNTRACK) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES) @@ -8260,7 +8260,7 @@ sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_tcp_sock_proto; #endif /* CONFIG_INET */ default: - return bpf_sk_base_func_proto(func_id); + return bpf_sk_base_func_proto(func_id, prog); } } @@ -8302,7 +8302,7 @@ sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_cgroup_classid_curr_proto; #endif default: - return bpf_sk_base_func_proto(func_id); + return bpf_sk_base_func_proto(func_id, prog); } } @@ -8346,7 +8346,7 @@ sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_skc_lookup_tcp_proto; #endif default: - return bpf_sk_base_func_proto(func_id); + return bpf_sk_base_func_proto(func_id, prog); } } @@ -8357,7 +8357,7 @@ flow_dissector_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_skb_load_bytes: return &bpf_flow_dissector_load_bytes_proto; default: - return bpf_sk_base_func_proto(func_id); + return bpf_sk_base_func_proto(func_id, prog); } } @@ -8384,7 +8384,7 @@ lwt_out_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_skb_under_cgroup: return &bpf_skb_under_cgroup_proto; default: - return bpf_sk_base_func_proto(func_id); + return bpf_sk_base_func_proto(func_id, prog); } } @@ -11215,7 +11215,7 @@ sk_reuseport_func_proto(enum bpf_func_id func_id, case BPF_FUNC_ktime_get_coarse_ns: return &bpf_ktime_get_coarse_ns_proto; default: - return bpf_base_func_proto(func_id); + return bpf_base_func_proto(func_id, prog); } } @@ -11397,7 +11397,7 @@ sk_lookup_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_sk_release: return &bpf_sk_release_proto; default: - return bpf_sk_base_func_proto(func_id); + return bpf_sk_base_func_proto(func_id, prog); } } @@ -11731,7 +11731,7 @@ const struct bpf_func_proto bpf_sock_from_file_proto = { }; static const struct bpf_func_proto * -bpf_sk_base_func_proto(enum bpf_func_id func_id) +bpf_sk_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { const struct bpf_func_proto *func; @@ -11760,10 +11760,10 @@ bpf_sk_base_func_proto(enum bpf_func_id func_id) case BPF_FUNC_ktime_get_coarse_ns: return &bpf_ktime_get_coarse_ns_proto; default: - return bpf_base_func_proto(func_id); + return bpf_base_func_proto(func_id, prog); } - if (!perfmon_capable()) + if (!bpf_token_capable(prog->aux->token, CAP_PERFMON)) return NULL; return func; diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c index 39dcccf0f174..c7bbd8f3c708 100644 --- a/net/ipv4/bpf_tcp_ca.c +++ b/net/ipv4/bpf_tcp_ca.c @@ -191,7 +191,7 @@ bpf_tcp_ca_get_func_proto(enum bpf_func_id func_id, case BPF_FUNC_ktime_get_coarse_ns: return &bpf_ktime_get_coarse_ns_proto; default: - return bpf_base_func_proto(func_id); + return bpf_base_func_proto(func_id, prog); } } diff --git a/net/netfilter/nf_bpf_link.c b/net/netfilter/nf_bpf_link.c index e502ec00b2fe..1969facac91c 100644 --- a/net/netfilter/nf_bpf_link.c +++ b/net/netfilter/nf_bpf_link.c @@ -314,7 +314,7 @@ static bool nf_is_valid_access(int off, int size, enum bpf_access_type type, static const struct bpf_func_proto * bpf_nf_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { - return bpf_base_func_proto(func_id); + return bpf_base_func_proto(func_id, prog); } const struct bpf_verifier_ops netfilter_verifier_ops = { -- cgit From 8062fb12de99b2da33754c6a3be1bfc30d9a35f4 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 30 Nov 2023 10:52:20 -0800 Subject: bpf: consistently use BPF token throughout BPF verifier logic Remove remaining direct queries to perfmon_capable() and bpf_capable() in BPF verifier logic and instead use BPF token (if available) to make decisions about privileges. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231130185229.2688956-9-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 16 ++++++++-------- include/linux/filter.h | 2 +- kernel/bpf/arraymap.c | 2 +- kernel/bpf/core.c | 2 +- kernel/bpf/verifier.c | 13 ++++++------- net/core/filter.c | 4 ++-- 6 files changed, 19 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 2a3ab4f3dd8c..435abad3cc61 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2200,24 +2200,24 @@ extern int sysctl_unprivileged_bpf_disabled; bool bpf_token_capable(const struct bpf_token *token, int cap); -static inline bool bpf_allow_ptr_leaks(void) +static inline bool bpf_allow_ptr_leaks(const struct bpf_token *token) { - return perfmon_capable(); + return bpf_token_capable(token, CAP_PERFMON); } -static inline bool bpf_allow_uninit_stack(void) +static inline bool bpf_allow_uninit_stack(const struct bpf_token *token) { - return perfmon_capable(); + return bpf_token_capable(token, CAP_PERFMON); } -static inline bool bpf_bypass_spec_v1(void) +static inline bool bpf_bypass_spec_v1(const struct bpf_token *token) { - return cpu_mitigations_off() || perfmon_capable(); + return cpu_mitigations_off() || bpf_token_capable(token, CAP_PERFMON); } -static inline bool bpf_bypass_spec_v4(void) +static inline bool bpf_bypass_spec_v4(const struct bpf_token *token) { - return cpu_mitigations_off() || perfmon_capable(); + return cpu_mitigations_off() || bpf_token_capable(token, CAP_PERFMON); } int bpf_map_new_fd(struct bpf_map *map, int flags); diff --git a/include/linux/filter.h b/include/linux/filter.h index a4953fafc8cb..14354605ad26 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1139,7 +1139,7 @@ static inline bool bpf_jit_blinding_enabled(struct bpf_prog *prog) return false; if (!bpf_jit_harden) return false; - if (bpf_jit_harden == 1 && bpf_capable()) + if (bpf_jit_harden == 1 && bpf_token_capable(prog->aux->token, CAP_BPF)) return false; return true; diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 4a4a67956e21..8d365bda9a8b 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -82,7 +82,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY; int numa_node = bpf_map_attr_numa_node(attr); u32 elem_size, index_mask, max_entries; - bool bypass_spec_v1 = bpf_bypass_spec_v1(); + bool bypass_spec_v1 = bpf_bypass_spec_v1(NULL); u64 array_size, mask64; struct bpf_array *array; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 47085839af8d..ced511f44174 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -675,7 +675,7 @@ static bool bpf_prog_kallsyms_candidate(const struct bpf_prog *fp) void bpf_prog_kallsyms_add(struct bpf_prog *fp) { if (!bpf_prog_kallsyms_candidate(fp) || - !bpf_capable()) + !bpf_token_capable(fp->aux->token, CAP_BPF)) return; bpf_prog_ksym_set_addr(fp); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index e5ce530641ba..45e85fb76d82 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -20597,7 +20597,12 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3 env->prog = *prog; env->ops = bpf_verifier_ops[env->prog->type]; env->fd_array = make_bpfptr(attr->fd_array, uattr.is_kernel); - is_priv = bpf_capable(); + + env->allow_ptr_leaks = bpf_allow_ptr_leaks(env->prog->aux->token); + env->allow_uninit_stack = bpf_allow_uninit_stack(env->prog->aux->token); + env->bypass_spec_v1 = bpf_bypass_spec_v1(env->prog->aux->token); + env->bypass_spec_v4 = bpf_bypass_spec_v4(env->prog->aux->token); + env->bpf_capable = is_priv = bpf_token_capable(env->prog->aux->token, CAP_BPF); bpf_get_btf_vmlinux(); @@ -20629,12 +20634,6 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3 if (attr->prog_flags & BPF_F_ANY_ALIGNMENT) env->strict_alignment = false; - env->allow_ptr_leaks = bpf_allow_ptr_leaks(); - env->allow_uninit_stack = bpf_allow_uninit_stack(); - env->bypass_spec_v1 = bpf_bypass_spec_v1(); - env->bypass_spec_v4 = bpf_bypass_spec_v4(); - env->bpf_capable = bpf_capable(); - if (is_priv) env->test_state_freq = attr->prog_flags & BPF_F_TEST_STATE_FREQ; env->test_reg_invariants = attr->prog_flags & BPF_F_TEST_REG_INVARIANTS; diff --git a/net/core/filter.c b/net/core/filter.c index 0bf2a03d8203..adcfc2c25754 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -8559,7 +8559,7 @@ static bool cg_skb_is_valid_access(int off, int size, return false; case bpf_ctx_range(struct __sk_buff, data): case bpf_ctx_range(struct __sk_buff, data_end): - if (!bpf_capable()) + if (!bpf_token_capable(prog->aux->token, CAP_BPF)) return false; break; } @@ -8571,7 +8571,7 @@ static bool cg_skb_is_valid_access(int off, int size, case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]): break; case bpf_ctx_range(struct __sk_buff, tstamp): - if (!bpf_capable()) + if (!bpf_token_capable(prog->aux->token, CAP_BPF)) return false; break; default: -- cgit From c3dd6e94df7193f33f45d33303f5e85afb2a72dc Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 30 Nov 2023 10:52:21 -0800 Subject: bpf,lsm: refactor bpf_prog_alloc/bpf_prog_free LSM hooks Based on upstream discussion ([0]), rework existing bpf_prog_alloc_security LSM hook. Rename it to bpf_prog_load and instead of passing bpf_prog_aux, pass proper bpf_prog pointer for a full BPF program struct. Also, we pass bpf_attr union with all the user-provided arguments for BPF_PROG_LOAD command. This will give LSMs as much information as we can basically provide. The hook is also BPF token-aware now, and optional bpf_token struct is passed as a third argument. bpf_prog_load LSM hook is called after a bunch of sanity checks were performed, bpf_prog and bpf_prog_aux were allocated and filled out, but right before performing full-fledged BPF verification step. bpf_prog_free LSM hook is now accepting struct bpf_prog argument, for consistency. SELinux code is adjusted to all new names, types, and signatures. Note, given that bpf_prog_load (previously bpf_prog_alloc) hook can be used by some LSMs to allocate extra security blob, but also by other LSMs to reject BPF program loading, we need to make sure that bpf_prog_free LSM hook is called after bpf_prog_load/bpf_prog_alloc one *even* if the hook itself returned error. If we don't do that, we run the risk of leaking memory. This seems to be possible today when combining SELinux and BPF LSM, as one example, depending on their relative ordering. Also, for BPF LSM setup, add bpf_prog_load and bpf_prog_free to sleepable LSM hooks list, as they are both executed in sleepable context. Also drop bpf_prog_load hook from untrusted, as there is no issue with refcount or anything else anymore, that originally forced us to add it to untrusted list in c0c852dd1876 ("bpf: Do not mark certain LSM hook arguments as trusted"). We now trigger this hook much later and it should not be an issue anymore. [0] https://lore.kernel.org/bpf/9fe88aef7deabbe87d3fc38c4aea3c69.paul@paul-moore.com/ Acked-by: Paul Moore Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231130185229.2688956-10-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/lsm_hook_defs.h | 5 +++-- include/linux/security.h | 12 +++++++----- kernel/bpf/bpf_lsm.c | 5 +++-- kernel/bpf/syscall.c | 25 +++++++++++++------------ security/security.c | 25 +++++++++++++++---------- security/selinux/hooks.c | 15 ++++++++------- 6 files changed, 49 insertions(+), 38 deletions(-) (limited to 'kernel') diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index ff217a5ce552..41ec4a7c070e 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -400,8 +400,9 @@ LSM_HOOK(int, 0, bpf_map, struct bpf_map *map, fmode_t fmode) LSM_HOOK(int, 0, bpf_prog, struct bpf_prog *prog) LSM_HOOK(int, 0, bpf_map_alloc_security, struct bpf_map *map) LSM_HOOK(void, LSM_RET_VOID, bpf_map_free_security, struct bpf_map *map) -LSM_HOOK(int, 0, bpf_prog_alloc_security, struct bpf_prog_aux *aux) -LSM_HOOK(void, LSM_RET_VOID, bpf_prog_free_security, struct bpf_prog_aux *aux) +LSM_HOOK(int, 0, bpf_prog_load, struct bpf_prog *prog, union bpf_attr *attr, + struct bpf_token *token) +LSM_HOOK(void, LSM_RET_VOID, bpf_prog_free, struct bpf_prog *prog) #endif /* CONFIG_BPF_SYSCALL */ LSM_HOOK(int, 0, locked_down, enum lockdown_reason what) diff --git a/include/linux/security.h b/include/linux/security.h index 1d1df326c881..65467eef6678 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -2020,15 +2020,16 @@ static inline void securityfs_remove(struct dentry *dentry) union bpf_attr; struct bpf_map; struct bpf_prog; -struct bpf_prog_aux; +struct bpf_token; #ifdef CONFIG_SECURITY extern int security_bpf(int cmd, union bpf_attr *attr, unsigned int size); extern int security_bpf_map(struct bpf_map *map, fmode_t fmode); extern int security_bpf_prog(struct bpf_prog *prog); extern int security_bpf_map_alloc(struct bpf_map *map); extern void security_bpf_map_free(struct bpf_map *map); -extern int security_bpf_prog_alloc(struct bpf_prog_aux *aux); -extern void security_bpf_prog_free(struct bpf_prog_aux *aux); +extern int security_bpf_prog_load(struct bpf_prog *prog, union bpf_attr *attr, + struct bpf_token *token); +extern void security_bpf_prog_free(struct bpf_prog *prog); #else static inline int security_bpf(int cmd, union bpf_attr *attr, unsigned int size) @@ -2054,12 +2055,13 @@ static inline int security_bpf_map_alloc(struct bpf_map *map) static inline void security_bpf_map_free(struct bpf_map *map) { } -static inline int security_bpf_prog_alloc(struct bpf_prog_aux *aux) +static inline int security_bpf_prog_load(struct bpf_prog *prog, union bpf_attr *attr, + struct bpf_token *token) { return 0; } -static inline void security_bpf_prog_free(struct bpf_prog_aux *aux) +static inline void security_bpf_prog_free(struct bpf_prog *prog) { } #endif /* CONFIG_SECURITY */ #endif /* CONFIG_BPF_SYSCALL */ diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index e14c822f8911..3e956f6302f3 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -263,6 +263,8 @@ BTF_ID(func, bpf_lsm_bpf_map) BTF_ID(func, bpf_lsm_bpf_map_alloc_security) BTF_ID(func, bpf_lsm_bpf_map_free_security) BTF_ID(func, bpf_lsm_bpf_prog) +BTF_ID(func, bpf_lsm_bpf_prog_load) +BTF_ID(func, bpf_lsm_bpf_prog_free) BTF_ID(func, bpf_lsm_bprm_check_security) BTF_ID(func, bpf_lsm_bprm_committed_creds) BTF_ID(func, bpf_lsm_bprm_committing_creds) @@ -346,8 +348,7 @@ BTF_SET_END(sleepable_lsm_hooks) BTF_SET_START(untrusted_lsm_hooks) BTF_ID(func, bpf_lsm_bpf_map_free_security) -BTF_ID(func, bpf_lsm_bpf_prog_alloc_security) -BTF_ID(func, bpf_lsm_bpf_prog_free_security) +BTF_ID(func, bpf_lsm_bpf_prog_free) BTF_ID(func, bpf_lsm_file_alloc_security) BTF_ID(func, bpf_lsm_file_free_security) #ifdef CONFIG_SECURITY_NETWORK diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 1cc03f08c9cd..7717c7c7b95d 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2162,7 +2162,7 @@ static void __bpf_prog_put_rcu(struct rcu_head *rcu) kvfree(aux->func_info); kfree(aux->func_info_aux); free_uid(aux->user); - security_bpf_prog_free(aux); + security_bpf_prog_free(aux->prog); bpf_prog_free(aux->prog); } @@ -2753,10 +2753,6 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) prog->aux->token = token; token = NULL; - err = security_bpf_prog_alloc(prog->aux); - if (err) - goto free_prog; - prog->aux->user = get_current_user(); prog->len = attr->insn_cnt; @@ -2764,12 +2760,12 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) if (copy_from_bpfptr(prog->insns, make_bpfptr(attr->insns, uattr.is_kernel), bpf_prog_insn_size(prog)) != 0) - goto free_prog_sec; + goto free_prog; /* copy eBPF program license from user space */ if (strncpy_from_bpfptr(license, make_bpfptr(attr->license, uattr.is_kernel), sizeof(license) - 1) < 0) - goto free_prog_sec; + goto free_prog; license[sizeof(license) - 1] = 0; /* eBPF programs must be GPL compatible to use GPL-ed functions */ @@ -2783,25 +2779,29 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) if (bpf_prog_is_dev_bound(prog->aux)) { err = bpf_prog_dev_bound_init(prog, attr); if (err) - goto free_prog_sec; + goto free_prog; } if (type == BPF_PROG_TYPE_EXT && dst_prog && bpf_prog_is_dev_bound(dst_prog->aux)) { err = bpf_prog_dev_bound_inherit(prog, dst_prog); if (err) - goto free_prog_sec; + goto free_prog; } /* find program type: socket_filter vs tracing_filter */ err = find_prog_type(type, prog); if (err < 0) - goto free_prog_sec; + goto free_prog; prog->aux->load_time = ktime_get_boottime_ns(); err = bpf_obj_name_cpy(prog->aux->name, attr->prog_name, sizeof(attr->prog_name)); if (err < 0) + goto free_prog; + + err = security_bpf_prog_load(prog, attr, token); + if (err) goto free_prog_sec; /* run eBPF verifier */ @@ -2847,10 +2847,11 @@ free_used_maps: */ __bpf_prog_put_noref(prog, prog->aux->real_func_cnt); return err; + free_prog_sec: - free_uid(prog->aux->user); - security_bpf_prog_free(prog->aux); + security_bpf_prog_free(prog); free_prog: + free_uid(prog->aux->user); if (prog->aux->attach_btf) btf_put(prog->aux->attach_btf); bpf_prog_free(prog); diff --git a/security/security.c b/security/security.c index dcb3e7014f9b..c8a1c66cfaad 100644 --- a/security/security.c +++ b/security/security.c @@ -5180,16 +5180,21 @@ int security_bpf_map_alloc(struct bpf_map *map) } /** - * security_bpf_prog_alloc() - Allocate a bpf program LSM blob - * @aux: bpf program aux info struct + * security_bpf_prog_load() - Check if loading of BPF program is allowed + * @prog: BPF program object + * @attr: BPF syscall attributes used to create BPF program + * @token: BPF token used to grant user access to BPF subsystem * - * Initialize the security field inside bpf program. + * Perform an access control check when the kernel loads a BPF program and + * allocates associated BPF program object. This hook is also responsible for + * allocating any required LSM state for the BPF program. * * Return: Returns 0 on success, error on failure. */ -int security_bpf_prog_alloc(struct bpf_prog_aux *aux) +int security_bpf_prog_load(struct bpf_prog *prog, union bpf_attr *attr, + struct bpf_token *token) { - return call_int_hook(bpf_prog_alloc_security, 0, aux); + return call_int_hook(bpf_prog_load, 0, prog, attr, token); } /** @@ -5204,14 +5209,14 @@ void security_bpf_map_free(struct bpf_map *map) } /** - * security_bpf_prog_free() - Free a bpf program's LSM blob - * @aux: bpf program aux info struct + * security_bpf_prog_free() - Free a BPF program's LSM blob + * @prog: BPF program struct * - * Clean up the security information stored inside bpf prog. + * Clean up the security information stored inside BPF program. */ -void security_bpf_prog_free(struct bpf_prog_aux *aux) +void security_bpf_prog_free(struct bpf_prog *prog) { - call_void_hook(bpf_prog_free_security, aux); + call_void_hook(bpf_prog_free, prog); } #endif /* CONFIG_BPF_SYSCALL */ diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index feda711c6b7b..eabee39e983c 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -6805,7 +6805,8 @@ static void selinux_bpf_map_free(struct bpf_map *map) kfree(bpfsec); } -static int selinux_bpf_prog_alloc(struct bpf_prog_aux *aux) +static int selinux_bpf_prog_load(struct bpf_prog *prog, union bpf_attr *attr, + struct bpf_token *token) { struct bpf_security_struct *bpfsec; @@ -6814,16 +6815,16 @@ static int selinux_bpf_prog_alloc(struct bpf_prog_aux *aux) return -ENOMEM; bpfsec->sid = current_sid(); - aux->security = bpfsec; + prog->aux->security = bpfsec; return 0; } -static void selinux_bpf_prog_free(struct bpf_prog_aux *aux) +static void selinux_bpf_prog_free(struct bpf_prog *prog) { - struct bpf_security_struct *bpfsec = aux->security; + struct bpf_security_struct *bpfsec = prog->aux->security; - aux->security = NULL; + prog->aux->security = NULL; kfree(bpfsec); } #endif @@ -7180,7 +7181,7 @@ static struct security_hook_list selinux_hooks[] __ro_after_init = { LSM_HOOK_INIT(bpf_map, selinux_bpf_map), LSM_HOOK_INIT(bpf_prog, selinux_bpf_prog), LSM_HOOK_INIT(bpf_map_free_security, selinux_bpf_map_free), - LSM_HOOK_INIT(bpf_prog_free_security, selinux_bpf_prog_free), + LSM_HOOK_INIT(bpf_prog_free, selinux_bpf_prog_free), #endif #ifdef CONFIG_PERF_EVENTS @@ -7238,7 +7239,7 @@ static struct security_hook_list selinux_hooks[] __ro_after_init = { #endif #ifdef CONFIG_BPF_SYSCALL LSM_HOOK_INIT(bpf_map_alloc_security, selinux_bpf_map_alloc), - LSM_HOOK_INIT(bpf_prog_alloc_security, selinux_bpf_prog_alloc), + LSM_HOOK_INIT(bpf_prog_load, selinux_bpf_prog_load), #endif #ifdef CONFIG_PERF_EVENTS LSM_HOOK_INIT(perf_event_alloc, selinux_perf_event_alloc), -- cgit From 66d636d70a79c1d37e3eea67ab50969e6aaef983 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 30 Nov 2023 10:52:22 -0800 Subject: bpf,lsm: refactor bpf_map_alloc/bpf_map_free LSM hooks Similarly to bpf_prog_alloc LSM hook, rename and extend bpf_map_alloc hook into bpf_map_create, taking not just struct bpf_map, but also bpf_attr and bpf_token, to give a fuller context to LSMs. Unlike bpf_prog_alloc, there is no need to move the hook around, as it currently is firing right before allocating BPF map ID and FD, which seems to be a sweet spot. But like bpf_prog_alloc/bpf_prog_free combo, make sure that bpf_map_free LSM hook is called even if bpf_map_create hook returned error, as if few LSMs are combined together it could be that one LSM successfully allocated security blob for its needs, while subsequent LSM rejected BPF map creation. The former LSM would still need to free up LSM blob, so we need to ensure security_bpf_map_free() is called regardless of the outcome. Acked-by: Paul Moore Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231130185229.2688956-11-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/lsm_hook_defs.h | 5 +++-- include/linux/security.h | 6 ++++-- kernel/bpf/bpf_lsm.c | 6 +++--- kernel/bpf/syscall.c | 4 ++-- security/security.c | 16 ++++++++++------ security/selinux/hooks.c | 7 ++++--- 6 files changed, 26 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index 41ec4a7c070e..adb25cc63ce3 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -398,8 +398,9 @@ LSM_HOOK(void, LSM_RET_VOID, audit_rule_free, void *lsmrule) LSM_HOOK(int, 0, bpf, int cmd, union bpf_attr *attr, unsigned int size) LSM_HOOK(int, 0, bpf_map, struct bpf_map *map, fmode_t fmode) LSM_HOOK(int, 0, bpf_prog, struct bpf_prog *prog) -LSM_HOOK(int, 0, bpf_map_alloc_security, struct bpf_map *map) -LSM_HOOK(void, LSM_RET_VOID, bpf_map_free_security, struct bpf_map *map) +LSM_HOOK(int, 0, bpf_map_create, struct bpf_map *map, union bpf_attr *attr, + struct bpf_token *token) +LSM_HOOK(void, LSM_RET_VOID, bpf_map_free, struct bpf_map *map) LSM_HOOK(int, 0, bpf_prog_load, struct bpf_prog *prog, union bpf_attr *attr, struct bpf_token *token) LSM_HOOK(void, LSM_RET_VOID, bpf_prog_free, struct bpf_prog *prog) diff --git a/include/linux/security.h b/include/linux/security.h index 65467eef6678..08fd777cbe94 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -2025,7 +2025,8 @@ struct bpf_token; extern int security_bpf(int cmd, union bpf_attr *attr, unsigned int size); extern int security_bpf_map(struct bpf_map *map, fmode_t fmode); extern int security_bpf_prog(struct bpf_prog *prog); -extern int security_bpf_map_alloc(struct bpf_map *map); +extern int security_bpf_map_create(struct bpf_map *map, union bpf_attr *attr, + struct bpf_token *token); extern void security_bpf_map_free(struct bpf_map *map); extern int security_bpf_prog_load(struct bpf_prog *prog, union bpf_attr *attr, struct bpf_token *token); @@ -2047,7 +2048,8 @@ static inline int security_bpf_prog(struct bpf_prog *prog) return 0; } -static inline int security_bpf_map_alloc(struct bpf_map *map) +static inline int security_bpf_map_create(struct bpf_map *map, union bpf_attr *attr, + struct bpf_token *token) { return 0; } diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index 3e956f6302f3..9e4e615f11eb 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -260,8 +260,8 @@ bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) BTF_SET_START(sleepable_lsm_hooks) BTF_ID(func, bpf_lsm_bpf) BTF_ID(func, bpf_lsm_bpf_map) -BTF_ID(func, bpf_lsm_bpf_map_alloc_security) -BTF_ID(func, bpf_lsm_bpf_map_free_security) +BTF_ID(func, bpf_lsm_bpf_map_create) +BTF_ID(func, bpf_lsm_bpf_map_free) BTF_ID(func, bpf_lsm_bpf_prog) BTF_ID(func, bpf_lsm_bpf_prog_load) BTF_ID(func, bpf_lsm_bpf_prog_free) @@ -347,7 +347,7 @@ BTF_ID(func, bpf_lsm_userns_create) BTF_SET_END(sleepable_lsm_hooks) BTF_SET_START(untrusted_lsm_hooks) -BTF_ID(func, bpf_lsm_bpf_map_free_security) +BTF_ID(func, bpf_lsm_bpf_map_free) BTF_ID(func, bpf_lsm_bpf_prog_free) BTF_ID(func, bpf_lsm_file_alloc_security) BTF_ID(func, bpf_lsm_file_free_security) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 7717c7c7b95d..aff045eed375 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1309,9 +1309,9 @@ static int map_create(union bpf_attr *attr) attr->btf_vmlinux_value_type_id; } - err = security_bpf_map_alloc(map); + err = security_bpf_map_create(map, attr, token); if (err) - goto free_map; + goto free_map_sec; err = bpf_map_alloc_id(map); if (err) diff --git a/security/security.c b/security/security.c index c8a1c66cfaad..ad24cf36da94 100644 --- a/security/security.c +++ b/security/security.c @@ -5167,16 +5167,20 @@ int security_bpf_prog(struct bpf_prog *prog) } /** - * security_bpf_map_alloc() - Allocate a bpf map LSM blob - * @map: bpf map + * security_bpf_map_create() - Check if BPF map creation is allowed + * @map: BPF map object + * @attr: BPF syscall attributes used to create BPF map + * @token: BPF token used to grant user access * - * Initialize the security field inside bpf map. + * Do a check when the kernel creates a new BPF map. This is also the + * point where LSM blob is allocated for LSMs that need them. * * Return: Returns 0 on success, error on failure. */ -int security_bpf_map_alloc(struct bpf_map *map) +int security_bpf_map_create(struct bpf_map *map, union bpf_attr *attr, + struct bpf_token *token) { - return call_int_hook(bpf_map_alloc_security, 0, map); + return call_int_hook(bpf_map_create, 0, map, attr, token); } /** @@ -5205,7 +5209,7 @@ int security_bpf_prog_load(struct bpf_prog *prog, union bpf_attr *attr, */ void security_bpf_map_free(struct bpf_map *map) { - call_void_hook(bpf_map_free_security, map); + call_void_hook(bpf_map_free, map); } /** diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index eabee39e983c..002351ab67b7 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -6783,7 +6783,8 @@ static int selinux_bpf_prog(struct bpf_prog *prog) BPF__PROG_RUN, NULL); } -static int selinux_bpf_map_alloc(struct bpf_map *map) +static int selinux_bpf_map_create(struct bpf_map *map, union bpf_attr *attr, + struct bpf_token *token) { struct bpf_security_struct *bpfsec; @@ -7180,7 +7181,7 @@ static struct security_hook_list selinux_hooks[] __ro_after_init = { LSM_HOOK_INIT(bpf, selinux_bpf), LSM_HOOK_INIT(bpf_map, selinux_bpf_map), LSM_HOOK_INIT(bpf_prog, selinux_bpf_prog), - LSM_HOOK_INIT(bpf_map_free_security, selinux_bpf_map_free), + LSM_HOOK_INIT(bpf_map_free, selinux_bpf_map_free), LSM_HOOK_INIT(bpf_prog_free, selinux_bpf_prog_free), #endif @@ -7238,7 +7239,7 @@ static struct security_hook_list selinux_hooks[] __ro_after_init = { LSM_HOOK_INIT(audit_rule_init, selinux_audit_rule_init), #endif #ifdef CONFIG_BPF_SYSCALL - LSM_HOOK_INIT(bpf_map_alloc_security, selinux_bpf_map_alloc), + LSM_HOOK_INIT(bpf_map_create, selinux_bpf_map_create), LSM_HOOK_INIT(bpf_prog_load, selinux_bpf_prog_load), #endif #ifdef CONFIG_PERF_EVENTS -- cgit From d734ca7b33dbf60eb15dcf7c44f3da7073356777 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 30 Nov 2023 10:52:23 -0800 Subject: bpf,lsm: add BPF token LSM hooks Wire up bpf_token_create and bpf_token_free LSM hooks, which allow to allocate LSM security blob (we add `void *security` field to struct bpf_token for that), but also control who can instantiate BPF token. This follows existing pattern for BPF map and BPF prog. Also add security_bpf_token_allow_cmd() and security_bpf_token_capable() LSM hooks that allow LSM implementation to control and negate (if necessary) BPF token's delegation of a specific bpf_cmd and capability, respectively. Acked-by: Paul Moore Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231130185229.2688956-12-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 3 +++ include/linux/lsm_hook_defs.h | 5 ++++ include/linux/security.h | 25 ++++++++++++++++++ kernel/bpf/bpf_lsm.c | 4 +++ kernel/bpf/token.c | 18 ++++++++----- security/security.c | 60 +++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 109 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 435abad3cc61..7a483f6b6d5f 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1604,6 +1604,9 @@ struct bpf_token { u64 allowed_maps; u64 allowed_progs; u64 allowed_attachs; +#ifdef CONFIG_SECURITY + void *security; +#endif }; struct bpf_struct_ops_value; diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index adb25cc63ce3..3fdd00b452ac 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -404,6 +404,11 @@ LSM_HOOK(void, LSM_RET_VOID, bpf_map_free, struct bpf_map *map) LSM_HOOK(int, 0, bpf_prog_load, struct bpf_prog *prog, union bpf_attr *attr, struct bpf_token *token) LSM_HOOK(void, LSM_RET_VOID, bpf_prog_free, struct bpf_prog *prog) +LSM_HOOK(int, 0, bpf_token_create, struct bpf_token *token, union bpf_attr *attr, + struct path *path) +LSM_HOOK(void, LSM_RET_VOID, bpf_token_free, struct bpf_token *token) +LSM_HOOK(int, 0, bpf_token_cmd, const struct bpf_token *token, enum bpf_cmd cmd) +LSM_HOOK(int, 0, bpf_token_capable, const struct bpf_token *token, int cap) #endif /* CONFIG_BPF_SYSCALL */ LSM_HOOK(int, 0, locked_down, enum lockdown_reason what) diff --git a/include/linux/security.h b/include/linux/security.h index 08fd777cbe94..00809d2d5c38 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -32,6 +32,7 @@ #include #include #include +#include struct linux_binprm; struct cred; @@ -2031,6 +2032,11 @@ extern void security_bpf_map_free(struct bpf_map *map); extern int security_bpf_prog_load(struct bpf_prog *prog, union bpf_attr *attr, struct bpf_token *token); extern void security_bpf_prog_free(struct bpf_prog *prog); +extern int security_bpf_token_create(struct bpf_token *token, union bpf_attr *attr, + struct path *path); +extern void security_bpf_token_free(struct bpf_token *token); +extern int security_bpf_token_cmd(const struct bpf_token *token, enum bpf_cmd cmd); +extern int security_bpf_token_capable(const struct bpf_token *token, int cap); #else static inline int security_bpf(int cmd, union bpf_attr *attr, unsigned int size) @@ -2065,6 +2071,25 @@ static inline int security_bpf_prog_load(struct bpf_prog *prog, union bpf_attr * static inline void security_bpf_prog_free(struct bpf_prog *prog) { } + +static inline int security_bpf_token_create(struct bpf_token *token, union bpf_attr *attr, + struct path *path) +{ + return 0; +} + +static inline void security_bpf_token_free(struct bpf_token *token) +{ } + +static inline int security_bpf_token_cmd(const struct bpf_token *token, enum bpf_cmd cmd) +{ + return 0; +} + +static inline int security_bpf_token_capable(const struct bpf_token *token, int cap) +{ + return 0; +} #endif /* CONFIG_SECURITY */ #endif /* CONFIG_BPF_SYSCALL */ diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index 9e4e615f11eb..7d2f96413a57 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -265,6 +265,10 @@ BTF_ID(func, bpf_lsm_bpf_map_free) BTF_ID(func, bpf_lsm_bpf_prog) BTF_ID(func, bpf_lsm_bpf_prog_load) BTF_ID(func, bpf_lsm_bpf_prog_free) +BTF_ID(func, bpf_lsm_bpf_token_create) +BTF_ID(func, bpf_lsm_bpf_token_free) +BTF_ID(func, bpf_lsm_bpf_token_cmd) +BTF_ID(func, bpf_lsm_bpf_token_capable) BTF_ID(func, bpf_lsm_bprm_check_security) BTF_ID(func, bpf_lsm_bprm_committed_creds) BTF_ID(func, bpf_lsm_bprm_committing_creds) diff --git a/kernel/bpf/token.c b/kernel/bpf/token.c index 5a51e6b8f6bf..17212efcde60 100644 --- a/kernel/bpf/token.c +++ b/kernel/bpf/token.c @@ -7,6 +7,7 @@ #include #include #include +#include bool bpf_token_capable(const struct bpf_token *token, int cap) { @@ -14,10 +15,9 @@ bool bpf_token_capable(const struct bpf_token *token, int cap) * token's userns is *exactly* the same as current user's userns */ if (token && current_user_ns() == token->userns) { - if (ns_capable(token->userns, cap)) - return true; - if (cap != CAP_SYS_ADMIN && ns_capable(token->userns, CAP_SYS_ADMIN)) - return true; + if (ns_capable(token->userns, cap) || + (cap != CAP_SYS_ADMIN && ns_capable(token->userns, CAP_SYS_ADMIN))) + return security_bpf_token_capable(token, cap) == 0; } /* otherwise fallback to capable() checks */ return capable(cap) || (cap != CAP_SYS_ADMIN && capable(CAP_SYS_ADMIN)); @@ -30,6 +30,7 @@ void bpf_token_inc(struct bpf_token *token) static void bpf_token_free(struct bpf_token *token) { + security_bpf_token_free(token); put_user_ns(token->userns); kvfree(token); } @@ -186,6 +187,10 @@ int bpf_token_create(union bpf_attr *attr) token->allowed_progs = mnt_opts->delegate_progs; token->allowed_attachs = mnt_opts->delegate_attachs; + err = security_bpf_token_create(token, attr, &path); + if (err) + goto out_token; + fd = get_unused_fd_flags(O_CLOEXEC); if (fd < 0) { err = fd; @@ -233,8 +238,9 @@ bool bpf_token_allow_cmd(const struct bpf_token *token, enum bpf_cmd cmd) */ if (!token || current_user_ns() != token->userns) return false; - - return token->allowed_cmds & (1ULL << cmd); + if (!(token->allowed_cmds & (1ULL << cmd))) + return false; + return security_bpf_token_cmd(token, cmd) == 0; } bool bpf_token_allow_map_type(const struct bpf_token *token, enum bpf_map_type type) diff --git a/security/security.c b/security/security.c index ad24cf36da94..088a79c35c26 100644 --- a/security/security.c +++ b/security/security.c @@ -5201,6 +5201,55 @@ int security_bpf_prog_load(struct bpf_prog *prog, union bpf_attr *attr, return call_int_hook(bpf_prog_load, 0, prog, attr, token); } +/** + * security_bpf_token_create() - Check if creating of BPF token is allowed + * @token: BPF token object + * @attr: BPF syscall attributes used to create BPF token + * @path: path pointing to BPF FS mount point from which BPF token is created + * + * Do a check when the kernel instantiates a new BPF token object from BPF FS + * instance. This is also the point where LSM blob can be allocated for LSMs. + * + * Return: Returns 0 on success, error on failure. + */ +int security_bpf_token_create(struct bpf_token *token, union bpf_attr *attr, + struct path *path) +{ + return call_int_hook(bpf_token_create, 0, token, attr, path); +} + +/** + * security_bpf_token_cmd() - Check if BPF token is allowed to delegate + * requested BPF syscall command + * @token: BPF token object + * @cmd: BPF syscall command requested to be delegated by BPF token + * + * Do a check when the kernel decides whether provided BPF token should allow + * delegation of requested BPF syscall command. + * + * Return: Returns 0 on success, error on failure. + */ +int security_bpf_token_cmd(const struct bpf_token *token, enum bpf_cmd cmd) +{ + return call_int_hook(bpf_token_cmd, 0, token, cmd); +} + +/** + * security_bpf_token_capable() - Check if BPF token is allowed to delegate + * requested BPF-related capability + * @token: BPF token object + * @cap: capabilities requested to be delegated by BPF token + * + * Do a check when the kernel decides whether provided BPF token should allow + * delegation of requested BPF-related capabilities. + * + * Return: Returns 0 on success, error on failure. + */ +int security_bpf_token_capable(const struct bpf_token *token, int cap) +{ + return call_int_hook(bpf_token_capable, 0, token, cap); +} + /** * security_bpf_map_free() - Free a bpf map's LSM blob * @map: bpf map @@ -5222,6 +5271,17 @@ void security_bpf_prog_free(struct bpf_prog *prog) { call_void_hook(bpf_prog_free, prog); } + +/** + * security_bpf_token_free() - Free a BPF token's LSM blob + * @token: BPF token struct + * + * Clean up the security information stored inside BPF token. + */ +void security_bpf_token_free(struct bpf_token *token) +{ + call_void_hook(bpf_token_free, token); +} #endif /* CONFIG_BPF_SYSCALL */ /** -- cgit From 3232e7aad11e541da86bbb1fa5ea5737b30bd006 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Tue, 5 Dec 2023 17:21:14 -0500 Subject: cgroup/cpuset: Include isolated cpuset CPUs in cpu_is_isolated() check Currently, the cpu_is_isolated() function checks only the statically isolated CPUs specified via the "isolcpus" and "nohz_full" kernel command line options. This function is used by vmstat and memcg to reduce interference with isolated CPUs by not doing stat flushing or scheduling works on those CPUs. Workloads running on isolated CPUs within isolated cpuset partitions should receive the same treatment to reduce unnecessary interference. This patch introduces a new cpuset_cpu_is_isolated() function to be called by cpu_is_isolated() so that the set of dynamically created cpuset isolated CPUs will be included in the check. Assuming that testing a bit in a cpumask is atomic, no synchronization primitive is currently used to synchronize access to the cpuset's isolated_cpus mask. Signed-off-by: Waiman Long Signed-off-by: Tejun Heo --- include/linux/cpuset.h | 6 ++++++ include/linux/sched/isolation.h | 4 +++- kernel/cgroup/cpuset.c | 11 +++++++++++ 3 files changed, 20 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index d629094fac6e..875d12598bd2 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -77,6 +77,7 @@ extern void cpuset_lock(void); extern void cpuset_unlock(void); extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask); extern bool cpuset_cpus_allowed_fallback(struct task_struct *p); +extern bool cpuset_cpu_is_isolated(int cpu); extern nodemask_t cpuset_mems_allowed(struct task_struct *p); #define cpuset_current_mems_allowed (current->mems_allowed) void cpuset_init_current_mems_allowed(void); @@ -207,6 +208,11 @@ static inline bool cpuset_cpus_allowed_fallback(struct task_struct *p) return false; } +static inline bool cpuset_cpu_is_isolated(int cpu) +{ + return false; +} + static inline nodemask_t cpuset_mems_allowed(struct task_struct *p) { return node_possible_map; diff --git a/include/linux/sched/isolation.h b/include/linux/sched/isolation.h index fe1a46f30d24..2b461129d1fa 100644 --- a/include/linux/sched/isolation.h +++ b/include/linux/sched/isolation.h @@ -2,6 +2,7 @@ #define _LINUX_SCHED_ISOLATION_H #include +#include #include #include @@ -67,7 +68,8 @@ static inline bool housekeeping_cpu(int cpu, enum hk_type type) static inline bool cpu_is_isolated(int cpu) { return !housekeeping_test_cpu(cpu, HK_TYPE_DOMAIN) || - !housekeeping_test_cpu(cpu, HK_TYPE_TICK); + !housekeeping_test_cpu(cpu, HK_TYPE_TICK) || + cpuset_cpu_is_isolated(cpu); } #endif /* _LINUX_SCHED_ISOLATION_H */ diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 2a16df86c55c..dfbb16aca9f4 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -1518,6 +1518,17 @@ static void update_unbound_workqueue_cpumask(bool isolcpus_updated) WARN_ON_ONCE(ret < 0); } +/** + * cpuset_cpu_is_isolated - Check if the given CPU is isolated + * @cpu: the CPU number to be checked + * Return: true if CPU is used in an isolated partition, false otherwise + */ +bool cpuset_cpu_is_isolated(int cpu) +{ + return cpumask_test_cpu(cpu, isolated_cpus); +} +EXPORT_SYMBOL_GPL(cpuset_cpu_is_isolated); + /* * compute_effective_exclusive_cpumask - compute effective exclusive CPUs * @cs: cpuset -- cgit From b2dd797543cfa6580eac8408dd67fa02164d9e56 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Wed, 6 Dec 2023 10:02:44 -0500 Subject: ring-buffer: Force absolute timestamp on discard of event There's a race where if an event is discarded from the ring buffer and an interrupt were to happen at that time and insert an event, the time stamp is still used from the discarded event as an offset. This can screw up the timings. If the event is going to be discarded, set the "before_stamp" to zero. When a new event comes in, it compares the "before_stamp" with the "write_stamp" and if they are not equal, it will insert an absolute timestamp. This will prevent the timings from getting out of sync due to the discarded event. Link: https://lore.kernel.org/linux-trace-kernel/20231206100244.5130f9b3@gandalf.local.home Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Fixes: 6f6be606e763f ("ring-buffer: Force before_stamp and write_stamp to be different on discard") Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 43cc47d7faaf..a6da2d765c78 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -3030,22 +3030,19 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, local_read(&bpage->write) & ~RB_WRITE_MASK; unsigned long event_length = rb_event_length(event); + /* + * For the before_stamp to be different than the write_stamp + * to make sure that the next event adds an absolute + * value and does not rely on the saved write stamp, which + * is now going to be bogus. + */ + rb_time_set(&cpu_buffer->before_stamp, 0); + /* Something came in, can't discard */ if (!rb_time_cmpxchg(&cpu_buffer->write_stamp, write_stamp, write_stamp - delta)) return false; - /* - * It's possible that the event time delta is zero - * (has the same time stamp as the previous event) - * in which case write_stamp and before_stamp could - * be the same. In such a case, force before_stamp - * to be different than write_stamp. It doesn't - * matter what it is, as long as its different. - */ - if (!delta) - rb_time_set(&cpu_buffer->before_stamp, 0); - /* * If an event were to come in now, it would see that the * write_stamp and the before_stamp are different, and assume -- cgit From f458a1453424e03462b5bb539673c9a3cddda480 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Wed, 6 Dec 2023 10:00:50 -0500 Subject: ring-buffer: Test last update in 32bit version of __rb_time_read() Since 64 bit cmpxchg() is very expensive on 32bit architectures, the timestamp used by the ring buffer does some interesting tricks to be able to still have an atomic 64 bit number. It originally just used 60 bits and broke it up into two 32 bit words where the extra 2 bits were used for synchronization. But this was not enough for all use cases, and all 64 bits were required. The 32bit version of the ring buffer timestamp was then broken up into 3 32bit words using the same counter trick. But one update was not done. The check to see if the read operation was done without interruption only checked the first two words and not last one (like it had before this update). Fix it by making sure all three updates happen without interruption by comparing the initial counter with the last updated counter. Link: https://lore.kernel.org/linux-trace-kernel/20231206100050.3100b7bb@gandalf.local.home Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Fixes: f03f2abce4f39 ("ring-buffer: Have 32 bit time stamps use all 64 bits") Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index a6da2d765c78..8d2a4f00eca9 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -644,8 +644,8 @@ static inline bool __rb_time_read(rb_time_t *t, u64 *ret, unsigned long *cnt) *cnt = rb_time_cnt(top); - /* If top and bottom counts don't match, this interrupted a write */ - if (*cnt != rb_time_cnt(bottom)) + /* If top and msb counts don't match, this interrupted a write */ + if (*cnt != rb_time_cnt(msb)) return false; /* The shift to msb will lose its cnt bits */ -- cgit From 4b7de801606e504e69689df71475d27e35336fb3 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 6 Dec 2023 09:30:40 +0100 Subject: bpf: Fix prog_array_map_poke_run map poke update Lee pointed out issue found by syscaller [0] hitting BUG in prog array map poke update in prog_array_map_poke_run function due to error value returned from bpf_arch_text_poke function. There's race window where bpf_arch_text_poke can fail due to missing bpf program kallsym symbols, which is accounted for with check for -EINVAL in that BUG_ON call. The problem is that in such case we won't update the tail call jump and cause imbalance for the next tail call update check which will fail with -EBUSY in bpf_arch_text_poke. I'm hitting following race during the program load: CPU 0 CPU 1 bpf_prog_load bpf_check do_misc_fixups prog_array_map_poke_track map_update_elem bpf_fd_array_map_update_elem prog_array_map_poke_run bpf_arch_text_poke returns -EINVAL bpf_prog_kallsyms_add After bpf_arch_text_poke (CPU 1) fails to update the tail call jump, the next poke update fails on expected jump instruction check in bpf_arch_text_poke with -EBUSY and triggers the BUG_ON in prog_array_map_poke_run. Similar race exists on the program unload. Fixing this by moving the update to bpf_arch_poke_desc_update function which makes sure we call __bpf_arch_text_poke that skips the bpf address check. Each architecture has slightly different approach wrt looking up bpf address in bpf_arch_text_poke, so instead of splitting the function or adding new 'checkip' argument in previous version, it seems best to move the whole map_poke_run update as arch specific code. [0] https://syzkaller.appspot.com/bug?extid=97a4fe20470e9bc30810 Fixes: ebf7d1f508a7 ("bpf, x64: rework pro/epilogue and tailcall handling in JIT") Reported-by: syzbot+97a4fe20470e9bc30810@syzkaller.appspotmail.com Signed-off-by: Jiri Olsa Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Cc: Lee Jones Cc: Maciej Fijalkowski Link: https://lore.kernel.org/bpf/20231206083041.1306660-2-jolsa@kernel.org --- arch/x86/net/bpf_jit_comp.c | 46 +++++++++++++++++++++++++++++++++++ include/linux/bpf.h | 3 +++ kernel/bpf/arraymap.c | 58 ++++++++------------------------------------- 3 files changed, 59 insertions(+), 48 deletions(-) (limited to 'kernel') diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 8c10d9abc239..e89e415aa743 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -3025,3 +3025,49 @@ void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp #endif WARN(1, "verification of programs using bpf_throw should have failed\n"); } + +void bpf_arch_poke_desc_update(struct bpf_jit_poke_descriptor *poke, + struct bpf_prog *new, struct bpf_prog *old) +{ + u8 *old_addr, *new_addr, *old_bypass_addr; + int ret; + + old_bypass_addr = old ? NULL : poke->bypass_addr; + old_addr = old ? (u8 *)old->bpf_func + poke->adj_off : NULL; + new_addr = new ? (u8 *)new->bpf_func + poke->adj_off : NULL; + + /* + * On program loading or teardown, the program's kallsym entry + * might not be in place, so we use __bpf_arch_text_poke to skip + * the kallsyms check. + */ + if (new) { + ret = __bpf_arch_text_poke(poke->tailcall_target, + BPF_MOD_JUMP, + old_addr, new_addr); + BUG_ON(ret < 0); + if (!old) { + ret = __bpf_arch_text_poke(poke->tailcall_bypass, + BPF_MOD_JUMP, + poke->bypass_addr, + NULL); + BUG_ON(ret < 0); + } + } else { + ret = __bpf_arch_text_poke(poke->tailcall_bypass, + BPF_MOD_JUMP, + old_bypass_addr, + poke->bypass_addr); + BUG_ON(ret < 0); + /* let other CPUs finish the execution of program + * so that it will not possible to expose them + * to invalid nop, stack unwind, nop state + */ + if (!ret) + synchronize_rcu(); + ret = __bpf_arch_text_poke(poke->tailcall_target, + BPF_MOD_JUMP, + old_addr, NULL); + BUG_ON(ret < 0); + } +} diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 6762dac3ef76..cff5bb08820e 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -3175,6 +3175,9 @@ enum bpf_text_poke_type { int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, void *addr1, void *addr2); +void bpf_arch_poke_desc_update(struct bpf_jit_poke_descriptor *poke, + struct bpf_prog *new, struct bpf_prog *old); + void *bpf_arch_text_copy(void *dst, void *src, size_t len); int bpf_arch_text_invalidate(void *dst, size_t len); diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 2058e89b5ddd..c85ff9162a5c 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -1012,11 +1012,16 @@ static void prog_array_map_poke_untrack(struct bpf_map *map, mutex_unlock(&aux->poke_mutex); } +void __weak bpf_arch_poke_desc_update(struct bpf_jit_poke_descriptor *poke, + struct bpf_prog *new, struct bpf_prog *old) +{ + WARN_ON_ONCE(1); +} + static void prog_array_map_poke_run(struct bpf_map *map, u32 key, struct bpf_prog *old, struct bpf_prog *new) { - u8 *old_addr, *new_addr, *old_bypass_addr; struct prog_poke_elem *elem; struct bpf_array_aux *aux; @@ -1025,7 +1030,7 @@ static void prog_array_map_poke_run(struct bpf_map *map, u32 key, list_for_each_entry(elem, &aux->poke_progs, list) { struct bpf_jit_poke_descriptor *poke; - int i, ret; + int i; for (i = 0; i < elem->aux->size_poke_tab; i++) { poke = &elem->aux->poke_tab[i]; @@ -1044,21 +1049,10 @@ static void prog_array_map_poke_run(struct bpf_map *map, u32 key, * activated, so tail call updates can arrive from here * while JIT is still finishing its final fixup for * non-activated poke entries. - * 3) On program teardown, the program's kallsym entry gets - * removed out of RCU callback, but we can only untrack - * from sleepable context, therefore bpf_arch_text_poke() - * might not see that this is in BPF text section and - * bails out with -EINVAL. As these are unreachable since - * RCU grace period already passed, we simply skip them. - * 4) Also programs reaching refcount of zero while patching + * 3) Also programs reaching refcount of zero while patching * is in progress is okay since we're protected under * poke_mutex and untrack the programs before the JIT - * buffer is freed. When we're still in the middle of - * patching and suddenly kallsyms entry of the program - * gets evicted, we just skip the rest which is fine due - * to point 3). - * 5) Any other error happening below from bpf_arch_text_poke() - * is a unexpected bug. + * buffer is freed. */ if (!READ_ONCE(poke->tailcall_target_stable)) continue; @@ -1068,39 +1062,7 @@ static void prog_array_map_poke_run(struct bpf_map *map, u32 key, poke->tail_call.key != key) continue; - old_bypass_addr = old ? NULL : poke->bypass_addr; - old_addr = old ? (u8 *)old->bpf_func + poke->adj_off : NULL; - new_addr = new ? (u8 *)new->bpf_func + poke->adj_off : NULL; - - if (new) { - ret = bpf_arch_text_poke(poke->tailcall_target, - BPF_MOD_JUMP, - old_addr, new_addr); - BUG_ON(ret < 0 && ret != -EINVAL); - if (!old) { - ret = bpf_arch_text_poke(poke->tailcall_bypass, - BPF_MOD_JUMP, - poke->bypass_addr, - NULL); - BUG_ON(ret < 0 && ret != -EINVAL); - } - } else { - ret = bpf_arch_text_poke(poke->tailcall_bypass, - BPF_MOD_JUMP, - old_bypass_addr, - poke->bypass_addr); - BUG_ON(ret < 0 && ret != -EINVAL); - /* let other CPUs finish the execution of program - * so that it will not possible to expose them - * to invalid nop, stack unwind, nop state - */ - if (!ret) - synchronize_rcu(); - ret = bpf_arch_text_poke(poke->tailcall_target, - BPF_MOD_JUMP, - old_addr, NULL); - BUG_ON(ret < 0 && ret != -EINVAL); - } + bpf_arch_poke_desc_update(poke, new, old); } } } -- cgit From dccf78d39f1069a5ddf4328bf0c97aa5f2f4296e Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Tue, 28 Nov 2023 13:44:57 +0800 Subject: kernel/Kconfig.kexec: drop select of KEXEC for CRASH_DUMP Ignat Korchagin complained that a potential config regression was introduced by commit 89cde455915f ("kexec: consolidate kexec and crash options into kernel/Kconfig.kexec"). Before the commit, CONFIG_CRASH_DUMP has no dependency on CONFIG_KEXEC. After the commit, CRASH_DUMP selects KEXEC. That enforces system to have CONFIG_KEXEC=y as long as CONFIG_CRASH_DUMP=Y which people may not want. In Ignat's case, he sets CONFIG_CRASH_DUMP=y, CONFIG_KEXEC_FILE=y and CONFIG_KEXEC=n because kexec_load interface could have security issue if kernel/initrd has no chance to be signed and verified. CRASH_DUMP has select of KEXEC because Eric, author of above commit, met a LKP report of build failure when posting patch of earlier version. Please see below link to get detail of the LKP report: https://lore.kernel.org/all/3e8eecd1-a277-2cfb-690e-5de2eb7b988e@oracle.com/T/#u In fact, that LKP report is triggered because arm's is wrapped in CONFIG_KEXEC ifdeffery scope. That is wrong. CONFIG_KEXEC controls the enabling/disabling of kexec_load interface, but not kexec feature. Removing the wrongly added CONFIG_KEXEC ifdeffery scope in of arm allows us to drop the select KEXEC for CRASH_DUMP. Meanwhile, change arch/arm/kernel/Makefile to let machine_kexec.o relocate_kernel.o depend on KEXEC_CORE. Link: https://lkml.kernel.org/r/20231128054457.659452-1-bhe@redhat.com Fixes: 89cde455915f ("kexec: consolidate kexec and crash options into kernel/Kconfig.kexec") Signed-off-by: Baoquan He Reported-by: Ignat Korchagin Tested-by: Ignat Korchagin [compile-time only] Tested-by: Alexander Gordeev Reviewed-by: Eric DeVolder Tested-by: Eric DeVolder Signed-off-by: Andrew Morton --- arch/arm/include/asm/kexec.h | 4 ---- arch/arm/kernel/Makefile | 2 +- kernel/Kconfig.kexec | 1 - 3 files changed, 1 insertion(+), 6 deletions(-) (limited to 'kernel') diff --git a/arch/arm/include/asm/kexec.h b/arch/arm/include/asm/kexec.h index e62832dcba76..a8287e7ab9d4 100644 --- a/arch/arm/include/asm/kexec.h +++ b/arch/arm/include/asm/kexec.h @@ -2,8 +2,6 @@ #ifndef _ARM_KEXEC_H #define _ARM_KEXEC_H -#ifdef CONFIG_KEXEC - /* Maximum physical address we can use pages from */ #define KEXEC_SOURCE_MEMORY_LIMIT (-1UL) /* Maximum address we can reach in physical address mode */ @@ -82,6 +80,4 @@ static inline struct page *boot_pfn_to_page(unsigned long boot_pfn) #endif /* __ASSEMBLY__ */ -#endif /* CONFIG_KEXEC */ - #endif /* _ARM_KEXEC_H */ diff --git a/arch/arm/kernel/Makefile b/arch/arm/kernel/Makefile index d53f56d6f840..771264d4726a 100644 --- a/arch/arm/kernel/Makefile +++ b/arch/arm/kernel/Makefile @@ -59,7 +59,7 @@ obj-$(CONFIG_FUNCTION_TRACER) += entry-ftrace.o obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o insn.o patch.o obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o insn.o patch.o obj-$(CONFIG_JUMP_LABEL) += jump_label.o insn.o patch.o -obj-$(CONFIG_KEXEC) += machine_kexec.o relocate_kernel.o +obj-$(CONFIG_KEXEC_CORE) += machine_kexec.o relocate_kernel.o # Main staffs in KPROBES are in arch/arm/probes/ . obj-$(CONFIG_KPROBES) += patch.o insn.o obj-$(CONFIG_OABI_COMPAT) += sys_oabi-compat.o diff --git a/kernel/Kconfig.kexec b/kernel/Kconfig.kexec index 7aff28ded2f4..1cc3b1c595d7 100644 --- a/kernel/Kconfig.kexec +++ b/kernel/Kconfig.kexec @@ -97,7 +97,6 @@ config CRASH_DUMP depends on ARCH_SUPPORTS_KEXEC select CRASH_CORE select KEXEC_CORE - select KEXEC help Generate crash dump after being started by kexec. This should be normally only set in special crash dump kernels -- cgit From f08a1c658257c73697a819c4ded3a84b6f0ead74 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Wed, 6 Dec 2023 14:40:48 -0800 Subject: bpf: Let bpf_prog_pack_free handle any pointer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, bpf_prog_pack_free only can only free pointer to struct bpf_binary_header, which is not flexible. Add a size argument to bpf_prog_pack_free so that it can handle any pointer. Signed-off-by: Song Liu Acked-by: Ilya Leoshkevich Tested-by: Ilya Leoshkevich # on s390x Reviewed-by: Björn Töpel Acked-by: Jiri Olsa Link: https://lore.kernel.org/r/20231206224054.492250-2-song@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/filter.h | 2 +- kernel/bpf/core.c | 21 ++++++++++----------- kernel/bpf/dispatcher.c | 5 +---- 3 files changed, 12 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/include/linux/filter.h b/include/linux/filter.h index 14354605ad26..12d907f17d36 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1067,7 +1067,7 @@ struct bpf_binary_header * bpf_jit_binary_pack_hdr(const struct bpf_prog *fp); void *bpf_prog_pack_alloc(u32 size, bpf_jit_fill_hole_t bpf_fill_ill_insns); -void bpf_prog_pack_free(struct bpf_binary_header *hdr); +void bpf_prog_pack_free(void *ptr, u32 size); static inline bool bpf_prog_kallsyms_verify_off(const struct bpf_prog *fp) { diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index ced511f44174..c34513d645c4 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -928,20 +928,20 @@ out: return ptr; } -void bpf_prog_pack_free(struct bpf_binary_header *hdr) +void bpf_prog_pack_free(void *ptr, u32 size) { struct bpf_prog_pack *pack = NULL, *tmp; unsigned int nbits; unsigned long pos; mutex_lock(&pack_mutex); - if (hdr->size > BPF_PROG_PACK_SIZE) { - bpf_jit_free_exec(hdr); + if (size > BPF_PROG_PACK_SIZE) { + bpf_jit_free_exec(ptr); goto out; } list_for_each_entry(tmp, &pack_list, list) { - if ((void *)hdr >= tmp->ptr && (tmp->ptr + BPF_PROG_PACK_SIZE) > (void *)hdr) { + if (ptr >= tmp->ptr && (tmp->ptr + BPF_PROG_PACK_SIZE) > ptr) { pack = tmp; break; } @@ -950,10 +950,10 @@ void bpf_prog_pack_free(struct bpf_binary_header *hdr) if (WARN_ONCE(!pack, "bpf_prog_pack bug\n")) goto out; - nbits = BPF_PROG_SIZE_TO_NBITS(hdr->size); - pos = ((unsigned long)hdr - (unsigned long)pack->ptr) >> BPF_PROG_CHUNK_SHIFT; + nbits = BPF_PROG_SIZE_TO_NBITS(size); + pos = ((unsigned long)ptr - (unsigned long)pack->ptr) >> BPF_PROG_CHUNK_SHIFT; - WARN_ONCE(bpf_arch_text_invalidate(hdr, hdr->size), + WARN_ONCE(bpf_arch_text_invalidate(ptr, size), "bpf_prog_pack bug: missing bpf_arch_text_invalidate?\n"); bitmap_clear(pack->bitmap, pos, nbits); @@ -1100,8 +1100,7 @@ bpf_jit_binary_pack_alloc(unsigned int proglen, u8 **image_ptr, *rw_header = kvmalloc(size, GFP_KERNEL); if (!*rw_header) { - bpf_arch_text_copy(&ro_header->size, &size, sizeof(size)); - bpf_prog_pack_free(ro_header); + bpf_prog_pack_free(ro_header, size); bpf_jit_uncharge_modmem(size); return NULL; } @@ -1132,7 +1131,7 @@ int bpf_jit_binary_pack_finalize(struct bpf_prog *prog, kvfree(rw_header); if (IS_ERR(ptr)) { - bpf_prog_pack_free(ro_header); + bpf_prog_pack_free(ro_header, ro_header->size); return PTR_ERR(ptr); } return 0; @@ -1153,7 +1152,7 @@ void bpf_jit_binary_pack_free(struct bpf_binary_header *ro_header, { u32 size = ro_header->size; - bpf_prog_pack_free(ro_header); + bpf_prog_pack_free(ro_header, size); kvfree(rw_header); bpf_jit_uncharge_modmem(size); } diff --git a/kernel/bpf/dispatcher.c b/kernel/bpf/dispatcher.c index fa3e9225aedc..56760fc10e78 100644 --- a/kernel/bpf/dispatcher.c +++ b/kernel/bpf/dispatcher.c @@ -150,10 +150,7 @@ void bpf_dispatcher_change_prog(struct bpf_dispatcher *d, struct bpf_prog *from, goto out; d->rw_image = bpf_jit_alloc_exec(PAGE_SIZE); if (!d->rw_image) { - u32 size = PAGE_SIZE; - - bpf_arch_text_copy(d->image, &size, sizeof(size)); - bpf_prog_pack_free((struct bpf_binary_header *)d->image); + bpf_prog_pack_free(d->image, PAGE_SIZE); d->image = NULL; goto out; } -- cgit From 7a3d9a159b178e87306a6e989071ed9a114a1a31 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Wed, 6 Dec 2023 14:40:49 -0800 Subject: bpf: Adjust argument names of arch_prepare_bpf_trampoline() We are using "im" for "struct bpf_tramp_image" and "tr" for "struct bpf_trampoline" in most of the code base. The only exception is the prototype and fallback version of arch_prepare_bpf_trampoline(). Update them to match the rest of the code base. We mix "orig_call" and "func_addr" for the argument in different versions of arch_prepare_bpf_trampoline(). s/orig_call/func_addr/g so they match. Signed-off-by: Song Liu Acked-by: Ilya Leoshkevich Tested-by: Ilya Leoshkevich # on s390x Acked-by: Jiri Olsa Link: https://lore.kernel.org/r/20231206224054.492250-3-song@kernel.org Signed-off-by: Alexei Starovoitov --- arch/arm64/net/bpf_jit_comp.c | 10 +++++----- include/linux/bpf.h | 4 ++-- kernel/bpf/trampoline.c | 4 ++-- 3 files changed, 9 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index 7d4af64e3982..d81b886ea4df 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -1828,7 +1828,7 @@ static void restore_args(struct jit_ctx *ctx, int args_off, int nregs) * */ static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im, - struct bpf_tramp_links *tlinks, void *orig_call, + struct bpf_tramp_links *tlinks, void *func_addr, int nregs, u32 flags) { int i; @@ -1926,7 +1926,7 @@ static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im, if (flags & BPF_TRAMP_F_IP_ARG) { /* save ip address of the traced function */ - emit_addr_mov_i64(A64_R(10), (const u64)orig_call, ctx); + emit_addr_mov_i64(A64_R(10), (const u64)func_addr, ctx); emit(A64_STR64I(A64_R(10), A64_SP, ip_off), ctx); } @@ -2029,7 +2029,7 @@ static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im, int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *image_end, const struct btf_func_model *m, u32 flags, struct bpf_tramp_links *tlinks, - void *orig_call) + void *func_addr) { int i, ret; int nregs = m->nr_args; @@ -2050,7 +2050,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, if (nregs > 8) return -ENOTSUPP; - ret = prepare_trampoline(&ctx, im, tlinks, orig_call, nregs, flags); + ret = prepare_trampoline(&ctx, im, tlinks, func_addr, nregs, flags); if (ret < 0) return ret; @@ -2061,7 +2061,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, ctx.idx = 0; jit_fill_hole(image, (unsigned int)(image_end - image)); - ret = prepare_trampoline(&ctx, im, tlinks, orig_call, nregs, flags); + ret = prepare_trampoline(&ctx, im, tlinks, func_addr, nregs, flags); if (ret > 0 && validate_code(&ctx) < 0) ret = -EINVAL; diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 7a483f6b6d5f..17eb6d905204 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1098,10 +1098,10 @@ struct bpf_tramp_run_ctx; * fexit = a set of program to run after original function */ struct bpf_tramp_image; -int arch_prepare_bpf_trampoline(struct bpf_tramp_image *tr, void *image, void *image_end, +int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *image_end, const struct btf_func_model *m, u32 flags, struct bpf_tramp_links *tlinks, - void *orig_call); + void *func_addr); u64 notrace __bpf_prog_enter_sleepable_recur(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx); void notrace __bpf_prog_exit_sleepable_recur(struct bpf_prog *prog, u64 start, diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index e97aeda3a86b..e114a1c7961e 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -1032,10 +1032,10 @@ bpf_trampoline_exit_t bpf_trampoline_exit(const struct bpf_prog *prog) } int __weak -arch_prepare_bpf_trampoline(struct bpf_tramp_image *tr, void *image, void *image_end, +arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *image_end, const struct btf_func_model *m, u32 flags, struct bpf_tramp_links *tlinks, - void *orig_call) + void *func_addr) { return -ENOTSUPP; } -- cgit From 82583daa2efc2e336962b231a46bad03a280b3e0 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Wed, 6 Dec 2023 14:40:50 -0800 Subject: bpf: Add helpers for trampoline image management As BPF trampoline of different archs moves from bpf_jit_[alloc|free]_exec() to bpf_prog_pack_[alloc|free](), we need to use different _alloc, _free for different archs during the transition. Add the following helpers for this transition: void *arch_alloc_bpf_trampoline(unsigned int size); void arch_free_bpf_trampoline(void *image, unsigned int size); void arch_protect_bpf_trampoline(void *image, unsigned int size); void arch_unprotect_bpf_trampoline(void *image, unsigned int size); The fallback version of these helpers require size <= PAGE_SIZE, but they are only called with size == PAGE_SIZE. They will be called with size < PAGE_SIZE when arch_bpf_trampoline_size() helper is introduced later. Signed-off-by: Song Liu Acked-by: Ilya Leoshkevich Tested-by: Ilya Leoshkevich # on s390x Acked-by: Jiri Olsa Link: https://lore.kernel.org/r/20231206224054.492250-4-song@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 5 +++++ kernel/bpf/bpf_struct_ops.c | 12 +++++------ kernel/bpf/trampoline.c | 46 +++++++++++++++++++++++++++++++++++------- net/bpf/bpf_dummy_struct_ops.c | 7 +++---- 4 files changed, 52 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 17eb6d905204..b7fca151cf1b 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1102,6 +1102,11 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i const struct btf_func_model *m, u32 flags, struct bpf_tramp_links *tlinks, void *func_addr); +void *arch_alloc_bpf_trampoline(unsigned int size); +void arch_free_bpf_trampoline(void *image, unsigned int size); +void arch_protect_bpf_trampoline(void *image, unsigned int size); +void arch_unprotect_bpf_trampoline(void *image, unsigned int size); + u64 notrace __bpf_prog_enter_sleepable_recur(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx); void notrace __bpf_prog_exit_sleepable_recur(struct bpf_prog *prog, u64 start, diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index db6176fb64dc..e9e95879bce2 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -515,7 +515,7 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, if (err) goto reset_unlock; } - set_memory_rox((long)st_map->image, 1); + arch_protect_bpf_trampoline(st_map->image, PAGE_SIZE); /* Let bpf_link handle registration & unregistration. * * Pair with smp_load_acquire() during lookup_elem(). @@ -524,7 +524,7 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, goto unlock; } - set_memory_rox((long)st_map->image, 1); + arch_protect_bpf_trampoline(st_map->image, PAGE_SIZE); err = st_ops->reg(kdata); if (likely(!err)) { /* This refcnt increment on the map here after @@ -547,8 +547,7 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, * there was a race in registering the struct_ops (under the same name) to * a sub-system through different struct_ops's maps. */ - set_memory_nx((long)st_map->image, 1); - set_memory_rw((long)st_map->image, 1); + arch_unprotect_bpf_trampoline(st_map->image, PAGE_SIZE); reset_unlock: bpf_struct_ops_map_put_progs(st_map); @@ -616,7 +615,7 @@ static void __bpf_struct_ops_map_free(struct bpf_map *map) bpf_struct_ops_map_put_progs(st_map); bpf_map_area_free(st_map->links); if (st_map->image) { - bpf_jit_free_exec(st_map->image); + arch_free_bpf_trampoline(st_map->image, PAGE_SIZE); bpf_jit_uncharge_modmem(PAGE_SIZE); } bpf_map_area_free(st_map->uvalue); @@ -691,7 +690,7 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr) return ERR_PTR(ret); } - st_map->image = bpf_jit_alloc_exec(PAGE_SIZE); + st_map->image = arch_alloc_bpf_trampoline(PAGE_SIZE); if (!st_map->image) { /* __bpf_struct_ops_map_free() uses st_map->image as flag * for "charged or not". In this case, we need to unchange @@ -711,7 +710,6 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr) } mutex_init(&st_map->lock); - set_vm_flush_reset_perms(st_map->image); bpf_map_init_from_attr(map, attr); return map; diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index e114a1c7961e..affbcbf7e76e 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -254,7 +254,7 @@ bpf_trampoline_get_progs(const struct bpf_trampoline *tr, int *total, bool *ip_a static void bpf_tramp_image_free(struct bpf_tramp_image *im) { bpf_image_ksym_del(&im->ksym); - bpf_jit_free_exec(im->image); + arch_free_bpf_trampoline(im->image, PAGE_SIZE); bpf_jit_uncharge_modmem(PAGE_SIZE); percpu_ref_exit(&im->pcref); kfree_rcu(im, rcu); @@ -365,10 +365,9 @@ static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key) goto out_free_im; err = -ENOMEM; - im->image = image = bpf_jit_alloc_exec(PAGE_SIZE); + im->image = image = arch_alloc_bpf_trampoline(PAGE_SIZE); if (!image) goto out_uncharge; - set_vm_flush_reset_perms(image); err = percpu_ref_init(&im->pcref, __bpf_tramp_image_release, 0, GFP_KERNEL); if (err) @@ -381,7 +380,7 @@ static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key) return im; out_free_image: - bpf_jit_free_exec(im->image); + arch_free_bpf_trampoline(im->image, PAGE_SIZE); out_uncharge: bpf_jit_uncharge_modmem(PAGE_SIZE); out_free_im: @@ -444,7 +443,7 @@ again: if (err < 0) goto out_free; - set_memory_rox((long)im->image, 1); + arch_protect_bpf_trampoline(im->image, PAGE_SIZE); WARN_ON(tr->cur_image && total == 0); if (tr->cur_image) @@ -465,8 +464,7 @@ again: tr->fops->trampoline = 0; /* reset im->image memory attr for arch_prepare_bpf_trampoline */ - set_memory_nx((long)im->image, 1); - set_memory_rw((long)im->image, 1); + arch_unprotect_bpf_trampoline(im->image, PAGE_SIZE); goto again; } #endif @@ -1040,6 +1038,40 @@ arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *image return -ENOTSUPP; } +void * __weak arch_alloc_bpf_trampoline(unsigned int size) +{ + void *image; + + if (WARN_ON_ONCE(size > PAGE_SIZE)) + return NULL; + image = bpf_jit_alloc_exec(PAGE_SIZE); + if (image) + set_vm_flush_reset_perms(image); + return image; +} + +void __weak arch_free_bpf_trampoline(void *image, unsigned int size) +{ + WARN_ON_ONCE(size > PAGE_SIZE); + /* bpf_jit_free_exec doesn't need "size", but + * bpf_prog_pack_free() needs it. + */ + bpf_jit_free_exec(image); +} + +void __weak arch_protect_bpf_trampoline(void *image, unsigned int size) +{ + WARN_ON_ONCE(size > PAGE_SIZE); + set_memory_rox((long)image, 1); +} + +void __weak arch_unprotect_bpf_trampoline(void *image, unsigned int size) +{ + WARN_ON_ONCE(size > PAGE_SIZE); + set_memory_nx((long)image, 1); + set_memory_rw((long)image, 1); +} + static int __init init_trampolines(void) { int i; diff --git a/net/bpf/bpf_dummy_struct_ops.c b/net/bpf/bpf_dummy_struct_ops.c index 5918d1b32e19..2748f9d77b18 100644 --- a/net/bpf/bpf_dummy_struct_ops.c +++ b/net/bpf/bpf_dummy_struct_ops.c @@ -101,12 +101,11 @@ int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr, goto out; } - image = bpf_jit_alloc_exec(PAGE_SIZE); + image = arch_alloc_bpf_trampoline(PAGE_SIZE); if (!image) { err = -ENOMEM; goto out; } - set_vm_flush_reset_perms(image); link = kzalloc(sizeof(*link), GFP_USER); if (!link) { @@ -124,7 +123,7 @@ int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr, if (err < 0) goto out; - set_memory_rox((long)image, 1); + arch_protect_bpf_trampoline(image, PAGE_SIZE); prog_ret = dummy_ops_call_op(image, args); err = dummy_ops_copy_args(args); @@ -134,7 +133,7 @@ int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr, err = -EFAULT; out: kfree(args); - bpf_jit_free_exec(image); + arch_free_bpf_trampoline(image, PAGE_SIZE); if (link) bpf_link_put(&link->link); kfree(tlinks); -- cgit From 96d1b7c081c0c96cbe8901045f4ff15a2e9974a2 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Wed, 6 Dec 2023 14:40:52 -0800 Subject: bpf: Add arch_bpf_trampoline_size() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This helper will be used to calculate the size of the trampoline before allocating the memory. arch_prepare_bpf_trampoline() for arm64 and riscv64 can use arch_bpf_trampoline_size() to check the trampoline fits in the image. OTOH, arch_prepare_bpf_trampoline() for s390 has to call the JIT process twice, so it cannot use arch_bpf_trampoline_size(). Signed-off-by: Song Liu Acked-by: Ilya Leoshkevich Tested-by: Ilya Leoshkevich # on s390x Acked-by: Jiri Olsa Acked-by: Björn Töpel Tested-by: Björn Töpel # on riscv Link: https://lore.kernel.org/r/20231206224054.492250-6-song@kernel.org Signed-off-by: Alexei Starovoitov --- arch/arm64/net/bpf_jit_comp.c | 56 ++++++++++++++++++++++++++++++----------- arch/riscv/net/bpf_jit_comp64.c | 22 ++++++++++++---- arch/s390/net/bpf_jit_comp.c | 56 +++++++++++++++++++++++++---------------- arch/x86/net/bpf_jit_comp.c | 40 ++++++++++++++++++++++++++--- include/linux/bpf.h | 2 ++ kernel/bpf/trampoline.c | 6 +++++ 6 files changed, 136 insertions(+), 46 deletions(-) (limited to 'kernel') diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index d81b886ea4df..a6671253b7ed 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -2026,18 +2026,10 @@ static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im, return ctx->idx; } -int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, - void *image_end, const struct btf_func_model *m, - u32 flags, struct bpf_tramp_links *tlinks, - void *func_addr) +static int btf_func_model_nregs(const struct btf_func_model *m) { - int i, ret; int nregs = m->nr_args; - int max_insns = ((long)image_end - (long)image) / AARCH64_INSN_SIZE; - struct jit_ctx ctx = { - .image = NULL, - .idx = 0, - }; + int i; /* extra registers needed for struct argument */ for (i = 0; i < MAX_BPF_FUNC_ARGS; i++) { @@ -2046,19 +2038,53 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, nregs += (m->arg_size[i] + 7) / 8 - 1; } + return nregs; +} + +int arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags, + struct bpf_tramp_links *tlinks, void *func_addr) +{ + struct jit_ctx ctx = { + .image = NULL, + .idx = 0, + }; + struct bpf_tramp_image im; + int nregs, ret; + + nregs = btf_func_model_nregs(m); /* the first 8 registers are used for arguments */ if (nregs > 8) return -ENOTSUPP; - ret = prepare_trampoline(&ctx, im, tlinks, func_addr, nregs, flags); + ret = prepare_trampoline(&ctx, &im, tlinks, func_addr, nregs, flags); if (ret < 0) return ret; - if (ret > max_insns) - return -EFBIG; + return ret < 0 ? ret : ret * AARCH64_INSN_SIZE; +} - ctx.image = image; - ctx.idx = 0; +int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, + void *image_end, const struct btf_func_model *m, + u32 flags, struct bpf_tramp_links *tlinks, + void *func_addr) +{ + int ret, nregs; + struct jit_ctx ctx = { + .image = image, + .idx = 0, + }; + + nregs = btf_func_model_nregs(m); + /* the first 8 registers are used for arguments */ + if (nregs > 8) + return -ENOTSUPP; + + ret = arch_bpf_trampoline_size(m, flags, tlinks, func_addr); + if (ret < 0) + return ret; + + if (ret > ((long)image_end - (long)image)) + return -EFBIG; jit_fill_hole(image, (unsigned int)(image_end - image)); ret = prepare_trampoline(&ctx, im, tlinks, func_addr, nregs, flags); diff --git a/arch/riscv/net/bpf_jit_comp64.c b/arch/riscv/net/bpf_jit_comp64.c index 8581693e62d3..35747fafde57 100644 --- a/arch/riscv/net/bpf_jit_comp64.c +++ b/arch/riscv/net/bpf_jit_comp64.c @@ -1029,6 +1029,21 @@ out: return ret; } +int arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags, + struct bpf_tramp_links *tlinks, void *func_addr) +{ + struct bpf_tramp_image im; + struct rv_jit_context ctx; + int ret; + + ctx.ninsns = 0; + ctx.insns = NULL; + ctx.ro_insns = NULL; + ret = __arch_prepare_bpf_trampoline(&im, m, tlinks, func_addr, flags, &ctx); + + return ret < 0 ? ret : ninsns_rvoff(ctx.ninsns); +} + int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *image_end, const struct btf_func_model *m, u32 flags, struct bpf_tramp_links *tlinks, @@ -1037,14 +1052,11 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, int ret; struct rv_jit_context ctx; - ctx.ninsns = 0; - ctx.insns = NULL; - ctx.ro_insns = NULL; - ret = __arch_prepare_bpf_trampoline(im, m, tlinks, func_addr, flags, &ctx); + ret = arch_bpf_trampoline_size(im, m, flags, tlinks, func_addr); if (ret < 0) return ret; - if (ninsns_rvoff(ret) > (long)image_end - (long)image) + if (ret > (long)image_end - (long)image) return -EFBIG; ctx.ninsns = 0; diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index bf06b7283f0c..cc129617480a 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -2637,6 +2637,21 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, return 0; } +int arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags, + struct bpf_tramp_links *tlinks, void *orig_call) +{ + struct bpf_tramp_image im; + struct bpf_tramp_jit tjit; + int ret; + + memset(&tjit, 0, sizeof(tjit)); + + ret = __arch_prepare_bpf_trampoline(&im, &tjit, m, flags, + tlinks, orig_call); + + return ret < 0 ? ret : tjit.common.prg; +} + int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *image_end, const struct btf_func_model *m, u32 flags, struct bpf_tramp_links *tlinks, @@ -2644,30 +2659,27 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, { struct bpf_tramp_jit tjit; int ret; - int i; - for (i = 0; i < 2; i++) { - if (i == 0) { - /* Compute offsets, check whether the code fits. */ - memset(&tjit, 0, sizeof(tjit)); - } else { - /* Generate the code. */ - tjit.common.prg = 0; - tjit.common.prg_buf = image; - } - ret = __arch_prepare_bpf_trampoline(im, &tjit, m, flags, - tlinks, func_addr); - if (ret < 0) - return ret; - if (tjit.common.prg > (char *)image_end - (char *)image) - /* - * Use the same error code as for exceeding - * BPF_MAX_TRAMP_LINKS. - */ - return -E2BIG; - } + /* Compute offsets, check whether the code fits. */ + memset(&tjit, 0, sizeof(tjit)); + ret = __arch_prepare_bpf_trampoline(im, &tjit, m, flags, + tlinks, func_addr); + + if (ret < 0) + return ret; + if (tjit.common.prg > (char *)image_end - (char *)image) + /* + * Use the same error code as for exceeding + * BPF_MAX_TRAMP_LINKS. + */ + return -E2BIG; + + tjit.common.prg = 0; + tjit.common.prg_buf = image; + ret = __arch_prepare_bpf_trampoline(im, &tjit, m, flags, + tlinks, func_addr); - return tjit.common.prg; + return ret < 0 ? ret : tjit.common.prg; } bool bpf_jit_supports_subprog_tailcalls(void) diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 5f7528cac344..5d75069fdcc2 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -2422,10 +2422,10 @@ static int invoke_bpf_mod_ret(const struct btf_func_model *m, u8 **pprog, * add rsp, 8 // skip eth_type_trans's frame * ret // return to its caller */ -int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *image_end, - const struct btf_func_model *m, u32 flags, - struct bpf_tramp_links *tlinks, - void *func_addr) +static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *image_end, + const struct btf_func_model *m, u32 flags, + struct bpf_tramp_links *tlinks, + void *func_addr) { int i, ret, nr_regs = m->nr_args, stack_size = 0; int regs_off, nregs_off, ip_off, run_ctx_off, arg_stack_off, rbx_off; @@ -2678,6 +2678,38 @@ cleanup: return ret; } +int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *image_end, + const struct btf_func_model *m, u32 flags, + struct bpf_tramp_links *tlinks, + void *func_addr) +{ + return __arch_prepare_bpf_trampoline(im, image, image_end, m, flags, tlinks, func_addr); +} + +int arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags, + struct bpf_tramp_links *tlinks, void *func_addr) +{ + struct bpf_tramp_image im; + void *image; + int ret; + + /* Allocate a temporary buffer for __arch_prepare_bpf_trampoline(). + * This will NOT cause fragmentation in direct map, as we do not + * call set_memory_*() on this buffer. + * + * We cannot use kvmalloc here, because we need image to be in + * module memory range. + */ + image = bpf_jit_alloc_exec(PAGE_SIZE); + if (!image) + return -ENOMEM; + + ret = __arch_prepare_bpf_trampoline(&im, image, image + PAGE_SIZE, m, flags, + tlinks, func_addr); + bpf_jit_free_exec(image); + return ret; +} + static int emit_bpf_dispatcher(u8 **pprog, int a, int b, s64 *progs, u8 *image, u8 *buf) { u8 *jg_reloc, *prog = *pprog; diff --git a/include/linux/bpf.h b/include/linux/bpf.h index b7fca151cf1b..2332ddeb396b 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1106,6 +1106,8 @@ void *arch_alloc_bpf_trampoline(unsigned int size); void arch_free_bpf_trampoline(void *image, unsigned int size); void arch_protect_bpf_trampoline(void *image, unsigned int size); void arch_unprotect_bpf_trampoline(void *image, unsigned int size); +int arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags, + struct bpf_tramp_links *tlinks, void *func_addr); u64 notrace __bpf_prog_enter_sleepable_recur(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx); diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index affbcbf7e76e..b553cbd89e55 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -1072,6 +1072,12 @@ void __weak arch_unprotect_bpf_trampoline(void *image, unsigned int size) set_memory_rw((long)image, 1); } +int __weak arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags, + struct bpf_tramp_links *tlinks, void *func_addr) +{ + return -ENOTSUPP; +} + static int __init init_trampolines(void) { int i; -- cgit From 26ef208c209a0e6eed8942a5d191b39dccfa6e38 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Wed, 6 Dec 2023 14:40:53 -0800 Subject: bpf: Use arch_bpf_trampoline_size MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Instead of blindly allocating PAGE_SIZE for each trampoline, check the size of the trampoline with arch_bpf_trampoline_size(). This size is saved in bpf_tramp_image->size, and used for modmem charge/uncharge. The fallback arch_alloc_bpf_trampoline() still allocates a whole page because we need to use set_memory_* to protect the memory. struct_ops trampoline still uses a whole page for multiple trampolines. With this size check at caller (regular trampoline and struct_ops trampoline), remove arch_bpf_trampoline_size() from arch_prepare_bpf_trampoline() in archs. Also, update bpf_image_ksym_add() to handle symbol of different sizes. Signed-off-by: Song Liu Acked-by: Ilya Leoshkevich Tested-by: Ilya Leoshkevich # on s390x Acked-by: Jiri Olsa Acked-by: Björn Töpel Tested-by: Björn Töpel # on riscv Link: https://lore.kernel.org/r/20231206224054.492250-7-song@kernel.org Signed-off-by: Alexei Starovoitov --- arch/arm64/net/bpf_jit_comp.c | 7 ------ arch/riscv/net/bpf_jit_comp64.c | 7 ------ include/linux/bpf.h | 3 ++- kernel/bpf/bpf_struct_ops.c | 7 ++++++ kernel/bpf/dispatcher.c | 2 +- kernel/bpf/trampoline.c | 55 +++++++++++++++++++++++++---------------- 6 files changed, 44 insertions(+), 37 deletions(-) (limited to 'kernel') diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index a6671253b7ed..8955da5c47cf 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -2079,13 +2079,6 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, if (nregs > 8) return -ENOTSUPP; - ret = arch_bpf_trampoline_size(m, flags, tlinks, func_addr); - if (ret < 0) - return ret; - - if (ret > ((long)image_end - (long)image)) - return -EFBIG; - jit_fill_hole(image, (unsigned int)(image_end - image)); ret = prepare_trampoline(&ctx, im, tlinks, func_addr, nregs, flags); diff --git a/arch/riscv/net/bpf_jit_comp64.c b/arch/riscv/net/bpf_jit_comp64.c index 35747fafde57..58dc64dd94a8 100644 --- a/arch/riscv/net/bpf_jit_comp64.c +++ b/arch/riscv/net/bpf_jit_comp64.c @@ -1052,13 +1052,6 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, int ret; struct rv_jit_context ctx; - ret = arch_bpf_trampoline_size(im, m, flags, tlinks, func_addr); - if (ret < 0) - return ret; - - if (ret > (long)image_end - (long)image) - return -EFBIG; - ctx.ninsns = 0; /* * The bpf_int_jit_compile() uses a RW buffer (ctx.insns) to write the diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 2332ddeb396b..c1a06263a4f3 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1141,6 +1141,7 @@ enum bpf_tramp_prog_type { struct bpf_tramp_image { void *image; + int size; struct bpf_ksym ksym; struct percpu_ref pcref; void *ip_after_call; @@ -1325,7 +1326,7 @@ int arch_prepare_bpf_dispatcher(void *image, void *buf, s64 *funcs, int num_func void bpf_dispatcher_change_prog(struct bpf_dispatcher *d, struct bpf_prog *from, struct bpf_prog *to); /* Called only from JIT-enabled code, so there's no need for stubs. */ -void bpf_image_ksym_add(void *data, struct bpf_ksym *ksym); +void bpf_image_ksym_add(void *data, unsigned int size, struct bpf_ksym *ksym); void bpf_image_ksym_del(struct bpf_ksym *ksym); void bpf_ksym_add(struct bpf_ksym *ksym); void bpf_ksym_del(struct bpf_ksym *ksym); diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index e9e95879bce2..4d53c53fc5aa 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -355,6 +355,7 @@ int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks, void *image, void *image_end) { u32 flags; + int size; tlinks[BPF_TRAMP_FENTRY].links[0] = link; tlinks[BPF_TRAMP_FENTRY].nr_links = 1; @@ -362,6 +363,12 @@ int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks, * and it must be used alone. */ flags = model->ret_size > 0 ? BPF_TRAMP_F_RET_FENTRY_RET : 0; + + size = arch_bpf_trampoline_size(model, flags, tlinks, NULL); + if (size < 0) + return size; + if (size > (unsigned long)image_end - (unsigned long)image) + return -E2BIG; return arch_prepare_bpf_trampoline(NULL, image, image_end, model, flags, tlinks, NULL); } diff --git a/kernel/bpf/dispatcher.c b/kernel/bpf/dispatcher.c index 56760fc10e78..70fb82bf1637 100644 --- a/kernel/bpf/dispatcher.c +++ b/kernel/bpf/dispatcher.c @@ -154,7 +154,7 @@ void bpf_dispatcher_change_prog(struct bpf_dispatcher *d, struct bpf_prog *from, d->image = NULL; goto out; } - bpf_image_ksym_add(d->image, &d->ksym); + bpf_image_ksym_add(d->image, PAGE_SIZE, &d->ksym); } prev_num_progs = d->num_progs; diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index b553cbd89e55..d382f5ebe06c 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -115,10 +115,10 @@ bool bpf_prog_has_trampoline(const struct bpf_prog *prog) (ptype == BPF_PROG_TYPE_LSM && eatype == BPF_LSM_MAC); } -void bpf_image_ksym_add(void *data, struct bpf_ksym *ksym) +void bpf_image_ksym_add(void *data, unsigned int size, struct bpf_ksym *ksym) { ksym->start = (unsigned long) data; - ksym->end = ksym->start + PAGE_SIZE; + ksym->end = ksym->start + size; bpf_ksym_add(ksym); perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_BPF, ksym->start, PAGE_SIZE, false, ksym->name); @@ -254,8 +254,8 @@ bpf_trampoline_get_progs(const struct bpf_trampoline *tr, int *total, bool *ip_a static void bpf_tramp_image_free(struct bpf_tramp_image *im) { bpf_image_ksym_del(&im->ksym); - arch_free_bpf_trampoline(im->image, PAGE_SIZE); - bpf_jit_uncharge_modmem(PAGE_SIZE); + arch_free_bpf_trampoline(im->image, im->size); + bpf_jit_uncharge_modmem(im->size); percpu_ref_exit(&im->pcref); kfree_rcu(im, rcu); } @@ -349,7 +349,7 @@ static void bpf_tramp_image_put(struct bpf_tramp_image *im) call_rcu_tasks_trace(&im->rcu, __bpf_tramp_image_put_rcu_tasks); } -static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key) +static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key, int size) { struct bpf_tramp_image *im; struct bpf_ksym *ksym; @@ -360,12 +360,13 @@ static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key) if (!im) goto out; - err = bpf_jit_charge_modmem(PAGE_SIZE); + err = bpf_jit_charge_modmem(size); if (err) goto out_free_im; + im->size = size; err = -ENOMEM; - im->image = image = arch_alloc_bpf_trampoline(PAGE_SIZE); + im->image = image = arch_alloc_bpf_trampoline(size); if (!image) goto out_uncharge; @@ -376,13 +377,13 @@ static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key) ksym = &im->ksym; INIT_LIST_HEAD_RCU(&ksym->lnode); snprintf(ksym->name, KSYM_NAME_LEN, "bpf_trampoline_%llu", key); - bpf_image_ksym_add(image, ksym); + bpf_image_ksym_add(image, size, ksym); return im; out_free_image: - arch_free_bpf_trampoline(im->image, PAGE_SIZE); + arch_free_bpf_trampoline(im->image, im->size); out_uncharge: - bpf_jit_uncharge_modmem(PAGE_SIZE); + bpf_jit_uncharge_modmem(size); out_free_im: kfree(im); out: @@ -395,7 +396,7 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mut struct bpf_tramp_links *tlinks; u32 orig_flags = tr->flags; bool ip_arg = false; - int err, total; + int err, total, size; tlinks = bpf_trampoline_get_progs(tr, &total, &ip_arg); if (IS_ERR(tlinks)) @@ -408,12 +409,6 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mut goto out; } - im = bpf_tramp_image_alloc(tr->key); - if (IS_ERR(im)) { - err = PTR_ERR(im); - goto out; - } - /* clear all bits except SHARE_IPMODIFY and TAIL_CALL_CTX */ tr->flags &= (BPF_TRAMP_F_SHARE_IPMODIFY | BPF_TRAMP_F_TAIL_CALL_CTX); @@ -437,13 +432,31 @@ again: tr->flags |= BPF_TRAMP_F_ORIG_STACK; #endif - err = arch_prepare_bpf_trampoline(im, im->image, im->image + PAGE_SIZE, + size = arch_bpf_trampoline_size(&tr->func.model, tr->flags, + tlinks, tr->func.addr); + if (size < 0) { + err = size; + goto out; + } + + if (size > PAGE_SIZE) { + err = -E2BIG; + goto out; + } + + im = bpf_tramp_image_alloc(tr->key, size); + if (IS_ERR(im)) { + err = PTR_ERR(im); + goto out; + } + + err = arch_prepare_bpf_trampoline(im, im->image, im->image + size, &tr->func.model, tr->flags, tlinks, tr->func.addr); if (err < 0) goto out_free; - arch_protect_bpf_trampoline(im->image, PAGE_SIZE); + arch_protect_bpf_trampoline(im->image, im->size); WARN_ON(tr->cur_image && total == 0); if (tr->cur_image) @@ -463,8 +476,8 @@ again: tr->fops->func = NULL; tr->fops->trampoline = 0; - /* reset im->image memory attr for arch_prepare_bpf_trampoline */ - arch_unprotect_bpf_trampoline(im->image, PAGE_SIZE); + /* free im memory and reallocate later */ + bpf_tramp_image_free(im); goto again; } #endif -- cgit From a833a17aeac73b33f79433d7cee68d5cafd71e4f Mon Sep 17 00:00:00 2001 From: Andrei Matei Date: Wed, 6 Dec 2023 23:11:48 -0500 Subject: bpf: Fix verification of indirect var-off stack access This patch fixes a bug around the verification of possibly-zero-sized stack accesses. When the access was done through a var-offset stack pointer, check_stack_access_within_bounds was incorrectly computing the maximum-offset of a zero-sized read to be the same as the register's min offset. Instead, we have to take in account the register's maximum possible value. The patch also simplifies how the max offset is checked; the check is now simpler than for min offset. The bug was allowing accesses to erroneously pass the check_stack_access_within_bounds() checks, only to later crash in check_stack_range_initialized() when all the possibly-affected stack slots are iterated (this time with a correct max offset). check_stack_range_initialized() is relying on check_stack_access_within_bounds() for its accesses to the stack-tracking vector to be within bounds; in the case of zero-sized accesses, we were essentially only verifying that the lowest possible slot was within bounds. We would crash when the max-offset of the stack pointer was >= 0 (which shouldn't pass verification, and hopefully is not something anyone's code attempts to do in practice). Thanks Hao for reporting! Fixes: 01f810ace9ed3 ("bpf: Allow variable-offset stack access") Reported-by: Hao Sun Signed-off-by: Andrei Matei Signed-off-by: Andrii Nakryiko Acked-by: Eduard Zingerman Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20231207041150.229139-2-andreimatei1@gmail.com Closes: https://lore.kernel.org/bpf/CACkBjsZGEUaRCHsmaX=h-efVogsRfK1FPxmkgb0Os_frnHiNdw@mail.gmail.com/ --- kernel/bpf/verifier.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 45e85fb76d82..85e4ab61084f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6620,10 +6620,7 @@ static int check_stack_access_within_bounds( if (tnum_is_const(reg->var_off)) { min_off = reg->var_off.value + off; - if (access_size > 0) - max_off = min_off + access_size - 1; - else - max_off = min_off; + max_off = min_off + access_size; } else { if (reg->smax_value >= BPF_MAX_VAR_OFF || reg->smin_value <= -BPF_MAX_VAR_OFF) { @@ -6632,15 +6629,12 @@ static int check_stack_access_within_bounds( return -EACCES; } min_off = reg->smin_value + off; - if (access_size > 0) - max_off = reg->smax_value + off + access_size - 1; - else - max_off = min_off; + max_off = reg->smax_value + off + access_size; } err = check_stack_slot_within_bounds(min_off, state, type); - if (!err) - err = check_stack_slot_within_bounds(max_off, state, type); + if (!err && max_off > 0) + err = -EINVAL; /* out of stack access into non-negative offsets */ if (err) { if (tnum_is_const(reg->var_off)) { -- cgit From 1d38a9ee81570c4bd61f557832dead4d6f816760 Mon Sep 17 00:00:00 2001 From: Andrei Matei Date: Wed, 6 Dec 2023 23:11:50 -0500 Subject: bpf: Guard stack limits against 32bit overflow This patch promotes the arithmetic around checking stack bounds to be done in the 64-bit domain, instead of the current 32bit. The arithmetic implies adding together a 64-bit register with a int offset. The register was checked to be below 1<<29 when it was variable, but not when it was fixed. The offset either comes from an instruction (in which case it is 16 bit), from another register (in which case the caller checked it to be below 1<<29 [1]), or from the size of an argument to a kfunc (in which case it can be a u32 [2]). Between the register being inconsistently checked to be below 1<<29, and the offset being up to an u32, it appears that we were open to overflowing the `int`s which were currently used for arithmetic. [1] https://github.com/torvalds/linux/blob/815fb87b753055df2d9e50f6cd80eb10235fe3e9/kernel/bpf/verifier.c#L7494-L7498 [2] https://github.com/torvalds/linux/blob/815fb87b753055df2d9e50f6cd80eb10235fe3e9/kernel/bpf/verifier.c#L11904 Reported-by: Andrii Nakryiko Signed-off-by: Andrei Matei Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20231207041150.229139-4-andreimatei1@gmail.com --- kernel/bpf/verifier.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 85e4ab61084f..0e77bb52542d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6577,7 +6577,7 @@ static int check_ptr_to_map_access(struct bpf_verifier_env *env, * The minimum valid offset is -MAX_BPF_STACK for writes, and * -state->allocated_stack for reads. */ -static int check_stack_slot_within_bounds(int off, +static int check_stack_slot_within_bounds(s64 off, struct bpf_func_state *state, enum bpf_access_type t) { @@ -6606,7 +6606,7 @@ static int check_stack_access_within_bounds( struct bpf_reg_state *regs = cur_regs(env); struct bpf_reg_state *reg = regs + regno; struct bpf_func_state *state = func(env, reg); - int min_off, max_off; + s64 min_off, max_off; int err; char *err_extra; @@ -6619,7 +6619,7 @@ static int check_stack_access_within_bounds( err_extra = " write to"; if (tnum_is_const(reg->var_off)) { - min_off = reg->var_off.value + off; + min_off = (s64)reg->var_off.value + off; max_off = min_off + access_size; } else { if (reg->smax_value >= BPF_MAX_VAR_OFF || -- cgit From 6b4a64bafd107e521c01eec3453ce94a3fb38529 Mon Sep 17 00:00:00 2001 From: Andrei Matei Date: Thu, 7 Dec 2023 22:25:18 -0500 Subject: bpf: Fix accesses to uninit stack slots Privileged programs are supposed to be able to read uninitialized stack memory (ever since 6715df8d5) but, before this patch, these accesses were permitted inconsistently. In particular, accesses were permitted above state->allocated_stack, but not below it. In other words, if the stack was already "large enough", the access was permitted, but otherwise the access was rejected instead of being allowed to "grow the stack". This undesired rejection was happening in two places: - in check_stack_slot_within_bounds() - in check_stack_range_initialized() This patch arranges for these accesses to be permitted. A bunch of tests that were relying on the old rejection had to change; all of them were changed to add also run unprivileged, in which case the old behavior persists. One tests couldn't be updated - global_func16 - because it can't run unprivileged for other reasons. This patch also fixes the tracking of the stack size for variable-offset reads. This second fix is bundled in the same commit as the first one because they're inter-related. Before this patch, writes to the stack using registers containing a variable offset (as opposed to registers with fixed, known values) were not properly contributing to the function's needed stack size. As a result, it was possible for a program to verify, but then to attempt to read out-of-bounds data at runtime because a too small stack had been allocated for it. Each function tracks the size of the stack it needs in bpf_subprog_info.stack_depth, which is maintained by update_stack_depth(). For regular memory accesses, check_mem_access() was calling update_state_depth() but it was passing in only the fixed part of the offset register, ignoring the variable offset. This was incorrect; the minimum possible value of that register should be used instead. This tracking is now fixed by centralizing the tracking of stack size in grow_stack_state(), and by lifting the calls to grow_stack_state() to check_stack_access_within_bounds() as suggested by Andrii. The code is now simpler and more convincingly tracks the correct maximum stack size. check_stack_range_initialized() can now rely on enough stack having been allocated for the access; this helps with the fix for the first issue. A few tests were changed to also check the stack depth computation. The one that fails without this patch is verifier_var_off:stack_write_priv_vs_unpriv. Fixes: 01f810ace9ed3 ("bpf: Allow variable-offset stack access") Reported-by: Hao Sun Signed-off-by: Andrei Matei Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20231208032519.260451-3-andreimatei1@gmail.com Closes: https://lore.kernel.org/bpf/CABWLsev9g8UP_c3a=1qbuZUi20tGoUXoU07FPf-5FLvhOKOY+Q@mail.gmail.com/ --- kernel/bpf/verifier.c | 65 +++++++++------------- tools/testing/selftests/bpf/progs/iters.c | 2 +- .../selftests/bpf/progs/test_global_func16.c | 2 +- .../selftests/bpf/progs/verifier_basic_stack.c | 8 +-- .../testing/selftests/bpf/progs/verifier_int_ptr.c | 5 +- .../selftests/bpf/progs/verifier_raw_stack.c | 5 +- .../testing/selftests/bpf/progs/verifier_var_off.c | 62 +++++++++++++++++---- .../selftests/bpf/verifier/atomic_cmpxchg.c | 11 ---- tools/testing/selftests/bpf/verifier/calls.c | 4 +- 9 files changed, 92 insertions(+), 72 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 0e77bb52542d..de1e29fa467e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1259,7 +1259,10 @@ static int resize_reference_state(struct bpf_func_state *state, size_t n) return 0; } -static int grow_stack_state(struct bpf_func_state *state, int size) +/* Possibly update state->allocated_stack to be at least size bytes. Also + * possibly update the function's high-water mark in its bpf_subprog_info. + */ +static int grow_stack_state(struct bpf_verifier_env *env, struct bpf_func_state *state, int size) { size_t old_n = state->allocated_stack / BPF_REG_SIZE, n = size / BPF_REG_SIZE; @@ -1271,6 +1274,11 @@ static int grow_stack_state(struct bpf_func_state *state, int size) return -ENOMEM; state->allocated_stack = size; + + /* update known max for given subprogram */ + if (env->subprog_info[state->subprogno].stack_depth < size) + env->subprog_info[state->subprogno].stack_depth = size; + return 0; } @@ -4440,9 +4448,6 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, struct bpf_reg_state *reg = NULL; int insn_flags = insn_stack_access_flags(state->frameno, spi); - err = grow_stack_state(state, round_up(slot + 1, BPF_REG_SIZE)); - if (err) - return err; /* caller checked that off % size == 0 and -MAX_BPF_STACK <= off < 0, * so it's aligned access and [off, off + size) are within stack limits */ @@ -4595,10 +4600,6 @@ static int check_stack_write_var_off(struct bpf_verifier_env *env, (!value_reg && is_bpf_st_mem(insn) && insn->imm == 0)) writing_zero = true; - err = grow_stack_state(state, round_up(-min_off, BPF_REG_SIZE)); - if (err) - return err; - for (i = min_off; i < max_off; i++) { int spi; @@ -5774,20 +5775,6 @@ static int check_ptr_alignment(struct bpf_verifier_env *env, strict); } -static int update_stack_depth(struct bpf_verifier_env *env, - const struct bpf_func_state *func, - int off) -{ - u16 stack = env->subprog_info[func->subprogno].stack_depth; - - if (stack >= -off) - return 0; - - /* update known max for given subprogram */ - env->subprog_info[func->subprogno].stack_depth = -off; - return 0; -} - /* starting from main bpf function walk all instructions of the function * and recursively walk all callees that given function can call. * Ignore jump and exit insns. @@ -6577,13 +6564,14 @@ static int check_ptr_to_map_access(struct bpf_verifier_env *env, * The minimum valid offset is -MAX_BPF_STACK for writes, and * -state->allocated_stack for reads. */ -static int check_stack_slot_within_bounds(s64 off, - struct bpf_func_state *state, - enum bpf_access_type t) +static int check_stack_slot_within_bounds(struct bpf_verifier_env *env, + s64 off, + struct bpf_func_state *state, + enum bpf_access_type t) { int min_valid_off; - if (t == BPF_WRITE) + if (t == BPF_WRITE || env->allow_uninit_stack) min_valid_off = -MAX_BPF_STACK; else min_valid_off = -state->allocated_stack; @@ -6632,7 +6620,7 @@ static int check_stack_access_within_bounds( max_off = reg->smax_value + off + access_size; } - err = check_stack_slot_within_bounds(min_off, state, type); + err = check_stack_slot_within_bounds(env, min_off, state, type); if (!err && max_off > 0) err = -EINVAL; /* out of stack access into non-negative offsets */ @@ -6647,8 +6635,10 @@ static int check_stack_access_within_bounds( verbose(env, "invalid variable-offset%s stack R%d var_off=%s off=%d size=%d\n", err_extra, regno, tn_buf, off, access_size); } + return err; } - return err; + + return grow_stack_state(env, state, round_up(-min_off, BPF_REG_SIZE)); } /* check whether memory at (regno + off) is accessible for t = (read | write) @@ -6663,7 +6653,6 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn { struct bpf_reg_state *regs = cur_regs(env); struct bpf_reg_state *reg = regs + regno; - struct bpf_func_state *state; int size, err = 0; size = bpf_size_to_bytes(bpf_size); @@ -6806,11 +6795,6 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn if (err) return err; - state = func(env, reg); - err = update_stack_depth(env, state, off); - if (err) - return err; - if (t == BPF_READ) err = check_stack_read(env, regno, off, size, value_regno); @@ -7004,7 +6988,8 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i /* When register 'regno' is used to read the stack (either directly or through * a helper function) make sure that it's within stack boundary and, depending - * on the access type, that all elements of the stack are initialized. + * on the access type and privileges, that all elements of the stack are + * initialized. * * 'off' includes 'regno->off', but not its dynamic part (if any). * @@ -7112,8 +7097,11 @@ static int check_stack_range_initialized( slot = -i - 1; spi = slot / BPF_REG_SIZE; - if (state->allocated_stack <= slot) - goto err; + if (state->allocated_stack <= slot) { + verbose(env, "verifier bug: allocated_stack too small"); + return -EFAULT; + } + stype = &state->stack[spi].slot_type[slot % BPF_REG_SIZE]; if (*stype == STACK_MISC) goto mark; @@ -7137,7 +7125,6 @@ static int check_stack_range_initialized( goto mark; } -err: if (tnum_is_const(reg->var_off)) { verbose(env, "invalid%s read from stack R%d off %d+%d size %d\n", err_extra, regno, min_off, i - min_off, access_size); @@ -7162,7 +7149,7 @@ mark: * helper may write to the entire memory range. */ } - return update_stack_depth(env, state, min_off); + return 0; } static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, diff --git a/tools/testing/selftests/bpf/progs/iters.c b/tools/testing/selftests/bpf/progs/iters.c index b2181f850d3e..3aca3dc145b5 100644 --- a/tools/testing/selftests/bpf/progs/iters.c +++ b/tools/testing/selftests/bpf/progs/iters.c @@ -846,7 +846,7 @@ __naked int delayed_precision_mark(void) "call %[bpf_iter_num_next];" "if r0 == 0 goto 2f;" "if r6 != 42 goto 3f;" - "r7 = -32;" + "r7 = -33;" "call %[bpf_get_prandom_u32];" "r6 = r0;" "goto 1b;\n" diff --git a/tools/testing/selftests/bpf/progs/test_global_func16.c b/tools/testing/selftests/bpf/progs/test_global_func16.c index e7206304632e..e3e64bc472cd 100644 --- a/tools/testing/selftests/bpf/progs/test_global_func16.c +++ b/tools/testing/selftests/bpf/progs/test_global_func16.c @@ -13,7 +13,7 @@ __noinline int foo(int (*arr)[10]) } SEC("cgroup_skb/ingress") -__failure __msg("invalid indirect read from stack") +__success int global_func16(struct __sk_buff *skb) { int array[10]; diff --git a/tools/testing/selftests/bpf/progs/verifier_basic_stack.c b/tools/testing/selftests/bpf/progs/verifier_basic_stack.c index 359df865a8f3..8d77cc5323d3 100644 --- a/tools/testing/selftests/bpf/progs/verifier_basic_stack.c +++ b/tools/testing/selftests/bpf/progs/verifier_basic_stack.c @@ -27,8 +27,8 @@ __naked void stack_out_of_bounds(void) SEC("socket") __description("uninitialized stack1") -__failure __msg("invalid indirect read from stack") -__failure_unpriv +__success __log_level(4) __msg("stack depth 8") +__failure_unpriv __msg_unpriv("invalid indirect read from stack") __naked void uninitialized_stack1(void) { asm volatile (" \ @@ -45,8 +45,8 @@ __naked void uninitialized_stack1(void) SEC("socket") __description("uninitialized stack2") -__failure __msg("invalid read from stack") -__failure_unpriv +__success __log_level(4) __msg("stack depth 8") +__failure_unpriv __msg_unpriv("invalid read from stack") __naked void uninitialized_stack2(void) { asm volatile (" \ diff --git a/tools/testing/selftests/bpf/progs/verifier_int_ptr.c b/tools/testing/selftests/bpf/progs/verifier_int_ptr.c index 74d9cad469d9..9fc3fae5cd83 100644 --- a/tools/testing/selftests/bpf/progs/verifier_int_ptr.c +++ b/tools/testing/selftests/bpf/progs/verifier_int_ptr.c @@ -5,9 +5,10 @@ #include #include "bpf_misc.h" -SEC("cgroup/sysctl") +SEC("socket") __description("ARG_PTR_TO_LONG uninitialized") -__failure __msg("invalid indirect read from stack R4 off -16+0 size 8") +__success +__failure_unpriv __msg_unpriv("invalid indirect read from stack R4 off -16+0 size 8") __naked void arg_ptr_to_long_uninitialized(void) { asm volatile (" \ diff --git a/tools/testing/selftests/bpf/progs/verifier_raw_stack.c b/tools/testing/selftests/bpf/progs/verifier_raw_stack.c index efbfc3a4ad6a..f67390224a9c 100644 --- a/tools/testing/selftests/bpf/progs/verifier_raw_stack.c +++ b/tools/testing/selftests/bpf/progs/verifier_raw_stack.c @@ -5,9 +5,10 @@ #include #include "bpf_misc.h" -SEC("tc") +SEC("socket") __description("raw_stack: no skb_load_bytes") -__failure __msg("invalid read from stack R6 off=-8 size=8") +__success +__failure_unpriv __msg_unpriv("invalid read from stack R6 off=-8 size=8") __naked void stack_no_skb_load_bytes(void) { asm volatile (" \ diff --git a/tools/testing/selftests/bpf/progs/verifier_var_off.c b/tools/testing/selftests/bpf/progs/verifier_var_off.c index b7bdd7db3a35..c810f4f6f479 100644 --- a/tools/testing/selftests/bpf/progs/verifier_var_off.c +++ b/tools/testing/selftests/bpf/progs/verifier_var_off.c @@ -59,9 +59,10 @@ __naked void stack_read_priv_vs_unpriv(void) " ::: __clobber_all); } -SEC("lwt_in") +SEC("cgroup/skb") __description("variable-offset stack read, uninitialized") -__failure __msg("invalid variable-offset read from stack R2") +__success +__failure_unpriv __msg_unpriv("R2 variable stack access prohibited for !root") __naked void variable_offset_stack_read_uninitialized(void) { asm volatile (" \ @@ -83,12 +84,55 @@ __naked void variable_offset_stack_read_uninitialized(void) SEC("socket") __description("variable-offset stack write, priv vs unpriv") -__success __failure_unpriv +__success +/* Check that the maximum stack depth is correctly maintained according to the + * maximum possible variable offset. + */ +__log_level(4) __msg("stack depth 16") +__failure_unpriv /* Variable stack access is rejected for unprivileged. */ __msg_unpriv("R2 variable stack access prohibited for !root") __retval(0) __naked void stack_write_priv_vs_unpriv(void) +{ + asm volatile (" \ + /* Get an unknown value */ \ + r2 = *(u32*)(r1 + 0); \ + /* Make it small and 8-byte aligned */ \ + r2 &= 8; \ + r2 -= 16; \ + /* Add it to fp. We now have either fp-8 or \ + * fp-16, but we don't know which \ + */ \ + r2 += r10; \ + /* Dereference it for a stack write */ \ + r0 = 0; \ + *(u64*)(r2 + 0) = r0; \ + exit; \ +" ::: __clobber_all); +} + +/* Similar to the previous test, but this time also perform a read from the + * address written to with a variable offset. The read is allowed, showing that, + * after a variable-offset write, a priviledged program can read the slots that + * were in the range of that write (even if the verifier doesn't actually know if + * the slot being read was really written to or not. + * + * Despite this test being mostly a superset, the previous test is also kept for + * the sake of it checking the stack depth in the case where there is no read. + */ +SEC("socket") +__description("variable-offset stack write followed by read") +__success +/* Check that the maximum stack depth is correctly maintained according to the + * maximum possible variable offset. + */ +__log_level(4) __msg("stack depth 16") +__failure_unpriv +__msg_unpriv("R2 variable stack access prohibited for !root") +__retval(0) +__naked void stack_write_followed_by_read(void) { asm volatile (" \ /* Get an unknown value */ \ @@ -103,12 +147,7 @@ __naked void stack_write_priv_vs_unpriv(void) /* Dereference it for a stack write */ \ r0 = 0; \ *(u64*)(r2 + 0) = r0; \ - /* Now read from the address we just wrote. This shows\ - * that, after a variable-offset write, a priviledged\ - * program can read the slots that were in the range of\ - * that write (even if the verifier doesn't actually know\ - * if the slot being read was really written to or not.\ - */ \ + /* Now read from the address we just wrote. */ \ r3 = *(u64*)(r2 + 0); \ r0 = 0; \ exit; \ @@ -282,9 +321,10 @@ __naked void access_min_out_of_bound(void) : __clobber_all); } -SEC("lwt_in") +SEC("cgroup/skb") __description("indirect variable-offset stack access, min_off < min_initialized") -__failure __msg("invalid indirect read from stack R2 var_off") +__success +__failure_unpriv __msg_unpriv("R2 variable stack access prohibited for !root") __naked void access_min_off_min_initialized(void) { asm volatile (" \ diff --git a/tools/testing/selftests/bpf/verifier/atomic_cmpxchg.c b/tools/testing/selftests/bpf/verifier/atomic_cmpxchg.c index 319337bdcfc8..9a7b1106fda8 100644 --- a/tools/testing/selftests/bpf/verifier/atomic_cmpxchg.c +++ b/tools/testing/selftests/bpf/verifier/atomic_cmpxchg.c @@ -83,17 +83,6 @@ .result = REJECT, .errstr = "!read_ok", }, -{ - "Can't use cmpxchg on uninit memory", - .insns = { - BPF_MOV64_IMM(BPF_REG_0, 3), - BPF_MOV64_IMM(BPF_REG_2, 4), - BPF_ATOMIC_OP(BPF_DW, BPF_CMPXCHG, BPF_REG_10, BPF_REG_2, -8), - BPF_EXIT_INSN(), - }, - .result = REJECT, - .errstr = "invalid read from stack", -}, { "BPF_W cmpxchg should zero top 32 bits", .insns = { diff --git a/tools/testing/selftests/bpf/verifier/calls.c b/tools/testing/selftests/bpf/verifier/calls.c index 3d5cd51071f0..ab25a81fd3a1 100644 --- a/tools/testing/selftests/bpf/verifier/calls.c +++ b/tools/testing/selftests/bpf/verifier/calls.c @@ -1505,7 +1505,9 @@ .prog_type = BPF_PROG_TYPE_XDP, .fixup_map_hash_8b = { 23 }, .result = REJECT, - .errstr = "invalid read from stack R7 off=-16 size=8", + .errstr = "R0 invalid mem access 'scalar'", + .result_unpriv = REJECT, + .errstr_unpriv = "invalid read from stack R7 off=-16 size=8", }, { "calls: two calls that receive map_value via arg=ptr_stack_of_caller. test1", -- cgit From 2929bfac006d8f8e22b307d04e0d71bcb84db698 Mon Sep 17 00:00:00 2001 From: Andrei Matei Date: Thu, 7 Dec 2023 22:25:19 -0500 Subject: bpf: Minor cleanup around stack bounds Push the rounding up of stack offsets into the function responsible for growing the stack, rather than relying on all the callers to do it. Uncertainty about whether the callers did it or not tripped up people in a previous review. Signed-off-by: Andrei Matei Signed-off-by: Andrii Nakryiko Acked-by: Eduard Zingerman Link: https://lore.kernel.org/bpf/20231208032519.260451-4-andreimatei1@gmail.com --- kernel/bpf/verifier.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index de1e29fa467e..fb690539d5f6 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1264,7 +1264,11 @@ static int resize_reference_state(struct bpf_func_state *state, size_t n) */ static int grow_stack_state(struct bpf_verifier_env *env, struct bpf_func_state *state, int size) { - size_t old_n = state->allocated_stack / BPF_REG_SIZE, n = size / BPF_REG_SIZE; + size_t old_n = state->allocated_stack / BPF_REG_SIZE, n; + + /* The stack size is always a multiple of BPF_REG_SIZE. */ + size = round_up(size, BPF_REG_SIZE); + n = size / BPF_REG_SIZE; if (old_n >= n) return 0; @@ -6638,7 +6642,10 @@ static int check_stack_access_within_bounds( return err; } - return grow_stack_state(env, state, round_up(-min_off, BPF_REG_SIZE)); + /* Note that there is no stack access with offset zero, so the needed stack + * size is -min_off, not -min_off+1. + */ + return grow_stack_state(env, state, -min_off /* size */); } /* check whether memory at (regno + off) is accessible for t = (read | write) -- cgit From 73d9eb340d2b95e0e86a656a7f3157c137f10129 Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Wed, 6 Dec 2023 11:53:24 +0000 Subject: bpf: Enable bpf_cgrp_storage for cgroup1 non-attach case In the current cgroup1 environment, associating operations between cgroups and applications in a BPF program requires storing a mapping of cgroup_id to application either in a hash map or maintaining it in userspace. However, by enabling bpf_cgrp_storage for cgroup1, it becomes possible to conveniently store application-specific information in cgroup-local storage and utilize it within BPF programs. Furthermore, enabling this feature for cgroup1 involves minor modifications for the non-attach case, streamlining the process. However, when it comes to enabling this functionality for the cgroup1 attach case, it presents challenges. Therefore, the decision is to focus on enabling it solely for the cgroup1 non-attach case at present. If attempting to attach to a cgroup1 fd, the operation will simply fail with the error code -EBADF. Signed-off-by: Yafang Shao Acked-by: Tejun Heo Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20231206115326.4295-2-laoar.shao@gmail.com Signed-off-by: Martin KaFai Lau --- kernel/bpf/bpf_cgrp_storage.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/bpf_cgrp_storage.c b/kernel/bpf/bpf_cgrp_storage.c index d44fe8dd9732..28efd0a3f220 100644 --- a/kernel/bpf/bpf_cgrp_storage.c +++ b/kernel/bpf/bpf_cgrp_storage.c @@ -82,7 +82,7 @@ static void *bpf_cgrp_storage_lookup_elem(struct bpf_map *map, void *key) int fd; fd = *(int *)key; - cgroup = cgroup_get_from_fd(fd); + cgroup = cgroup_v1v2_get_from_fd(fd); if (IS_ERR(cgroup)) return ERR_CAST(cgroup); @@ -101,7 +101,7 @@ static long bpf_cgrp_storage_update_elem(struct bpf_map *map, void *key, int fd; fd = *(int *)key; - cgroup = cgroup_get_from_fd(fd); + cgroup = cgroup_v1v2_get_from_fd(fd); if (IS_ERR(cgroup)) return PTR_ERR(cgroup); @@ -131,7 +131,7 @@ static long bpf_cgrp_storage_delete_elem(struct bpf_map *map, void *key) int err, fd; fd = *(int *)key; - cgroup = cgroup_get_from_fd(fd); + cgroup = cgroup_v1v2_get_from_fd(fd); if (IS_ERR(cgroup)) return PTR_ERR(cgroup); -- cgit From c26f2a8901393c9f81909da0a4324587092bd3a3 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Fri, 8 Dec 2023 18:23:49 +0800 Subject: bpf: Remove unnecessary wait from bpf_map_copy_value() Both map_lookup_elem() and generic_map_lookup_batch() use bpf_map_copy_value() to lookup and copy the value, and there is no update operation in bpf_map_copy_value(), so just remove the invocation of maybe_wait_bpf_programs() from it. Fixes: 15c14a3dca42 ("bpf: Add bpf_map_{value_size, update_value, map_copy_value} functions") Signed-off-by: Hou Tao Link: https://lore.kernel.org/r/20231208102355.2628918-2-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index aff045eed375..9ad3f527ab37 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -264,7 +264,6 @@ static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value, } bpf_enable_instrumentation(); - maybe_wait_bpf_programs(map); return err; } -- cgit From 37ba5b59d6adfa08926acd3a833608487a18c2ef Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Fri, 8 Dec 2023 18:23:50 +0800 Subject: bpf: Call maybe_wait_bpf_programs() only once for generic_map_update_batch() Just like commit 9087c6ff8dfe ("bpf: Call maybe_wait_bpf_programs() only once from generic_map_delete_batch()"), there is also no need to call maybe_wait_bpf_programs() for each update in batched update, so only call it once in generic_map_update_batch(). Signed-off-by: Hou Tao Link: https://lore.kernel.org/r/20231208102355.2628918-3-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 9ad3f527ab37..07e671431987 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -203,7 +203,6 @@ static int bpf_map_update_value(struct bpf_map *map, struct file *map_file, rcu_read_unlock(); } bpf_enable_instrumentation(); - maybe_wait_bpf_programs(map); return err; } @@ -1577,6 +1576,7 @@ static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr) } err = bpf_map_update_value(map, f.file, key, value, attr->flags); + maybe_wait_bpf_programs(map); kvfree(value); free_key: @@ -1816,6 +1816,8 @@ int generic_map_update_batch(struct bpf_map *map, struct file *map_file, kvfree(value); kvfree(key); + + maybe_wait_bpf_programs(map); return err; } -- cgit From 012772581d040607ac1f981f47f6afd2336b4580 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Fri, 8 Dec 2023 18:23:51 +0800 Subject: bpf: Add missed maybe_wait_bpf_programs() for htab of maps When doing batched lookup and deletion operations on htab of maps, maybe_wait_bpf_programs() is needed to ensure all programs don't use the inner map after the bpf syscall returns. Instead of adding the wait in __htab_map_lookup_and_delete_batch(), adding the wait in bpf_map_do_batch() and also removing the calling of maybe_wait_bpf_programs() from generic_map_{delete,update}_batch(). Signed-off-by: Hou Tao Link: https://lore.kernel.org/r/20231208102355.2628918-4-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 07e671431987..2e6ef361da1c 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1758,7 +1758,6 @@ int generic_map_delete_batch(struct bpf_map *map, kvfree(key); - maybe_wait_bpf_programs(map); return err; } @@ -1817,7 +1816,6 @@ int generic_map_update_batch(struct bpf_map *map, struct file *map_file, kvfree(value); kvfree(key); - maybe_wait_bpf_programs(map); return err; } @@ -5031,8 +5029,10 @@ static int bpf_map_do_batch(const union bpf_attr *attr, else BPF_DO_BATCH(map->ops->map_delete_batch, map, attr, uattr); err_put: - if (has_write) + if (has_write) { + maybe_wait_bpf_programs(map); bpf_map_write_active_dec(map); + } fdput(f); return err; } -- cgit From 67ad2c73ff29b32bd09135ec07c26e59490dbb3b Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Fri, 8 Dec 2023 18:23:52 +0800 Subject: bpf: Only call maybe_wait_bpf_programs() when map operation succeeds There is no need to call maybe_wait_bpf_programs() if update or deletion operation fails. So only call maybe_wait_bpf_programs() if update or deletion operation succeeds. Signed-off-by: Hou Tao Link: https://lore.kernel.org/r/20231208102355.2628918-5-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 2e6ef361da1c..dd641475b65f 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1576,7 +1576,8 @@ static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr) } err = bpf_map_update_value(map, f.file, key, value, attr->flags); - maybe_wait_bpf_programs(map); + if (!err) + maybe_wait_bpf_programs(map); kvfree(value); free_key: @@ -1632,7 +1633,8 @@ static int map_delete_elem(union bpf_attr *attr, bpfptr_t uattr) err = map->ops->map_delete_elem(map, key); rcu_read_unlock(); bpf_enable_instrumentation(); - maybe_wait_bpf_programs(map); + if (!err) + maybe_wait_bpf_programs(map); out: kvfree(key); err_put: -- cgit From 06e5c999f10269a532304e89a6adb2fbfeb0593c Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Fri, 8 Dec 2023 18:23:53 +0800 Subject: bpf: Set uattr->batch.count as zero before batched update or deletion generic_map_{delete,update}_batch() doesn't set uattr->batch.count as zero before it tries to allocate memory for key. If the memory allocation fails, the value of uattr->batch.count will be incorrect. Fix it by setting uattr->batch.count as zero beore batched update or deletion. Signed-off-by: Hou Tao Link: https://lore.kernel.org/r/20231208102355.2628918-6-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index dd641475b65f..06320d9abf33 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1731,6 +1731,9 @@ int generic_map_delete_batch(struct bpf_map *map, if (!max_count) return 0; + if (put_user(0, &uattr->batch.count)) + return -EFAULT; + key = kvmalloc(map->key_size, GFP_USER | __GFP_NOWARN); if (!key) return -ENOMEM; @@ -1787,6 +1790,9 @@ int generic_map_update_batch(struct bpf_map *map, struct file *map_file, if (!max_count) return 0; + if (put_user(0, &uattr->batch.count)) + return -EFAULT; + key = kvmalloc(map->key_size, GFP_USER | __GFP_NOWARN); if (!key) return -ENOMEM; -- cgit From 482d548d40b0af9af730e4869903d4433e44f014 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 8 Dec 2023 17:09:57 -0800 Subject: bpf: handle fake register spill to stack with BPF_ST_MEM instruction When verifier validates BPF_ST_MEM instruction that stores known constant to stack (e.g., *(u64 *)(r10 - 8) = 123), it effectively spills a fake register with a constant (but initially imprecise) value to a stack slot. Because read-side logic treats it as a proper register fill from stack slot, we need to mark such stack slot initialization as INSN_F_STACK_ACCESS instruction to stop precision backtracking from missing it. Fixes: 41f6f64e6999 ("bpf: support non-r10 register spill/fill to/from stack in precision tracking") Signed-off-by: Andrii Nakryiko Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20231209010958.66758-1-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index fb690539d5f6..727a59e4a647 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4498,7 +4498,6 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, __mark_reg_known(&fake_reg, insn->imm); fake_reg.type = SCALAR_VALUE; save_register_state(env, state, spi, &fake_reg, size); - insn_flags = 0; /* not a register spill */ } else if (reg && is_spillable_regtype(reg->type)) { /* register containing pointer is being spilled into stack */ if (size != BPF_REG_SIZE) { -- cgit From a6de18f310a511278c1ff16b96eb2d500eada725 Mon Sep 17 00:00:00 2001 From: David Vernet Date: Thu, 7 Dec 2023 15:08:42 -0600 Subject: bpf: Add bpf_cpumask_weight() kfunc It can be useful to query how many bits are set in a cpumask. For example, if you want to perform special logic for the last remaining core that's set in a mask. Let's therefore add a new bpf_cpumask_weight() kfunc which checks how many bits are set in a mask. Signed-off-by: David Vernet Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20231207210843.168466-2-void@manifault.com Signed-off-by: Alexei Starovoitov --- Documentation/bpf/cpumasks.rst | 2 +- kernel/bpf/cpumask.c | 12 ++++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/Documentation/bpf/cpumasks.rst b/Documentation/bpf/cpumasks.rst index a22b6ad105fb..b5d47a04da5d 100644 --- a/Documentation/bpf/cpumasks.rst +++ b/Documentation/bpf/cpumasks.rst @@ -352,7 +352,7 @@ can be used to query the contents of cpumasks. .. kernel-doc:: kernel/bpf/cpumask.c :identifiers: bpf_cpumask_first bpf_cpumask_first_zero bpf_cpumask_first_and - bpf_cpumask_test_cpu + bpf_cpumask_test_cpu bpf_cpumask_weight .. kernel-doc:: kernel/bpf/cpumask.c :identifiers: bpf_cpumask_equal bpf_cpumask_intersects bpf_cpumask_subset diff --git a/kernel/bpf/cpumask.c b/kernel/bpf/cpumask.c index e01c741e54e7..7499b7d8c06f 100644 --- a/kernel/bpf/cpumask.c +++ b/kernel/bpf/cpumask.c @@ -405,6 +405,17 @@ __bpf_kfunc u32 bpf_cpumask_any_and_distribute(const struct cpumask *src1, return cpumask_any_and_distribute(src1, src2); } +/** + * bpf_cpumask_weight() - Return the number of bits in @cpumask. + * @cpumask: The cpumask being queried. + * + * Count the number of set bits in the given cpumask. + */ +__bpf_kfunc u32 bpf_cpumask_weight(const struct cpumask *cpumask) +{ + return cpumask_weight(cpumask); +} + __bpf_kfunc_end_defs(); BTF_SET8_START(cpumask_kfunc_btf_ids) @@ -432,6 +443,7 @@ BTF_ID_FLAGS(func, bpf_cpumask_full, KF_RCU) BTF_ID_FLAGS(func, bpf_cpumask_copy, KF_RCU) BTF_ID_FLAGS(func, bpf_cpumask_any_distribute, KF_RCU) BTF_ID_FLAGS(func, bpf_cpumask_any_and_distribute, KF_RCU) +BTF_ID_FLAGS(func, bpf_cpumask_weight, KF_RCU) BTF_SET8_END(cpumask_kfunc_btf_ids) static const struct btf_kfunc_id_set cpumask_kfunc_set = { -- cgit From d2406291483775ecddaee929231a39c70c08fda2 Mon Sep 17 00:00:00 2001 From: Peng Zhang Date: Fri, 27 Oct 2023 11:38:45 +0800 Subject: fork: use __mt_dup() to duplicate maple tree in dup_mmap() In dup_mmap(), using __mt_dup() to duplicate the old maple tree and then directly replacing the entries of VMAs in the new maple tree can result in better performance. __mt_dup() uses DFS pre-order to duplicate the maple tree, so it is efficient. The average time complexity of __mt_dup() is O(n), where n is the number of VMAs. The proof of the time complexity is provided in the commit log that introduces __mt_dup(). After duplicating the maple tree, each element is traversed and replaced (ignoring the cases of deletion, which are rare). Since it is only a replacement operation for each element, this process is also O(n). Analyzing the exact time complexity of the previous algorithm is challenging because each insertion can involve appending to a node, pushing data to adjacent nodes, or even splitting nodes. The frequency of each action is difficult to calculate. The worst-case scenario for a single insertion is when the tree undergoes splitting at every level. If we consider each insertion as the worst-case scenario, we can determine that the upper bound of the time complexity is O(n*log(n)), although this is a loose upper bound. However, based on the test data, it appears that the actual time complexity is likely to be O(n). As the entire maple tree is duplicated using __mt_dup(), if dup_mmap() fails, there will be a portion of VMAs that have not been duplicated in the maple tree. To handle this, we mark the failure point with XA_ZERO_ENTRY. In exit_mmap(), if this marker is encountered, stop releasing VMAs that have not been duplicated after this point. There is a "spawn" in byte-unixbench[1], which can be used to test the performance of fork(). I modified it slightly to make it work with different number of VMAs. Below are the test results. The first row shows the number of VMAs. The second and third rows show the number of fork() calls per ten seconds, corresponding to next-20231006 and the this patchset, respectively. The test results were obtained with CPU binding to avoid scheduler load balancing that could cause unstable results. There are still some fluctuations in the test results, but at least they are better than the original performance. 21 121 221 421 821 1621 3221 6421 12821 25621 51221 112100 76261 54227 34035 20195 11112 6017 3161 1606 802 393 114558 83067 65008 45824 28751 16072 8922 4747 2436 1233 599 2.19% 8.92% 19.88% 34.64% 42.37% 44.64% 48.28% 50.17% 51.68% 53.74% 52.42% [1] https://github.com/kdlucas/byte-unixbench/tree/master Link: https://lkml.kernel.org/r/20231027033845.90608-11-zhangpeng.00@bytedance.com Signed-off-by: Peng Zhang Suggested-by: Liam R. Howlett Reviewed-by: Liam R. Howlett Cc: Christian Brauner Cc: Jonathan Corbet Cc: Mateusz Guzik Cc: Mathieu Desnoyers Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Mike Christie Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- include/linux/mm.h | 11 +++++++++++ kernel/fork.c | 40 +++++++++++++++++++++++++++++----------- mm/internal.h | 11 ----------- mm/memory.c | 7 ++++++- mm/mmap.c | 9 ++++++--- 5 files changed, 52 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/include/linux/mm.h b/include/linux/mm.h index 418d26608ece..64cd1ee4aacc 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -994,6 +994,17 @@ static inline int vma_iter_bulk_alloc(struct vma_iterator *vmi, return mas_expected_entries(&vmi->mas, count); } +static inline int vma_iter_clear_gfp(struct vma_iterator *vmi, + unsigned long start, unsigned long end, gfp_t gfp) +{ + __mas_set_range(&vmi->mas, start, end - 1); + mas_store_gfp(&vmi->mas, NULL, gfp); + if (unlikely(mas_is_err(&vmi->mas))) + return -ENOMEM; + + return 0; +} + /* Free any unused preallocations */ static inline void vma_iter_free(struct vma_iterator *vmi) { diff --git a/kernel/fork.c b/kernel/fork.c index 10917c3e1f03..93924392a5c3 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -650,7 +650,6 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, int retval; unsigned long charge = 0; LIST_HEAD(uf); - VMA_ITERATOR(old_vmi, oldmm, 0); VMA_ITERATOR(vmi, mm, 0); uprobe_start_dup_mmap(); @@ -678,16 +677,22 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, goto out; khugepaged_fork(mm, oldmm); - retval = vma_iter_bulk_alloc(&vmi, oldmm->map_count); - if (retval) + /* Use __mt_dup() to efficiently build an identical maple tree. */ + retval = __mt_dup(&oldmm->mm_mt, &mm->mm_mt, GFP_KERNEL); + if (unlikely(retval)) goto out; mt_clear_in_rcu(vmi.mas.tree); - for_each_vma(old_vmi, mpnt) { + for_each_vma(vmi, mpnt) { struct file *file; vma_start_write(mpnt); if (mpnt->vm_flags & VM_DONTCOPY) { + retval = vma_iter_clear_gfp(&vmi, mpnt->vm_start, + mpnt->vm_end, GFP_KERNEL); + if (retval) + goto loop_out; + vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt)); continue; } @@ -749,9 +754,11 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, if (is_vm_hugetlb_page(tmp)) hugetlb_dup_vma_private(tmp); - /* Link the vma into the MT */ - if (vma_iter_bulk_store(&vmi, tmp)) - goto fail_nomem_vmi_store; + /* + * Link the vma into the MT. After using __mt_dup(), memory + * allocation is not necessary here, so it cannot fail. + */ + vma_iter_bulk_store(&vmi, tmp); mm->map_count++; if (!(tmp->vm_flags & VM_WIPEONFORK)) @@ -760,15 +767,28 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, if (tmp->vm_ops && tmp->vm_ops->open) tmp->vm_ops->open(tmp); - if (retval) + if (retval) { + mpnt = vma_next(&vmi); goto loop_out; + } } /* a new mm has just been created */ retval = arch_dup_mmap(oldmm, mm); loop_out: vma_iter_free(&vmi); - if (!retval) + if (!retval) { mt_set_in_rcu(vmi.mas.tree); + } else if (mpnt) { + /* + * The entire maple tree has already been duplicated. If the + * mmap duplication fails, mark the failure point with + * XA_ZERO_ENTRY. In exit_mmap(), if this marker is encountered, + * stop releasing VMAs that have not been duplicated after this + * point. + */ + mas_set_range(&vmi.mas, mpnt->vm_start, mpnt->vm_end - 1); + mas_store(&vmi.mas, XA_ZERO_ENTRY); + } out: mmap_write_unlock(mm); flush_tlb_mm(oldmm); @@ -778,8 +798,6 @@ fail_uprobe_end: uprobe_end_dup_mmap(); return retval; -fail_nomem_vmi_store: - unlink_anon_vmas(tmp); fail_nomem_anon_vma_fork: mpol_put(vma_policy(tmp)); fail_nomem_policy: diff --git a/mm/internal.h b/mm/internal.h index b61034bd50f5..89a5a794d68f 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1154,17 +1154,6 @@ static inline void vma_iter_clear(struct vma_iterator *vmi) mas_store_prealloc(&vmi->mas, NULL); } -static inline int vma_iter_clear_gfp(struct vma_iterator *vmi, - unsigned long start, unsigned long end, gfp_t gfp) -{ - __mas_set_range(&vmi->mas, start, end - 1); - mas_store_gfp(&vmi->mas, NULL, gfp); - if (unlikely(mas_is_err(&vmi->mas))) - return -ENOMEM; - - return 0; -} - static inline struct vm_area_struct *vma_iter_load(struct vma_iterator *vmi) { return mas_walk(&vmi->mas); diff --git a/mm/memory.c b/mm/memory.c index 5c757fba8858..a7025ed5c65b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -374,6 +374,8 @@ void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas, * be 0. This will underflow and is okay. */ next = mas_find(mas, ceiling - 1); + if (unlikely(xa_is_zero(next))) + next = NULL; /* * Hide vma from rmap and truncate_pagecache before freeing @@ -395,6 +397,8 @@ void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas, && !is_vm_hugetlb_page(next)) { vma = next; next = mas_find(mas, ceiling - 1); + if (unlikely(xa_is_zero(next))) + next = NULL; if (mm_wr_locked) vma_start_write(vma); unlink_anon_vmas(vma); @@ -1744,7 +1748,8 @@ void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas, unmap_single_vma(tlb, vma, start, end, &details, mm_wr_locked); hugetlb_zap_end(vma, &details); - } while ((vma = mas_find(mas, tree_end - 1)) != NULL); + vma = mas_find(mas, tree_end - 1); + } while (vma && likely(!xa_is_zero(vma))); mmu_notifier_invalidate_range_end(&range); } diff --git a/mm/mmap.c b/mm/mmap.c index 1971bfffcc03..4f1cb814586d 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -3294,10 +3294,11 @@ void exit_mmap(struct mm_struct *mm) arch_exit_mmap(mm); vma = mas_find(&mas, ULONG_MAX); - if (!vma) { + if (!vma || unlikely(xa_is_zero(vma))) { /* Can happen if dup_mmap() received an OOM */ mmap_read_unlock(mm); - return; + mmap_write_lock(mm); + goto destroy; } lru_add_drain(); @@ -3332,11 +3333,13 @@ void exit_mmap(struct mm_struct *mm) remove_vma(vma, true); count++; cond_resched(); - } while ((vma = mas_find(&mas, ULONG_MAX)) != NULL); + vma = mas_find(&mas, ULONG_MAX); + } while (vma && likely(!xa_is_zero(vma))); BUG_ON(count != mm->map_count); trace_exit_mmap(mm); +destroy: __mt_destroy(&mm->mm_mt); mmap_write_unlock(mm); vm_unacct_memory(nr_accounted); -- cgit From a9a1d6ad668f2ea0b5a77d0c4c7a41446d0801a8 Mon Sep 17 00:00:00 2001 From: Dongmin Lee Date: Sat, 4 Nov 2023 20:33:20 +0900 Subject: kernel/reboot: explicitly notify if halt occurred instead of power off When kernel_can_power_off() returns false, and reboot has called with LINUX_REBOOT_CMD_POWER_OFF, kernel_halt() will be initiated instead of actual power off function. However, in this situation, Kernel never explicitly notifies user that system halted instead of requested power off. Since halt and power off perform different behavior, and user initiated reboot call with power off command, not halt, This could be unintended behavior to user, like this: ~ # poweroff -f [ 3.581482] reboot: System halted Therefore, this explicitly notifies user that poweroff is not available, and halting has been occured as an alternative behavior instead: ~ # poweroff -f [ 4.123668] reboot: Power off not available: System halted instead [akpm@linux-foundation.org: tweak comment text] Link: https://lkml.kernel.org/r/20231104113320.72440-1-ldmldm05@gmail.com Signed-off-by: Dongmin Lee Signed-off-by: Andrew Morton --- kernel/reboot.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/reboot.c b/kernel/reboot.c index 395a0ea3c7a8..c3a3b82c4f64 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -58,6 +58,14 @@ struct sys_off_handler { struct device *dev; }; +/* + * This variable is used to indicate if a halt was initiated instead of a + * reboot when the reboot call was invoked with LINUX_REBOOT_CMD_POWER_OFF, but + * the system cannot be powered off. This allowes kernel_halt() to notify users + * of that. + */ +static bool poweroff_fallback_to_halt; + /* * Temporary stub that prevents linkage failure while we're in process * of removing all uses of legacy pm_power_off() around the kernel. @@ -297,7 +305,10 @@ void kernel_halt(void) kernel_shutdown_prepare(SYSTEM_HALT); migrate_to_reboot_cpu(); syscore_shutdown(); - pr_emerg("System halted\n"); + if (poweroff_fallback_to_halt) + pr_emerg("Power off not available: System halted instead\n"); + else + pr_emerg("System halted\n"); kmsg_dump(KMSG_DUMP_SHUTDOWN); machine_halt(); } @@ -732,8 +743,10 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd, /* Instead of trying to make the power_off code look like * halt when pm_power_off is not set do it the easy way. */ - if ((cmd == LINUX_REBOOT_CMD_POWER_OFF) && !kernel_can_power_off()) + if ((cmd == LINUX_REBOOT_CMD_POWER_OFF) && !kernel_can_power_off()) { + poweroff_fallback_to_halt = true; cmd = LINUX_REBOOT_CMD_HALT; + } mutex_lock(&system_transition_mutex); switch (cmd) { -- cgit From 61a7a5e25fe79b6c43f1c49705a0294be113c4a5 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 30 Oct 2023 16:57:10 +0100 Subject: introduce for_other_threads(p, t) Cosmetic, but imho it makes the usage look more clear and simple, the new helper doesn't require to initialize "t". After this change while_each_thread() has only 3 users, and it is only used in the do/while loops. Link: https://lkml.kernel.org/r/20231030155710.GA9095@redhat.com Signed-off-by: Oleg Nesterov Reviewed-by: Christian Brauner Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton --- fs/exec.c | 3 +-- include/linux/sched/signal.h | 3 +++ kernel/signal.c | 11 ++++------- 3 files changed, 8 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/fs/exec.c b/fs/exec.c index 4aa19b24f281..ee43597cb453 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1578,11 +1578,10 @@ static void check_unsafe_exec(struct linux_binprm *bprm) * will be able to manipulate the current directory, etc. * It would be nice to force an unshare instead... */ - t = p; n_fs = 1; spin_lock(&p->fs->lock); rcu_read_lock(); - while_each_thread(p, t) { + for_other_threads(p, t) { if (t->fs == p->fs) n_fs++; } diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index 3499c1a8b929..41d6759d6a4a 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -646,6 +646,9 @@ extern bool current_is_single_threaded(void); #define while_each_thread(g, t) \ while ((t = next_thread(t)) != g) +#define for_other_threads(p, t) \ + for (t = p; (t = next_thread(t)) != p; ) + #define __for_each_thread(signal, t) \ list_for_each_entry_rcu(t, &(signal)->thread_head, thread_node, \ lockdep_is_held(&tasklist_lock)) diff --git a/kernel/signal.c b/kernel/signal.c index 47a7602dfe8d..5aa216e841a2 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1376,12 +1376,12 @@ int force_sig_info(struct kernel_siginfo *info) */ int zap_other_threads(struct task_struct *p) { - struct task_struct *t = p; + struct task_struct *t; int count = 0; p->signal->group_stop_count = 0; - while_each_thread(p, t) { + for_other_threads(p, t) { task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK); /* Don't require de_thread to wait for the vhost_worker */ if ((t->flags & (PF_IO_WORKER | PF_USER_WORKER)) != PF_USER_WORKER) @@ -2465,12 +2465,10 @@ static bool do_signal_stop(int signr) sig->group_exit_code = signr; sig->group_stop_count = 0; - if (task_set_jobctl_pending(current, signr | gstop)) sig->group_stop_count++; - t = current; - while_each_thread(current, t) { + for_other_threads(current, t) { /* * Setting state to TASK_STOPPED for a group * stop is always done with the siglock held, @@ -2966,8 +2964,7 @@ static void retarget_shared_pending(struct task_struct *tsk, sigset_t *which) if (sigisemptyset(&retarget)) return; - t = tsk; - while_each_thread(tsk, t) { + for_other_threads(tsk, t) { if (t->flags & PF_EXITING) continue; -- cgit From f72709ab69430d986dfc5a08c9a86f625e3fed33 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Thu, 16 Nov 2023 14:36:36 +0100 Subject: arch: remove ARCH_THREAD_STACK_ALLOCATOR Patch series "Remove unused code after IA-64 removal". While looking into something different I noticed that there are a couple of Kconfig options which were only selected by IA-64 and which are now unused. So remove them and simplify the code a bit. This patch (of 3): IA-64 was the only architecture which selected ARCH_THREAD_STACK_ALLOCATOR. IA-64 was removed with commit cf8e8658100d ("arch: Remove Itanium (IA-64) architecture"). Therefore remove support for ARCH_THREAD_STACK_ALLOCATOR as well. Link: https://lkml.kernel.org/r/20231116133638.1636277-1-hca@linux.ibm.com Link: https://lkml.kernel.org/r/20231116133638.1636277-2-hca@linux.ibm.com Signed-off-by: Heiko Carstens Reviewed-by: Arnd Bergmann Cc: Michael Ellerman Cc: Nicholas Piggin Signed-off-by: Andrew Morton --- arch/Kconfig | 4 ---- kernel/fork.c | 20 -------------------- 2 files changed, 24 deletions(-) (limited to 'kernel') diff --git a/arch/Kconfig b/arch/Kconfig index f4b210ab0612..310162b41a1c 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -320,10 +320,6 @@ config HAVE_ARCH_THREAD_STRUCT_WHITELIST should be implemented. Without this, the entire thread_struct field in task_struct will be left whitelisted. -# Select if arch has its private alloc_thread_stack() function -config ARCH_THREAD_STACK_ALLOCATOR - bool - # Select if arch wants to size task_struct dynamically via arch_task_struct_size: config ARCH_WANTS_DYNAMIC_TASK_STRUCT bool diff --git a/kernel/fork.c b/kernel/fork.c index 10917c3e1f03..d071809866e0 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -179,8 +179,6 @@ static inline void free_task_struct(struct task_struct *tsk) } #endif -#ifndef CONFIG_ARCH_THREAD_STACK_ALLOCATOR - /* * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a * kmemcache based allocator. @@ -412,24 +410,6 @@ void thread_stack_cache_init(void) } # endif /* THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK) */ -#else /* CONFIG_ARCH_THREAD_STACK_ALLOCATOR */ - -static int alloc_thread_stack_node(struct task_struct *tsk, int node) -{ - unsigned long *stack; - - stack = arch_alloc_thread_stack_node(tsk, node); - tsk->stack = stack; - return stack ? 0 : -ENOMEM; -} - -static void free_thread_stack(struct task_struct *tsk) -{ - arch_free_thread_stack(tsk); - tsk->stack = NULL; -} - -#endif /* !CONFIG_ARCH_THREAD_STACK_ALLOCATOR */ /* SLAB cache for signal_struct structures (tsk->signal) */ static struct kmem_cache *signal_cachep; -- cgit From 3888750e21ccb909051c810cc79fcc0650a740f8 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Thu, 16 Nov 2023 14:36:37 +0100 Subject: arch: remove ARCH_TASK_STRUCT_ALLOCATOR IA-64 was the only architecture which selected ARCH_TASK_STRUCT_ALLOCATOR. IA-64 was removed with commit cf8e8658100d ("arch: Remove Itanium (IA-64) architecture"). Therefore remove support for ARCH_THREAD_STACK_ALLOCATOR as well. Link: https://lkml.kernel.org/r/20231116133638.1636277-3-hca@linux.ibm.com Signed-off-by: Heiko Carstens Reviewed-by: Arnd Bergmann Cc: Michael Ellerman Cc: Nicholas Piggin Signed-off-by: Andrew Morton --- arch/Kconfig | 5 ----- kernel/fork.c | 6 ------ 2 files changed, 11 deletions(-) (limited to 'kernel') diff --git a/arch/Kconfig b/arch/Kconfig index 310162b41a1c..c2f87ef9f0ae 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -305,13 +305,8 @@ config ARCH_HAS_CPU_FINALIZE_INIT config ARCH_TASK_STRUCT_ON_STACK bool -# Select if arch has its private alloc_task_struct() function -config ARCH_TASK_STRUCT_ALLOCATOR - bool - config HAVE_ARCH_THREAD_STRUCT_WHITELIST bool - depends on !ARCH_TASK_STRUCT_ALLOCATOR help An architecture should select this to provide hardened usercopy knowledge about what region of the thread_struct should be diff --git a/kernel/fork.c b/kernel/fork.c index d071809866e0..ce8a4b8c04e2 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -165,7 +165,6 @@ void __weak arch_release_task_struct(struct task_struct *tsk) { } -#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR static struct kmem_cache *task_struct_cachep; static inline struct task_struct *alloc_task_struct_node(int node) @@ -177,7 +176,6 @@ static inline void free_task_struct(struct task_struct *tsk) { kmem_cache_free(task_struct_cachep, tsk); } -#endif /* * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a @@ -1001,7 +999,6 @@ static void set_max_threads(unsigned int max_threads_suggested) int arch_task_struct_size __read_mostly; #endif -#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR static void task_struct_whitelist(unsigned long *offset, unsigned long *size) { /* Fetch thread_struct whitelist for the architecture. */ @@ -1016,12 +1013,10 @@ static void task_struct_whitelist(unsigned long *offset, unsigned long *size) else *offset += offsetof(struct task_struct, thread); } -#endif /* CONFIG_ARCH_TASK_STRUCT_ALLOCATOR */ void __init fork_init(void) { int i; -#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR #ifndef ARCH_MIN_TASKALIGN #define ARCH_MIN_TASKALIGN 0 #endif @@ -1034,7 +1029,6 @@ void __init fork_init(void) arch_task_struct_size, align, SLAB_PANIC|SLAB_ACCOUNT, useroffset, usersize, NULL); -#endif /* do the arch specific task caches init */ arch_task_cache_init(); -- cgit From b454ec29225cda9ae85ed0a154f4228f1922c872 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 20 Nov 2023 16:16:49 +0100 Subject: kernel/signal.c: simplify force_sig_info_to_task(), kill recalc_sigpending_and_wake() The purpose of recalc_sigpending_and_wake() is not clear, it looks "obviously unneeded" because we are going to send the signal which can't be blocked or ignored. Add the comment to explain why we can't rely on send_signal_locked() and make this logic more simple/explicit. recalc_sigpending_and_wake() has no other users, it can die. In fact I think we don't even need signal_wake_up(), the target task must be either current or a TASK_TRACED child, otherwise the usage of siglock is not safe. But this needs another change. Link: https://lkml.kernel.org/r/20231120151649.GA15995@redhat.com Signed-off-by: Oleg Nesterov Cc: Eric Biederman Signed-off-by: Andrew Morton --- include/linux/sched/signal.h | 1 - kernel/signal.c | 17 ++++------------- 2 files changed, 4 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index 41d6759d6a4a..015c0e3a3e1d 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -432,7 +432,6 @@ static inline bool fault_signal_pending(vm_fault_t fault_flags, * This is required every time the blocked sigset_t changes. * callers must hold sighand->siglock. */ -extern void recalc_sigpending_and_wake(struct task_struct *t); extern void recalc_sigpending(void); extern void calculate_sigpending(void); diff --git a/kernel/signal.c b/kernel/signal.c index 5aa216e841a2..c9c57d053ce4 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -171,16 +171,6 @@ static bool recalc_sigpending_tsk(struct task_struct *t) return false; } -/* - * After recalculating TIF_SIGPENDING, we need to make sure the task wakes up. - * This is superfluous when called on current, the wakeup is a harmless no-op. - */ -void recalc_sigpending_and_wake(struct task_struct *t) -{ - if (recalc_sigpending_tsk(t)) - signal_wake_up(t, 0); -} - void recalc_sigpending(void) { if (!recalc_sigpending_tsk(current) && !freezing(current)) @@ -1348,10 +1338,8 @@ force_sig_info_to_task(struct kernel_siginfo *info, struct task_struct *t, action->sa.sa_handler = SIG_DFL; if (handler == HANDLER_EXIT) action->sa.sa_flags |= SA_IMMUTABLE; - if (blocked) { + if (blocked) sigdelset(&t->blocked, sig); - recalc_sigpending_and_wake(t); - } } /* * Don't clear SIGNAL_UNKILLABLE for traced tasks, users won't expect @@ -1361,6 +1349,9 @@ force_sig_info_to_task(struct kernel_siginfo *info, struct task_struct *t, (!t->ptrace || (handler == HANDLER_EXIT))) t->signal->flags &= ~SIGNAL_UNKILLABLE; ret = send_signal_locked(sig, info, t, PIDTYPE_PID); + /* This can happen if the signal was already pending and blocked */ + if (!task_sigpending(t)) + signal_wake_up(t, 0); spin_unlock_irqrestore(&t->sighand->siglock, flags); return ret; -- cgit From 27bbb2a0fddf70e185e800cd78f0142d45330c6c Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 21 Nov 2023 17:26:50 +0100 Subject: __ptrace_unlink: kill the obsolete "FIXME" code The corner case described by the comment is no longer possible after the commit 7b3c36fc4c23 ("ptrace: fix task_join_group_stop() for the case when current is traced"), task_join_group_stop() ensures that the new thread has the correct signr in JOBCTL_STOP_SIGMASK regardless of ptrace. Link: https://lkml.kernel.org/r/20231121162650.GA6635@redhat.com Signed-off-by: Oleg Nesterov Cc: Eric Biederman Signed-off-by: Andrew Morton --- kernel/ptrace.c | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/ptrace.c b/kernel/ptrace.c index d8b5e13a2229..3617213c3d8a 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -145,20 +145,9 @@ void __ptrace_unlink(struct task_struct *child) */ if (!(child->flags & PF_EXITING) && (child->signal->flags & SIGNAL_STOP_STOPPED || - child->signal->group_stop_count)) { + child->signal->group_stop_count)) child->jobctl |= JOBCTL_STOP_PENDING; - /* - * This is only possible if this thread was cloned by the - * traced task running in the stopped group, set the signal - * for the future reports. - * FIXME: we should change ptrace_init_task() to handle this - * case. - */ - if (!(child->jobctl & JOBCTL_STOP_SIGMASK)) - child->jobctl |= SIGSTOP; - } - /* * If transition to TASK_STOPPED is pending or in TASK_TRACED, kick * @child in the butt. Note that @resume should be used iff @child -- cgit From 0311d8272406b2ec47f485bef887723cc352a489 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Tue, 14 Nov 2023 17:12:01 +0100 Subject: kexec: use atomic_try_cmpxchg in crash_kexec Use atomic_try_cmpxchg instead of cmpxchg (*ptr, old, new) == old in crash_kexec(). x86 CMPXCHG instruction returns success in ZF flag, so this change saves a compare after cmpxchg. No functional change intended. Link: https://lkml.kernel.org/r/20231114161228.108516-1-ubizjak@gmail.com Signed-off-by: Uros Bizjak Acked-by: Baoquan He Cc: Eric Biederman Signed-off-by: Andrew Morton --- kernel/kexec_core.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index be5642a4ec49..bc4c096ab1f3 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -1063,9 +1063,10 @@ __bpf_kfunc void crash_kexec(struct pt_regs *regs) * panic(). Otherwise parallel calls of panic() and crash_kexec() * may stop each other. To exclude them, we use panic_cpu here too. */ + old_cpu = PANIC_CPU_INVALID; this_cpu = raw_smp_processor_id(); - old_cpu = atomic_cmpxchg(&panic_cpu, PANIC_CPU_INVALID, this_cpu); - if (old_cpu == PANIC_CPU_INVALID) { + + if (atomic_try_cmpxchg(&panic_cpu, &old_cpu, this_cpu)) { /* This is the 1st CPU which comes here, so go ahead. */ __crash_kexec(regs); -- cgit From b1c3efe07987592c16d5f59ce235e6ddbea65a73 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 23 Nov 2023 12:05:03 +0100 Subject: sched: fair: move unused stub functions to header These four functions have a normal definition for CONFIG_FAIR_GROUP_SCHED, and empty one that is only referenced when FAIR_GROUP_SCHED is disabled but CGROUP_SCHED is still enabled. If both are turned off, the functions are still defined but the misisng prototype causes a W=1 warning: kernel/sched/fair.c:12544:6: error: no previous prototype for 'free_fair_sched_group' kernel/sched/fair.c:12546:5: error: no previous prototype for 'alloc_fair_sched_group' kernel/sched/fair.c:12553:6: error: no previous prototype for 'online_fair_sched_group' kernel/sched/fair.c:12555:6: error: no previous prototype for 'unregister_fair_sched_group' Move the alternatives into the header as static inline functions with the correct combination of #ifdef checks to avoid the warning without adding even more complexity. [A different patch with the same description got applied by accident and was later reverted, but the original patch is still missing] Link: https://lkml.kernel.org/r/20231123110506.707903-4-arnd@kernel.org Fixes: 7aa55f2a5902 ("sched/fair: Move unused stub functions to header") Signed-off-by: Arnd Bergmann Cc: "David S. Miller" Cc: David Woodhouse Cc: Dinh Nguyen Cc: Greg Kroah-Hartman Cc: Ivan Kokshaysky Cc: John Paul Adrian Glaubitz Cc: Kees Cook Cc: Masahiro Yamada Cc: Matt Turner Cc: Michael Ellerman Cc: Nathan Chancellor Cc: Nicolas Schier Cc: Palmer Dabbelt Cc: Peter Zijlstra Cc: Richard Henderson Cc: Richard Weinberger Cc: Rich Felker Cc: Stephen Rothwell Cc: Thomas Bogendoerfer Cc: Tudor Ambarus Cc: Yoshinori Sato Cc: Zhihao Cheng Signed-off-by: Andrew Morton --- kernel/sched/fair.c | 13 ------------- kernel/sched/sched.h | 11 +++++++++++ 2 files changed, 11 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d7a3c63a2171..835a40eac462 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -13036,19 +13036,6 @@ next_cpu: return 0; } -#else /* CONFIG_FAIR_GROUP_SCHED */ - -void free_fair_sched_group(struct task_group *tg) { } - -int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) -{ - return 1; -} - -void online_fair_sched_group(struct task_group *tg) { } - -void unregister_fair_sched_group(struct task_group *tg) { } - #endif /* CONFIG_FAIR_GROUP_SCHED */ diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 2e5a95486a42..8f5df5250b8d 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -436,10 +436,21 @@ static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data) extern int tg_nop(struct task_group *tg, void *data); +#ifdef CONFIG_FAIR_GROUP_SCHED extern void free_fair_sched_group(struct task_group *tg); extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent); extern void online_fair_sched_group(struct task_group *tg); extern void unregister_fair_sched_group(struct task_group *tg); +#else +static inline void free_fair_sched_group(struct task_group *tg) { } +static inline int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) +{ + return 1; +} +static inline void online_fair_sched_group(struct task_group *tg) { } +static inline void unregister_fair_sched_group(struct task_group *tg) { } +#endif + extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, struct sched_entity *se, int cpu, struct sched_entity *parent); -- cgit From 7acf164b259d9007264d9d8501da1023f140a3b4 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Wed, 15 Nov 2023 21:00:27 +0800 Subject: resource: add walk_system_ram_res_rev() This function, being a variant of walk_system_ram_res() introduced in commit 8c86e70acead ("resource: provide new functions to walk through resources"), walks through a list of all the resources of System RAM in reversed order, i.e., from higher to lower. It will be used in kexec_file code to load kernel, initrd etc when preparing kexec reboot. Link: https://lkml.kernel.org/r/ZVTA6z/06cLnWKUz@MiWiFi-R3L-srv Signed-off-by: AKASHI Takahiro Signed-off-by: Baoquan He Cc: Eric Biederman Signed-off-by: Andrew Morton --- include/linux/ioport.h | 3 +++ kernel/resource.c | 57 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 60 insertions(+) (limited to 'kernel') diff --git a/include/linux/ioport.h b/include/linux/ioport.h index 14f5cfabbbc8..db7fe25f3370 100644 --- a/include/linux/ioport.h +++ b/include/linux/ioport.h @@ -331,6 +331,9 @@ extern int walk_system_ram_res(u64 start, u64 end, void *arg, int (*func)(struct resource *, void *)); extern int +walk_system_ram_res_rev(u64 start, u64 end, void *arg, + int (*func)(struct resource *, void *)); +extern int walk_iomem_res_desc(unsigned long desc, unsigned long flags, u64 start, u64 end, void *arg, int (*func)(struct resource *, void *)); diff --git a/kernel/resource.c b/kernel/resource.c index 866ef3663a0b..e8a244300e5b 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -27,6 +27,8 @@ #include #include #include +#include +#include #include @@ -429,6 +431,61 @@ int walk_system_ram_res(u64 start, u64 end, void *arg, func); } +/* + * This function, being a variant of walk_system_ram_res(), calls the @func + * callback against all memory ranges of type System RAM which are marked as + * IORESOURCE_SYSTEM_RAM and IORESOUCE_BUSY in reversed order, i.e., from + * higher to lower. + */ +int walk_system_ram_res_rev(u64 start, u64 end, void *arg, + int (*func)(struct resource *, void *)) +{ + struct resource res, *rams; + int rams_size = 16, i; + unsigned long flags; + int ret = -1; + + /* create a list */ + rams = kvcalloc(rams_size, sizeof(struct resource), GFP_KERNEL); + if (!rams) + return ret; + + flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; + i = 0; + while ((start < end) && + (!find_next_iomem_res(start, end, flags, IORES_DESC_NONE, &res))) { + if (i >= rams_size) { + /* re-alloc */ + struct resource *rams_new; + + rams_new = kvrealloc(rams, rams_size * sizeof(struct resource), + (rams_size + 16) * sizeof(struct resource), + GFP_KERNEL); + if (!rams_new) + goto out; + + rams = rams_new; + rams_size += 16; + } + + rams[i].start = res.start; + rams[i++].end = res.end; + + start = res.end + 1; + } + + /* go reverse */ + for (i--; i >= 0; i--) { + ret = (*func)(&rams[i], arg); + if (ret) + break; + } + +out: + kvfree(rams); + return ret; +} + /* * This function calls the @func callback against all memory ranges, which * are ranges marked as IORESOURCE_MEM and IORESOUCE_BUSY. -- cgit From b3ba234171cd0d58df0a13c262210ff8b5fd2830 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Tue, 14 Nov 2023 17:16:58 +0800 Subject: kexec_file: load kernel at top of system RAM if required Patch series "kexec_file: Load kernel at top of system RAM if required". Justification: ============== Kexec_load interface has been doing top down searching and loading kernel/initrd/purgtory etc to prepare for kexec reboot. In that way, the benefits are that it avoids to consume and fragment limited low memory which satisfy DMA buffer allocation and big chunk of continuous memory during system init; and avoids to stir with BIOS/FW reserved or occupied areas, or corner case handling/work around/quirk occupied areas when doing system init. By the way, the top-down searching and loading of kexec-ed kernel is done in user space utility code. For kexec_file loading, even if kexec_buf.top_down is 'true', it's simply ignored. It calls walk_system_ram_res() directly to go through all resources of System RAM bottom up, to find an available memory region, then call locate_mem_hole_callback() to allocate memory in that found memory region from top to down. This is not expected and inconsistent with kexec_load. Implementation =============== In patch 1, introduce a new function walk_system_ram_res_rev() which is a variant of walk_system_ram_res(), it walks through a list of all the resources of System RAM in reversed order, i.e., from higher to lower. In patch 2, check if kexec_buf.top_down is 'true' in kexec_walk_resources(), if yes, call walk_system_ram_res_rev() to find memory region of system RAM from top to down to load kernel/initrd etc. Background information: ======================= And I ever tried this in the past in a different way, please see below link. In the post, I tried to adjust struct sibling linking code, replace the the singly linked list with list_head so that walk_system_ram_res_rev() can be implemented in a much easier way. Finally I failed. https://lore.kernel.org/all/20180718024944.577-4-bhe@redhat.com/ This time, I picked up the patch from AKASHI Takahiro's old post and made some change to take as the current patch 1: https://lists.infradead.org/pipermail/linux-arm-kernel/2017-September/531456.html This patch (of 2): Kexec_load interface has been doing top down searching and loading kernel/initrd/purgtory etc to prepare for kexec reboot. In that way, the benefits are that it avoids to consume and fragment limited low memory which satisfy DMA buffer allocation and big chunk of continuous memory during system init; and avoids to stir with BIOS/FW reserved or occupied areas, or corner case handling/work around/quirk occupied areas when doing system init. By the way, the top-down searching and loading of kexec-ed kernel is done in user space utility code. For kexec_file loading, even if kexec_buf.top_down is 'true', it's simply ignored. It calls walk_system_ram_res() directly to go through all resources of System RAM bottom up, to find an available memory region, then call locate_mem_hole_callback() to allocate memory in that found memory region from top to down. This is not expected and inconsistent with kexec_load. Here check if kexec_buf.top_down is 'true' in kexec_walk_resources(), if yes, call the newly added walk_system_ram_res_rev() to find memory region of system RAM from top to down to load kernel/initrd etc. Link: https://lkml.kernel.org/r/20231114091658.228030-1-bhe@redhat.com Link: https://lkml.kernel.org/r/20231114091658.228030-3-bhe@redhat.com Signed-off-by: Baoquan He Cc: AKASHI Takahiro Cc: Baoquan He Cc: Eric Biederman Signed-off-by: Andrew Morton --- kernel/kexec_file.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index f9a419cd22d4..ba3ef30921b8 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -592,6 +592,8 @@ static int kexec_walk_resources(struct kexec_buf *kbuf, IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY, crashk_res.start, crashk_res.end, kbuf, func); + else if (kbuf->top_down) + return walk_system_ram_res_rev(0, ULONG_MAX, kbuf, func); else return walk_system_ram_res(0, ULONG_MAX, kbuf, func); } -- cgit From 9d02330abd3ecdacc43fc6b16a5668e7e94b7562 Mon Sep 17 00:00:00 2001 From: Li Zhe Date: Thu, 23 Nov 2023 16:40:22 +0800 Subject: softlockup: serialized softlockup's log If multiple CPUs trigger softlockup at the same time with 'softlockup_all_cpu_backtrace=0', the softlockup's logs will appear staggeredly in dmesg, which will affect the viewing of the logs for developer. Since the code path for outputting softlockup logs is not a kernel hotspot and the performance requirements for the code are not strict, locks are used to serialize the softlockup log output to improve the readability of the logs. Link: https://lkml.kernel.org/r/20231123084022.10302-1-lizhe.67@bytedance.com Signed-off-by: Li Zhe Reviewed-by: Petr Mladek Reviewed-by: Douglas Anderson Cc: Lecopzer Chen Cc: Pingfan Liu Cc: Zefan Li Cc: John Ogness Signed-off-by: Andrew Morton --- kernel/watchdog.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 5cd6d4e26915..bf30a6fac665 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -448,6 +448,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) struct pt_regs *regs = get_irq_regs(); int duration; int softlockup_all_cpu_backtrace = sysctl_softlockup_all_cpu_backtrace; + static DEFINE_SPINLOCK(watchdog_output_lock); if (!watchdog_enabled) return HRTIMER_NORESTART; @@ -514,6 +515,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) /* Start period for the next softlockup warning. */ update_report_ts(); + spin_lock(&watchdog_output_lock); pr_emerg("BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n", smp_processor_id(), duration, current->comm, task_pid_nr(current)); @@ -523,6 +525,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) show_regs(regs); else dump_stack(); + spin_unlock(&watchdog_output_lock); if (softlockup_all_cpu_backtrace) { trigger_allbutcpu_cpu_backtrace(smp_processor_id()); -- cgit From 18966f7b9458d3b19412fe9dfb421ab59401bfe1 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 11 Oct 2023 09:45:54 -0700 Subject: rcu-tasks: Mark RCU Tasks accesses to current->rcu_tasks_idle_cpu The task_struct structure's ->rcu_tasks_idle_cpu can be concurrently read and written from the RCU Tasks grace-period kthread and from the CPU on which the task_struct structure's task is running. This commit therefore marks the accesses appropriately. Reported-by: Boqun Feng Signed-off-by: Paul E. McKenney Reviewed-by: Joel Fernandes (Google) Signed-off-by: Neeraj Upadhyay (AMD) --- kernel/rcu/tasks.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index f54d5782eca0..732ad5b39946 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -975,7 +975,7 @@ static void check_holdout_task(struct task_struct *t, t->rcu_tasks_nvcsw != READ_ONCE(t->nvcsw) || !rcu_tasks_is_holdout(t) || (IS_ENABLED(CONFIG_NO_HZ_FULL) && - !is_idle_task(t) && t->rcu_tasks_idle_cpu >= 0)) { + !is_idle_task(t) && READ_ONCE(t->rcu_tasks_idle_cpu) >= 0)) { WRITE_ONCE(t->rcu_tasks_holdout, false); list_del_init(&t->rcu_tasks_holdout_list); put_task_struct(t); @@ -993,7 +993,7 @@ static void check_holdout_task(struct task_struct *t, t, ".I"[is_idle_task(t)], "N."[cpu < 0 || !tick_nohz_full_cpu(cpu)], t->rcu_tasks_nvcsw, t->nvcsw, t->rcu_tasks_holdout, - t->rcu_tasks_idle_cpu, cpu); + data_race(t->rcu_tasks_idle_cpu), cpu); sched_show_task(t); } -- cgit From 4e58aaeebb3c27993c734c99eae6881b196b1ddb Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 1 Nov 2023 18:28:38 -0700 Subject: rcu: Restrict access to RCU CPU stall notifiers Although the RCU CPU stall notifiers can be useful for dumping state when tracking down delicate forward-progress bugs where NUMA effects cause cache lines to be delivered to a given CPU regularly, but always in a state that prevents that CPU from making forward progress. These bugs can be detected by the RCU CPU stall-warning mechanism, but in some cases, the stall-warnings printk()s disrupt the forward-progress bug before any useful state can be obtained. Unfortunately, the notifier mechanism added by commit 5b404fdabacf ("rcu: Add RCU CPU stall notifier") can make matters worse if used at all carelessly. For example, if the stall warning was caused by a lock not being released, then any attempt to acquire that lock in the notifier will hang. This will prevent not only the notifier from producing any useful output, but it will also prevent the stall-warning message from ever appearing. This commit therefore hides this new RCU CPU stall notifier mechanism under a new RCU_CPU_STALL_NOTIFIER Kconfig option that depends on both DEBUG_KERNEL and RCU_EXPERT. In addition, the rcupdate.rcu_cpu_stall_notifiers=1 kernel boot parameter must also be specified. The RCU_CPU_STALL_NOTIFIER Kconfig option's help text contains a warning and explains the dangers of careless use, recommending lockless notifier code. In addition, a WARN() is triggered each time that an attempt is made to register a stall-warning notifier in kernels built with CONFIG_RCU_CPU_STALL_NOTIFIER=y. This combination of measures will keep use of this mechanism confined to debug kernels and away from routine deployments. [ paulmck: Apply Dan Carpenter feedback. ] Fixes: 5b404fdabacf ("rcu: Add RCU CPU stall notifier") Reported-by: Linus Torvalds Signed-off-by: Paul E. McKenney Reviewed-by: Joel Fernandes (Google) Signed-off-by: Neeraj Upadhyay (AMD) --- Documentation/admin-guide/kernel-parameters.txt | 6 ++++++ include/linux/rcu_notifier.h | 6 +++--- kernel/rcu/Kconfig.debug | 25 +++++++++++++++++++++++++ kernel/rcu/rcu.h | 8 +++++--- kernel/rcu/rcutorture.c | 12 +++++++----- kernel/rcu/tree_stall.h | 11 ++++++++++- kernel/rcu/update.c | 6 ++++++ 7 files changed, 62 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 65731b060e3f..b72e2049c487 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -5302,6 +5302,12 @@ Dump ftrace buffer after reporting RCU CPU stall warning. + rcupdate.rcu_cpu_stall_notifiers= [KNL] + Provide RCU CPU stall notifiers, but see the + warnings in the RCU_CPU_STALL_NOTIFIER Kconfig + option's help text. TL;DR: You almost certainly + do not want rcupdate.rcu_cpu_stall_notifiers. + rcupdate.rcu_cpu_stall_suppress= [KNL] Suppress RCU CPU stall warning messages. diff --git a/include/linux/rcu_notifier.h b/include/linux/rcu_notifier.h index ebf371364581..5640f024773b 100644 --- a/include/linux/rcu_notifier.h +++ b/include/linux/rcu_notifier.h @@ -13,7 +13,7 @@ #define RCU_STALL_NOTIFY_NORM 1 #define RCU_STALL_NOTIFY_EXP 2 -#ifdef CONFIG_RCU_STALL_COMMON +#if defined(CONFIG_RCU_STALL_COMMON) && defined(CONFIG_RCU_CPU_STALL_NOTIFIER) #include #include @@ -21,12 +21,12 @@ int rcu_stall_chain_notifier_register(struct notifier_block *n); int rcu_stall_chain_notifier_unregister(struct notifier_block *n); -#else // #ifdef CONFIG_RCU_STALL_COMMON +#else // #if defined(CONFIG_RCU_STALL_COMMON) && defined(CONFIG_RCU_CPU_STALL_NOTIFIER) // No RCU CPU stall warnings in Tiny RCU. static inline int rcu_stall_chain_notifier_register(struct notifier_block *n) { return -EEXIST; } static inline int rcu_stall_chain_notifier_unregister(struct notifier_block *n) { return -ENOENT; } -#endif // #else // #ifdef CONFIG_RCU_STALL_COMMON +#endif // #else // #if defined(CONFIG_RCU_STALL_COMMON) && defined(CONFIG_RCU_CPU_STALL_NOTIFIER) #endif /* __LINUX_RCU_NOTIFIER_H */ diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug index 2984de629f74..9b0b52e1836f 100644 --- a/kernel/rcu/Kconfig.debug +++ b/kernel/rcu/Kconfig.debug @@ -105,6 +105,31 @@ config RCU_CPU_STALL_CPUTIME The boot option rcupdate.rcu_cpu_stall_cputime has the same function as this one, but will override this if it exists. +config RCU_CPU_STALL_NOTIFIER + bool "Provide RCU CPU-stall notifiers" + depends on RCU_STALL_COMMON + depends on DEBUG_KERNEL + depends on RCU_EXPERT + default n + help + WARNING: You almost certainly do not want this!!! + + Enable RCU CPU-stall notifiers, which are invoked just before + printing the RCU CPU stall warning. As such, bugs in notifier + callbacks can prevent stall warnings from being printed. + And the whole reason that a stall warning is being printed is + that something is hung up somewhere. Therefore, the notifier + callbacks must be written extremely carefully, preferably + containing only lockless code. After all, it is quite possible + that the whole reason that the RCU CPU stall is happening in + the first place is that someone forgot to release whatever lock + that you are thinking of acquiring. In which case, having your + notifier callback acquire that lock will hang, preventing the + RCU CPU stall warning from appearing. + + Say Y here if you want RCU CPU stall notifiers (you don't want them) + Say N if you are unsure. + config RCU_TRACE bool "Enable tracing for RCU" depends on DEBUG_KERNEL diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index b531c33e9545..f94f65877f2b 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -262,6 +262,8 @@ static inline bool rcu_stall_is_suppressed_at_boot(void) return rcu_cpu_stall_suppress_at_boot && !rcu_inkernel_boot_has_ended(); } +extern int rcu_cpu_stall_notifiers; + #ifdef CONFIG_RCU_STALL_COMMON extern int rcu_cpu_stall_ftrace_dump; @@ -659,10 +661,10 @@ static inline bool rcu_cpu_beenfullyonline(int cpu) { return true; } bool rcu_cpu_beenfullyonline(int cpu); #endif -#ifdef CONFIG_RCU_STALL_COMMON +#if defined(CONFIG_RCU_STALL_COMMON) && defined(CONFIG_RCU_CPU_STALL_NOTIFIER) int rcu_stall_notifier_call_chain(unsigned long val, void *v); -#else // #ifdef CONFIG_RCU_STALL_COMMON +#else // #if defined(CONFIG_RCU_STALL_COMMON) && defined(CONFIG_RCU_CPU_STALL_NOTIFIER) static inline int rcu_stall_notifier_call_chain(unsigned long val, void *v) { return NOTIFY_DONE; } -#endif // #else // #ifdef CONFIG_RCU_STALL_COMMON +#endif // #else // #if defined(CONFIG_RCU_STALL_COMMON) && defined(CONFIG_RCU_CPU_STALL_NOTIFIER) #endif /* __LINUX_RCU_H */ diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 30fc9d34e329..07a6a183c555 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -2450,10 +2450,12 @@ static int rcu_torture_stall(void *args) unsigned long stop_at; VERBOSE_TOROUT_STRING("rcu_torture_stall task started"); - ret = rcu_stall_chain_notifier_register(&rcu_torture_stall_block); - if (ret) - pr_info("%s: rcu_stall_chain_notifier_register() returned %d, %sexpected.\n", - __func__, ret, !IS_ENABLED(CONFIG_RCU_STALL_COMMON) ? "un" : ""); + if (rcu_cpu_stall_notifiers) { + ret = rcu_stall_chain_notifier_register(&rcu_torture_stall_block); + if (ret) + pr_info("%s: rcu_stall_chain_notifier_register() returned %d, %sexpected.\n", + __func__, ret, !IS_ENABLED(CONFIG_RCU_STALL_COMMON) ? "un" : ""); + } if (stall_cpu_holdoff > 0) { VERBOSE_TOROUT_STRING("rcu_torture_stall begin holdoff"); schedule_timeout_interruptible(stall_cpu_holdoff * HZ); @@ -2497,7 +2499,7 @@ static int rcu_torture_stall(void *args) cur_ops->readunlock(idx); } pr_alert("%s end.\n", __func__); - if (!ret) { + if (rcu_cpu_stall_notifiers && !ret) { ret = rcu_stall_chain_notifier_unregister(&rcu_torture_stall_block); if (ret) pr_info("%s: rcu_stall_chain_notifier_unregister() returned %d.\n", __func__, ret); diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index ac8e86babe44..5d666428546b 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -1061,6 +1061,7 @@ static int __init rcu_sysrq_init(void) } early_initcall(rcu_sysrq_init); +#ifdef CONFIG_RCU_CPU_STALL_NOTIFIER ////////////////////////////////////////////////////////////////////////////// // @@ -1081,7 +1082,13 @@ static ATOMIC_NOTIFIER_HEAD(rcu_cpu_stall_notifier_list); */ int rcu_stall_chain_notifier_register(struct notifier_block *n) { - return atomic_notifier_chain_register(&rcu_cpu_stall_notifier_list, n); + int rcsn = rcu_cpu_stall_notifiers; + + WARN(1, "Adding %pS() to RCU stall notifier list (%s).\n", n->notifier_call, + rcsn ? "possibly suppressing RCU CPU stall warnings" : "failed, so all is well"); + if (rcsn) + return atomic_notifier_chain_register(&rcu_cpu_stall_notifier_list, n); + return -EEXIST; } EXPORT_SYMBOL_GPL(rcu_stall_chain_notifier_register); @@ -1115,3 +1122,5 @@ int rcu_stall_notifier_call_chain(unsigned long val, void *v) { return atomic_notifier_call_chain(&rcu_cpu_stall_notifier_list, val, v); } + +#endif // #ifdef CONFIG_RCU_CPU_STALL_NOTIFIER diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index c534d6806d3d..46aaaa9fe339 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -538,9 +538,15 @@ long torture_sched_setaffinity(pid_t pid, const struct cpumask *in_mask) EXPORT_SYMBOL_GPL(torture_sched_setaffinity); #endif +int rcu_cpu_stall_notifiers __read_mostly; // !0 = provide stall notifiers (rarely useful) +EXPORT_SYMBOL_GPL(rcu_cpu_stall_notifiers); + #ifdef CONFIG_RCU_STALL_COMMON int rcu_cpu_stall_ftrace_dump __read_mostly; module_param(rcu_cpu_stall_ftrace_dump, int, 0644); +#ifdef CONFIG_RCU_CPU_STALL_NOTIFIER +module_param(rcu_cpu_stall_notifiers, int, 0444); +#endif // #ifdef CONFIG_RCU_CPU_STALL_NOTIFIER int rcu_cpu_stall_suppress __read_mostly; // !0 = suppress stall warnings. EXPORT_SYMBOL_GPL(rcu_cpu_stall_suppress); module_param(rcu_cpu_stall_suppress, int, 0644); -- cgit From a1ca8295ee53a2fc57085fae26df37228c655791 Mon Sep 17 00:00:00 2001 From: Wang chaodong Date: Fri, 20 Oct 2023 16:51:06 +0800 Subject: PM: hibernate: Drop unnecessary local variable initialization It is not necessary to intialize the error variable in create_basic_memory_bitmaps(), because it is only read after being assigned a value. Signed-off-by: Wang chaodong [ rjw: Subject and changelog rewrite ] Signed-off-by: Rafael J. Wysocki --- kernel/power/snapshot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 50a15408c3fc..71b2f12ed3b5 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1119,7 +1119,7 @@ static void mark_nosave_pages(struct memory_bitmap *bm) int create_basic_memory_bitmaps(void) { struct memory_bitmap *bm1, *bm2; - int error = 0; + int error; if (forbidden_pages_map && free_pages_map) return 0; -- cgit From bbeaa4691fa8682e2fe2e87f28d5fce39805fa68 Mon Sep 17 00:00:00 2001 From: Li zeming Date: Fri, 27 Oct 2023 09:55:33 +0800 Subject: PM: hibernate: Do not initialize error in swap_write_page() 'error' first receives the function result before it is used, and it does not need to be assigned a value during definition. Signed-off-by: Li zeming [ rjw: Subject rewrite ] Signed-off-by: Rafael J. Wysocki --- kernel/power/swap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/swap.c b/kernel/power/swap.c index a2cb0babb5ec..68973ca2cf07 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -451,7 +451,7 @@ err_close: static int swap_write_page(struct swap_map_handle *handle, void *buf, struct hib_bio_batch *hb) { - int error = 0; + int error; sector_t offset; if (!handle->cur) -- cgit From 4ac934b1aaa99e00ca25875d55094a4fe34e212d Mon Sep 17 00:00:00 2001 From: Li zeming Date: Tue, 24 Oct 2023 10:04:34 +0800 Subject: PM: hibernate: Do not initialize error in snapshot_write_next() The error variable in snapshot_write_next() gets a value before it is used, so don't initialize it to 0 upfront. Signed-off-by: Li zeming [ rjw: Subject and changelog rewrite ] Signed-off-by: Rafael J. Wysocki --- kernel/power/snapshot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 71b2f12ed3b5..e3e8f1c6e75f 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -2778,7 +2778,7 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca) int snapshot_write_next(struct snapshot_handle *handle) { static struct chain_allocator ca; - int error = 0; + int error; next: /* Check if we have already loaded the entire image */ -- cgit From 20eb4142397cf3ec221de43f10ea149af462c572 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 4 Oct 2023 01:29:01 +0200 Subject: srcu: Remove superfluous callbacks advancing from srcu_gp_start() Callbacks advancing on SRCU must be performed on two specific places: 1) On enqueue time in order to make room for the acceleration of the new callback. 2) On invocation time in order to move the callbacks ready to invoke. Any other callback advancing callsite is needless. Remove the remaining one in srcu_gp_start(). Co-developed-by: Yong He Signed-off-by: Yong He Co-developed-by: Joel Fernandes Signed-off-by: Joel Fernandes Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney Co-developed-by: Neeraj Upadhyay (AMD) Signed-off-by: Neeraj Upadhyay (AMD) --- kernel/rcu/srcutree.c | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 560e99ec5333..e9356a103626 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -772,20 +772,10 @@ EXPORT_SYMBOL_GPL(__srcu_read_unlock_nmisafe); */ static void srcu_gp_start(struct srcu_struct *ssp) { - struct srcu_data *sdp; int state; - if (smp_load_acquire(&ssp->srcu_sup->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER) - sdp = per_cpu_ptr(ssp->sda, get_boot_cpu_id()); - else - sdp = this_cpu_ptr(ssp->sda); lockdep_assert_held(&ACCESS_PRIVATE(ssp->srcu_sup, lock)); WARN_ON_ONCE(ULONG_CMP_GE(ssp->srcu_sup->srcu_gp_seq, ssp->srcu_sup->srcu_gp_seq_needed)); - spin_lock_rcu_node(sdp); /* Interrupts already disabled. */ - rcu_segcblist_advance(&sdp->srcu_cblist, - rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq)); - WARN_ON_ONCE(!rcu_segcblist_segempty(&sdp->srcu_cblist, RCU_NEXT_TAIL)); - spin_unlock_rcu_node(sdp); /* Interrupts remain disabled. */ WRITE_ONCE(ssp->srcu_sup->srcu_gp_start, jiffies); WRITE_ONCE(ssp->srcu_sup->srcu_n_exp_nodelay, 0); smp_mb(); /* Order prior store to ->srcu_gp_seq_needed vs. GP start. */ -- cgit From 94c55b9e21979daa88e190bf971c47432a818ebe Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 4 Oct 2023 01:29:02 +0200 Subject: srcu: No need to advance/accelerate if no callback enqueued While in grace period start, there is nothing to accelerate and therefore no need to advance the callbacks either if no callback is to be enqueued. Spare these needless operations in this case. Signed-off-by: Frederic Weisbecker Reviewed-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney Signed-off-by: Neeraj Upadhyay (AMD) --- kernel/rcu/srcutree.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index e9356a103626..2bfc8ed1eed2 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -1261,9 +1261,11 @@ static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp, * period (gp_num = X + 8). So acceleration fails. */ s = rcu_seq_snap(&ssp->srcu_sup->srcu_gp_seq); - rcu_segcblist_advance(&sdp->srcu_cblist, - rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq)); - WARN_ON_ONCE(!rcu_segcblist_accelerate(&sdp->srcu_cblist, s) && rhp); + if (rhp) { + rcu_segcblist_advance(&sdp->srcu_cblist, + rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq)); + WARN_ON_ONCE(!rcu_segcblist_accelerate(&sdp->srcu_cblist, s)); + } if (ULONG_CMP_LT(sdp->srcu_gp_seq_needed, s)) { sdp->srcu_gp_seq_needed = s; needgp = true; -- cgit From c21357e4461f3f9c8ff93302906b5372411ee108 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 4 Oct 2023 01:29:03 +0200 Subject: srcu: Explain why callbacks invocations can't run concurrently If an SRCU barrier is queued while callbacks are running and a new callbacks invocator for the same sdp were to run concurrently, the RCU barrier might execute too early. As this requirement is non-obvious, make sure to keep a record. Signed-off-by: Frederic Weisbecker Reviewed-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney Signed-off-by: Neeraj Upadhyay (AMD) --- kernel/rcu/srcutree.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 2bfc8ed1eed2..0351a4e83529 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -1715,6 +1715,11 @@ static void srcu_invoke_callbacks(struct work_struct *work) WARN_ON_ONCE(!rcu_segcblist_segempty(&sdp->srcu_cblist, RCU_NEXT_TAIL)); rcu_segcblist_advance(&sdp->srcu_cblist, rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq)); + /* + * Although this function is theoretically re-entrant, concurrent + * callbacks invocation is disallowed to avoid executing an SRCU barrier + * too early. + */ if (sdp->srcu_cblist_invoking || !rcu_segcblist_ready_cbs(&sdp->srcu_cblist)) { spin_unlock_irq_rcu_node(sdp); @@ -1745,6 +1750,7 @@ static void srcu_invoke_callbacks(struct work_struct *work) sdp->srcu_cblist_invoking = false; more = rcu_segcblist_ready_cbs(&sdp->srcu_cblist); spin_unlock_irq_rcu_node(sdp); + /* An SRCU barrier or callbacks from previous nesting work pending */ if (more) srcu_schedule_cbs_sdp(sdp, 0); } -- cgit From 1e68485d8299860e68c4e1d29589ff0d20db0287 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 4 Dec 2023 15:39:19 -0800 Subject: bpf: log PTR_TO_MEM memory size in verifier log Emit valid memory size addressable through PTR_TO_MEM register. Signed-off-by: Andrii Nakryiko Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20231204233931.49758-2-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/log.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c index 55d019f30e91..61d7d23a0118 100644 --- a/kernel/bpf/log.c +++ b/kernel/bpf/log.c @@ -682,6 +682,10 @@ static void print_reg_state(struct bpf_verifier_env *env, verbose_a("r="); verbose_unum(env, reg->range); } + if (base_type(t) == PTR_TO_MEM) { + verbose_a("sz="); + verbose_unum(env, reg->mem_size); + } if (tnum_is_const(reg->var_off)) { /* a pointer register with fixed offset */ if (reg->var_off.value) { -- cgit From 22b769bb4f87060774bfdd6facbab438ed3b8453 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 4 Dec 2023 15:39:20 -0800 Subject: bpf: emit more dynptr information in verifier log Emit dynptr type for CONST_PTR_TO_DYNPTR register. Also emit id, ref_obj_id, and dynptr_id fields for STACK_DYNPTR stack slots. Signed-off-by: Andrii Nakryiko Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20231204233931.49758-3-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/log.c | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c index 61d7d23a0118..594a234f122b 100644 --- a/kernel/bpf/log.c +++ b/kernel/bpf/log.c @@ -628,6 +628,12 @@ static bool type_is_map_ptr(enum bpf_reg_type t) { } } +/* + * _a stands for append, was shortened to avoid multiline statements below. + * This macro is used to output a comma separated list of attributes. + */ +#define verbose_a(fmt, ...) ({ verbose(env, "%s" fmt, sep, ##__VA_ARGS__); sep = ","; }) + static void print_reg_state(struct bpf_verifier_env *env, const struct bpf_func_state *state, const struct bpf_reg_state *reg) @@ -643,11 +649,6 @@ static void print_reg_state(struct bpf_verifier_env *env, verbose_snum(env, reg->var_off.value + reg->off); return; } -/* - * _a stands for append, was shortened to avoid multiline statements below. - * This macro is used to output a comma separated list of attributes. - */ -#define verbose_a(fmt, ...) ({ verbose(env, "%s" fmt, sep, ##__VA_ARGS__); sep = ","; }) verbose(env, "%s", reg_type_str(env, t)); if (t == PTR_TO_STACK) { @@ -686,6 +687,8 @@ static void print_reg_state(struct bpf_verifier_env *env, verbose_a("sz="); verbose_unum(env, reg->mem_size); } + if (t == CONST_PTR_TO_DYNPTR) + verbose_a("type=%s", dynptr_type_str(reg->dynptr.type)); if (tnum_is_const(reg->var_off)) { /* a pointer register with fixed offset */ if (reg->var_off.value) { @@ -702,8 +705,6 @@ static void print_reg_state(struct bpf_verifier_env *env, } } verbose(env, ")"); - -#undef verbose_a } void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_func_state *state, @@ -727,6 +728,7 @@ void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_func_st } for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { char types_buf[BPF_REG_SIZE + 1]; + const char *sep = ""; bool valid = false; u8 slot_type; int j; @@ -765,9 +767,14 @@ void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_func_st verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE); print_liveness(env, reg->live); - verbose(env, "=dynptr_%s", dynptr_type_str(reg->dynptr.type)); + verbose(env, "=dynptr_%s(", dynptr_type_str(reg->dynptr.type)); + if (reg->id) + verbose_a("id=%d", reg->id); if (reg->ref_obj_id) - verbose(env, "(ref_id=%d)", reg->ref_obj_id); + verbose_a("ref_id=%d", reg->ref_obj_id); + if (reg->dynptr_id) + verbose_a("dynptr_id=%d", reg->dynptr_id); + verbose(env, ")"); break; case STACK_ITER: /* only main slot has ref_obj_id set; skip others */ -- cgit From 1a1ad782dcbbacd9e8d4e2e7ff1bf14d1db80727 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 4 Dec 2023 15:39:21 -0800 Subject: bpf: tidy up exception callback management a bit Use the fact that we are passing subprog index around and have a corresponding struct bpf_subprog_info in bpf_verifier_env for each subprogram. We don't need to separately pass around a flag whether subprog is exception callback or not, each relevant verifier function can determine this using provided subprog index if we maintain bpf_subprog_info properly. Also move out exception callback-specific logic from btf_prepare_func_args(), keeping it generic. We can enforce all these restriction right before exception callback verification pass. We add out parameter, arg_cnt, for now, but this will be unnecessary with subsequent refactoring and will be removed. Signed-off-by: Andrii Nakryiko Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20231204233931.49758-4-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 +- kernel/bpf/btf.c | 11 ++--------- kernel/bpf/verifier.c | 52 ++++++++++++++++++++++++++++++++++++++------------- 3 files changed, 42 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index c1a06263a4f3..0bd4889e917a 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2494,7 +2494,7 @@ int btf_check_subprog_arg_match(struct bpf_verifier_env *env, int subprog, int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog, struct bpf_reg_state *regs); int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog, - struct bpf_reg_state *reg, bool is_ex_cb); + struct bpf_reg_state *reg, u32 *nargs); int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *prog, struct btf *btf, const struct btf_type *t); const char *btf_find_decl_tag_value(const struct btf *btf, const struct btf_type *pt, diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 63cf4128fc05..d56433bf8aba 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -6956,7 +6956,7 @@ int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog, * (either PTR_TO_CTX or SCALAR_VALUE). */ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog, - struct bpf_reg_state *regs, bool is_ex_cb) + struct bpf_reg_state *regs, u32 *arg_cnt) { struct bpf_verifier_log *log = &env->log; struct bpf_prog *prog = env->prog; @@ -7013,6 +7013,7 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog, tname, nargs, MAX_BPF_FUNC_REG_ARGS); return -EINVAL; } + *arg_cnt = nargs; /* check that function returns int, exception cb also requires this */ t = btf_type_by_id(btf, t->type); while (btf_type_is_modifier(t)) @@ -7062,14 +7063,6 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog, i, btf_type_str(t), tname); return -EINVAL; } - /* We have already ensured that the callback returns an integer, just - * like all global subprogs. We need to determine it only has a single - * scalar argument. - */ - if (is_ex_cb && (nargs != 1 || regs[BPF_REG_1].type != SCALAR_VALUE)) { - bpf_log(log, "exception cb only supports single integer argument\n"); - return -EINVAL; - } return 0; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 727a59e4a647..d1755db1b503 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -442,6 +442,25 @@ static struct bpf_func_info_aux *subprog_aux(const struct bpf_verifier_env *env, return &env->prog->aux->func_info_aux[subprog]; } +static struct bpf_subprog_info *subprog_info(struct bpf_verifier_env *env, int subprog) +{ + return &env->subprog_info[subprog]; +} + +static void mark_subprog_exc_cb(struct bpf_verifier_env *env, int subprog) +{ + struct bpf_subprog_info *info = subprog_info(env, subprog); + + info->is_cb = true; + info->is_async_cb = true; + info->is_exception_cb = true; +} + +static bool subprog_is_exc_cb(struct bpf_verifier_env *env, int subprog) +{ + return subprog_info(env, subprog)->is_exception_cb; +} + static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg) { return btf_record_has_field(reg_btf_record(reg), BPF_SPIN_LOCK); @@ -2892,6 +2911,7 @@ static int add_subprog_and_kfunc(struct bpf_verifier_env *env) if (env->subprog_info[i].start != ex_cb_insn) continue; env->exception_callback_subprog = i; + mark_subprog_exc_cb(env, i); break; } } @@ -19166,9 +19186,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env) env->exception_callback_subprog = env->subprog_cnt - 1; /* Don't update insn_cnt, as add_hidden_subprog always appends insns */ - env->subprog_info[env->exception_callback_subprog].is_cb = true; - env->subprog_info[env->exception_callback_subprog].is_async_cb = true; - env->subprog_info[env->exception_callback_subprog].is_exception_cb = true; + mark_subprog_exc_cb(env, env->exception_callback_subprog); } for (i = 0; i < insn_cnt; i++, insn++) { @@ -19868,7 +19886,7 @@ static void free_states(struct bpf_verifier_env *env) } } -static int do_check_common(struct bpf_verifier_env *env, int subprog, bool is_ex_cb) +static int do_check_common(struct bpf_verifier_env *env, int subprog) { bool pop_log = !(env->log.level & BPF_LOG_LEVEL2); struct bpf_verifier_state *state; @@ -19899,9 +19917,23 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog, bool is_ex regs = state->frame[state->curframe]->regs; if (subprog || env->prog->type == BPF_PROG_TYPE_EXT) { - ret = btf_prepare_func_args(env, subprog, regs, is_ex_cb); + u32 nargs; + + ret = btf_prepare_func_args(env, subprog, regs, &nargs); if (ret) goto out; + if (subprog_is_exc_cb(env, subprog)) { + state->frame[0]->in_exception_callback_fn = true; + /* We have already ensured that the callback returns an integer, just + * like all global subprogs. We need to determine it only has a single + * scalar argument. + */ + if (nargs != 1 || regs[BPF_REG_1].type != SCALAR_VALUE) { + verbose(env, "exception cb only supports single integer argument\n"); + ret = -EINVAL; + goto out; + } + } for (i = BPF_REG_1; i <= BPF_REG_5; i++) { if (regs[i].type == PTR_TO_CTX) mark_reg_known_zero(env, regs, i); @@ -19915,12 +19947,6 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog, bool is_ex regs[i].id = ++env->id_gen; } } - if (is_ex_cb) { - state->frame[0]->in_exception_callback_fn = true; - env->subprog_info[subprog].is_cb = true; - env->subprog_info[subprog].is_async_cb = true; - env->subprog_info[subprog].is_exception_cb = true; - } } else { /* 1st arg to a function */ regs[BPF_REG_1].type = PTR_TO_CTX; @@ -20000,7 +20026,7 @@ again: env->insn_idx = env->subprog_info[i].start; WARN_ON_ONCE(env->insn_idx == 0); - ret = do_check_common(env, i, env->exception_callback_subprog == i); + ret = do_check_common(env, i); if (ret) { return ret; } else if (env->log.level & BPF_LOG_LEVEL) { @@ -20030,7 +20056,7 @@ static int do_check_main(struct bpf_verifier_env *env) int ret; env->insn_idx = 0; - ret = do_check_common(env, 0, false); + ret = do_check_common(env, 0); if (!ret) env->prog->aux->stack_depth = env->subprog_info[0].stack_depth; return ret; -- cgit From 8f23f5dba6b4693448144bde4dd6f537543442c2 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 27 Oct 2023 08:05:20 +0800 Subject: iommu: Change kconfig around IOMMU_SVA Linus suggested that the kconfig here is confusing: https://lore.kernel.org/all/CAHk-=wgUiAtiszwseM1p2fCJ+sC4XWQ+YN4TanFhUgvUqjr9Xw@mail.gmail.com/ Let's break it into three kconfigs controlling distinct things: - CONFIG_IOMMU_MM_DATA controls if the mm_struct has the additional fields for the IOMMU. Currently only PASID, but later patches store a struct iommu_mm_data * - CONFIG_ARCH_HAS_CPU_PASID controls if the arch needs the scheduling bit for keeping track of the ENQCMD instruction. x86 will select this if IOMMU_SVA is enabled - IOMMU_SVA controls if the IOMMU core compiles in the SVA support code for iommu driver use and the IOMMU exported API This way ARM will not enable CONFIG_ARCH_HAS_CPU_PASID Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20231027000525.1278806-2-tina.zhang@intel.com Signed-off-by: Joerg Roedel --- arch/Kconfig | 5 +++++ arch/x86/Kconfig | 1 + arch/x86/kernel/traps.c | 2 +- drivers/iommu/Kconfig | 1 + include/linux/iommu.h | 2 +- include/linux/mm_types.h | 2 +- include/linux/sched.h | 2 +- kernel/fork.c | 2 +- mm/Kconfig | 3 +++ mm/init-mm.c | 2 +- 10 files changed, 16 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/arch/Kconfig b/arch/Kconfig index f4b210ab0612..3e49f862670e 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -301,6 +301,11 @@ config ARCH_HAS_DMA_CLEAR_UNCACHED config ARCH_HAS_CPU_FINALIZE_INIT bool +# The architecture has a per-task state that includes the mm's PASID +config ARCH_HAS_CPU_PASID + bool + select IOMMU_MM_DATA + # Select if arch init_task must go in the __init_task_data section config ARCH_TASK_STRUCT_ON_STACK bool diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 3762f41bb092..68a2ec36a46e 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -71,6 +71,7 @@ config X86 select ARCH_HAS_CACHE_LINE_SIZE select ARCH_HAS_CPU_CACHE_INVALIDATE_MEMREGION select ARCH_HAS_CPU_FINALIZE_INIT + select ARCH_HAS_CPU_PASID if IOMMU_SVA select ARCH_HAS_CURRENT_STACK_POINTER select ARCH_HAS_DEBUG_VIRTUAL select ARCH_HAS_DEBUG_VM_PGTABLE if !X86_PAE diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index c876f1d36a81..2b62dbb3396a 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -565,7 +565,7 @@ static bool fixup_iopl_exception(struct pt_regs *regs) */ static bool try_fixup_enqcmd_gp(void) { -#ifdef CONFIG_IOMMU_SVA +#ifdef CONFIG_ARCH_HAS_CPU_PASID u32 pasid; /* diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig index 7673bb82945b..9a29d742617e 100644 --- a/drivers/iommu/Kconfig +++ b/drivers/iommu/Kconfig @@ -160,6 +160,7 @@ config IOMMU_DMA # Shared Virtual Addressing config IOMMU_SVA + select IOMMU_MM_DATA bool config FSL_PAMU diff --git a/include/linux/iommu.h b/include/linux/iommu.h index c7394b39599c..cd3f398095bf 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -1337,7 +1337,7 @@ static inline bool tegra_dev_iommu_get_stream_id(struct device *dev, u32 *stream return false; } -#ifdef CONFIG_IOMMU_SVA +#ifdef CONFIG_IOMMU_MM_DATA static inline void mm_pasid_init(struct mm_struct *mm) { mm->pasid = IOMMU_PASID_INVALID; diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 957ce38768b2..41f248608dd9 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -938,7 +938,7 @@ struct mm_struct { #endif struct work_struct async_put_work; -#ifdef CONFIG_IOMMU_SVA +#ifdef CONFIG_IOMMU_MM_DATA u32 pasid; #endif #ifdef CONFIG_KSM diff --git a/include/linux/sched.h b/include/linux/sched.h index 292c31697248..70888a36677b 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -954,7 +954,7 @@ struct task_struct { /* Recursion prevention for eventfd_signal() */ unsigned in_eventfd:1; #endif -#ifdef CONFIG_IOMMU_SVA +#ifdef CONFIG_ARCH_HAS_CPU_PASID unsigned pasid_activated:1; #endif #ifdef CONFIG_CPU_SUP_INTEL diff --git a/kernel/fork.c b/kernel/fork.c index 10917c3e1f03..43fd9bc1a522 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1179,7 +1179,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) tsk->use_memdelay = 0; #endif -#ifdef CONFIG_IOMMU_SVA +#ifdef CONFIG_ARCH_HAS_CPU_PASID tsk->pasid_activated = 0; #endif diff --git a/mm/Kconfig b/mm/Kconfig index 89971a894b60..0143f4d905c9 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -1270,6 +1270,9 @@ config LOCK_MM_AND_FIND_VMA bool depends on !STACK_GROWSUP +config IOMMU_MM_DATA + bool + source "mm/damon/Kconfig" endmenu diff --git a/mm/init-mm.c b/mm/init-mm.c index cfd367822cdd..c52dc2740a3d 100644 --- a/mm/init-mm.c +++ b/mm/init-mm.c @@ -44,7 +44,7 @@ struct mm_struct init_mm = { #endif .user_ns = &init_user_ns, .cpu_bitmap = CPU_BITS_NONE, -#ifdef CONFIG_IOMMU_SVA +#ifdef CONFIG_IOMMU_MM_DATA .pasid = IOMMU_PASID_INVALID, #endif INIT_MM_CONTEXT(init_mm) -- cgit From 4e94ddfe2aab72139acb8d5372fac9e6c3f3e383 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 30 Nov 2023 13:49:11 +0100 Subject: file: remove __receive_fd() Honestly, there's little value in having a helper with and without that int __user *ufd argument. It's just messy and doesn't really give us anything. Just expose receive_fd() with that argument and get rid of that helper. Link: https://lore.kernel.org/r/20231130-vfs-files-fixes-v1-5-e73ca6f4ea83@kernel.org Reviewed-by: Jan Kara Reviewed-by: Jens Axboe Signed-off-by: Christian Brauner --- drivers/vdpa/vdpa_user/vduse_dev.c | 2 +- fs/file.c | 11 +++-------- include/linux/file.h | 5 +---- include/net/scm.h | 2 +- kernel/pid.c | 2 +- kernel/seccomp.c | 2 +- 6 files changed, 8 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c index 0ddd4b8abecb..fafd4610b185 100644 --- a/drivers/vdpa/vdpa_user/vduse_dev.c +++ b/drivers/vdpa/vdpa_user/vduse_dev.c @@ -1157,7 +1157,7 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd, fput(f); break; } - ret = receive_fd(f, perm_to_file_flags(entry.perm)); + ret = receive_fd(f, NULL, perm_to_file_flags(entry.perm)); fput(f); break; } diff --git a/fs/file.c b/fs/file.c index c8eaa0b29a08..3b683b9101d8 100644 --- a/fs/file.c +++ b/fs/file.c @@ -1296,7 +1296,7 @@ out_unlock: } /** - * __receive_fd() - Install received file into file descriptor table + * receive_fd() - Install received file into file descriptor table * @file: struct file that was received from another process * @ufd: __user pointer to write new fd number to * @o_flags: the O_* flags to apply to the new fd entry @@ -1310,7 +1310,7 @@ out_unlock: * * Returns newly install fd or -ve on error. */ -int __receive_fd(struct file *file, int __user *ufd, unsigned int o_flags) +int receive_fd(struct file *file, int __user *ufd, unsigned int o_flags) { int new_fd; int error; @@ -1335,6 +1335,7 @@ int __receive_fd(struct file *file, int __user *ufd, unsigned int o_flags) __receive_sock(file); return new_fd; } +EXPORT_SYMBOL_GPL(receive_fd); int receive_fd_replace(int new_fd, struct file *file, unsigned int o_flags) { @@ -1350,12 +1351,6 @@ int receive_fd_replace(int new_fd, struct file *file, unsigned int o_flags) return new_fd; } -int receive_fd(struct file *file, unsigned int o_flags) -{ - return __receive_fd(file, NULL, o_flags); -} -EXPORT_SYMBOL_GPL(receive_fd); - static int ksys_dup3(unsigned int oldfd, unsigned int newfd, int flags) { int err = -EBADF; diff --git a/include/linux/file.h b/include/linux/file.h index c0d5219c2852..6834a29338c4 100644 --- a/include/linux/file.h +++ b/include/linux/file.h @@ -96,10 +96,7 @@ DEFINE_CLASS(get_unused_fd, int, if (_T >= 0) put_unused_fd(_T), extern void fd_install(unsigned int fd, struct file *file); -extern int __receive_fd(struct file *file, int __user *ufd, - unsigned int o_flags); - -extern int receive_fd(struct file *file, unsigned int o_flags); +int receive_fd(struct file *file, int __user *ufd, unsigned int o_flags); int receive_fd_replace(int new_fd, struct file *file, unsigned int o_flags); diff --git a/include/net/scm.h b/include/net/scm.h index 8aae2468bae0..cf68acec4d70 100644 --- a/include/net/scm.h +++ b/include/net/scm.h @@ -214,7 +214,7 @@ static inline int scm_recv_one_fd(struct file *f, int __user *ufd, { if (!ufd) return -EFAULT; - return __receive_fd(f, ufd, flags); + return receive_fd(f, ufd, flags); } #endif /* __LINUX_NET_SCM_H */ diff --git a/kernel/pid.c b/kernel/pid.c index 6500ef956f2f..b52b10865454 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -700,7 +700,7 @@ static int pidfd_getfd(struct pid *pid, int fd) if (IS_ERR(file)) return PTR_ERR(file); - ret = receive_fd(file, O_CLOEXEC); + ret = receive_fd(file, NULL, O_CLOEXEC); fput(file); return ret; diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 255999ba9190..aca7b437882e 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -1072,7 +1072,7 @@ static void seccomp_handle_addfd(struct seccomp_kaddfd *addfd, struct seccomp_kn */ list_del_init(&addfd->list); if (!addfd->setfd) - fd = receive_fd(addfd->file, addfd->flags); + fd = receive_fd(addfd->file, NULL, addfd->flags); else fd = receive_fd_replace(addfd->fd, addfd->file, addfd->flags); addfd->ret = fd; -- cgit From 56c26d5ad86dfe48a76855a91b523ab4f372c003 Mon Sep 17 00:00:00 2001 From: Yang Li Date: Tue, 12 Dec 2023 08:54:36 +0800 Subject: bpf: Remove unused backtrack_state helper functions The function are defined in the verifier.c file, but not called elsewhere, so delete the unused function. kernel/bpf/verifier.c:3448:20: warning: unused function 'bt_set_slot' kernel/bpf/verifier.c:3453:20: warning: unused function 'bt_clear_slot' kernel/bpf/verifier.c:3488:20: warning: unused function 'bt_is_slot_set' Reported-by: Abaci Robot Signed-off-by: Yang Li Signed-off-by: Andrii Nakryiko Acked-by: Eduard Zingerman Link: https://lore.kernel.org/bpf/20231212005436.103829-1-yang.lee@linux.alibaba.com Closes: https://bugzilla.openanolis.cn/show_bug.cgi?id=7714 --- kernel/bpf/verifier.c | 15 --------------- 1 file changed, 15 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d1755db1b503..ef27820a24e3 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3465,16 +3465,6 @@ static inline void bt_clear_frame_slot(struct backtrack_state *bt, u32 frame, u3 bt->stack_masks[frame] &= ~(1ull << slot); } -static inline void bt_set_slot(struct backtrack_state *bt, u32 slot) -{ - bt_set_frame_slot(bt, bt->frame, slot); -} - -static inline void bt_clear_slot(struct backtrack_state *bt, u32 slot) -{ - bt_clear_frame_slot(bt, bt->frame, slot); -} - static inline u32 bt_frame_reg_mask(struct backtrack_state *bt, u32 frame) { return bt->reg_masks[frame]; @@ -3505,11 +3495,6 @@ static inline bool bt_is_frame_slot_set(struct backtrack_state *bt, u32 frame, u return bt->stack_masks[frame] & (1ull << slot); } -static inline bool bt_is_slot_set(struct backtrack_state *bt, u32 slot) -{ - return bt_is_frame_slot_set(bt, bt->frame, slot); -} - /* format registers bitmask, e.g., "r0,r2,r4" for 0x15 mask */ static void fmt_reg_mask(char *buf, ssize_t buf_sz, u32 reg_mask) { -- cgit From 745e0311306507ddbe1727ac798c8f956812b810 Mon Sep 17 00:00:00 2001 From: Andrei Matei Date: Sun, 10 Dec 2023 17:51:50 -0500 Subject: bpf: Comment on check_mem_size_reg This patch adds a comment to check_mem_size_reg -- a function whose meaning is not very transparent. The function implicitly deals with two registers connected by convention, which is not obvious. Signed-off-by: Andrei Matei Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20231210225149.67639-1-andreimatei1@gmail.com --- kernel/bpf/verifier.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index ef27820a24e3..1863826a4ac3 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -7256,6 +7256,12 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, } } +/* verify arguments to helpers or kfuncs consisting of a pointer and an access + * size. + * + * @regno is the register containing the access size. regno-1 is the register + * containing the pointer. + */ static int check_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, u32 regno, bool zero_size_allowed, -- cgit From b3ae7b67b87fed771fa5bf95389df06b0433603e Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 12 Dec 2023 11:16:17 -0500 Subject: ring-buffer: Fix writing to the buffer with max_data_size The maximum ring buffer data size is the maximum size of data that can be recorded on the ring buffer. Events must be smaller than the sub buffer data size minus any meta data. This size is checked before trying to allocate from the ring buffer because the allocation assumes that the size will fit on the sub buffer. The maximum size was calculated as the size of a sub buffer page (which is currently PAGE_SIZE minus the sub buffer header) minus the size of the meta data of an individual event. But it missed the possible adding of a time stamp for events that are added long enough apart that the event meta data can't hold the time delta. When an event is added that is greater than the current BUF_MAX_DATA_SIZE minus the size of a time stamp, but still less than or equal to BUF_MAX_DATA_SIZE, the ring buffer would go into an infinite loop, looking for a page that can hold the event. Luckily, there's a check for this loop and after 1000 iterations and a warning is emitted and the ring buffer is disabled. But this should never happen. This can happen when a large event is added first, or after a long period where an absolute timestamp is prefixed to the event, increasing its size by 8 bytes. This passes the check and then goes into the algorithm that causes the infinite loop. For events that are the first event on the sub-buffer, it does not need to add a timestamp, because the sub-buffer itself contains an absolute timestamp, and adding one is redundant. The fix is to check if the event is to be the first event on the sub-buffer, and if it is, then do not add a timestamp. This also fixes 32 bit adding a timestamp when a read of before_stamp or write_stamp is interrupted. There's still no need to add that timestamp if the event is going to be the first event on the sub buffer. Also, if the buffer has "time_stamp_abs" set, then also check if the length plus the timestamp is greater than the BUF_MAX_DATA_SIZE. Link: https://lore.kernel.org/all/20231212104549.58863438@gandalf.local.home/ Link: https://lore.kernel.org/linux-trace-kernel/20231212071837.5fdd6c13@gandalf.local.home Link: https://lore.kernel.org/linux-trace-kernel/20231212111617.39e02849@gandalf.local.home Cc: stable@vger.kernel.org Cc: Mark Rutland Cc: Mathieu Desnoyers Fixes: a4543a2fa9ef3 ("ring-buffer: Get timestamp after event is allocated") Fixes: 58fbc3c63275c ("ring-buffer: Consolidate add_timestamp to remove some branches") Reported-by: Kent Overstreet # (on IRC) Acked-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 8d2a4f00eca9..b8986f82eccf 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -3579,7 +3579,10 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, * absolute timestamp. * Don't bother if this is the start of a new page (w == 0). */ - if (unlikely(!a_ok || !b_ok || (info->before != info->after && w))) { + if (!w) { + /* Use the sub-buffer timestamp */ + info->delta = 0; + } else if (unlikely(!a_ok || !b_ok || info->before != info->after)) { info->add_timestamp |= RB_ADD_STAMP_FORCE | RB_ADD_STAMP_EXTEND; info->length += RB_LEN_TIME_EXTEND; } else { @@ -3737,6 +3740,8 @@ rb_reserve_next_event(struct trace_buffer *buffer, if (ring_buffer_time_stamp_abs(cpu_buffer->buffer)) { add_ts_default = RB_ADD_STAMP_ABSOLUTE; info.length += RB_LEN_TIME_EXTEND; + if (info.length > BUF_MAX_DATA_SIZE) + goto out_fail; } else { add_ts_default = RB_ADD_STAMP_NONE; } -- cgit From b55b0a0d7c4aa2dac3579aa7e6802d1f57445096 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Sat, 9 Dec 2023 17:10:58 -0500 Subject: tracing: Have large events show up as '[LINE TOO BIG]' instead of nothing If a large event was added to the ring buffer that is larger than what the trace_seq can handle, it just drops the output: ~# cat /sys/kernel/tracing/trace # tracer: nop # # entries-in-buffer/entries-written: 2/2 #P:8 # # _-----=> irqs-off/BH-disabled # / _----=> need-resched # | / _---=> hardirq/softirq # || / _--=> preempt-depth # ||| / _-=> migrate-disable # |||| / delay # TASK-PID CPU# ||||| TIMESTAMP FUNCTION # | | | ||||| | | <...>-859 [001] ..... 141.118951: tracing_mark_write <...>-859 [001] ..... 141.148201: tracing_mark_write: 78901234 Instead, catch this case and add some context: ~# cat /sys/kernel/tracing/trace # tracer: nop # # entries-in-buffer/entries-written: 2/2 #P:8 # # _-----=> irqs-off/BH-disabled # / _----=> need-resched # | / _---=> hardirq/softirq # || / _--=> preempt-depth # ||| / _-=> migrate-disable # |||| / delay # TASK-PID CPU# ||||| TIMESTAMP FUNCTION # | | | ||||| | | <...>-852 [001] ..... 121.550551: tracing_mark_write[LINE TOO BIG] <...>-852 [001] ..... 121.550581: tracing_mark_write: 78901234 This now emulates the same output as trace_pipe. Link: https://lore.kernel.org/linux-trace-kernel/20231209171058.78c1a026@gandalf.local.home Cc: Mark Rutland Cc: Mathieu Desnoyers Reviewed-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index fbcd3bafb93e..aa8f99f3e5de 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4722,7 +4722,11 @@ static int s_show(struct seq_file *m, void *v) iter->leftover = ret; } else { - print_trace_line(iter); + ret = print_trace_line(iter); + if (ret == TRACE_TYPE_PARTIAL_LINE) { + iter->seq.full = 0; + trace_seq_puts(&iter->seq, "[LINE TOO BIG]\n"); + } ret = trace_print_seq(m, &iter->seq); /* * If we overflow the seq_file buffer, then it will -- cgit From 17d801758157bec93f26faaf5ff1a8b9a552d67a Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Sun, 10 Dec 2023 22:12:50 -0500 Subject: ring-buffer: Fix memory leak of free page Reading the ring buffer does a swap of a sub-buffer within the ring buffer with a empty sub-buffer. This allows the reader to have full access to the content of the sub-buffer that was swapped out without having to worry about contention with the writer. The readers call ring_buffer_alloc_read_page() to allocate a page that will be used to swap with the ring buffer. When the code is finished with the reader page, it calls ring_buffer_free_read_page(). Instead of freeing the page, it stores it as a spare. Then next call to ring_buffer_alloc_read_page() will return this spare instead of calling into the memory management system to allocate a new page. Unfortunately, on freeing of the ring buffer, this spare page is not freed, and causes a memory leak. Link: https://lore.kernel.org/linux-trace-kernel/20231210221250.7b9cc83c@rorschach.local.home Cc: stable@vger.kernel.org Cc: Mark Rutland Cc: Mathieu Desnoyers Fixes: 73a757e63114d ("ring-buffer: Return reader page back into existing ring buffer") Acked-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index b8986f82eccf..dcd47895b424 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1787,6 +1787,8 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer) free_buffer_page(bpage); } + free_page((unsigned long)cpu_buffer->free_page); + kfree(cpu_buffer); } -- cgit From d06aff1cb13d2a0d52b48e605462518149c98c81 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Sun, 10 Dec 2023 22:54:47 -0500 Subject: tracing: Update snapshot buffer on resize if it is allocated The snapshot buffer is to mimic the main buffer so that when a snapshot is needed, the snapshot and main buffer are swapped. When the snapshot buffer is allocated, it is set to the minimal size that the ring buffer may be at and still functional. When it is allocated it becomes the same size as the main ring buffer, and when the main ring buffer changes in size, it should do. Currently, the resize only updates the snapshot buffer if it's used by the current tracer (ie. the preemptirqsoff tracer). But it needs to be updated anytime it is allocated. When changing the size of the main buffer, instead of looking to see if the current tracer is utilizing the snapshot buffer, just check if it is allocated to know if it should be updated or not. Also fix typo in comment just above the code change. Link: https://lore.kernel.org/linux-trace-kernel/20231210225447.48476a6a@rorschach.local.home Cc: stable@vger.kernel.org Cc: Mark Rutland Cc: Mathieu Desnoyers Fixes: ad909e21bbe69 ("tracing: Add internal tracing_snapshot() functions") Reviewed-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index aa8f99f3e5de..6c79548f9574 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6348,7 +6348,7 @@ static int __tracing_resize_ring_buffer(struct trace_array *tr, if (!tr->array_buffer.buffer) return 0; - /* Do not allow tracing while resizng ring buffer */ + /* Do not allow tracing while resizing ring buffer */ tracing_stop_tr(tr); ret = ring_buffer_resize(tr->array_buffer.buffer, size, cpu); @@ -6356,7 +6356,7 @@ static int __tracing_resize_ring_buffer(struct trace_array *tr, goto out_start; #ifdef CONFIG_TRACER_MAX_TRACE - if (!tr->current_trace->use_max_tr) + if (!tr->allocated_snapshot) goto out; ret = ring_buffer_resize(tr->max_buffer.buffer, size, cpu); -- cgit From c41bd2514184d75db087fe4c1221237fb7922875 Mon Sep 17 00:00:00 2001 From: Ignat Korchagin Date: Wed, 29 Nov 2023 22:04:09 +0000 Subject: kexec: drop dependency on ARCH_SUPPORTS_KEXEC from CRASH_DUMP In commit f8ff23429c62 ("kernel/Kconfig.kexec: drop select of KEXEC for CRASH_DUMP") we tried to fix a config regression, where CONFIG_CRASH_DUMP required CONFIG_KEXEC. However, it was not enough at least for arm64 platforms. While further testing the patch with our arm64 config I noticed that CONFIG_CRASH_DUMP is unavailable in menuconfig. This is because CONFIG_CRASH_DUMP still depends on the new CONFIG_ARCH_SUPPORTS_KEXEC introduced in commit 91506f7e5d21 ("arm64/kexec: refactor for kernel/Kconfig.kexec") and on arm64 CONFIG_ARCH_SUPPORTS_KEXEC requires CONFIG_PM_SLEEP_SMP=y, which in turn requires either CONFIG_SUSPEND=y or CONFIG_HIBERNATION=y neither of which are set in our config. Given that we already established that CONFIG_KEXEC (which is a switch for kexec system call itself) is not required for CONFIG_CRASH_DUMP drop CONFIG_ARCH_SUPPORTS_KEXEC dependency as well. The arm64 kernel builds just fine with CONFIG_CRASH_DUMP=y and with both CONFIG_KEXEC=n and CONFIG_KEXEC_FILE=n after f8ff23429c62 ("kernel/Kconfig.kexec: drop select of KEXEC for CRASH_DUMP") and this patch are applied given that the necessary shared bits are included via CONFIG_KEXEC_CORE dependency. [bhe@redhat.com: don't export some symbols when CONFIG_MMU=n] Link: https://lkml.kernel.org/r/ZW03ODUKGGhP1ZGU@MiWiFi-R3L-srv [bhe@redhat.com: riscv, kexec: fix dependency of two items] Link: https://lkml.kernel.org/r/ZW04G/SKnhbE5mnX@MiWiFi-R3L-srv Link: https://lkml.kernel.org/r/20231129220409.55006-1-ignat@cloudflare.com Fixes: 91506f7e5d21 ("arm64/kexec: refactor for kernel/Kconfig.kexec") Signed-off-by: Ignat Korchagin Signed-off-by: Baoquan He Acked-by: Baoquan He Cc: Alexander Gordeev Cc: # 6.6+: f8ff234: kernel/Kconfig.kexec: drop select of KEXEC for CRASH_DUMP Cc: # 6.6+ Signed-off-by: Andrew Morton --- arch/riscv/Kconfig | 4 ++-- arch/riscv/kernel/crash_core.c | 4 +++- kernel/Kconfig.kexec | 1 - 3 files changed, 5 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 95a2a06acc6a..24c1799e2ec4 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -685,7 +685,7 @@ config RISCV_BOOT_SPINWAIT If unsure what to do here, say N. config ARCH_SUPPORTS_KEXEC - def_bool MMU + def_bool y config ARCH_SELECTS_KEXEC def_bool y @@ -693,7 +693,7 @@ config ARCH_SELECTS_KEXEC select HOTPLUG_CPU if SMP config ARCH_SUPPORTS_KEXEC_FILE - def_bool 64BIT && MMU + def_bool 64BIT config ARCH_SELECTS_KEXEC_FILE def_bool y diff --git a/arch/riscv/kernel/crash_core.c b/arch/riscv/kernel/crash_core.c index 55f1d7856b54..8706736fd4e2 100644 --- a/arch/riscv/kernel/crash_core.c +++ b/arch/riscv/kernel/crash_core.c @@ -5,17 +5,19 @@ void arch_crash_save_vmcoreinfo(void) { - VMCOREINFO_NUMBER(VA_BITS); VMCOREINFO_NUMBER(phys_ram_base); vmcoreinfo_append_str("NUMBER(PAGE_OFFSET)=0x%lx\n", PAGE_OFFSET); vmcoreinfo_append_str("NUMBER(VMALLOC_START)=0x%lx\n", VMALLOC_START); vmcoreinfo_append_str("NUMBER(VMALLOC_END)=0x%lx\n", VMALLOC_END); +#ifdef CONFIG_MMU + VMCOREINFO_NUMBER(VA_BITS); vmcoreinfo_append_str("NUMBER(VMEMMAP_START)=0x%lx\n", VMEMMAP_START); vmcoreinfo_append_str("NUMBER(VMEMMAP_END)=0x%lx\n", VMEMMAP_END); #ifdef CONFIG_64BIT vmcoreinfo_append_str("NUMBER(MODULES_VADDR)=0x%lx\n", MODULES_VADDR); vmcoreinfo_append_str("NUMBER(MODULES_END)=0x%lx\n", MODULES_END); +#endif #endif vmcoreinfo_append_str("NUMBER(KERNEL_LINK_ADDR)=0x%lx\n", KERNEL_LINK_ADDR); vmcoreinfo_append_str("NUMBER(va_kernel_pa_offset)=0x%lx\n", diff --git a/kernel/Kconfig.kexec b/kernel/Kconfig.kexec index 1cc3b1c595d7..2fd510256604 100644 --- a/kernel/Kconfig.kexec +++ b/kernel/Kconfig.kexec @@ -94,7 +94,6 @@ config KEXEC_JUMP config CRASH_DUMP bool "kernel crash dumps" depends on ARCH_SUPPORTS_CRASH_DUMP - depends on ARCH_SUPPORTS_KEXEC select CRASH_CORE select KEXEC_CORE help -- cgit From 1dd11e977360ad3493812da0b05ffd9adcdd15a1 Mon Sep 17 00:00:00 2001 From: Yuntao Wang Date: Sat, 9 Dec 2023 22:14:38 +0800 Subject: crash_core: fix the check for whether crashkernel is from high memory If crash_base is equal to CRASH_ADDR_LOW_MAX, it also indicates that the crashkernel memory is allocated from high memory. However, the current check only considers the case where crash_base is greater than CRASH_ADDR_LOW_MAX. Fix it. The runtime effects is that crashkernel high memory is successfully reserved, whereas the crashkernel low memory is bypassed in this case, then kdump kernel bootup will fail because of no low memory under 4G. This patch also includes some minor cleanups. Link: https://lkml.kernel.org/r/20231209141438.77233-1-ytcoode@gmail.com Fixes: 0ab97169aa05 ("crash_core: add generic function to do reservation") Signed-off-by: Yuntao Wang Cc: Baoquan He Cc: Dave Young Cc: Vivek Goyal Cc: Zhen Lei Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton --- kernel/crash_core.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/crash_core.c b/kernel/crash_core.c index efe87d501c8c..d4313b53837e 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -199,7 +199,7 @@ static __initdata char *suffix_tbl[] = { * It returns 0 on success and -EINVAL on failure. */ static int __init parse_crashkernel_suffix(char *cmdline, - unsigned long long *crash_size, + unsigned long long *crash_size, const char *suffix) { char *cur = cmdline; @@ -268,9 +268,9 @@ static int __init __parse_crashkernel(char *cmdline, unsigned long long *crash_base, const char *suffix) { - char *first_colon, *first_space; - char *ck_cmdline; - char *name = "crashkernel="; + char *first_colon, *first_space; + char *ck_cmdline; + char *name = "crashkernel="; BUG_ON(!crash_size || !crash_base); *crash_size = 0; @@ -440,7 +440,7 @@ retry: return; } - if ((crash_base > CRASH_ADDR_LOW_MAX) && + if ((crash_base >= CRASH_ADDR_LOW_MAX) && crash_low_size && reserve_crashkernel_low(crash_low_size)) { memblock_phys_free(crash_base, crash_size); return; -- cgit From 9e45e39dc249c970d99d2681f6bcb55736fd725c Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Mon, 11 Dec 2023 11:44:20 -0500 Subject: ring-buffer: Do not update before stamp when switching sub-buffers The ring buffer timestamps are synchronized by two timestamp placeholders. One is the "before_stamp" and the other is the "write_stamp" (sometimes referred to as the "after stamp" but only in the comments. These two stamps are key to knowing how to handle nested events coming in with a lockless system. When moving across sub-buffers, the before stamp is updated but the write stamp is not. There's an effort to put back the before stamp to something that seems logical in case there's nested events. But as the current event is about to cross sub-buffers, and so will any new nested event that happens, updating the before stamp is useless, and could even introduce new race conditions. The first event on a sub-buffer simply uses the sub-buffer's timestamp and keeps a "delta" of zero. The "before_stamp" and "write_stamp" are not used in the algorithm in this case. There's no reason to try to fix the before_stamp when this happens. As a bonus, it removes a cmpxchg() when crossing sub-buffers! Link: https://lore.kernel.org/linux-trace-kernel/20231211114420.36dde01b@gandalf.local.home Cc: stable@vger.kernel.org Cc: Mark Rutland Cc: Mathieu Desnoyers Fixes: a389d86f7fd09 ("ring-buffer: Have nested events still record running time stamp") Reviewed-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index dcd47895b424..c7abcc215fe2 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -3607,14 +3607,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, /* See if we shot pass the end of this buffer page */ if (unlikely(write > BUF_PAGE_SIZE)) { - /* before and after may now different, fix it up*/ - b_ok = rb_time_read(&cpu_buffer->before_stamp, &info->before); - a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after); - if (a_ok && b_ok && info->before != info->after) - (void)rb_time_cmpxchg(&cpu_buffer->before_stamp, - info->before, info->after); - if (a_ok && b_ok) - check_buffer(cpu_buffer, info, CHECK_FULL_PAGE); + check_buffer(cpu_buffer, info, CHECK_FULL_PAGE); return rb_move_tail(cpu_buffer, tail, info); } -- cgit From b049525855fdd0024881c9b14b8fbec61c3f53d3 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 12 Dec 2023 07:25:58 -0500 Subject: ring-buffer: Have saved event hold the entire event For the ring buffer iterator (non-consuming read), the event needs to be copied into the iterator buffer to make sure that a writer does not overwrite it while the user is reading it. If a write happens during the copy, the buffer is simply discarded. But the temp buffer itself was not big enough. The allocation of the buffer was only BUF_MAX_DATA_SIZE, which is the maximum data size that can be passed into the ring buffer and saved. But the temp buffer needs to hold the meta data as well. That would be BUF_PAGE_SIZE and not BUF_MAX_DATA_SIZE. Link: https://lore.kernel.org/linux-trace-kernel/20231212072558.61f76493@gandalf.local.home Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Fixes: 785888c544e04 ("ring-buffer: Have rb_iter_head_event() handle concurrent writer") Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index c7abcc215fe2..1d9caee7f542 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2409,7 +2409,7 @@ rb_iter_head_event(struct ring_buffer_iter *iter) */ barrier(); - if ((iter->head + length) > commit || length > BUF_MAX_DATA_SIZE) + if ((iter->head + length) > commit || length > BUF_PAGE_SIZE) /* Writer corrupted the read? */ goto reset; @@ -5118,7 +5118,8 @@ ring_buffer_read_prepare(struct trace_buffer *buffer, int cpu, gfp_t flags) if (!iter) return NULL; - iter->event = kmalloc(BUF_MAX_DATA_SIZE, flags); + /* Holds the entire event: data and meta data */ + iter->event = kmalloc(BUF_PAGE_SIZE, flags); if (!iter->event) { kfree(iter); return NULL; -- cgit From 60be76eeabb3d83858cc6577fc65c7d0f36ffd42 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 12 Dec 2023 08:44:44 -0500 Subject: tracing: Add size check when printing trace_marker output If for some reason the trace_marker write does not have a nul byte for the string, it will overflow the print: trace_seq_printf(s, ": %s", field->buf); The field->buf could be missing the nul byte. To prevent overflow, add the max size that the buf can be by using the event size and the field location. int max = iter->ent_size - offsetof(struct print_entry, buf); trace_seq_printf(s, ": %*.s", max, field->buf); Link: https://lore.kernel.org/linux-trace-kernel/20231212084444.4619b8ce@gandalf.local.home Cc: Mark Rutland Cc: Mathieu Desnoyers Reviewed-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_output.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index d8b302d01083..3e7fa44dc2b2 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -1587,11 +1587,12 @@ static enum print_line_t trace_print_print(struct trace_iterator *iter, { struct print_entry *field; struct trace_seq *s = &iter->seq; + int max = iter->ent_size - offsetof(struct print_entry, buf); trace_assign_type(field, iter->ent); seq_print_ip_sym(s, field->ip, flags); - trace_seq_printf(s, ": %s", field->buf); + trace_seq_printf(s, ": %.*s", max, field->buf); return trace_handle_return(s); } @@ -1600,10 +1601,11 @@ static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags, struct trace_event *event) { struct print_entry *field; + int max = iter->ent_size - offsetof(struct print_entry, buf); trace_assign_type(field, iter->ent); - trace_seq_printf(&iter->seq, "# %lx %s", field->ip, field->buf); + trace_seq_printf(&iter->seq, "# %lx %.*s", field->ip, max, field->buf); return trace_handle_return(&iter->seq); } -- cgit From dee39c0c1e9624f925da4ca0bece46bdc7427257 Mon Sep 17 00:00:00 2001 From: Zqiang Date: Wed, 1 Nov 2023 11:35:07 +0800 Subject: rcu: Force quiescent states only for ongoing grace period If an rcutorture test scenario creates an fqs_task kthread, it will periodically invoke rcu_force_quiescent_state() in order to start force-quiescent-state (FQS) operations. However, an FQS operation will be started even if there is no RCU grace period in progress. Although testing FQS operations startup when there is no grace period in progress is necessary, it need not happen all that often. This commit therefore causes rcu_force_quiescent_state() to take an early exit if there is no grace period in progress. Note that there will still be attempts to start an FQS scan in the absence of a grace period because the grace period might end right after the rcu_force_quiescent_state() function's check. In actual testing, this happens about once every ten minutes, which should provide adequate testing. Signed-off-by: Zqiang Reviewed-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney Signed-off-by: Neeraj Upadhyay (AMD) --- kernel/rcu/tree.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 3ac3c846105f..1ae851777806 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2338,6 +2338,8 @@ void rcu_force_quiescent_state(void) struct rcu_node *rnp; struct rcu_node *rnp_old = NULL; + if (!rcu_gp_in_progress()) + return; /* Funnel through hierarchy to reduce memory contention. */ rnp = raw_cpu_read(rcu_data.mynode); for (; rnp != NULL; rnp = rnp->parent) { -- cgit From 750e785796bb72423b97cac21ecd0fa3b3b65610 Mon Sep 17 00:00:00 2001 From: Jie Jiang Date: Tue, 12 Dec 2023 09:39:23 +0000 Subject: bpf: Support uid and gid when mounting bpffs Parse uid and gid in bpf_parse_param() so that they can be passed in as the `data` parameter when mount() bpffs. This will be useful when we want to control which user/group has the control to the mounted bpffs, otherwise a separate chown() call will be needed. Signed-off-by: Jie Jiang Signed-off-by: Andrii Nakryiko Acked-by: Mike Frysinger Acked-by: Christian Brauner Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20231212093923.497838-1-jiejiang@chromium.org --- include/linux/bpf.h | 2 ++ kernel/bpf/inode.c | 50 +++++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 51 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 0bd4889e917a..c87c608a3689 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1595,6 +1595,8 @@ struct bpf_link_primer { }; struct bpf_mount_opts { + kuid_t uid; + kgid_t gid; umode_t mode; /* BPF token-related delegation options */ diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 5359a0929c35..0a8e1188ea46 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -601,9 +601,16 @@ EXPORT_SYMBOL(bpf_prog_get_type_path); static int bpf_show_options(struct seq_file *m, struct dentry *root) { struct bpf_mount_opts *opts = root->d_sb->s_fs_info; - umode_t mode = d_inode(root)->i_mode & S_IALLUGO & ~S_ISVTX; + struct inode *inode = d_inode(root); + umode_t mode = inode->i_mode & S_IALLUGO & ~S_ISVTX; u64 mask; + if (!uid_eq(inode->i_uid, GLOBAL_ROOT_UID)) + seq_printf(m, ",uid=%u", + from_kuid_munged(&init_user_ns, inode->i_uid)); + if (!gid_eq(inode->i_gid, GLOBAL_ROOT_GID)) + seq_printf(m, ",gid=%u", + from_kgid_munged(&init_user_ns, inode->i_gid)); if (mode != S_IRWXUGO) seq_printf(m, ",mode=%o", mode); @@ -652,6 +659,8 @@ const struct super_operations bpf_super_ops = { }; enum { + OPT_UID, + OPT_GID, OPT_MODE, OPT_DELEGATE_CMDS, OPT_DELEGATE_MAPS, @@ -660,6 +669,8 @@ enum { }; static const struct fs_parameter_spec bpf_fs_parameters[] = { + fsparam_u32 ("uid", OPT_UID), + fsparam_u32 ("gid", OPT_GID), fsparam_u32oct ("mode", OPT_MODE), fsparam_string ("delegate_cmds", OPT_DELEGATE_CMDS), fsparam_string ("delegate_maps", OPT_DELEGATE_MAPS), @@ -672,6 +683,8 @@ static int bpf_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct bpf_mount_opts *opts = fc->s_fs_info; struct fs_parse_result result; + kuid_t uid; + kgid_t gid; int opt, err; u64 msk; @@ -694,6 +707,34 @@ static int bpf_parse_param(struct fs_context *fc, struct fs_parameter *param) } switch (opt) { + case OPT_UID: + uid = make_kuid(current_user_ns(), result.uint_32); + if (!uid_valid(uid)) + goto bad_value; + + /* + * The requested uid must be representable in the + * filesystem's idmapping. + */ + if (!kuid_has_mapping(fc->user_ns, uid)) + goto bad_value; + + opts->uid = uid; + break; + case OPT_GID: + gid = make_kgid(current_user_ns(), result.uint_32); + if (!gid_valid(gid)) + goto bad_value; + + /* + * The requested gid must be representable in the + * filesystem's idmapping. + */ + if (!kgid_has_mapping(fc->user_ns, gid)) + goto bad_value; + + opts->gid = gid; + break; case OPT_MODE: opts->mode = result.uint_32 & S_IALLUGO; break; @@ -722,6 +763,9 @@ static int bpf_parse_param(struct fs_context *fc, struct fs_parameter *param) } return 0; + +bad_value: + return invalfc(fc, "Bad value for '%s'", param->key); } struct bpf_preload_ops *bpf_preload_ops; @@ -808,6 +852,8 @@ static int bpf_fill_super(struct super_block *sb, struct fs_context *fc) sb->s_op = &bpf_super_ops; inode = sb->s_root->d_inode; + inode->i_uid = opts->uid; + inode->i_gid = opts->gid; inode->i_op = &bpf_dir_iops; inode->i_mode &= ~S_IALLUGO; populate_bpffs(sb->s_root); @@ -843,6 +889,8 @@ static int bpf_init_fs_context(struct fs_context *fc) return -ENOMEM; opts->mode = S_IRWXUGO; + opts->uid = current_fsuid(); + opts->gid = current_fsgid(); /* start out with no BPF token delegation enabled */ opts->delegate_cmds = 0; -- cgit From f5fdb51fb980077a4c6c78f3f775821f611fb38b Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 13 Dec 2023 11:08:33 -0800 Subject: bpf: fail BPF_TOKEN_CREATE if no delegation option was set on BPF FS It's quite confusing in practice when it's possible to successfully create a BPF token from BPF FS that didn't have any of delegate_xxx mount options set up. While it's not wrong, it's actually more meaningful to reject BPF_TOKEN_CREATE with specific error code (-ENOENT) to let user-space know that no token delegation is setup up. So, instead of creating empty BPF token that will be always ignored because it doesn't have any of the allow_xxx bits set, reject it with -ENOENT. If we ever need empty BPF token to be possible, we can support that with extra flag passed into BPF_TOKEN_CREATE. Acked-by: Christian Brauner Acked-by: John Fastabend Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231213190842.3844987-2-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/token.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/token.c b/kernel/bpf/token.c index 17212efcde60..a86fccd57e2d 100644 --- a/kernel/bpf/token.c +++ b/kernel/bpf/token.c @@ -152,6 +152,15 @@ int bpf_token_create(union bpf_attr *attr) goto out_path; } + mnt_opts = path.dentry->d_sb->s_fs_info; + if (mnt_opts->delegate_cmds == 0 && + mnt_opts->delegate_maps == 0 && + mnt_opts->delegate_progs == 0 && + mnt_opts->delegate_attachs == 0) { + err = -ENOENT; /* no BPF token delegation is set up */ + goto out_path; + } + mode = S_IFREG | ((S_IRUSR | S_IWUSR) & ~current_umask()); inode = bpf_get_inode(path.mnt->mnt_sb, NULL, mode); if (IS_ERR(inode)) { @@ -181,7 +190,6 @@ int bpf_token_create(union bpf_attr *attr) /* remember bpffs owning userns for future ns_capable() checks */ token->userns = get_user_ns(userns); - mnt_opts = path.dentry->d_sb->s_fs_info; token->allowed_cmds = mnt_opts->delegate_cmds; token->allowed_maps = mnt_opts->delegate_maps; token->allowed_progs = mnt_opts->delegate_progs; -- cgit From b13cddf633562b9b2c34fd63471d377019704ebe Mon Sep 17 00:00:00 2001 From: Matt Bobrowski Date: Fri, 8 Dec 2023 15:32:48 +0000 Subject: bpf: add small subset of SECURITY_PATH hooks to BPF sleepable_lsm_hooks list security_path_* based LSM hooks appear to be generally missing from the sleepable_lsm_hooks list. Initially add a small subset of them to the preexisting sleepable_lsm_hooks list so that sleepable BPF helpers like bpf_d_path() can be used from sleepable BPF LSM based programs. The security_path_* hooks added in this patch are similar to the security_inode_* counterparts that already exist in the sleepable_lsm_hooks list, and are called in roughly similar points and contexts. Presumably, making them OK to be also annotated as sleepable. Building a kernel with DEBUG_ATOMIC_SLEEP options enabled and running reasonable workloads stimulating activity that would be intercepted by such security hooks didn't show any splats. Notably, I haven't added all the security_path_* LSM hooks that are available as I don't need them at this point in time. Signed-off-by: Matt Bobrowski Acked-by: KP Singh Link: https://lore.kernel.org/r/ZXM3IHHXpNY9y82a@google.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/bpf_lsm.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index 7d2f96413a57..63b4dc495125 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -304,6 +304,18 @@ BTF_ID(func, bpf_lsm_kernel_module_request) BTF_ID(func, bpf_lsm_kernel_read_file) BTF_ID(func, bpf_lsm_kernfs_init_security) +#ifdef CONFIG_SECURITY_PATH +BTF_ID(func, bpf_lsm_path_unlink) +BTF_ID(func, bpf_lsm_path_mkdir) +BTF_ID(func, bpf_lsm_path_rmdir) +BTF_ID(func, bpf_lsm_path_truncate) +BTF_ID(func, bpf_lsm_path_symlink) +BTF_ID(func, bpf_lsm_path_link) +BTF_ID(func, bpf_lsm_path_rename) +BTF_ID(func, bpf_lsm_path_chmod) +BTF_ID(func, bpf_lsm_path_chown) +#endif /* CONFIG_SECURITY_PATH */ + #ifdef CONFIG_KEYS BTF_ID(func, bpf_lsm_key_free) #endif /* CONFIG_KEYS */ -- cgit From 2a0c6b41eec90c2a138ea8b574836744783c67ff Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Mon, 11 Dec 2023 16:34:47 +0800 Subject: bpf: Update the comments in maybe_wait_bpf_programs() Since commit 638e4b825d52 ("bpf: Allows per-cpu maps and map-in-map in sleepable programs"), sleepable BPF program can also use map-in-map, but maybe_wait_bpf_programs() doesn't handle it accordingly. The main reason is that using synchronize_rcu_tasks_trace() to wait for the completions of these sleepable BPF programs may incur a very long delay and userspace may think it is hung, so the wait for sleepable BPF programs is skipped. Update the comments in maybe_wait_bpf_programs() to reflect the reason. Signed-off-by: Hou Tao Acked-by: Yonghong Song Acked-by: John Fastabend Link: https://lore.kernel.org/r/20231211083447.1921178-1-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 06320d9abf33..d63c1ed42412 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -142,9 +142,13 @@ static u32 bpf_map_value_size(const struct bpf_map *map) static void maybe_wait_bpf_programs(struct bpf_map *map) { - /* Wait for any running BPF programs to complete so that - * userspace, when we return to it, knows that all programs - * that could be running use the new map value. + /* Wait for any running non-sleepable BPF programs to complete so that + * userspace, when we return to it, knows that all non-sleepable + * programs that could be running use the new map value. For sleepable + * BPF programs, synchronize_rcu_tasks_trace() should be used to wait + * for the completions of these programs, but considering the waiting + * time can be very long and userspace may think it will hang forever, + * so don't handle sleepable BPF programs now. */ if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS || map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS) -- cgit From 1cc111b9cddc71ce161cd388f11f0e9048edffdb Mon Sep 17 00:00:00 2001 From: Zheng Yejian Date: Thu, 14 Dec 2023 09:21:53 +0800 Subject: tracing: Fix uaf issue when open the hist or hist_debug file KASAN report following issue. The root cause is when opening 'hist' file of an instance and accessing 'trace_event_file' in hist_show(), but 'trace_event_file' has been freed due to the instance being removed. 'hist_debug' file has the same problem. To fix it, call tracing_{open,release}_file_tr() in file_operations callback to have the ref count and avoid 'trace_event_file' being freed. BUG: KASAN: slab-use-after-free in hist_show+0x11e0/0x1278 Read of size 8 at addr ffff242541e336b8 by task head/190 CPU: 4 PID: 190 Comm: head Not tainted 6.7.0-rc5-g26aff849438c #133 Hardware name: linux,dummy-virt (DT) Call trace: dump_backtrace+0x98/0xf8 show_stack+0x1c/0x30 dump_stack_lvl+0x44/0x58 print_report+0xf0/0x5a0 kasan_report+0x80/0xc0 __asan_report_load8_noabort+0x1c/0x28 hist_show+0x11e0/0x1278 seq_read_iter+0x344/0xd78 seq_read+0x128/0x1c0 vfs_read+0x198/0x6c8 ksys_read+0xf4/0x1e0 __arm64_sys_read+0x70/0xa8 invoke_syscall+0x70/0x260 el0_svc_common.constprop.0+0xb0/0x280 do_el0_svc+0x44/0x60 el0_svc+0x34/0x68 el0t_64_sync_handler+0xb8/0xc0 el0t_64_sync+0x168/0x170 Allocated by task 188: kasan_save_stack+0x28/0x50 kasan_set_track+0x28/0x38 kasan_save_alloc_info+0x20/0x30 __kasan_slab_alloc+0x6c/0x80 kmem_cache_alloc+0x15c/0x4a8 trace_create_new_event+0x84/0x348 __trace_add_new_event+0x18/0x88 event_trace_add_tracer+0xc4/0x1a0 trace_array_create_dir+0x6c/0x100 trace_array_create+0x2e8/0x568 instance_mkdir+0x48/0x80 tracefs_syscall_mkdir+0x90/0xe8 vfs_mkdir+0x3c4/0x610 do_mkdirat+0x144/0x200 __arm64_sys_mkdirat+0x8c/0xc0 invoke_syscall+0x70/0x260 el0_svc_common.constprop.0+0xb0/0x280 do_el0_svc+0x44/0x60 el0_svc+0x34/0x68 el0t_64_sync_handler+0xb8/0xc0 el0t_64_sync+0x168/0x170 Freed by task 191: kasan_save_stack+0x28/0x50 kasan_set_track+0x28/0x38 kasan_save_free_info+0x34/0x58 __kasan_slab_free+0xe4/0x158 kmem_cache_free+0x19c/0x508 event_file_put+0xa0/0x120 remove_event_file_dir+0x180/0x320 event_trace_del_tracer+0xb0/0x180 __remove_instance+0x224/0x508 instance_rmdir+0x44/0x78 tracefs_syscall_rmdir+0xbc/0x140 vfs_rmdir+0x1cc/0x4c8 do_rmdir+0x220/0x2b8 __arm64_sys_unlinkat+0xc0/0x100 invoke_syscall+0x70/0x260 el0_svc_common.constprop.0+0xb0/0x280 do_el0_svc+0x44/0x60 el0_svc+0x34/0x68 el0t_64_sync_handler+0xb8/0xc0 el0t_64_sync+0x168/0x170 Link: https://lore.kernel.org/linux-trace-kernel/20231214012153.676155-1-zhengyejian1@huawei.com Suggested-by: Steven Rostedt Signed-off-by: Zheng Yejian Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 6 ++++++ kernel/trace/trace.h | 1 + kernel/trace/trace_events_hist.c | 12 ++++++++---- 3 files changed, 15 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 6c79548f9574..199df497db07 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4968,6 +4968,12 @@ int tracing_release_file_tr(struct inode *inode, struct file *filp) return 0; } +int tracing_single_release_file_tr(struct inode *inode, struct file *filp) +{ + tracing_release_file_tr(inode, filp); + return single_release(inode, filp); +} + static int tracing_mark_open(struct inode *inode, struct file *filp) { stream_open(inode, filp); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index b7f4ea25a194..0489e72c8169 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -617,6 +617,7 @@ int tracing_open_generic(struct inode *inode, struct file *filp); int tracing_open_generic_tr(struct inode *inode, struct file *filp); int tracing_open_file_tr(struct inode *inode, struct file *filp); int tracing_release_file_tr(struct inode *inode, struct file *filp); +int tracing_single_release_file_tr(struct inode *inode, struct file *filp); bool tracing_is_disabled(void); bool tracer_tracing_is_on(struct trace_array *tr); void tracer_tracing_on(struct trace_array *tr); diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 1abc07fba1b9..5ecf3c8bde20 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -5623,10 +5623,12 @@ static int event_hist_open(struct inode *inode, struct file *file) { int ret; - ret = security_locked_down(LOCKDOWN_TRACEFS); + ret = tracing_open_file_tr(inode, file); if (ret) return ret; + /* Clear private_data to avoid warning in single_open() */ + file->private_data = NULL; return single_open(file, hist_show, file); } @@ -5634,7 +5636,7 @@ const struct file_operations event_hist_fops = { .open = event_hist_open, .read = seq_read, .llseek = seq_lseek, - .release = single_release, + .release = tracing_single_release_file_tr, }; #ifdef CONFIG_HIST_TRIGGERS_DEBUG @@ -5900,10 +5902,12 @@ static int event_hist_debug_open(struct inode *inode, struct file *file) { int ret; - ret = security_locked_down(LOCKDOWN_TRACEFS); + ret = tracing_open_file_tr(inode, file); if (ret) return ret; + /* Clear private_data to avoid warning in single_open() */ + file->private_data = NULL; return single_open(file, hist_debug_show, file); } @@ -5911,7 +5915,7 @@ const struct file_operations event_hist_debug_fops = { .open = event_hist_debug_open, .read = seq_read, .llseek = seq_lseek, - .release = single_release, + .release = tracing_single_release_file_tr, }; #endif -- cgit From 8f82583f9527b3be9d70d9a5d1f33435e29d0480 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Thu, 14 Dec 2023 12:30:09 +0800 Subject: bpf: Reduce the scope of rcu_read_lock when updating fd map There is no rcu-read-lock requirement for ops->map_fd_get_ptr() or ops->map_fd_put_ptr(), so doesn't use rcu-read-lock for these two callbacks. For bpf_fd_array_map_update_elem(), accessing array->ptrs doesn't need rcu-read-lock because array->ptrs must still be allocated. For bpf_fd_htab_map_update_elem(), htab_map_update_elem() only requires rcu-read-lock to be held to avoid the WARN_ON_ONCE(), so only use rcu_read_lock() during the invocation of htab_map_update_elem(). Acked-by: Yonghong Song Signed-off-by: Hou Tao Link: https://lore.kernel.org/r/20231214043010.3458072-2-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/hashtab.c | 6 ++++++ kernel/bpf/syscall.c | 4 ---- 2 files changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 5b9146fa825f..ec3bdcc6a3cf 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -2523,7 +2523,13 @@ int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file, if (IS_ERR(ptr)) return PTR_ERR(ptr); + /* The htab bucket lock is always held during update operations in fd + * htab map, and the following rcu_read_lock() is only used to avoid + * the WARN_ON_ONCE in htab_map_update_elem(). + */ + rcu_read_lock(); ret = htab_map_update_elem(map, key, &ptr, map_flags); + rcu_read_unlock(); if (ret) map->ops->map_fd_put_ptr(map, ptr, false); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index d63c1ed42412..3fcf7741146a 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -184,15 +184,11 @@ static int bpf_map_update_value(struct bpf_map *map, struct file *map_file, err = bpf_percpu_cgroup_storage_update(map, key, value, flags); } else if (IS_FD_ARRAY(map)) { - rcu_read_lock(); err = bpf_fd_array_map_update_elem(map, map_file, key, value, flags); - rcu_read_unlock(); } else if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) { - rcu_read_lock(); err = bpf_fd_htab_map_update_elem(map, map_file, key, value, flags); - rcu_read_unlock(); } else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) { /* rcu_read_lock() is not needed */ err = bpf_fd_reuseport_array_update_elem(map, key, value, -- cgit From dc68540913ac523b46ebda3843cec179362c7a72 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Thu, 14 Dec 2023 12:30:10 +0800 Subject: bpf: Use GFP_KERNEL in bpf_event_entry_gen() rcu_read_lock() is no longer held when invoking bpf_event_entry_gen() which is called by perf_event_fd_array_get_ptr(), so using GFP_KERNEL instead of GFP_ATOMIC to reduce the possibility of failures due to out-of-memory. Acked-by: Yonghong Song Signed-off-by: Hou Tao Link: https://lore.kernel.org/r/20231214043010.3458072-3-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/arraymap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 8d365bda9a8b..b5ec24b3563e 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -1195,7 +1195,7 @@ static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file, { struct bpf_event_entry *ee; - ee = kzalloc(sizeof(*ee), GFP_ATOMIC); + ee = kzalloc(sizeof(*ee), GFP_KERNEL); if (ee) { ee->event = perf_file->private_data; ee->perf_file = perf_file; -- cgit From 59e5791f59dd83e8aa72a4e74217eabb6e8cfd90 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 14 Dec 2023 12:38:15 -0800 Subject: bpf: Fix a race condition between btf_put() and map_free() When running `./test_progs -j` in my local vm with latest kernel, I once hit a kasan error like below: [ 1887.184724] BUG: KASAN: slab-use-after-free in bpf_rb_root_free+0x1f8/0x2b0 [ 1887.185599] Read of size 4 at addr ffff888106806910 by task kworker/u12:2/2830 [ 1887.186498] [ 1887.186712] CPU: 3 PID: 2830 Comm: kworker/u12:2 Tainted: G OEL 6.7.0-rc3-00699-g90679706d486-dirty #494 [ 1887.188034] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014 [ 1887.189618] Workqueue: events_unbound bpf_map_free_deferred [ 1887.190341] Call Trace: [ 1887.190666] [ 1887.190949] dump_stack_lvl+0xac/0xe0 [ 1887.191423] ? nf_tcp_handle_invalid+0x1b0/0x1b0 [ 1887.192019] ? panic+0x3c0/0x3c0 [ 1887.192449] print_report+0x14f/0x720 [ 1887.192930] ? preempt_count_sub+0x1c/0xd0 [ 1887.193459] ? __virt_addr_valid+0xac/0x120 [ 1887.194004] ? bpf_rb_root_free+0x1f8/0x2b0 [ 1887.194572] kasan_report+0xc3/0x100 [ 1887.195085] ? bpf_rb_root_free+0x1f8/0x2b0 [ 1887.195668] bpf_rb_root_free+0x1f8/0x2b0 [ 1887.196183] ? __bpf_obj_drop_impl+0xb0/0xb0 [ 1887.196736] ? preempt_count_sub+0x1c/0xd0 [ 1887.197270] ? preempt_count_sub+0x1c/0xd0 [ 1887.197802] ? _raw_spin_unlock+0x1f/0x40 [ 1887.198319] bpf_obj_free_fields+0x1d4/0x260 [ 1887.198883] array_map_free+0x1a3/0x260 [ 1887.199380] bpf_map_free_deferred+0x7b/0xe0 [ 1887.199943] process_scheduled_works+0x3a2/0x6c0 [ 1887.200549] worker_thread+0x633/0x890 [ 1887.201047] ? __kthread_parkme+0xd7/0xf0 [ 1887.201574] ? kthread+0x102/0x1d0 [ 1887.202020] kthread+0x1ab/0x1d0 [ 1887.202447] ? pr_cont_work+0x270/0x270 [ 1887.202954] ? kthread_blkcg+0x50/0x50 [ 1887.203444] ret_from_fork+0x34/0x50 [ 1887.203914] ? kthread_blkcg+0x50/0x50 [ 1887.204397] ret_from_fork_asm+0x11/0x20 [ 1887.204913] [ 1887.204913] [ 1887.205209] [ 1887.205416] Allocated by task 2197: [ 1887.205881] kasan_set_track+0x3f/0x60 [ 1887.206366] __kasan_kmalloc+0x6e/0x80 [ 1887.206856] __kmalloc+0xac/0x1a0 [ 1887.207293] btf_parse_fields+0xa15/0x1480 [ 1887.207836] btf_parse_struct_metas+0x566/0x670 [ 1887.208387] btf_new_fd+0x294/0x4d0 [ 1887.208851] __sys_bpf+0x4ba/0x600 [ 1887.209292] __x64_sys_bpf+0x41/0x50 [ 1887.209762] do_syscall_64+0x4c/0xf0 [ 1887.210222] entry_SYSCALL_64_after_hwframe+0x63/0x6b [ 1887.210868] [ 1887.211074] Freed by task 36: [ 1887.211460] kasan_set_track+0x3f/0x60 [ 1887.211951] kasan_save_free_info+0x28/0x40 [ 1887.212485] ____kasan_slab_free+0x101/0x180 [ 1887.213027] __kmem_cache_free+0xe4/0x210 [ 1887.213514] btf_free+0x5b/0x130 [ 1887.213918] rcu_core+0x638/0xcc0 [ 1887.214347] __do_softirq+0x114/0x37e The error happens at bpf_rb_root_free+0x1f8/0x2b0: 00000000000034c0 : ; { 34c0: f3 0f 1e fa endbr64 34c4: e8 00 00 00 00 callq 0x34c9 34c9: 55 pushq %rbp 34ca: 48 89 e5 movq %rsp, %rbp ... ; if (rec && rec->refcount_off >= 0 && 36aa: 4d 85 ed testq %r13, %r13 36ad: 74 a9 je 0x3658 36af: 49 8d 7d 10 leaq 0x10(%r13), %rdi 36b3: e8 00 00 00 00 callq 0x36b8 <==== kasan function 36b8: 45 8b 7d 10 movl 0x10(%r13), %r15d <==== use-after-free load 36bc: 45 85 ff testl %r15d, %r15d 36bf: 78 8c js 0x364d So the problem is at rec->refcount_off in the above. I did some source code analysis and find the reason. CPU A CPU B bpf_map_put: ... btf_put with rcu callback ... bpf_map_free_deferred with system_unbound_wq ... ... ... ... btf_free_rcu: ... ... ... bpf_map_free_deferred: ... ... ... ---------> btf_struct_metas_free() ... | race condition ... ... ---------> map->ops->map_free() ... ... btf->struct_meta_tab = NULL In the above, map_free() corresponds to array_map_free() and eventually calling bpf_rb_root_free() which calls: ... __bpf_obj_drop_impl(obj, field->graph_root.value_rec, false); ... Here, 'value_rec' is assigned in btf_check_and_fixup_fields() with following code: meta = btf_find_struct_meta(btf, btf_id); if (!meta) return -EFAULT; rec->fields[i].graph_root.value_rec = meta->record; So basically, 'value_rec' is a pointer to the record in struct_metas_tab. And it is possible that that particular record has been freed by btf_struct_metas_free() and hence we have a kasan error here. Actually it is very hard to reproduce the failure with current bpf/bpf-next code, I only got the above error once. To increase reproducibility, I added a delay in bpf_map_free_deferred() to delay map->ops->map_free(), which significantly increased reproducibility. diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 5e43ddd1b83f..aae5b5213e93 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -695,6 +695,7 @@ static void bpf_map_free_deferred(struct work_struct *work) struct bpf_map *map = container_of(work, struct bpf_map, work); struct btf_record *rec = map->record; + mdelay(100); security_bpf_map_free(map); bpf_map_release_memcg(map); /* implementation dependent freeing */ Hao also provided test cases ([1]) for easily reproducing the above issue. There are two ways to fix the issue, the v1 of the patch ([2]) moving btf_put() after map_free callback, and the v5 of the patch ([3]) using a kptr style fix which tries to get a btf reference during map_check_btf(). Each approach has its pro and cons. The first approach delays freeing btf while the second approach needs to acquire reference depending on context which makes logic not very elegant and may complicate things with future new data structures. Alexei suggested in [4] going back to v1 which is what this patch tries to do. Rerun './test_progs -j' with the above mdelay() hack for a couple of times and didn't observe the error for the above rb_root test cases. Running Hou's test ([1]) is also successful. [1] https://lore.kernel.org/bpf/20231207141500.917136-1-houtao@huaweicloud.com/ [2] v1: https://lore.kernel.org/bpf/20231204173946.3066377-1-yonghong.song@linux.dev/ [3] v5: https://lore.kernel.org/bpf/20231208041621.2968241-1-yonghong.song@linux.dev/ [4] v4: https://lore.kernel.org/bpf/CAADnVQJ3FiXUhZJwX_81sjZvSYYKCFB3BT6P8D59RS2Gu+0Z7g@mail.gmail.com/ Cc: Hou Tao Fixes: 958cf2e273f0 ("bpf: Introduce bpf_obj_new") Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20231214203815.1469107-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 3fcf7741146a..8faa1a20edf8 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -692,6 +692,7 @@ static void bpf_map_free_deferred(struct work_struct *work) { struct bpf_map *map = container_of(work, struct bpf_map, work); struct btf_record *rec = map->record; + struct btf *btf = map->btf; security_bpf_map_free(map); bpf_map_release_memcg(map); @@ -707,6 +708,10 @@ static void bpf_map_free_deferred(struct work_struct *work) * template bpf_map struct used during verification. */ btf_record_free(rec); + /* Delay freeing of btf for maps, as map_free callback may need + * struct_meta info which will be freed with btf_put(). + */ + btf_put(btf); } static void bpf_map_put_uref(struct bpf_map *map) @@ -747,7 +752,6 @@ void bpf_map_put(struct bpf_map *map) if (atomic64_dec_and_test(&map->refcnt)) { /* bpf_map_free_id() must be called first */ bpf_map_free_id(map); - btf_put(map->btf); WARN_ON_ONCE(atomic64_read(&map->sleepable_refcnt)); if (READ_ONCE(map->free_after_mult_rcu_gp)) -- cgit From c5707b2146d229691e193d5158ea70b21b8ba180 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 14 Dec 2023 14:50:15 -0800 Subject: bpf: support symbolic BPF FS delegation mount options Besides already supported special "any" value and hex bit mask, support string-based parsing of delegation masks based on exact enumerator names. Utilize BTF information of `enum bpf_cmd`, `enum bpf_map_type`, `enum bpf_prog_type`, and `enum bpf_attach_type` types to find supported symbolic names (ignoring __MAX_xxx guard values and stripping repetitive prefixes like BPF_ for cmd and attach types, BPF_MAP_TYPE_ for maps, and BPF_PROG_TYPE_ for prog types). The case doesn't matter, but it is normalized to lower case in mount option output. So "PROG_LOAD", "prog_load", and "MAP_create" are all valid values to specify for delegate_cmds options, "array" is among supported for map types, etc. Besides supporting string values, we also support multiple values specified at the same time, using colon (':') separator. There are corresponding changes on bpf_show_options side to use known values to print them in human-readable format, falling back to hex mask printing, if there are any unrecognized bits. This shouldn't be necessary when enum BTF information is present, but in general we should always be able to fall back to this even if kernel was built without BTF. As mentioned, emitted symbolic names are normalized to be all lower case. Example below shows various ways to specify delegate_cmds options through mount command and how mount options are printed back: 12/14 14:39:07.604 vmuser@archvm:~/local/linux/tools/testing/selftests/bpf $ mount | rg token $ sudo mkdir -p /sys/fs/bpf/token $ sudo mount -t bpf bpffs /sys/fs/bpf/token \ -o delegate_cmds=prog_load:MAP_CREATE \ -o delegate_progs=kprobe \ -o delegate_attachs=xdp $ mount | grep token bpffs on /sys/fs/bpf/token type bpf (rw,relatime,delegate_cmds=map_create:prog_load,delegate_progs=kprobe,delegate_attachs=xdp) Acked-by: John Fastabend Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231214225016.1209867-2-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/inode.c | 249 +++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 211 insertions(+), 38 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 0a8e1188ea46..4383b3d13a55 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -595,6 +595,136 @@ struct bpf_prog *bpf_prog_get_type_path(const char *name, enum bpf_prog_type typ } EXPORT_SYMBOL(bpf_prog_get_type_path); +struct bpffs_btf_enums { + const struct btf *btf; + const struct btf_type *cmd_t; + const struct btf_type *map_t; + const struct btf_type *prog_t; + const struct btf_type *attach_t; +}; + +static int find_bpffs_btf_enums(struct bpffs_btf_enums *info) +{ + const struct btf *btf; + const struct btf_type *t; + const char *name; + int i, n; + + memset(info, 0, sizeof(*info)); + + btf = bpf_get_btf_vmlinux(); + if (IS_ERR(btf)) + return PTR_ERR(btf); + if (!btf) + return -ENOENT; + + info->btf = btf; + + for (i = 1, n = btf_nr_types(btf); i < n; i++) { + t = btf_type_by_id(btf, i); + if (!btf_type_is_enum(t)) + continue; + + name = btf_name_by_offset(btf, t->name_off); + if (!name) + continue; + + if (strcmp(name, "bpf_cmd") == 0) + info->cmd_t = t; + else if (strcmp(name, "bpf_map_type") == 0) + info->map_t = t; + else if (strcmp(name, "bpf_prog_type") == 0) + info->prog_t = t; + else if (strcmp(name, "bpf_attach_type") == 0) + info->attach_t = t; + else + continue; + + if (info->cmd_t && info->map_t && info->prog_t && info->attach_t) + return 0; + } + + return -ESRCH; +} + +static bool find_btf_enum_const(const struct btf *btf, const struct btf_type *enum_t, + const char *prefix, const char *str, int *value) +{ + const struct btf_enum *e; + const char *name; + int i, n, pfx_len = strlen(prefix); + + *value = 0; + + if (!btf || !enum_t) + return false; + + for (i = 0, n = btf_vlen(enum_t); i < n; i++) { + e = &btf_enum(enum_t)[i]; + + name = btf_name_by_offset(btf, e->name_off); + if (!name || strncasecmp(name, prefix, pfx_len) != 0) + continue; + + /* match symbolic name case insensitive and ignoring prefix */ + if (strcasecmp(name + pfx_len, str) == 0) { + *value = e->val; + return true; + } + } + + return false; +} + +static void seq_print_delegate_opts(struct seq_file *m, + const char *opt_name, + const struct btf *btf, + const struct btf_type *enum_t, + const char *prefix, + u64 delegate_msk, u64 any_msk) +{ + const struct btf_enum *e; + bool first = true; + const char *name; + u64 msk; + int i, n, pfx_len = strlen(prefix); + + delegate_msk &= any_msk; /* clear unknown bits */ + + if (delegate_msk == 0) + return; + + seq_printf(m, ",%s", opt_name); + if (delegate_msk == any_msk) { + seq_printf(m, "=any"); + return; + } + + if (btf && enum_t) { + for (i = 0, n = btf_vlen(enum_t); i < n; i++) { + e = &btf_enum(enum_t)[i]; + name = btf_name_by_offset(btf, e->name_off); + if (!name || strncasecmp(name, prefix, pfx_len) != 0) + continue; + msk = 1ULL << e->val; + if (delegate_msk & msk) { + /* emit lower-case name without prefix */ + seq_printf(m, "%c", first ? '=' : ':'); + name += pfx_len; + while (*name) { + seq_printf(m, "%c", tolower(*name)); + name++; + } + + delegate_msk &= ~msk; + first = false; + } + } + } + if (delegate_msk) + seq_printf(m, "%c0x%llx", first ? '=' : ':', delegate_msk); +} + /* * Display the mount options in /proc/mounts. */ @@ -614,29 +744,34 @@ static int bpf_show_options(struct seq_file *m, struct dentry *root) if (mode != S_IRWXUGO) seq_printf(m, ",mode=%o", mode); - mask = (1ULL << __MAX_BPF_CMD) - 1; - if ((opts->delegate_cmds & mask) == mask) - seq_printf(m, ",delegate_cmds=any"); - else if (opts->delegate_cmds) - seq_printf(m, ",delegate_cmds=0x%llx", opts->delegate_cmds); - - mask = (1ULL << __MAX_BPF_MAP_TYPE) - 1; - if ((opts->delegate_maps & mask) == mask) - seq_printf(m, ",delegate_maps=any"); - else if (opts->delegate_maps) - seq_printf(m, ",delegate_maps=0x%llx", opts->delegate_maps); - - mask = (1ULL << __MAX_BPF_PROG_TYPE) - 1; - if ((opts->delegate_progs & mask) == mask) - seq_printf(m, ",delegate_progs=any"); - else if (opts->delegate_progs) - seq_printf(m, ",delegate_progs=0x%llx", opts->delegate_progs); - - mask = (1ULL << __MAX_BPF_ATTACH_TYPE) - 1; - if ((opts->delegate_attachs & mask) == mask) - seq_printf(m, ",delegate_attachs=any"); - else if (opts->delegate_attachs) - seq_printf(m, ",delegate_attachs=0x%llx", opts->delegate_attachs); + if (opts->delegate_cmds || opts->delegate_maps || + opts->delegate_progs || opts->delegate_attachs) { + struct bpffs_btf_enums info; + + /* ignore errors, fallback to hex */ + (void)find_bpffs_btf_enums(&info); + + mask = (1ULL << __MAX_BPF_CMD) - 1; + seq_print_delegate_opts(m, "delegate_cmds", + info.btf, info.cmd_t, "BPF_", + opts->delegate_cmds, mask); + + mask = (1ULL << __MAX_BPF_MAP_TYPE) - 1; + seq_print_delegate_opts(m, "delegate_maps", + info.btf, info.map_t, "BPF_MAP_TYPE_", + opts->delegate_maps, mask); + + mask = (1ULL << __MAX_BPF_PROG_TYPE) - 1; + seq_print_delegate_opts(m, "delegate_progs", + info.btf, info.prog_t, "BPF_PROG_TYPE_", + opts->delegate_progs, mask); + + mask = (1ULL << __MAX_BPF_ATTACH_TYPE) - 1; + seq_print_delegate_opts(m, "delegate_attachs", + info.btf, info.attach_t, "BPF_", + opts->delegate_attachs, mask); + } + return 0; } @@ -686,7 +821,6 @@ static int bpf_parse_param(struct fs_context *fc, struct fs_parameter *param) kuid_t uid; kgid_t gid; int opt, err; - u64 msk; opt = fs_parse(fc, bpf_fs_parameters, param, &result); if (opt < 0) { @@ -741,24 +875,63 @@ static int bpf_parse_param(struct fs_context *fc, struct fs_parameter *param) case OPT_DELEGATE_CMDS: case OPT_DELEGATE_MAPS: case OPT_DELEGATE_PROGS: - case OPT_DELEGATE_ATTACHS: - if (strcmp(param->string, "any") == 0) { - msk = ~0ULL; - } else { - err = kstrtou64(param->string, 0, &msk); - if (err) - return err; + case OPT_DELEGATE_ATTACHS: { + struct bpffs_btf_enums info; + const struct btf_type *enum_t; + const char *enum_pfx; + u64 *delegate_msk, msk = 0; + char *p; + int val; + + /* ignore errors, fallback to hex */ + (void)find_bpffs_btf_enums(&info); + + switch (opt) { + case OPT_DELEGATE_CMDS: + delegate_msk = &opts->delegate_cmds; + enum_t = info.cmd_t; + enum_pfx = "BPF_"; + break; + case OPT_DELEGATE_MAPS: + delegate_msk = &opts->delegate_maps; + enum_t = info.map_t; + enum_pfx = "BPF_MAP_TYPE_"; + break; + case OPT_DELEGATE_PROGS: + delegate_msk = &opts->delegate_progs; + enum_t = info.prog_t; + enum_pfx = "BPF_PROG_TYPE_"; + break; + case OPT_DELEGATE_ATTACHS: + delegate_msk = &opts->delegate_attachs; + enum_t = info.attach_t; + enum_pfx = "BPF_"; + break; + default: + return -EINVAL; } + + while ((p = strsep(¶m->string, ":"))) { + if (strcmp(p, "any") == 0) { + msk |= ~0ULL; + } else if (find_btf_enum_const(info.btf, enum_t, enum_pfx, p, &val)) { + msk |= 1ULL << val; + } else { + err = kstrtou64(p, 0, &msk); + if (err) + return err; + } + } + /* Setting delegation mount options requires privileges */ if (msk && !capable(CAP_SYS_ADMIN)) return -EPERM; - switch (opt) { - case OPT_DELEGATE_CMDS: opts->delegate_cmds |= msk; break; - case OPT_DELEGATE_MAPS: opts->delegate_maps |= msk; break; - case OPT_DELEGATE_PROGS: opts->delegate_progs |= msk; break; - case OPT_DELEGATE_ATTACHS: opts->delegate_attachs |= msk; break; - default: return -EINVAL; - } + + *delegate_msk |= msk; + break; + } + default: + /* ignore unknown mount options */ break; } -- cgit From 7489723c2e26504573dbb49b66bbc59092840008 Mon Sep 17 00:00:00 2001 From: Daniel Xu Date: Thu, 14 Dec 2023 15:56:25 -0700 Subject: bpf: xdp: Register generic_kfunc_set with XDP programs Registering generic_kfunc_set with XDP programs enables some of the newer BPF features inside XDP -- namely tree based data structures and BPF exceptions. The current motivation for this commit is to enable assertions inside XDP bpf progs. Assertions are a standard and useful tool to encode intent. Signed-off-by: Daniel Xu Link: https://lore.kernel.org/r/d07d4614b81ca6aada44fcb89bb6b618fb66e4ca.1702594357.git.dxu@dxuuu.xyz Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index b3be5742d6f1..b0b485126a76 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -2630,6 +2630,7 @@ static int __init kfunc_init(void) ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &generic_kfunc_set); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &generic_kfunc_set); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &generic_kfunc_set); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &generic_kfunc_set); ret = ret ?: register_btf_id_dtor_kfuncs(generic_dtors, ARRAY_SIZE(generic_dtors), -- cgit From 4ad4c1f394b84f9941a10aa8aaf11102478a390b Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Fri, 24 Nov 2023 18:10:03 +0000 Subject: dma-mapping: don't store redundant offsets A bus_dma_region necessarily stores both CPU and DMA base addresses for a range, so there's no need to also store the difference between them. Signed-off-by: Robin Murphy Acked-by: Rob Herring Signed-off-by: Christoph Hellwig --- drivers/acpi/scan.c | 1 - drivers/of/address.c | 1 - include/linux/dma-direct.h | 19 ++++++++++++------- kernel/dma/direct.c | 1 - 4 files changed, 12 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c index 02bb2cce423f..ee88a727f200 100644 --- a/drivers/acpi/scan.c +++ b/drivers/acpi/scan.c @@ -1532,7 +1532,6 @@ int acpi_dma_get_range(struct device *dev, const struct bus_dma_region **map) r->cpu_start = rentry->res->start; r->dma_start = rentry->res->start - rentry->offset; r->size = resource_size(rentry->res); - r->offset = rentry->offset; r++; } } diff --git a/drivers/of/address.c b/drivers/of/address.c index b59956310f66..ae46a3605904 100644 --- a/drivers/of/address.c +++ b/drivers/of/address.c @@ -955,7 +955,6 @@ int of_dma_get_range(struct device_node *np, const struct bus_dma_region **map) r->cpu_start = range.cpu_addr; r->dma_start = range.bus_addr; r->size = range.size; - r->offset = range.cpu_addr - range.bus_addr; r++; } out: diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h index 18aade195884..3eb3589ff43e 100644 --- a/include/linux/dma-direct.h +++ b/include/linux/dma-direct.h @@ -21,7 +21,6 @@ struct bus_dma_region { phys_addr_t cpu_start; dma_addr_t dma_start; u64 size; - u64 offset; }; static inline dma_addr_t translate_phys_to_dma(struct device *dev, @@ -29,9 +28,12 @@ static inline dma_addr_t translate_phys_to_dma(struct device *dev, { const struct bus_dma_region *m; - for (m = dev->dma_range_map; m->size; m++) - if (paddr >= m->cpu_start && paddr - m->cpu_start < m->size) - return (dma_addr_t)paddr - m->offset; + for (m = dev->dma_range_map; m->size; m++) { + u64 offset = paddr - m->cpu_start; + + if (paddr >= m->cpu_start && offset < m->size) + return m->dma_start + offset; + } /* make sure dma_capable fails when no translation is available */ return DMA_MAPPING_ERROR; @@ -42,9 +44,12 @@ static inline phys_addr_t translate_dma_to_phys(struct device *dev, { const struct bus_dma_region *m; - for (m = dev->dma_range_map; m->size; m++) - if (dma_addr >= m->dma_start && dma_addr - m->dma_start < m->size) - return (phys_addr_t)dma_addr + m->offset; + for (m = dev->dma_range_map; m->size; m++) { + u64 offset = dma_addr - m->dma_start; + + if (dma_addr >= m->dma_start && offset < m->size) + return m->cpu_start + offset; + } return (phys_addr_t)-1; } diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 73c95815789a..98b2e192fd69 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -677,7 +677,6 @@ int dma_direct_set_offset(struct device *dev, phys_addr_t cpu_start, return -ENOMEM; map[0].cpu_start = cpu_start; map[0].dma_start = dma_start; - map[0].offset = offset; map[0].size = size; dev->dma_range_map = map; return 0; -- cgit From 55c543865b76830f524ce5ce1243266a93d06f84 Mon Sep 17 00:00:00 2001 From: Petr Tesarik Date: Fri, 1 Dec 2023 13:13:52 +0100 Subject: swiotlb: reduce area lock contention for non-primary IO TLB pools If multiple areas and multiple IO TLB pools exist, first iterate the current CPU specific area in all pools. Then move to the next area index. This is best illustrated by a diagram: area 0 | area 1 | ... | area M | pool 0 A B C pool 1 D E ... pool N F G H Currently, each pool is searched before moving on to the next pool, i.e. the search order is A, B ... C, D, E ... F, G ... H. With this patch, each area is searched in all pools before moving on to the next area, i.e. the search order is A, D ... F, B, E ... G ... C ... H. Note that preemption is not disabled, and raw_smp_processor_id() may not return a stable result, but it is called only once to determine the initial area index. The search will iterate over all areas eventually, even if the current task is preempted. Next, some pools may have less (but not more) areas than default_nareas. Skip such pools if the distance from the initial area index is greater than pool->nareas. This logic ensures that for every pool the search starts in the initial CPU's own area and never tries any area twice. To verify performance impact, I booted the kernel with a minimum pool size ("swiotlb=512,4,force"), so multiple pools get allocated, and I ran these benchmarks: - small: single-threaded I/O of 4 KiB blocks, - big: single-threaded I/O of 64 KiB blocks, - 4way: 4-way parallel I/O of 4 KiB blocks. The "var" column in the tables below is the coefficient of variance over 5 runs of the test, the "diff" column is the relative difference against base in read-write I/O bandwidth (MiB/s). Tested on an x86 VM against a QEMU virtio SATA driver backed by a RAM-based block device on the host: base patched var var diff small 0.69% 0.62% +25.4% big 2.14% 2.27% +25.7% 4way 2.65% 1.70% +23.6% Tested on a Raspberry Pi against a class-10 A1 microSD card: base patched var var diff small 0.53% 1.96% -0.3% big 0.02% 0.57% +0.8% 4way 6.17% 0.40% +0.3% These results confirm that there is significant performance boost in the software IO TLB slot allocation itself. Where performance is dominated by actual hardware, there is no measurable change. Signed-off-by: Petr Tesarik Reviewed-by: Mirsad Todorovac Signed-off-by: Christoph Hellwig --- kernel/dma/swiotlb.c | 90 ++++++++++++++++++++++++++++++++-------------------- 1 file changed, 55 insertions(+), 35 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 33d942615be5..e20b856255ef 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -957,7 +957,7 @@ static void dec_used(struct io_tlb_mem *mem, unsigned int nslots) #endif /* CONFIG_DEBUG_FS */ /** - * swiotlb_area_find_slots() - search for slots in one IO TLB memory area + * swiotlb_search_pool_area() - search one memory area in one pool * @dev: Device which maps the buffer. * @pool: Memory pool to be searched. * @area_index: Index of the IO TLB memory area to be searched. @@ -972,7 +972,7 @@ static void dec_used(struct io_tlb_mem *mem, unsigned int nslots) * * Return: Index of the first allocated slot, or -1 on error. */ -static int swiotlb_area_find_slots(struct device *dev, struct io_tlb_pool *pool, +static int swiotlb_search_pool_area(struct device *dev, struct io_tlb_pool *pool, int area_index, phys_addr_t orig_addr, size_t alloc_size, unsigned int alloc_align_mask) { @@ -1066,41 +1066,50 @@ found: return slot_index; } +#ifdef CONFIG_SWIOTLB_DYNAMIC + /** - * swiotlb_pool_find_slots() - search for slots in one memory pool + * swiotlb_search_area() - search one memory area in all pools * @dev: Device which maps the buffer. - * @pool: Memory pool to be searched. + * @start_cpu: Start CPU number. + * @cpu_offset: Offset from @start_cpu. * @orig_addr: Original (non-bounced) IO buffer address. * @alloc_size: Total requested size of the bounce buffer, * including initial alignment padding. * @alloc_align_mask: Required alignment of the allocated buffer. + * @retpool: Used memory pool, updated on return. * - * Search through one memory pool to find a sequence of slots that match the + * Search one memory area in all pools for a sequence of slots that match the * allocation constraints. * * Return: Index of the first allocated slot, or -1 on error. */ -static int swiotlb_pool_find_slots(struct device *dev, struct io_tlb_pool *pool, - phys_addr_t orig_addr, size_t alloc_size, - unsigned int alloc_align_mask) +static int swiotlb_search_area(struct device *dev, int start_cpu, + int cpu_offset, phys_addr_t orig_addr, size_t alloc_size, + unsigned int alloc_align_mask, struct io_tlb_pool **retpool) { - int start = raw_smp_processor_id() & (pool->nareas - 1); - int i = start, index; - - do { - index = swiotlb_area_find_slots(dev, pool, i, orig_addr, - alloc_size, alloc_align_mask); - if (index >= 0) - return index; - if (++i >= pool->nareas) - i = 0; - } while (i != start); + struct io_tlb_mem *mem = dev->dma_io_tlb_mem; + struct io_tlb_pool *pool; + int area_index; + int index = -1; - return -1; + rcu_read_lock(); + list_for_each_entry_rcu(pool, &mem->pools, node) { + if (cpu_offset >= pool->nareas) + continue; + area_index = (start_cpu + cpu_offset) & (pool->nareas - 1); + index = swiotlb_search_pool_area(dev, pool, area_index, + orig_addr, alloc_size, + alloc_align_mask); + if (index >= 0) { + *retpool = pool; + break; + } + } + rcu_read_unlock(); + return index; } -#ifdef CONFIG_SWIOTLB_DYNAMIC - /** * swiotlb_find_slots() - search for slots in the whole swiotlb * @dev: Device which maps the buffer. @@ -1124,18 +1133,17 @@ static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr, unsigned long nslabs; unsigned long flags; u64 phys_limit; + int cpu, i; int index; - rcu_read_lock(); - list_for_each_entry_rcu(pool, &mem->pools, node) { - index = swiotlb_pool_find_slots(dev, pool, orig_addr, - alloc_size, alloc_align_mask); - if (index >= 0) { - rcu_read_unlock(); + cpu = raw_smp_processor_id(); + for (i = 0; i < default_nareas; ++i) { + index = swiotlb_search_area(dev, cpu, i, orig_addr, alloc_size, + alloc_align_mask, &pool); + if (index >= 0) goto found; - } } - rcu_read_unlock(); + if (!mem->can_grow) return -1; @@ -1148,8 +1156,8 @@ static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr, if (!pool) return -1; - index = swiotlb_pool_find_slots(dev, pool, orig_addr, - alloc_size, alloc_align_mask); + index = swiotlb_search_pool_area(dev, pool, 0, orig_addr, + alloc_size, alloc_align_mask); if (index < 0) { swiotlb_dyn_free(&pool->rcu); return -1; @@ -1192,9 +1200,21 @@ static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr, size_t alloc_size, unsigned int alloc_align_mask, struct io_tlb_pool **retpool) { - *retpool = &dev->dma_io_tlb_mem->defpool; - return swiotlb_pool_find_slots(dev, *retpool, - orig_addr, alloc_size, alloc_align_mask); + struct io_tlb_pool *pool; + int start, i; + int index; + + *retpool = pool = &dev->dma_io_tlb_mem->defpool; + i = start = raw_smp_processor_id() & (pool->nareas - 1); + do { + index = swiotlb_search_pool_area(dev, pool, i, orig_addr, + alloc_size, alloc_align_mask); + if (index >= 0) + return index; + if (++i >= pool->nareas) + i = 0; + } while (i != start); + return -1; } #endif /* CONFIG_SWIOTLB_DYNAMIC */ -- cgit From b07bc2347672cc8c7293c64499f1488278c5ca3d Mon Sep 17 00:00:00 2001 From: Joakim Zhang Date: Thu, 14 Dec 2023 16:25:26 +0800 Subject: dma-mapping: clear dev->dma_mem to NULL after freeing it Reproduced with below sequence: dma_declare_coherent_memory()->dma_release_coherent_memory() ->dma_declare_coherent_memory()->"return -EBUSY" error It will return -EBUSY from the dma_assign_coherent_memory() in dma_declare_coherent_memory(), the reason is that dev->dma_mem pointer has not been set to NULL after it's freed. Fixes: cf65a0f6f6ff ("dma-mapping: move all DMA mapping code to kernel/dma") Signed-off-by: Joakim Zhang Signed-off-by: Christoph Hellwig --- kernel/dma/coherent.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/dma/coherent.c b/kernel/dma/coherent.c index c21abc77c53e..ff5683a57f77 100644 --- a/kernel/dma/coherent.c +++ b/kernel/dma/coherent.c @@ -132,8 +132,10 @@ int dma_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr, void dma_release_coherent_memory(struct device *dev) { - if (dev) + if (dev) { _dma_release_coherent_memory(dev->dma_mem); + dev->dma_mem = NULL; + } } static void *__dma_alloc_from_coherent(struct device *dev, -- cgit From 7e2c1e4b34f07d9aa8937fab88359d4a0fce468e Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Fri, 15 Dec 2023 11:24:50 +0000 Subject: perf: Fix perf_event_validate_size() lockdep splat When lockdep is enabled, the for_each_sibling_event(sibling, event) macro checks that event->ctx->mutex is held. When creating a new group leader event, we call perf_event_validate_size() on a partially initialized event where event->ctx is NULL, and so when for_each_sibling_event() attempts to check event->ctx->mutex, we get a splat, as reported by Lucas De Marchi: WARNING: CPU: 8 PID: 1471 at kernel/events/core.c:1950 __do_sys_perf_event_open+0xf37/0x1080 This only happens for a new event which is its own group_leader, and in this case there cannot be any sibling events. Thus it's safe to skip the check for siblings, which avoids having to make invasive and ugly changes to for_each_sibling_event(). Avoid the splat by bailing out early when the new event is its own group_leader. Fixes: 382c27f4ed28f803 ("perf: Fix perf_event_validate_size()") Closes: https://lore.kernel.org/lkml/20231214000620.3081018-1-lucas.demarchi@intel.com/ Closes: https://lore.kernel.org/lkml/ZXpm6gQ%2Fd59jGsuW@xpf.sh.intel.com/ Reported-by: Lucas De Marchi Reported-by: Pengfei Xu Signed-off-by: Mark Rutland Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20231215112450.3972309-1-mark.rutland@arm.com --- kernel/events/core.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index c9d123e13b57..9efd0d7775e7 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1947,6 +1947,16 @@ static bool perf_event_validate_size(struct perf_event *event) group_leader->nr_siblings + 1) > 16*1024) return false; + /* + * When creating a new group leader, group_leader->ctx is initialized + * after the size has been validated, but we cannot safely use + * for_each_sibling_event() until group_leader->ctx is set. A new group + * leader cannot have any siblings yet, so we can safely skip checking + * the non-existent siblings. + */ + if (event == group_leader) + return true; + for_each_sibling_event(sibling, group_leader) { if (__perf_event_read_size(sibling->attr.read_format, group_leader->nr_siblings + 1) > 16*1024) -- cgit From 0c4cae1bc00d31c78858c184ede351baea232bdb Mon Sep 17 00:00:00 2001 From: Chris Feng Date: Wed, 13 Dec 2023 16:32:51 +0800 Subject: PM: hibernate: Avoid missing wakeup events during hibernation Wakeup events that occur in the hibernation process's hibernation_platform_enter() cannot wake up the system. Although the current hibernation framework will execute part of the recovery process after a wakeup event occurs, it ultimately performs a shutdown operation because the system does not check the return value of hibernation_platform_enter(). In short, if a wakeup event occurs before putting the system into the final low-power state, it will be missed. To solve this problem, check the return value of hibernation_platform_enter(). When it returns -EAGAIN or -EBUSY (indicate the occurrence of a wakeup event), execute the hibernation recovery process, discard the previously saved image, and ultimately return to the working state. Signed-off-by: Chris Feng [ rjw: Rephrase the message printed when going back to the working state ] Signed-off-by: Rafael J. Wysocki --- kernel/power/hibernate.c | 10 ++++++++-- kernel/power/power.h | 2 ++ 2 files changed, 10 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index dee341ae4ace..4b0b7cf2e019 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -642,9 +642,9 @@ int hibernation_platform_enter(void) */ static void power_down(void) { -#ifdef CONFIG_SUSPEND int error; +#ifdef CONFIG_SUSPEND if (hibernation_mode == HIBERNATION_SUSPEND) { error = suspend_devices_and_enter(mem_sleep_current); if (error) { @@ -667,7 +667,13 @@ static void power_down(void) kernel_restart(NULL); break; case HIBERNATION_PLATFORM: - hibernation_platform_enter(); + error = hibernation_platform_enter(); + if (error == -EAGAIN || error == -EBUSY) { + swsusp_unmark(); + events_check_enabled = false; + pr_info("Wakeup event detected during hibernation, rolling back.\n"); + return; + } fallthrough; case HIBERNATION_SHUTDOWN: if (kernel_can_power_off()) diff --git a/kernel/power/power.h b/kernel/power/power.h index 17fd9aaaf084..8499a39c62f4 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -175,6 +175,8 @@ extern int swsusp_write(unsigned int flags); void swsusp_close(void); #ifdef CONFIG_SUSPEND extern int swsusp_unmark(void); +#else +static inline int swsusp_unmark(void) { return 0; } #endif struct __kernel_old_timeval; -- cgit From 71cd7e80cfde548959952eac7063aeaea1f2e1c6 Mon Sep 17 00:00:00 2001 From: Hongchen Zhang Date: Thu, 16 Nov 2023 08:56:09 +0800 Subject: PM: hibernate: Enforce ordering during image compression/decompression An S4 (suspend to disk) test on the LoongArch 3A6000 platform sometimes fails with the following error messaged in the dmesg log: Invalid LZO compressed length That happens because when compressing/decompressing the image, the synchronization between the control thread and the compress/decompress/crc thread is based on a relaxed ordering interface, which is unreliable, and the following situation may occur: CPU 0 CPU 1 save_image_lzo lzo_compress_threadfn atomic_set(&d->stop, 1); atomic_read(&data[thr].stop) data[thr].cmp = data[thr].cmp_len; WRITE data[thr].cmp_len Then CPU0 gets a stale cmp_len and writes it to disk. During resume from S4, wrong cmp_len is loaded. To maintain data consistency between the two threads, use the acquire/release variants of atomic set and read operations. Fixes: 081a9d043c98 ("PM / Hibernate: Improve performance of LZO/plain hibernation, checksum image") Cc: All applicable Signed-off-by: Hongchen Zhang Co-developed-by: Weihao Li Signed-off-by: Weihao Li [ rjw: Subject rewrite and changelog edits ] Signed-off-by: Rafael J. Wysocki --- kernel/power/swap.c | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 68973ca2cf07..975e7195573b 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -606,11 +606,11 @@ static int crc32_threadfn(void *data) unsigned i; while (1) { - wait_event(d->go, atomic_read(&d->ready) || + wait_event(d->go, atomic_read_acquire(&d->ready) || kthread_should_stop()); if (kthread_should_stop()) { d->thr = NULL; - atomic_set(&d->stop, 1); + atomic_set_release(&d->stop, 1); wake_up(&d->done); break; } @@ -619,7 +619,7 @@ static int crc32_threadfn(void *data) for (i = 0; i < d->run_threads; i++) *d->crc32 = crc32_le(*d->crc32, d->unc[i], *d->unc_len[i]); - atomic_set(&d->stop, 1); + atomic_set_release(&d->stop, 1); wake_up(&d->done); } return 0; @@ -649,12 +649,12 @@ static int lzo_compress_threadfn(void *data) struct cmp_data *d = data; while (1) { - wait_event(d->go, atomic_read(&d->ready) || + wait_event(d->go, atomic_read_acquire(&d->ready) || kthread_should_stop()); if (kthread_should_stop()) { d->thr = NULL; d->ret = -1; - atomic_set(&d->stop, 1); + atomic_set_release(&d->stop, 1); wake_up(&d->done); break; } @@ -663,7 +663,7 @@ static int lzo_compress_threadfn(void *data) d->ret = lzo1x_1_compress(d->unc, d->unc_len, d->cmp + LZO_HEADER, &d->cmp_len, d->wrk); - atomic_set(&d->stop, 1); + atomic_set_release(&d->stop, 1); wake_up(&d->done); } return 0; @@ -798,7 +798,7 @@ static int save_image_lzo(struct swap_map_handle *handle, data[thr].unc_len = off; - atomic_set(&data[thr].ready, 1); + atomic_set_release(&data[thr].ready, 1); wake_up(&data[thr].go); } @@ -806,12 +806,12 @@ static int save_image_lzo(struct swap_map_handle *handle, break; crc->run_threads = thr; - atomic_set(&crc->ready, 1); + atomic_set_release(&crc->ready, 1); wake_up(&crc->go); for (run_threads = thr, thr = 0; thr < run_threads; thr++) { wait_event(data[thr].done, - atomic_read(&data[thr].stop)); + atomic_read_acquire(&data[thr].stop)); atomic_set(&data[thr].stop, 0); ret = data[thr].ret; @@ -850,7 +850,7 @@ static int save_image_lzo(struct swap_map_handle *handle, } } - wait_event(crc->done, atomic_read(&crc->stop)); + wait_event(crc->done, atomic_read_acquire(&crc->stop)); atomic_set(&crc->stop, 0); } @@ -1132,12 +1132,12 @@ static int lzo_decompress_threadfn(void *data) struct dec_data *d = data; while (1) { - wait_event(d->go, atomic_read(&d->ready) || + wait_event(d->go, atomic_read_acquire(&d->ready) || kthread_should_stop()); if (kthread_should_stop()) { d->thr = NULL; d->ret = -1; - atomic_set(&d->stop, 1); + atomic_set_release(&d->stop, 1); wake_up(&d->done); break; } @@ -1150,7 +1150,7 @@ static int lzo_decompress_threadfn(void *data) flush_icache_range((unsigned long)d->unc, (unsigned long)d->unc + d->unc_len); - atomic_set(&d->stop, 1); + atomic_set_release(&d->stop, 1); wake_up(&d->done); } return 0; @@ -1335,7 +1335,7 @@ static int load_image_lzo(struct swap_map_handle *handle, } if (crc->run_threads) { - wait_event(crc->done, atomic_read(&crc->stop)); + wait_event(crc->done, atomic_read_acquire(&crc->stop)); atomic_set(&crc->stop, 0); crc->run_threads = 0; } @@ -1371,7 +1371,7 @@ static int load_image_lzo(struct swap_map_handle *handle, pg = 0; } - atomic_set(&data[thr].ready, 1); + atomic_set_release(&data[thr].ready, 1); wake_up(&data[thr].go); } @@ -1390,7 +1390,7 @@ static int load_image_lzo(struct swap_map_handle *handle, for (run_threads = thr, thr = 0; thr < run_threads; thr++) { wait_event(data[thr].done, - atomic_read(&data[thr].stop)); + atomic_read_acquire(&data[thr].stop)); atomic_set(&data[thr].stop, 0); ret = data[thr].ret; @@ -1421,7 +1421,7 @@ static int load_image_lzo(struct swap_map_handle *handle, ret = snapshot_write_next(snapshot); if (ret <= 0) { crc->run_threads = thr + 1; - atomic_set(&crc->ready, 1); + atomic_set_release(&crc->ready, 1); wake_up(&crc->go); goto out_finish; } @@ -1429,13 +1429,13 @@ static int load_image_lzo(struct swap_map_handle *handle, } crc->run_threads = thr; - atomic_set(&crc->ready, 1); + atomic_set_release(&crc->ready, 1); wake_up(&crc->go); } out_finish: if (crc->run_threads) { - wait_event(crc->done, atomic_read(&crc->stop)); + wait_event(crc->done, atomic_read_acquire(&crc->stop)); atomic_set(&crc->stop, 0); } stop = ktime_get(); -- cgit From dd939425707898da992e59ab0fcfae4652546910 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Thu, 14 Dec 2023 22:29:21 -0500 Subject: ring-buffer: Do not try to put back write_stamp If an update to an event is interrupted by another event between the time the initial event allocated its buffer and where it wrote to the write_stamp, the code try to reset the write stamp back to the what it had just overwritten. It knows that it was overwritten via checking the before_stamp, and if it didn't match what it wrote to the before_stamp before it allocated its space, it knows it was overwritten. To put back the write_stamp, it uses the before_stamp it read. The problem here is that by writing the before_stamp to the write_stamp it makes the two equal again, which means that the write_stamp can be considered valid as the last timestamp written to the ring buffer. But this is not necessarily true. The event that interrupted the event could have been interrupted in a way that it was interrupted as well, and can end up leaving with an invalid write_stamp. But if this happens and returns to this context that uses the before_stamp to update the write_stamp again, it can possibly incorrectly make it valid, causing later events to have in correct time stamps. As it is OK to leave this function with an invalid write_stamp (one that doesn't match the before_stamp), there's no reason to try to make it valid again in this case. If this race happens, then just leave with the invalid write_stamp and the next event to come along will just add a absolute timestamp and validate everything again. Bonus points: This gets rid of another cmpxchg64! Link: https://lore.kernel.org/linux-trace-kernel/20231214222921.193037a7@gandalf.local.home Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Joel Fernandes Cc: Vincent Donnefort Fixes: a389d86f7fd09 ("ring-buffer: Have nested events still record running time stamp") Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 29 ++++++----------------------- 1 file changed, 6 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 1d9caee7f542..2668dde23343 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -3612,14 +3612,14 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, } if (likely(tail == w)) { - u64 save_before; - bool s_ok; - /* Nothing interrupted us between A and C */ /*D*/ rb_time_set(&cpu_buffer->write_stamp, info->ts); - barrier(); - /*E*/ s_ok = rb_time_read(&cpu_buffer->before_stamp, &save_before); - RB_WARN_ON(cpu_buffer, !s_ok); + /* + * If something came in between C and D, the write stamp + * may now not be in sync. But that's fine as the before_stamp + * will be different and then next event will just be forced + * to use an absolute timestamp. + */ if (likely(!(info->add_timestamp & (RB_ADD_STAMP_FORCE | RB_ADD_STAMP_ABSOLUTE)))) /* This did not interrupt any time update */ @@ -3627,24 +3627,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, else /* Just use full timestamp for interrupting event */ info->delta = info->ts; - barrier(); check_buffer(cpu_buffer, info, tail); - if (unlikely(info->ts != save_before)) { - /* SLOW PATH - Interrupted between C and E */ - - a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after); - RB_WARN_ON(cpu_buffer, !a_ok); - - /* Write stamp must only go forward */ - if (save_before > info->after) { - /* - * We do not care about the result, only that - * it gets updated atomically. - */ - (void)rb_time_cmpxchg(&cpu_buffer->write_stamp, - info->after, save_before); - } - } } else { u64 ts; /* SLOW PATH - Interrupted between A and C */ -- cgit From 083e9f65bd215582bf8f6a920db729fadf16704f Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Fri, 15 Dec 2023 08:18:10 -0500 Subject: ring-buffer: Remove useless update to write_stamp in rb_try_to_discard() When filtering is enabled, a temporary buffer is created to place the content of the trace event output so that the filter logic can decide from the trace event output if the trace event should be filtered out or not. If it is to be filtered out, the content in the temporary buffer is simply discarded, otherwise it is written into the trace buffer. But if an interrupt were to come in while a previous event was using that temporary buffer, the event written by the interrupt would actually go into the ring buffer itself to prevent corrupting the data on the temporary buffer. If the event is to be filtered out, the event in the ring buffer is discarded, or if it fails to discard because another event were to have already come in, it is turned into padding. The update to the write_stamp in the rb_try_to_discard() happens after a fix was made to force the next event after the discard to use an absolute timestamp by setting the before_stamp to zero so it does not match the write_stamp (which causes an event to use the absolute timestamp). But there's an effort in rb_try_to_discard() to put back the write_stamp to what it was before the event was added. But this is useless and wasteful because nothing is going to be using that write_stamp for calculations as it still will not match the before_stamp. Remove this useless update, and in doing so, we remove another cmpxchg64()! Also update the comments to reflect this change as well as remove some extra white space in another comment. Link: https://lore.kernel.org/linux-trace-kernel/20231215081810.1f4f38fe@rorschach.local.home Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Joel Fernandes Cc: Vincent Donnefort Fixes: b2dd797543cf ("ring-buffer: Force absolute timestamp on discard of event") Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 47 +++++++++++----------------------------------- 1 file changed, 11 insertions(+), 36 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 2668dde23343..ad4af0cba159 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2983,25 +2983,6 @@ static unsigned rb_calculate_event_length(unsigned length) return length; } -static u64 rb_time_delta(struct ring_buffer_event *event) -{ - switch (event->type_len) { - case RINGBUF_TYPE_PADDING: - return 0; - - case RINGBUF_TYPE_TIME_EXTEND: - return rb_event_time_stamp(event); - - case RINGBUF_TYPE_TIME_STAMP: - return 0; - - case RINGBUF_TYPE_DATA: - return event->time_delta; - default: - return 0; - } -} - static inline bool rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer_event *event) @@ -3009,8 +2990,6 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, unsigned long new_index, old_index; struct buffer_page *bpage; unsigned long addr; - u64 write_stamp; - u64 delta; new_index = rb_event_index(event); old_index = new_index + rb_event_ts_length(event); @@ -3019,14 +2998,10 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, bpage = READ_ONCE(cpu_buffer->tail_page); - delta = rb_time_delta(event); - - if (!rb_time_read(&cpu_buffer->write_stamp, &write_stamp)) - return false; - - /* Make sure the write stamp is read before testing the location */ - barrier(); - + /* + * Make sure the tail_page is still the same and + * the next write location is the end of this event + */ if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) { unsigned long write_mask = local_read(&bpage->write) & ~RB_WRITE_MASK; @@ -3037,20 +3012,20 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, * to make sure that the next event adds an absolute * value and does not rely on the saved write stamp, which * is now going to be bogus. + * + * By setting the before_stamp to zero, the next event + * is not going to use the write_stamp and will instead + * create an absolute timestamp. This means there's no + * reason to update the wirte_stamp! */ rb_time_set(&cpu_buffer->before_stamp, 0); - /* Something came in, can't discard */ - if (!rb_time_cmpxchg(&cpu_buffer->write_stamp, - write_stamp, write_stamp - delta)) - return false; - /* * If an event were to come in now, it would see that the * write_stamp and the before_stamp are different, and assume * that this event just added itself before updating * the write stamp. The interrupting event will fix the - * write stamp for us, and use the before stamp as its delta. + * write stamp for us, and use an absolute timestamp. */ /* @@ -3487,7 +3462,7 @@ static void check_buffer(struct ring_buffer_per_cpu *cpu_buffer, return; /* - * If this interrupted another event, + * If this interrupted another event, */ if (atomic_inc_return(this_cpu_ptr(&checking)) != 1) goto out; -- cgit From fff88fa0fbc7067ba46dde570912d63da42c59a9 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 12 Dec 2023 11:53:01 -0500 Subject: ring-buffer: Fix a race in rb_time_cmpxchg() for 32 bit archs Mathieu Desnoyers pointed out an issue in the rb_time_cmpxchg() for 32 bit architectures. That is: static bool rb_time_cmpxchg(rb_time_t *t, u64 expect, u64 set) { unsigned long cnt, top, bottom, msb; unsigned long cnt2, top2, bottom2, msb2; u64 val; /* The cmpxchg always fails if it interrupted an update */ if (!__rb_time_read(t, &val, &cnt2)) return false; if (val != expect) return false; <<<< interrupted here! cnt = local_read(&t->cnt); The problem is that the synchronization counter in the rb_time_t is read *after* the value of the timestamp is read. That means if an interrupt were to come in between the value being read and the counter being read, it can change the value and the counter and the interrupted process would be clueless about it! The counter needs to be read first and then the value. That way it is easy to tell if the value is stale or not. If the counter hasn't been updated, then the value is still good. Link: https://lore.kernel.org/linux-trace-kernel/20231211201324.652870-1-mathieu.desnoyers@efficios.com/ Link: https://lore.kernel.org/linux-trace-kernel/20231212115301.7a9c9a64@gandalf.local.home Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mark Rutland Fixes: 10464b4aa605e ("ring-buffer: Add rb_time_t 64 bit operations for speeding up 32 bit") Reported-by: Mathieu Desnoyers Reviewed-by: Mathieu Desnoyers Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index ad4af0cba159..b8ab0557bd1b 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -706,6 +706,9 @@ static bool rb_time_cmpxchg(rb_time_t *t, u64 expect, u64 set) unsigned long cnt2, top2, bottom2, msb2; u64 val; + /* Any interruptions in this function should cause a failure */ + cnt = local_read(&t->cnt); + /* The cmpxchg always fails if it interrupted an update */ if (!__rb_time_read(t, &val, &cnt2)) return false; @@ -713,7 +716,6 @@ static bool rb_time_cmpxchg(rb_time_t *t, u64 expect, u64 set) if (val != expect) return false; - cnt = local_read(&t->cnt); if ((cnt & 3) != cnt2) return false; -- cgit From dec890089bf79a4954b61482715ee2d084364856 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Tue, 12 Dec 2023 14:30:49 -0500 Subject: ring-buffer: Fix 32-bit rb_time_read() race with rb_time_cmpxchg() The following race can cause rb_time_read() to observe a corrupted time stamp: rb_time_cmpxchg() [...] if (!rb_time_read_cmpxchg(&t->msb, msb, msb2)) return false; if (!rb_time_read_cmpxchg(&t->top, top, top2)) return false; __rb_time_read() [...] do { c = local_read(&t->cnt); top = local_read(&t->top); bottom = local_read(&t->bottom); msb = local_read(&t->msb); } while (c != local_read(&t->cnt)); *cnt = rb_time_cnt(top); /* If top and msb counts don't match, this interrupted a write */ if (*cnt != rb_time_cnt(msb)) return false; ^ this check fails to catch that "bottom" is still not updated. So the old "bottom" value is returned, which is wrong. Fix this by checking that all three of msb, top, and bottom 2-bit cnt values match. The reason to favor checking all three fields over requiring a specific update order for both rb_time_set() and rb_time_cmpxchg() is because checking all three fields is more robust to handle partial failures of rb_time_cmpxchg() when interrupted by nested rb_time_set(). Link: https://lore.kernel.org/lkml/20231211201324.652870-1-mathieu.desnoyers@efficios.com/ Link: https://lore.kernel.org/linux-trace-kernel/20231212193049.680122-1-mathieu.desnoyers@efficios.com Fixes: f458a1453424e ("ring-buffer: Test last update in 32bit version of __rb_time_read()") Signed-off-by: Mathieu Desnoyers Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index b8ab0557bd1b..f22a849da179 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -644,8 +644,8 @@ static inline bool __rb_time_read(rb_time_t *t, u64 *ret, unsigned long *cnt) *cnt = rb_time_cnt(top); - /* If top and msb counts don't match, this interrupted a write */ - if (*cnt != rb_time_cnt(msb)) + /* If top, msb or bottom counts don't match, this interrupted a write */ + if (*cnt != rb_time_cnt(msb) || *cnt != rb_time_cnt(bottom)) return false; /* The shift to msb will lose its cnt bits */ -- cgit From 0aa0e5289cfe984a8a9fdd79ccf46ccf080151f7 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Fri, 15 Dec 2023 08:41:14 -0500 Subject: ring-buffer: Have rb_time_cmpxchg() set the msb counter too The rb_time_cmpxchg() on 32-bit architectures requires setting three 32-bit words to represent the 64-bit timestamp, with some salt for synchronization. Those are: msb, top, and bottom The issue is, the rb_time_cmpxchg() did not properly salt the msb portion, and the msb that was written was stale. Link: https://lore.kernel.org/linux-trace-kernel/20231215084114.20899342@rorschach.local.home Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Fixes: f03f2abce4f39 ("ring-buffer: Have 32 bit time stamps use all 64 bits") Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index f22a849da179..f4679013289b 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -722,10 +722,12 @@ static bool rb_time_cmpxchg(rb_time_t *t, u64 expect, u64 set) cnt2 = cnt + 1; rb_time_split(val, &top, &bottom, &msb); + msb = rb_time_val_cnt(msb, cnt); top = rb_time_val_cnt(top, cnt); bottom = rb_time_val_cnt(bottom, cnt); rb_time_split(set, &top2, &bottom2, &msb2); + msb2 = rb_time_val_cnt(msb2, cnt); top2 = rb_time_val_cnt(top2, cnt2); bottom2 = rb_time_val_cnt(bottom2, cnt2); -- cgit From 712292308af2265cd9b126aedfa987f10f452a33 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Wed, 13 Dec 2023 17:54:03 -0500 Subject: ring-buffer: Do not record in NMI if the arch does not support cmpxchg in NMI As the ring buffer recording requires cmpxchg() to work, if the architecture does not support cmpxchg in NMI, then do not do any recording within an NMI. Link: https://lore.kernel.org/linux-trace-kernel/20231213175403.6fc18540@gandalf.local.home Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index f4679013289b..5a114e752f11 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -3674,6 +3674,12 @@ rb_reserve_next_event(struct trace_buffer *buffer, int nr_loops = 0; int add_ts_default; + /* ring buffer does cmpxchg, make sure it is safe in NMI context */ + if (!IS_ENABLED(CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG) && + (unlikely(in_nmi()))) { + return NULL; + } + rb_start_commit(cpu_buffer); /* The commit page can not change after this */ -- cgit From fe3de0102bc830e78d80dbf6e2dd29c520d68575 Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Fri, 8 Dec 2023 10:33:09 +0100 Subject: kernel/cgroup: use kernfs_create_dir_ns() By passing the fsugid to kernfs_create_dir_ns(), we don't need cgroup_kn_set_ugid() any longer. That function was added for exactly this purpose by commit 49957f8e2a43 ("cgroup: newly created dirs and files should be owned by the creator"). Eliminating this piece of duplicate code means we benefit from future improvements to kernfs_create_dir_ns(); for example, both are lacking S_ISGID support currently, which my next patch will add to kernfs_create_dir_ns(). It cannot (easily) be added to cgroup_kn_set_ugid() because we can't dereference struct kernfs_iattrs from there. -- v1 -> v2: 12-digit commit id Signed-off-by: Max Kellermann Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20231208093310.297233-1-max.kellermann@ionos.com Signed-off-by: Greg Kroah-Hartman --- kernel/cgroup/cgroup.c | 31 ++++--------------------------- 1 file changed, 4 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 4b9ff41ca603..a844b421fd83 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -4169,20 +4169,6 @@ static struct kernfs_ops cgroup_kf_ops = { .seq_show = cgroup_seqfile_show, }; -/* set uid and gid of cgroup dirs and files to that of the creator */ -static int cgroup_kn_set_ugid(struct kernfs_node *kn) -{ - struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID, - .ia_uid = current_fsuid(), - .ia_gid = current_fsgid(), }; - - if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) && - gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID)) - return 0; - - return kernfs_setattr(kn, &iattr); -} - static void cgroup_file_notify_timer(struct timer_list *timer) { cgroup_file_notify(container_of(timer, struct cgroup_file, @@ -4195,25 +4181,18 @@ static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp, char name[CGROUP_FILE_NAME_MAX]; struct kernfs_node *kn; struct lock_class_key *key = NULL; - int ret; #ifdef CONFIG_DEBUG_LOCK_ALLOC key = &cft->lockdep_key; #endif kn = __kernfs_create_file(cgrp->kn, cgroup_file_name(cgrp, cft, name), cgroup_file_mode(cft), - GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, + current_fsuid(), current_fsgid(), 0, cft->kf_ops, cft, NULL, key); if (IS_ERR(kn)) return PTR_ERR(kn); - ret = cgroup_kn_set_ugid(kn); - if (ret) { - kernfs_remove(kn); - return ret; - } - if (cft->file_offset) { struct cgroup_file *cfile = (void *)css + cft->file_offset; @@ -5616,7 +5595,9 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name, goto out_cancel_ref; /* create the directory */ - kn = kernfs_create_dir(parent->kn, name, mode, cgrp); + kn = kernfs_create_dir_ns(parent->kn, name, mode, + current_fsuid(), current_fsgid(), + cgrp, NULL); if (IS_ERR(kn)) { ret = PTR_ERR(kn); goto out_stat_exit; @@ -5761,10 +5742,6 @@ int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode) */ kernfs_get(cgrp->kn); - ret = cgroup_kn_set_ugid(cgrp->kn); - if (ret) - goto out_destroy; - ret = css_populate_dir(&cgrp->self); if (ret) goto out_destroy; -- cgit From ff6d413b0b59466e5acf2e42f294b1842ae130a1 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 12 Dec 2023 13:17:40 -0800 Subject: kernfs: Convert kernfs_path_from_node_locked() from strlcpy() to strscpy() One of the last remaining users of strlcpy() in the kernel is kernfs_path_from_node_locked(), which passes back the problematic "length we _would_ have copied" return value to indicate truncation. Convert the chain of all callers to use the negative return value (some of which already doing this explicitly). All callers were already also checking for negative return values, so the risk to missed checks looks very low. In this analysis, it was found that cgroup1_release_agent() actually didn't handle the "too large" condition, so this is technically also a bug fix. :) Here's the chain of callers, and resolution identifying each one as now handling the correct return value: kernfs_path_from_node_locked() kernfs_path_from_node() pr_cont_kernfs_path() returns void kernfs_path() sysfs_warn_dup() return value ignored cgroup_path() blkg_path() bfq_bic_update_cgroup() return value ignored TRACE_IOCG_PATH() return value ignored TRACE_CGROUP_PATH() return value ignored perf_event_cgroup() return value ignored task_group_path() return value ignored damon_sysfs_memcg_path_eq() return value ignored get_mm_memcg_path() return value ignored lru_gen_seq_show() return value ignored cgroup_path_from_kernfs_id() return value ignored cgroup_show_path() already converted "too large" error to negative value cgroup_path_ns_locked() cgroup_path_ns() bpf_iter_cgroup_show_fdinfo() return value ignored cgroup1_release_agent() wasn't checking "too large" error proc_cgroup_show() already converted "too large" to negative value Cc: Greg Kroah-Hartman Cc: Tejun Heo Cc: Zefan Li Cc: Johannes Weiner Cc: Waiman Long Cc: Co-developed-by: Azeem Shaikh Signed-off-by: Azeem Shaikh Link: https://lore.kernel.org/r/20231116192127.1558276-3-keescook@chromium.org Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20231212211741.164376-3-keescook@chromium.org Signed-off-by: Greg Kroah-Hartman --- fs/kernfs/dir.c | 34 +++++++++++++++++----------------- kernel/cgroup/cgroup-v1.c | 2 +- kernel/cgroup/cgroup.c | 4 ++-- kernel/cgroup/cpuset.c | 2 +- 4 files changed, 21 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index d23bf7883b08..bce1d7ac95ca 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -127,7 +127,7 @@ static struct kernfs_node *kernfs_common_ancestor(struct kernfs_node *a, * * [3] when @kn_to is %NULL result will be "(null)" * - * Return: the length of the full path. If the full length is equal to or + * Return: the length of the constructed path. If the path would have been * greater than @buflen, @buf contains the truncated path with the trailing * '\0'. On error, -errno is returned. */ @@ -138,16 +138,17 @@ static int kernfs_path_from_node_locked(struct kernfs_node *kn_to, struct kernfs_node *kn, *common; const char parent_str[] = "/.."; size_t depth_from, depth_to, len = 0; + ssize_t copied; int i, j; if (!kn_to) - return strlcpy(buf, "(null)", buflen); + return strscpy(buf, "(null)", buflen); if (!kn_from) kn_from = kernfs_root(kn_to)->kn; if (kn_from == kn_to) - return strlcpy(buf, "/", buflen); + return strscpy(buf, "/", buflen); common = kernfs_common_ancestor(kn_from, kn_to); if (WARN_ON(!common)) @@ -158,18 +159,19 @@ static int kernfs_path_from_node_locked(struct kernfs_node *kn_to, buf[0] = '\0'; - for (i = 0; i < depth_from; i++) - len += strlcpy(buf + len, parent_str, - len < buflen ? buflen - len : 0); + for (i = 0; i < depth_from; i++) { + copied = strscpy(buf + len, parent_str, buflen - len); + if (copied < 0) + return copied; + len += copied; + } /* Calculate how many bytes we need for the rest */ for (i = depth_to - 1; i >= 0; i--) { for (kn = kn_to, j = 0; j < i; j++) kn = kn->parent; - len += strlcpy(buf + len, "/", - len < buflen ? buflen - len : 0); - len += strlcpy(buf + len, kn->name, - len < buflen ? buflen - len : 0); + + len += scnprintf(buf + len, buflen - len, "/%s", kn->name); } return len; @@ -214,7 +216,7 @@ int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen) * path (which includes '..'s) as needed to reach from @from to @to is * returned. * - * Return: the length of the full path. If the full length is equal to or + * Return: the length of the constructed path. If the path would have been * greater than @buflen, @buf contains the truncated path with the trailing * '\0'. On error, -errno is returned. */ @@ -265,12 +267,10 @@ void pr_cont_kernfs_path(struct kernfs_node *kn) sz = kernfs_path_from_node(kn, NULL, kernfs_pr_cont_buf, sizeof(kernfs_pr_cont_buf)); if (sz < 0) { - pr_cont("(error)"); - goto out; - } - - if (sz >= sizeof(kernfs_pr_cont_buf)) { - pr_cont("(name too long)"); + if (sz == -E2BIG) + pr_cont("(name too long)"); + else + pr_cont("(error)"); goto out; } diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 76db6c67e39a..9cb00ebe9ac6 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -802,7 +802,7 @@ void cgroup1_release_agent(struct work_struct *work) goto out_free; ret = cgroup_path_ns(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns); - if (ret < 0 || ret >= PATH_MAX) + if (ret < 0) goto out_free; argv[0] = agentbuf; diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index a844b421fd83..4bc8183b669f 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1893,7 +1893,7 @@ int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node, len = kernfs_path_from_node(kf_node, ns_cgroup->kn, buf, PATH_MAX); spin_unlock_irq(&css_set_lock); - if (len >= PATH_MAX) + if (len == -E2BIG) len = -ERANGE; else if (len > 0) { seq_escape(sf, buf, " \t\n\\"); @@ -6278,7 +6278,7 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, if (cgroup_on_dfl(cgrp) || !(tsk->flags & PF_EXITING)) { retval = cgroup_path_ns_locked(cgrp, buf, PATH_MAX, current->nsproxy->cgroup_ns); - if (retval >= PATH_MAX) + if (retval == -E2BIG) retval = -ENAMETOOLONG; if (retval < 0) goto out_unlock; diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 615daaf87f1f..fb29158ae825 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -4941,7 +4941,7 @@ int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns, retval = cgroup_path_ns(css->cgroup, buf, PATH_MAX, current->nsproxy->cgroup_ns); css_put(css); - if (retval >= PATH_MAX) + if (retval == -E2BIG) retval = -ENAMETOOLONG; if (retval < 0) goto out_free; -- cgit From 8b2efe51ba85ca83460941672afac6fca4199df6 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Fri, 15 Dec 2023 18:07:04 +0800 Subject: bpf: Limit the number of uprobes when attaching program to multiple uprobes An abnormally big cnt may be passed to link_create.uprobe_multi.cnt, and it will trigger the following warning in kvmalloc_node(): if (unlikely(size > INT_MAX)) { WARN_ON_ONCE(!(flags & __GFP_NOWARN)); return NULL; } Fix the warning by limiting the maximal number of uprobes in bpf_uprobe_multi_link_attach(). If the number of uprobes is greater than MAX_UPROBE_MULTI_CNT, the attachment will return -E2BIG. Fixes: 89ae89f53d20 ("bpf: Add multi uprobe link") Reported-by: Xingwei Lee Signed-off-by: Hou Tao Signed-off-by: Daniel Borkmann Acked-by: Jiri Olsa Acked-by: Andrii Nakryiko Closes: https://lore.kernel.org/bpf/CABOYnLwwJY=yFAGie59LFsUsBAgHfroVqbzZ5edAXbFE3YiNVA@mail.gmail.com Link: https://lore.kernel.org/bpf/20231215100708.2265609-2-houtao@huaweicloud.com --- kernel/trace/bpf_trace.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 774cf476a892..75c05aea9fd9 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -42,6 +42,8 @@ #define bpf_event_rcu_dereference(p) \ rcu_dereference_protected(p, lockdep_is_held(&bpf_event_mutex)) +#define MAX_UPROBE_MULTI_CNT (1U << 20) + #ifdef CONFIG_MODULES struct bpf_trace_module { struct module *module; @@ -3344,6 +3346,8 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr if (!upath || !uoffsets || !cnt) return -EINVAL; + if (cnt > MAX_UPROBE_MULTI_CNT) + return -E2BIG; uref_ctr_offsets = u64_to_user_ptr(attr->link_create.uprobe_multi.ref_ctr_offsets); ucookies = u64_to_user_ptr(attr->link_create.uprobe_multi.cookies); -- cgit From d6d1e6c17cab2dcb7b8530c599f00e7de906d380 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Fri, 15 Dec 2023 18:07:05 +0800 Subject: bpf: Limit the number of kprobes when attaching program to multiple kprobes An abnormally big cnt may also be assigned to kprobe_multi.cnt when attaching multiple kprobes. It will trigger the following warning in kvmalloc_node(): if (unlikely(size > INT_MAX)) { WARN_ON_ONCE(!(flags & __GFP_NOWARN)); return NULL; } Fix the warning by limiting the maximal number of kprobes in bpf_kprobe_multi_link_attach(). If the number of kprobes is greater than MAX_KPROBE_MULTI_CNT, the attachment will fail and return -E2BIG. Fixes: 0dcac2725406 ("bpf: Add multi kprobe link") Signed-off-by: Hou Tao Signed-off-by: Daniel Borkmann Acked-by: Jiri Olsa Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20231215100708.2265609-3-houtao@huaweicloud.com --- kernel/trace/bpf_trace.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 75c05aea9fd9..97c0c49c40a0 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -43,6 +43,7 @@ rcu_dereference_protected(p, lockdep_is_held(&bpf_event_mutex)) #define MAX_UPROBE_MULTI_CNT (1U << 20) +#define MAX_KPROBE_MULTI_CNT (1U << 20) #ifdef CONFIG_MODULES struct bpf_trace_module { @@ -2972,6 +2973,8 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr cnt = attr->link_create.kprobe_multi.cnt; if (!cnt) return -EINVAL; + if (cnt > MAX_KPROBE_MULTI_CNT) + return -E2BIG; size = cnt * sizeof(*addrs); addrs = kvmalloc_array(cnt, sizeof(*addrs), GFP_KERNEL); -- cgit From f8fa5d76925991976b3e7076f9d1052515ec1fca Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 15 Dec 2023 13:24:10 -0700 Subject: cred: switch to using atomic_long_t There are multiple ways to grab references to credentials, and the only protection we have against overflowing it is the memory required to do so. With memory sizes only moving in one direction, let's bump the reference count to 64-bit and move it outside the realm of feasibly overflowing. Signed-off-by: Jens Axboe Signed-off-by: Linus Torvalds --- include/linux/cred.h | 8 +++---- kernel/cred.c | 64 ++++++++++++++++++++++++++-------------------------- 2 files changed, 36 insertions(+), 36 deletions(-) (limited to 'kernel') diff --git a/include/linux/cred.h b/include/linux/cred.h index af8d353a4b86..a3383f8efb8f 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -109,7 +109,7 @@ static inline int groups_search(const struct group_info *group_info, kgid_t grp) * same context as task->real_cred. */ struct cred { - atomic_t usage; + atomic_long_t usage; #ifdef CONFIG_DEBUG_CREDENTIALS atomic_t subscribers; /* number of processes subscribed */ void *put_addr; @@ -229,7 +229,7 @@ static inline bool cap_ambient_invariant_ok(const struct cred *cred) */ static inline struct cred *get_new_cred_many(struct cred *cred, int nr) { - atomic_add(nr, &cred->usage); + atomic_long_add(nr, &cred->usage); return cred; } @@ -288,7 +288,7 @@ static inline const struct cred *get_cred_rcu(const struct cred *cred) struct cred *nonconst_cred = (struct cred *) cred; if (!cred) return NULL; - if (!atomic_inc_not_zero(&nonconst_cred->usage)) + if (!atomic_long_inc_not_zero(&nonconst_cred->usage)) return NULL; validate_creds(cred); nonconst_cred->non_rcu = 0; @@ -313,7 +313,7 @@ static inline void put_cred_many(const struct cred *_cred, int nr) if (cred) { validate_creds(cred); - if (atomic_sub_and_test(nr, &cred->usage)) + if (atomic_long_sub_and_test(nr, &cred->usage)) __put_cred(cred); } } diff --git a/kernel/cred.c b/kernel/cred.c index 3c714cb31660..4a6cd0f0fef5 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -102,17 +102,17 @@ static void put_cred_rcu(struct rcu_head *rcu) #ifdef CONFIG_DEBUG_CREDENTIALS if (cred->magic != CRED_MAGIC_DEAD || - atomic_read(&cred->usage) != 0 || + atomic_long_read(&cred->usage) != 0 || read_cred_subscribers(cred) != 0) panic("CRED: put_cred_rcu() sees %p with" - " mag %x, put %p, usage %d, subscr %d\n", + " mag %x, put %p, usage %ld, subscr %d\n", cred, cred->magic, cred->put_addr, - atomic_read(&cred->usage), + atomic_long_read(&cred->usage), read_cred_subscribers(cred)); #else - if (atomic_read(&cred->usage) != 0) - panic("CRED: put_cred_rcu() sees %p with usage %d\n", - cred, atomic_read(&cred->usage)); + if (atomic_long_read(&cred->usage) != 0) + panic("CRED: put_cred_rcu() sees %p with usage %ld\n", + cred, atomic_long_read(&cred->usage)); #endif security_cred_free(cred); @@ -137,11 +137,11 @@ static void put_cred_rcu(struct rcu_head *rcu) */ void __put_cred(struct cred *cred) { - kdebug("__put_cred(%p{%d,%d})", cred, - atomic_read(&cred->usage), + kdebug("__put_cred(%p{%ld,%d})", cred, + atomic_long_read(&cred->usage), read_cred_subscribers(cred)); - BUG_ON(atomic_read(&cred->usage) != 0); + BUG_ON(atomic_long_read(&cred->usage) != 0); #ifdef CONFIG_DEBUG_CREDENTIALS BUG_ON(read_cred_subscribers(cred) != 0); cred->magic = CRED_MAGIC_DEAD; @@ -164,8 +164,8 @@ void exit_creds(struct task_struct *tsk) { struct cred *real_cred, *cred; - kdebug("exit_creds(%u,%p,%p,{%d,%d})", tsk->pid, tsk->real_cred, tsk->cred, - atomic_read(&tsk->cred->usage), + kdebug("exit_creds(%u,%p,%p,{%ld,%d})", tsk->pid, tsk->real_cred, tsk->cred, + atomic_long_read(&tsk->cred->usage), read_cred_subscribers(tsk->cred)); real_cred = (struct cred *) tsk->real_cred; @@ -230,7 +230,7 @@ struct cred *cred_alloc_blank(void) if (!new) return NULL; - atomic_set(&new->usage, 1); + atomic_long_set(&new->usage, 1); #ifdef CONFIG_DEBUG_CREDENTIALS new->magic = CRED_MAGIC; #endif @@ -276,7 +276,7 @@ struct cred *prepare_creds(void) memcpy(new, old, sizeof(struct cred)); new->non_rcu = 0; - atomic_set(&new->usage, 1); + atomic_long_set(&new->usage, 1); set_cred_subscribers(new, 0); get_group_info(new->group_info); get_uid(new->user); @@ -363,8 +363,8 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags) ) { p->real_cred = get_cred_many(p->cred, 2); alter_cred_subscribers(p->cred, 2); - kdebug("share_creds(%p{%d,%d})", - p->cred, atomic_read(&p->cred->usage), + kdebug("share_creds(%p{%ld,%d})", + p->cred, atomic_long_read(&p->cred->usage), read_cred_subscribers(p->cred)); inc_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1); return 0; @@ -457,8 +457,8 @@ int commit_creds(struct cred *new) struct task_struct *task = current; const struct cred *old = task->real_cred; - kdebug("commit_creds(%p{%d,%d})", new, - atomic_read(&new->usage), + kdebug("commit_creds(%p{%ld,%d})", new, + atomic_long_read(&new->usage), read_cred_subscribers(new)); BUG_ON(task->cred != old); @@ -467,7 +467,7 @@ int commit_creds(struct cred *new) validate_creds(old); validate_creds(new); #endif - BUG_ON(atomic_read(&new->usage) < 1); + BUG_ON(atomic_long_read(&new->usage) < 1); get_cred(new); /* we will require a ref for the subj creds too */ @@ -539,14 +539,14 @@ EXPORT_SYMBOL(commit_creds); */ void abort_creds(struct cred *new) { - kdebug("abort_creds(%p{%d,%d})", new, - atomic_read(&new->usage), + kdebug("abort_creds(%p{%ld,%d})", new, + atomic_long_read(&new->usage), read_cred_subscribers(new)); #ifdef CONFIG_DEBUG_CREDENTIALS BUG_ON(read_cred_subscribers(new) != 0); #endif - BUG_ON(atomic_read(&new->usage) < 1); + BUG_ON(atomic_long_read(&new->usage) < 1); put_cred(new); } EXPORT_SYMBOL(abort_creds); @@ -562,8 +562,8 @@ const struct cred *override_creds(const struct cred *new) { const struct cred *old = current->cred; - kdebug("override_creds(%p{%d,%d})", new, - atomic_read(&new->usage), + kdebug("override_creds(%p{%ld,%d})", new, + atomic_long_read(&new->usage), read_cred_subscribers(new)); validate_creds(old); @@ -585,8 +585,8 @@ const struct cred *override_creds(const struct cred *new) rcu_assign_pointer(current->cred, new); alter_cred_subscribers(old, -1); - kdebug("override_creds() = %p{%d,%d}", old, - atomic_read(&old->usage), + kdebug("override_creds() = %p{%ld,%d}", old, + atomic_long_read(&old->usage), read_cred_subscribers(old)); return old; } @@ -603,8 +603,8 @@ void revert_creds(const struct cred *old) { const struct cred *override = current->cred; - kdebug("revert_creds(%p{%d,%d})", old, - atomic_read(&old->usage), + kdebug("revert_creds(%p{%ld,%d})", old, + atomic_long_read(&old->usage), read_cred_subscribers(old)); validate_creds(old); @@ -735,7 +735,7 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon) *new = *old; new->non_rcu = 0; - atomic_set(&new->usage, 1); + atomic_long_set(&new->usage, 1); set_cred_subscribers(new, 0); get_uid(new->user); get_user_ns(new->user_ns); @@ -849,8 +849,8 @@ static void dump_invalid_creds(const struct cred *cred, const char *label, cred == tsk->cred ? "[eff]" : ""); pr_err("->magic=%x, put_addr=%p\n", cred->magic, cred->put_addr); - pr_err("->usage=%d, subscr=%d\n", - atomic_read(&cred->usage), + pr_err("->usage=%ld, subscr=%d\n", + atomic_long_read(&cred->usage), read_cred_subscribers(cred)); pr_err("->*uid = { %d,%d,%d,%d }\n", from_kuid_munged(&init_user_ns, cred->uid), @@ -922,9 +922,9 @@ EXPORT_SYMBOL(__validate_process_creds); */ void validate_creds_for_do_exit(struct task_struct *tsk) { - kdebug("validate_creds_for_do_exit(%p,%p{%d,%d})", + kdebug("validate_creds_for_do_exit(%p,%p{%ld,%d})", tsk->real_cred, tsk->cred, - atomic_read(&tsk->cred->usage), + atomic_long_read(&tsk->cred->usage), read_cred_subscribers(tsk->cred)); __validate_process_creds(tsk, __FILE__, __LINE__); -- cgit From ae1914174a63a558113e80d24ccac2773f9f7b2b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 15 Dec 2023 13:40:57 -0700 Subject: cred: get rid of CONFIG_DEBUG_CREDENTIALS This code is rarely (never?) enabled by distros, and it hasn't caught anything in decades. Let's kill off this legacy debug code. Suggested-by: Linus Torvalds Signed-off-by: Jens Axboe Signed-off-by: Linus Torvalds --- arch/powerpc/configs/skiroot_defconfig | 1 - arch/s390/configs/debug_defconfig | 1 - fs/nfsd/auth.c | 4 - fs/nfsd/nfssvc.c | 1 - fs/nfsd/vfs.c | 9 +- fs/open.c | 3 - include/linux/cred.h | 50 ------- kernel/cred.c | 231 +++--------------------------- kernel/exit.c | 3 - lib/Kconfig.debug | 15 -- net/sunrpc/auth.c | 3 - security/selinux/hooks.c | 6 - tools/objtool/noreturns.h | 1 - tools/testing/selftests/bpf/config.x86_64 | 1 - tools/testing/selftests/hid/config.common | 1 - 15 files changed, 17 insertions(+), 313 deletions(-) (limited to 'kernel') diff --git a/arch/powerpc/configs/skiroot_defconfig b/arch/powerpc/configs/skiroot_defconfig index 8d3eacb50d56..9d44e6630908 100644 --- a/arch/powerpc/configs/skiroot_defconfig +++ b/arch/powerpc/configs/skiroot_defconfig @@ -301,7 +301,6 @@ CONFIG_WQ_WATCHDOG=y CONFIG_DEBUG_SG=y CONFIG_DEBUG_NOTIFIERS=y CONFIG_BUG_ON_DATA_CORRUPTION=y -CONFIG_DEBUG_CREDENTIALS=y # CONFIG_FTRACE is not set CONFIG_XMON=y # CONFIG_RUNTIME_TESTING_MENU is not set diff --git a/arch/s390/configs/debug_defconfig b/arch/s390/configs/debug_defconfig index 438cd92e6080..dd0608629310 100644 --- a/arch/s390/configs/debug_defconfig +++ b/arch/s390/configs/debug_defconfig @@ -834,7 +834,6 @@ CONFIG_DEBUG_IRQFLAGS=y CONFIG_DEBUG_LIST=y CONFIG_DEBUG_SG=y CONFIG_DEBUG_NOTIFIERS=y -CONFIG_DEBUG_CREDENTIALS=y CONFIG_RCU_TORTURE_TEST=m CONFIG_RCU_REF_SCALE_TEST=m CONFIG_RCU_CPU_STALL_TIMEOUT=300 diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c index fdf2aad73470..e6beaaf4f170 100644 --- a/fs/nfsd/auth.c +++ b/fs/nfsd/auth.c @@ -26,8 +26,6 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp) int i; int flags = nfsexp_flags(rqstp, exp); - validate_process_creds(); - /* discard any old override before preparing the new set */ revert_creds(get_cred(current_real_cred())); new = prepare_creds(); @@ -81,10 +79,8 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp) else new->cap_effective = cap_raise_nfsd_set(new->cap_effective, new->cap_permitted); - validate_process_creds(); put_cred(override_creds(new)); put_cred(new); - validate_process_creds(); return 0; oom: diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index fe61d9bbcc1f..5014ab87d313 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -955,7 +955,6 @@ nfsd(void *vrqstp) rqstp->rq_server->sv_maxconn = nn->max_connections; svc_recv(rqstp); - validate_process_creds(); } atomic_dec(&nfsdstats.th_cnt); diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index fbbea7498f02..e01e4e2acbd9 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -901,7 +901,6 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, int host_err; bool retried = false; - validate_process_creds(); /* * If we get here, then the client has already done an "open", * and (hopefully) checked permission - so allow OWNER_OVERRIDE @@ -926,7 +925,6 @@ retry: } err = nfserrno(host_err); } - validate_process_creds(); return err; } @@ -943,12 +941,7 @@ int nfsd_open_verified(struct svc_rqst *rqstp, struct svc_fh *fhp, int may_flags, struct file **filp) { - int err; - - validate_process_creds(); - err = __nfsd_open(rqstp, fhp, S_IFREG, may_flags, filp); - validate_process_creds(); - return err; + return __nfsd_open(rqstp, fhp, S_IFREG, may_flags, filp); } /* diff --git a/fs/open.c b/fs/open.c index 02dc608d40d8..3494a9cd8046 100644 --- a/fs/open.c +++ b/fs/open.c @@ -1088,8 +1088,6 @@ struct file *dentry_open(const struct path *path, int flags, int error; struct file *f; - validate_creds(cred); - /* We must always pass in a valid mount pointer. */ BUG_ON(!path->mnt); @@ -1128,7 +1126,6 @@ struct file *dentry_create(const struct path *path, int flags, umode_t mode, struct file *f; int error; - validate_creds(cred); f = alloc_empty_file(flags, cred); if (IS_ERR(f)) return f; diff --git a/include/linux/cred.h b/include/linux/cred.h index a3383f8efb8f..2976f534a7a3 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -110,13 +110,6 @@ static inline int groups_search(const struct group_info *group_info, kgid_t grp) */ struct cred { atomic_long_t usage; -#ifdef CONFIG_DEBUG_CREDENTIALS - atomic_t subscribers; /* number of processes subscribed */ - void *put_addr; - unsigned magic; -#define CRED_MAGIC 0x43736564 -#define CRED_MAGIC_DEAD 0x44656144 -#endif kuid_t uid; /* real UID of the task */ kgid_t gid; /* real GID of the task */ kuid_t suid; /* saved UID of the task */ @@ -172,46 +165,6 @@ extern int cred_fscmp(const struct cred *, const struct cred *); extern void __init cred_init(void); extern int set_cred_ucounts(struct cred *); -/* - * check for validity of credentials - */ -#ifdef CONFIG_DEBUG_CREDENTIALS -extern void __noreturn __invalid_creds(const struct cred *, const char *, unsigned); -extern void __validate_process_creds(struct task_struct *, - const char *, unsigned); - -extern bool creds_are_invalid(const struct cred *cred); - -static inline void __validate_creds(const struct cred *cred, - const char *file, unsigned line) -{ - if (unlikely(creds_are_invalid(cred))) - __invalid_creds(cred, file, line); -} - -#define validate_creds(cred) \ -do { \ - __validate_creds((cred), __FILE__, __LINE__); \ -} while(0) - -#define validate_process_creds() \ -do { \ - __validate_process_creds(current, __FILE__, __LINE__); \ -} while(0) - -extern void validate_creds_for_do_exit(struct task_struct *); -#else -static inline void validate_creds(const struct cred *cred) -{ -} -static inline void validate_creds_for_do_exit(struct task_struct *tsk) -{ -} -static inline void validate_process_creds(void) -{ -} -#endif - static inline bool cap_ambient_invariant_ok(const struct cred *cred) { return cap_issubset(cred->cap_ambient, @@ -264,7 +217,6 @@ static inline const struct cred *get_cred_many(const struct cred *cred, int nr) struct cred *nonconst_cred = (struct cred *) cred; if (!cred) return cred; - validate_creds(cred); nonconst_cred->non_rcu = 0; return get_new_cred_many(nonconst_cred, nr); } @@ -290,7 +242,6 @@ static inline const struct cred *get_cred_rcu(const struct cred *cred) return NULL; if (!atomic_long_inc_not_zero(&nonconst_cred->usage)) return NULL; - validate_creds(cred); nonconst_cred->non_rcu = 0; return cred; } @@ -312,7 +263,6 @@ static inline void put_cred_many(const struct cred *_cred, int nr) struct cred *cred = (struct cred *) _cred; if (cred) { - validate_creds(cred); if (atomic_long_sub_and_test(nr, &cred->usage)) __put_cred(cred); } diff --git a/kernel/cred.c b/kernel/cred.c index 4a6cd0f0fef5..c033a201c808 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -43,10 +43,6 @@ static struct group_info init_groups = { .usage = REFCOUNT_INIT(2) }; */ struct cred init_cred = { .usage = ATOMIC_INIT(4), -#ifdef CONFIG_DEBUG_CREDENTIALS - .subscribers = ATOMIC_INIT(2), - .magic = CRED_MAGIC, -#endif .uid = GLOBAL_ROOT_UID, .gid = GLOBAL_ROOT_GID, .suid = GLOBAL_ROOT_UID, @@ -66,31 +62,6 @@ struct cred init_cred = { .ucounts = &init_ucounts, }; -static inline void set_cred_subscribers(struct cred *cred, int n) -{ -#ifdef CONFIG_DEBUG_CREDENTIALS - atomic_set(&cred->subscribers, n); -#endif -} - -static inline int read_cred_subscribers(const struct cred *cred) -{ -#ifdef CONFIG_DEBUG_CREDENTIALS - return atomic_read(&cred->subscribers); -#else - return 0; -#endif -} - -static inline void alter_cred_subscribers(const struct cred *_cred, int n) -{ -#ifdef CONFIG_DEBUG_CREDENTIALS - struct cred *cred = (struct cred *) _cred; - - atomic_add(n, &cred->subscribers); -#endif -} - /* * The RCU callback to actually dispose of a set of credentials */ @@ -100,20 +71,9 @@ static void put_cred_rcu(struct rcu_head *rcu) kdebug("put_cred_rcu(%p)", cred); -#ifdef CONFIG_DEBUG_CREDENTIALS - if (cred->magic != CRED_MAGIC_DEAD || - atomic_long_read(&cred->usage) != 0 || - read_cred_subscribers(cred) != 0) - panic("CRED: put_cred_rcu() sees %p with" - " mag %x, put %p, usage %ld, subscr %d\n", - cred, cred->magic, cred->put_addr, - atomic_long_read(&cred->usage), - read_cred_subscribers(cred)); -#else if (atomic_long_read(&cred->usage) != 0) panic("CRED: put_cred_rcu() sees %p with usage %ld\n", cred, atomic_long_read(&cred->usage)); -#endif security_cred_free(cred); key_put(cred->session_keyring); @@ -137,16 +97,10 @@ static void put_cred_rcu(struct rcu_head *rcu) */ void __put_cred(struct cred *cred) { - kdebug("__put_cred(%p{%ld,%d})", cred, - atomic_long_read(&cred->usage), - read_cred_subscribers(cred)); + kdebug("__put_cred(%p{%ld})", cred, + atomic_long_read(&cred->usage)); BUG_ON(atomic_long_read(&cred->usage) != 0); -#ifdef CONFIG_DEBUG_CREDENTIALS - BUG_ON(read_cred_subscribers(cred) != 0); - cred->magic = CRED_MAGIC_DEAD; - cred->put_addr = __builtin_return_address(0); -#endif BUG_ON(cred == current->cred); BUG_ON(cred == current->real_cred); @@ -164,9 +118,8 @@ void exit_creds(struct task_struct *tsk) { struct cred *real_cred, *cred; - kdebug("exit_creds(%u,%p,%p,{%ld,%d})", tsk->pid, tsk->real_cred, tsk->cred, - atomic_long_read(&tsk->cred->usage), - read_cred_subscribers(tsk->cred)); + kdebug("exit_creds(%u,%p,%p,{%ld})", tsk->pid, tsk->real_cred, tsk->cred, + atomic_long_read(&tsk->cred->usage)); real_cred = (struct cred *) tsk->real_cred; tsk->real_cred = NULL; @@ -174,15 +127,10 @@ void exit_creds(struct task_struct *tsk) cred = (struct cred *) tsk->cred; tsk->cred = NULL; - validate_creds(cred); if (real_cred == cred) { - alter_cred_subscribers(cred, -2); put_cred_many(cred, 2); } else { - validate_creds(real_cred); - alter_cred_subscribers(real_cred, -1); put_cred(real_cred); - alter_cred_subscribers(cred, -1); put_cred(cred); } @@ -231,9 +179,6 @@ struct cred *cred_alloc_blank(void) return NULL; atomic_long_set(&new->usage, 1); -#ifdef CONFIG_DEBUG_CREDENTIALS - new->magic = CRED_MAGIC; -#endif if (security_cred_alloc_blank(new, GFP_KERNEL_ACCOUNT) < 0) goto error; @@ -264,8 +209,6 @@ struct cred *prepare_creds(void) const struct cred *old; struct cred *new; - validate_process_creds(); - new = kmem_cache_alloc(cred_jar, GFP_KERNEL); if (!new) return NULL; @@ -277,7 +220,6 @@ struct cred *prepare_creds(void) new->non_rcu = 0; atomic_long_set(&new->usage, 1); - set_cred_subscribers(new, 0); get_group_info(new->group_info); get_uid(new->user); get_user_ns(new->user_ns); @@ -300,7 +242,6 @@ struct cred *prepare_creds(void) if (security_prepare_creds(new, old, GFP_KERNEL_ACCOUNT) < 0) goto error; - validate_creds(new); return new; error: @@ -362,10 +303,8 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags) clone_flags & CLONE_THREAD ) { p->real_cred = get_cred_many(p->cred, 2); - alter_cred_subscribers(p->cred, 2); - kdebug("share_creds(%p{%ld,%d})", - p->cred, atomic_long_read(&p->cred->usage), - read_cred_subscribers(p->cred)); + kdebug("share_creds(%p{%ld})", + p->cred, atomic_long_read(&p->cred->usage)); inc_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1); return 0; } @@ -404,8 +343,6 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags) p->cred = p->real_cred = get_cred(new); inc_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1); - alter_cred_subscribers(new, 2); - validate_creds(new); return 0; error_put: @@ -457,16 +394,10 @@ int commit_creds(struct cred *new) struct task_struct *task = current; const struct cred *old = task->real_cred; - kdebug("commit_creds(%p{%ld,%d})", new, - atomic_long_read(&new->usage), - read_cred_subscribers(new)); + kdebug("commit_creds(%p{%ld})", new, + atomic_long_read(&new->usage)); BUG_ON(task->cred != old); -#ifdef CONFIG_DEBUG_CREDENTIALS - BUG_ON(read_cred_subscribers(old) < 2); - validate_creds(old); - validate_creds(new); -#endif BUG_ON(atomic_long_read(&new->usage) < 1); get_cred(new); /* we will require a ref for the subj creds too */ @@ -502,14 +433,12 @@ int commit_creds(struct cred *new) * RLIMIT_NPROC limits on user->processes have already been checked * in set_user(). */ - alter_cred_subscribers(new, 2); if (new->user != old->user || new->user_ns != old->user_ns) inc_rlimit_ucounts(new->ucounts, UCOUNT_RLIMIT_NPROC, 1); rcu_assign_pointer(task->real_cred, new); rcu_assign_pointer(task->cred, new); if (new->user != old->user || new->user_ns != old->user_ns) dec_rlimit_ucounts(old->ucounts, UCOUNT_RLIMIT_NPROC, 1); - alter_cred_subscribers(old, -2); /* send notifications */ if (!uid_eq(new->uid, old->uid) || @@ -539,13 +468,9 @@ EXPORT_SYMBOL(commit_creds); */ void abort_creds(struct cred *new) { - kdebug("abort_creds(%p{%ld,%d})", new, - atomic_long_read(&new->usage), - read_cred_subscribers(new)); + kdebug("abort_creds(%p{%ld})", new, + atomic_long_read(&new->usage)); -#ifdef CONFIG_DEBUG_CREDENTIALS - BUG_ON(read_cred_subscribers(new) != 0); -#endif BUG_ON(atomic_long_read(&new->usage) < 1); put_cred(new); } @@ -562,12 +487,8 @@ const struct cred *override_creds(const struct cred *new) { const struct cred *old = current->cred; - kdebug("override_creds(%p{%ld,%d})", new, - atomic_long_read(&new->usage), - read_cred_subscribers(new)); - - validate_creds(old); - validate_creds(new); + kdebug("override_creds(%p{%ld})", new, + atomic_long_read(&new->usage)); /* * NOTE! This uses 'get_new_cred()' rather than 'get_cred()'. @@ -576,18 +497,12 @@ const struct cred *override_creds(const struct cred *new) * we are only installing the cred into the thread-synchronous * '->cred' pointer, not the '->real_cred' pointer that is * visible to other threads under RCU. - * - * Also note that we did validate_creds() manually, not depending - * on the validation in 'get_cred()'. */ get_new_cred((struct cred *)new); - alter_cred_subscribers(new, 1); rcu_assign_pointer(current->cred, new); - alter_cred_subscribers(old, -1); - kdebug("override_creds() = %p{%ld,%d}", old, - atomic_long_read(&old->usage), - read_cred_subscribers(old)); + kdebug("override_creds() = %p{%ld}", old, + atomic_long_read(&old->usage)); return old; } EXPORT_SYMBOL(override_creds); @@ -603,15 +518,10 @@ void revert_creds(const struct cred *old) { const struct cred *override = current->cred; - kdebug("revert_creds(%p{%ld,%d})", old, - atomic_long_read(&old->usage), - read_cred_subscribers(old)); + kdebug("revert_creds(%p{%ld})", old, + atomic_long_read(&old->usage)); - validate_creds(old); - validate_creds(override); - alter_cred_subscribers(old, 1); rcu_assign_pointer(current->cred, old); - alter_cred_subscribers(override, -1); put_cred(override); } EXPORT_SYMBOL(revert_creds); @@ -731,12 +641,10 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon) kdebug("prepare_kernel_cred() alloc %p", new); old = get_task_cred(daemon); - validate_creds(old); *new = *old; new->non_rcu = 0; atomic_long_set(&new->usage, 1); - set_cred_subscribers(new, 0); get_uid(new->user); get_user_ns(new->user_ns); get_group_info(new->group_info); @@ -760,7 +668,6 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon) goto error; put_cred(old); - validate_creds(new); return new; error: @@ -825,109 +732,3 @@ int set_create_files_as(struct cred *new, struct inode *inode) return security_kernel_create_files_as(new, inode); } EXPORT_SYMBOL(set_create_files_as); - -#ifdef CONFIG_DEBUG_CREDENTIALS - -bool creds_are_invalid(const struct cred *cred) -{ - if (cred->magic != CRED_MAGIC) - return true; - return false; -} -EXPORT_SYMBOL(creds_are_invalid); - -/* - * dump invalid credentials - */ -static void dump_invalid_creds(const struct cred *cred, const char *label, - const struct task_struct *tsk) -{ - pr_err("%s credentials: %p %s%s%s\n", - label, cred, - cred == &init_cred ? "[init]" : "", - cred == tsk->real_cred ? "[real]" : "", - cred == tsk->cred ? "[eff]" : ""); - pr_err("->magic=%x, put_addr=%p\n", - cred->magic, cred->put_addr); - pr_err("->usage=%ld, subscr=%d\n", - atomic_long_read(&cred->usage), - read_cred_subscribers(cred)); - pr_err("->*uid = { %d,%d,%d,%d }\n", - from_kuid_munged(&init_user_ns, cred->uid), - from_kuid_munged(&init_user_ns, cred->euid), - from_kuid_munged(&init_user_ns, cred->suid), - from_kuid_munged(&init_user_ns, cred->fsuid)); - pr_err("->*gid = { %d,%d,%d,%d }\n", - from_kgid_munged(&init_user_ns, cred->gid), - from_kgid_munged(&init_user_ns, cred->egid), - from_kgid_munged(&init_user_ns, cred->sgid), - from_kgid_munged(&init_user_ns, cred->fsgid)); -#ifdef CONFIG_SECURITY - pr_err("->security is %p\n", cred->security); - if ((unsigned long) cred->security >= PAGE_SIZE && - (((unsigned long) cred->security & 0xffffff00) != - (POISON_FREE << 24 | POISON_FREE << 16 | POISON_FREE << 8))) - pr_err("->security {%x, %x}\n", - ((u32*)cred->security)[0], - ((u32*)cred->security)[1]); -#endif -} - -/* - * report use of invalid credentials - */ -void __noreturn __invalid_creds(const struct cred *cred, const char *file, unsigned line) -{ - pr_err("Invalid credentials\n"); - pr_err("At %s:%u\n", file, line); - dump_invalid_creds(cred, "Specified", current); - BUG(); -} -EXPORT_SYMBOL(__invalid_creds); - -/* - * check the credentials on a process - */ -void __validate_process_creds(struct task_struct *tsk, - const char *file, unsigned line) -{ - if (tsk->cred == tsk->real_cred) { - if (unlikely(read_cred_subscribers(tsk->cred) < 2 || - creds_are_invalid(tsk->cred))) - goto invalid_creds; - } else { - if (unlikely(read_cred_subscribers(tsk->real_cred) < 1 || - read_cred_subscribers(tsk->cred) < 1 || - creds_are_invalid(tsk->real_cred) || - creds_are_invalid(tsk->cred))) - goto invalid_creds; - } - return; - -invalid_creds: - pr_err("Invalid process credentials\n"); - pr_err("At %s:%u\n", file, line); - - dump_invalid_creds(tsk->real_cred, "Real", tsk); - if (tsk->cred != tsk->real_cred) - dump_invalid_creds(tsk->cred, "Effective", tsk); - else - pr_err("Effective creds == Real creds\n"); - BUG(); -} -EXPORT_SYMBOL(__validate_process_creds); - -/* - * check creds for do_exit() - */ -void validate_creds_for_do_exit(struct task_struct *tsk) -{ - kdebug("validate_creds_for_do_exit(%p,%p{%ld,%d})", - tsk->real_cred, tsk->cred, - atomic_long_read(&tsk->cred->usage), - read_cred_subscribers(tsk->cred)); - - __validate_process_creds(tsk, __FILE__, __LINE__); -} - -#endif /* CONFIG_DEBUG_CREDENTIALS */ diff --git a/kernel/exit.c b/kernel/exit.c index ee9f43bed49a..aedc0832c9f4 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -824,8 +824,6 @@ void __noreturn do_exit(long code) ptrace_event(PTRACE_EVENT_EXIT, code); user_events_exit(tsk); - validate_creds_for_do_exit(tsk); - io_uring_files_cancel(); exit_signals(tsk); /* sets PF_EXITING */ @@ -909,7 +907,6 @@ void __noreturn do_exit(long code) if (tsk->task_frag.page) put_page(tsk->task_frag.page); - validate_creds_for_do_exit(tsk); exit_task_stack_account(tsk); check_stack_usage(); diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index cc7d53d9dc01..4405f81248fb 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1739,21 +1739,6 @@ config DEBUG_MAPLE_TREE endmenu -config DEBUG_CREDENTIALS - bool "Debug credential management" - depends on DEBUG_KERNEL - help - Enable this to turn on some debug checking for credential - management. The additional code keeps track of the number of - pointers from task_structs to any given cred struct, and checks to - see that this number never exceeds the usage count of the cred - struct. - - Furthermore, if SELinux is enabled, this also checks that the - security pointer in the cred struct is never seen to be invalid. - - If unsure, say N. - source "kernel/rcu/Kconfig.debug" config DEBUG_WQ_FORCE_RR_CPU diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c index 7bfe7d9a32aa..04534ea537c8 100644 --- a/net/sunrpc/auth.c +++ b/net/sunrpc/auth.c @@ -40,9 +40,6 @@ static unsigned long number_cred_unused; static struct cred machine_cred = { .usage = ATOMIC_INIT(1), -#ifdef CONFIG_DEBUG_CREDENTIALS - .magic = CRED_MAGIC, -#endif }; /* diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index feda711c6b7b..340b2bbbb2dd 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -1660,8 +1660,6 @@ static int inode_has_perm(const struct cred *cred, struct inode_security_struct *isec; u32 sid; - validate_creds(cred); - if (unlikely(IS_PRIVATE(inode))) return 0; @@ -3056,8 +3054,6 @@ static int selinux_inode_follow_link(struct dentry *dentry, struct inode *inode, struct inode_security_struct *isec; u32 sid; - validate_creds(cred); - ad.type = LSM_AUDIT_DATA_DENTRY; ad.u.dentry = dentry; sid = cred_sid(cred); @@ -3101,8 +3097,6 @@ static int selinux_inode_permission(struct inode *inode, int mask) if (!mask) return 0; - validate_creds(cred); - if (unlikely(IS_PRIVATE(inode))) return 0; diff --git a/tools/objtool/noreturns.h b/tools/objtool/noreturns.h index 649ebdef9c3f..1685d7ea6a9f 100644 --- a/tools/objtool/noreturns.h +++ b/tools/objtool/noreturns.h @@ -6,7 +6,6 @@ * * Yes, this is unfortunate. A better solution is in the works. */ -NORETURN(__invalid_creds) NORETURN(__kunit_abort) NORETURN(__module_put_and_kthread_exit) NORETURN(__reiserfs_panic) diff --git a/tools/testing/selftests/bpf/config.x86_64 b/tools/testing/selftests/bpf/config.x86_64 index 2e70a6048278..49a29dbc1910 100644 --- a/tools/testing/selftests/bpf/config.x86_64 +++ b/tools/testing/selftests/bpf/config.x86_64 @@ -50,7 +50,6 @@ CONFIG_CRYPTO_SEQIV=y CONFIG_CRYPTO_XXHASH=y CONFIG_DCB=y CONFIG_DEBUG_ATOMIC_SLEEP=y -CONFIG_DEBUG_CREDENTIALS=y CONFIG_DEBUG_INFO_BTF=y CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_DEBUG_MEMORY_INIT=y diff --git a/tools/testing/selftests/hid/config.common b/tools/testing/selftests/hid/config.common index 0617275d93cc..0f456dbab62f 100644 --- a/tools/testing/selftests/hid/config.common +++ b/tools/testing/selftests/hid/config.common @@ -46,7 +46,6 @@ CONFIG_CRYPTO_SEQIV=y CONFIG_CRYPTO_XXHASH=y CONFIG_DCB=y CONFIG_DEBUG_ATOMIC_SLEEP=y -CONFIG_DEBUG_CREDENTIALS=y CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_DEBUG_MEMORY_INIT=y CONFIG_DEFAULT_FQ_CODEL=y -- cgit From 4f9087f16651aca4a5f32da840a53f6660f0579a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 15 Dec 2023 10:12:18 +0100 Subject: x86/cfi,bpf: Fix BPF JIT call The current BPF call convention is __nocfi, except when it calls !JIT things, then it calls regular C functions. It so happens that with FineIBT the __nocfi and C calling conventions are incompatible. Specifically __nocfi will call at func+0, while FineIBT will have endbr-poison there, which is not a valid indirect target. Causing #CP. Notably this only triggers on IBT enabled hardware, which is probably why this hasn't been reported (also, most people will have JIT on anyway). Implement proper CFI prologues for the BPF JIT codegen and drop __nocfi for x86. Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20231215092707.345270396@infradead.org Signed-off-by: Alexei Starovoitov --- arch/x86/include/asm/cfi.h | 110 ++++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/alternative.c | 47 +++++++++++++++--- arch/x86/net/bpf_jit_comp.c | 82 +++++++++++++++++++++++++++++-- include/linux/bpf.h | 12 ++++- include/linux/cfi.h | 7 +++ kernel/bpf/core.c | 25 ++++++++++ 6 files changed, 269 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/arch/x86/include/asm/cfi.h b/arch/x86/include/asm/cfi.h index 2a494643089d..7a7b0b823a98 100644 --- a/arch/x86/include/asm/cfi.h +++ b/arch/x86/include/asm/cfi.h @@ -9,15 +9,125 @@ */ #include +/* + * An overview of the various calling conventions... + * + * Traditional: + * + * foo: + * ... code here ... + * ret + * + * direct caller: + * call foo + * + * indirect caller: + * lea foo(%rip), %r11 + * ... + * call *%r11 + * + * + * IBT: + * + * foo: + * endbr64 + * ... code here ... + * ret + * + * direct caller: + * call foo / call foo+4 + * + * indirect caller: + * lea foo(%rip), %r11 + * ... + * call *%r11 + * + * + * kCFI: + * + * __cfi_foo: + * movl $0x12345678, %eax + * # 11 nops when CONFIG_CALL_PADDING + * foo: + * endbr64 # when IBT + * ... code here ... + * ret + * + * direct call: + * call foo # / call foo+4 when IBT + * + * indirect call: + * lea foo(%rip), %r11 + * ... + * movl $(-0x12345678), %r10d + * addl -4(%r11), %r10d # -15 when CONFIG_CALL_PADDING + * jz 1f + * ud2 + * 1:call *%r11 + * + * + * FineIBT (builds as kCFI + CALL_PADDING + IBT + RETPOLINE and runtime patches into): + * + * __cfi_foo: + * endbr64 + * subl 0x12345678, %r10d + * jz foo + * ud2 + * nop + * foo: + * osp nop3 # was endbr64 + * ... code here ... + * ret + * + * direct caller: + * call foo / call foo+4 + * + * indirect caller: + * lea foo(%rip), %r11 + * ... + * movl $0x12345678, %r10d + * subl $16, %r11 + * nop4 + * call *%r11 + * + */ +enum cfi_mode { + CFI_DEFAULT, /* FineIBT if hardware has IBT, otherwise kCFI */ + CFI_OFF, /* Taditional / IBT depending on .config */ + CFI_KCFI, /* Optionally CALL_PADDING, IBT, RETPOLINE */ + CFI_FINEIBT, /* see arch/x86/kernel/alternative.c */ +}; + +extern enum cfi_mode cfi_mode; + struct pt_regs; #ifdef CONFIG_CFI_CLANG enum bug_trap_type handle_cfi_failure(struct pt_regs *regs); +#define __bpfcall +extern u32 cfi_bpf_hash; + +static inline int cfi_get_offset(void) +{ + switch (cfi_mode) { + case CFI_FINEIBT: + return 16; + case CFI_KCFI: + if (IS_ENABLED(CONFIG_CALL_PADDING)) + return 16; + return 5; + default: + return 0; + } +} +#define cfi_get_offset cfi_get_offset + #else static inline enum bug_trap_type handle_cfi_failure(struct pt_regs *regs) { return BUG_TRAP_TYPE_NONE; } +#define cfi_bpf_hash 0U #endif /* CONFIG_CFI_CLANG */ #endif /* _ASM_X86_CFI_H */ diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 73be3931e4f0..d808d3aaec7e 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -30,6 +30,7 @@ #include #include #include +#include int __read_mostly alternatives_patched; @@ -832,15 +833,43 @@ void __init_or_module apply_seal_endbr(s32 *start, s32 *end) { } #endif /* CONFIG_X86_KERNEL_IBT */ #ifdef CONFIG_FINEIBT +#define __CFI_DEFAULT CFI_DEFAULT +#elif defined(CONFIG_CFI_CLANG) +#define __CFI_DEFAULT CFI_KCFI +#else +#define __CFI_DEFAULT CFI_OFF +#endif -enum cfi_mode { - CFI_DEFAULT, - CFI_OFF, - CFI_KCFI, - CFI_FINEIBT, -}; +enum cfi_mode cfi_mode __ro_after_init = __CFI_DEFAULT; + +#ifdef CONFIG_CFI_CLANG +struct bpf_insn; + +/* Must match bpf_func_t / DEFINE_BPF_PROG_RUN() */ +extern unsigned int __bpf_prog_runX(const void *ctx, + const struct bpf_insn *insn); + +/* + * Force a reference to the external symbol so the compiler generates + * __kcfi_typid. + */ +__ADDRESSABLE(__bpf_prog_runX); + +/* u32 __ro_after_init cfi_bpf_hash = __kcfi_typeid___bpf_prog_runX; */ +asm ( +" .pushsection .data..ro_after_init,\"aw\",@progbits \n" +" .type cfi_bpf_hash,@object \n" +" .globl cfi_bpf_hash \n" +" .p2align 2, 0x0 \n" +"cfi_bpf_hash: \n" +" .long __kcfi_typeid___bpf_prog_runX \n" +" .size cfi_bpf_hash, 4 \n" +" .popsection \n" +); +#endif + +#ifdef CONFIG_FINEIBT -static enum cfi_mode cfi_mode __ro_after_init = CFI_DEFAULT; static bool cfi_rand __ro_after_init = true; static u32 cfi_seed __ro_after_init; @@ -1149,8 +1178,10 @@ static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, goto err; if (cfi_rand) { - if (builtin) + if (builtin) { cfi_seed = get_random_u32(); + cfi_bpf_hash = cfi_rehash(cfi_bpf_hash); + } ret = cfi_rand_preamble(start_cfi, end_cfi); if (ret) diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index af4a5de7d93a..5d5b967b111d 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -17,6 +17,7 @@ #include #include #include +#include static bool all_callee_regs_used[4] = {true, true, true, true}; @@ -51,9 +52,11 @@ static u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len) do { EMIT4(b1, b2, b3, b4); EMIT(off, 4); } while (0) #ifdef CONFIG_X86_KERNEL_IBT -#define EMIT_ENDBR() EMIT(gen_endbr(), 4) +#define EMIT_ENDBR() EMIT(gen_endbr(), 4) +#define EMIT_ENDBR_POISON() EMIT(gen_endbr_poison(), 4) #else #define EMIT_ENDBR() +#define EMIT_ENDBR_POISON() #endif static bool is_imm8(int value) @@ -304,6 +307,69 @@ static void pop_callee_regs(u8 **pprog, bool *callee_regs_used) *pprog = prog; } +/* + * Emit the various CFI preambles, see asm/cfi.h and the comments about FineIBT + * in arch/x86/kernel/alternative.c + */ + +static void emit_fineibt(u8 **pprog) +{ + u8 *prog = *pprog; + + EMIT_ENDBR(); + EMIT3_off32(0x41, 0x81, 0xea, cfi_bpf_hash); /* subl $hash, %r10d */ + EMIT2(0x74, 0x07); /* jz.d8 +7 */ + EMIT2(0x0f, 0x0b); /* ud2 */ + EMIT1(0x90); /* nop */ + EMIT_ENDBR_POISON(); + + *pprog = prog; +} + +static void emit_kcfi(u8 **pprog) +{ + u8 *prog = *pprog; + + EMIT1_off32(0xb8, cfi_bpf_hash); /* movl $hash, %eax */ +#ifdef CONFIG_CALL_PADDING + EMIT1(0x90); + EMIT1(0x90); + EMIT1(0x90); + EMIT1(0x90); + EMIT1(0x90); + EMIT1(0x90); + EMIT1(0x90); + EMIT1(0x90); + EMIT1(0x90); + EMIT1(0x90); + EMIT1(0x90); +#endif + EMIT_ENDBR(); + + *pprog = prog; +} + +static void emit_cfi(u8 **pprog) +{ + u8 *prog = *pprog; + + switch (cfi_mode) { + case CFI_FINEIBT: + emit_fineibt(&prog); + break; + + case CFI_KCFI: + emit_kcfi(&prog); + break; + + default: + EMIT_ENDBR(); + break; + } + + *pprog = prog; +} + /* * Emit x86-64 prologue code for BPF program. * bpf_tail_call helper will skip the first X86_TAIL_CALL_OFFSET bytes @@ -315,10 +381,10 @@ static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf, { u8 *prog = *pprog; + emit_cfi(&prog); /* BPF trampoline can be made to work without these nops, * but let's waste 5 bytes for now and optimize later */ - EMIT_ENDBR(); memcpy(prog, x86_nops[5], X86_PATCH_SIZE); prog += X86_PATCH_SIZE; if (!ebpf_from_cbpf) { @@ -3013,9 +3079,16 @@ out_image: jit_data->header = header; jit_data->rw_header = rw_header; } - prog->bpf_func = (void *)image; + /* + * ctx.prog_offset is used when CFI preambles put code *before* + * the function. See emit_cfi(). For FineIBT specifically this code + * can also be executed and bpf_prog_kallsyms_add() will + * generate an additional symbol to cover this, hence also + * decrement proglen. + */ + prog->bpf_func = (void *)image + cfi_get_offset(); prog->jited = 1; - prog->jited_len = proglen; + prog->jited_len = proglen - cfi_get_offset(); } else { prog = orig_prog; } @@ -3070,6 +3143,7 @@ void bpf_jit_free(struct bpf_prog *prog) kvfree(jit_data->addrs); kfree(jit_data); } + prog->bpf_func = (void *)prog->bpf_func - cfi_get_offset(); hdr = bpf_jit_binary_pack_hdr(prog); bpf_jit_binary_pack_free(hdr, NULL); WARN_ON_ONCE(!bpf_prog_kallsyms_verify_off(prog)); diff --git a/include/linux/bpf.h b/include/linux/bpf.h index c87c608a3689..9d84c376851a 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -29,6 +29,7 @@ #include #include #include +#include struct bpf_verifier_env; struct bpf_verifier_log; @@ -1211,7 +1212,11 @@ struct bpf_dispatcher { #endif }; -static __always_inline __nocfi unsigned int bpf_dispatcher_nop_func( +#ifndef __bpfcall +#define __bpfcall __nocfi +#endif + +static __always_inline __bpfcall unsigned int bpf_dispatcher_nop_func( const void *ctx, const struct bpf_insn *insnsi, bpf_func_t bpf_func) @@ -1303,7 +1308,7 @@ int arch_prepare_bpf_dispatcher(void *image, void *buf, s64 *funcs, int num_func #define DEFINE_BPF_DISPATCHER(name) \ __BPF_DISPATCHER_SC(name); \ - noinline __nocfi unsigned int bpf_dispatcher_##name##_func( \ + noinline __bpfcall unsigned int bpf_dispatcher_##name##_func( \ const void *ctx, \ const struct bpf_insn *insnsi, \ bpf_func_t bpf_func) \ @@ -1453,6 +1458,9 @@ struct bpf_prog_aux { struct bpf_kfunc_desc_tab *kfunc_tab; struct bpf_kfunc_btf_tab *kfunc_btf_tab; u32 size_poke_tab; +#ifdef CONFIG_FINEIBT + struct bpf_ksym ksym_prefix; +#endif struct bpf_ksym ksym; const struct bpf_prog_ops *ops; struct bpf_map **used_maps; diff --git a/include/linux/cfi.h b/include/linux/cfi.h index 2309d74e77e6..1ed2d96c0cfc 100644 --- a/include/linux/cfi.h +++ b/include/linux/cfi.h @@ -11,6 +11,13 @@ #include #include +#ifndef cfi_get_offset +static inline int cfi_get_offset(void) +{ + return 0; +} +#endif + #ifdef CONFIG_CFI_CLANG enum bug_trap_type report_cfi_failure(struct pt_regs *regs, unsigned long addr, unsigned long *target, u32 type); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index c34513d645c4..5aa6863ac33b 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -121,6 +121,9 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag #endif INIT_LIST_HEAD_RCU(&fp->aux->ksym.lnode); +#ifdef CONFIG_FINEIBT + INIT_LIST_HEAD_RCU(&fp->aux->ksym_prefix.lnode); +#endif mutex_init(&fp->aux->used_maps_mutex); mutex_init(&fp->aux->dst_mutex); @@ -683,6 +686,23 @@ void bpf_prog_kallsyms_add(struct bpf_prog *fp) fp->aux->ksym.prog = true; bpf_ksym_add(&fp->aux->ksym); + +#ifdef CONFIG_FINEIBT + /* + * When FineIBT, code in the __cfi_foo() symbols can get executed + * and hence unwinder needs help. + */ + if (cfi_mode != CFI_FINEIBT) + return; + + snprintf(fp->aux->ksym_prefix.name, KSYM_NAME_LEN, + "__cfi_%s", fp->aux->ksym.name); + + fp->aux->ksym_prefix.start = (unsigned long) fp->bpf_func - 16; + fp->aux->ksym_prefix.end = (unsigned long) fp->bpf_func; + + bpf_ksym_add(&fp->aux->ksym_prefix); +#endif } void bpf_prog_kallsyms_del(struct bpf_prog *fp) @@ -691,6 +711,11 @@ void bpf_prog_kallsyms_del(struct bpf_prog *fp) return; bpf_ksym_del(&fp->aux->ksym); +#ifdef CONFIG_FINEIBT + if (cfi_mode != CFI_FINEIBT) + return; + bpf_ksym_del(&fp->aux->ksym_prefix); +#endif } static struct bpf_ksym *bpf_ksym_find(unsigned long addr) -- cgit From 2cd3e3772e41377f32d6eea643e0590774e9187c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 15 Dec 2023 10:12:20 +0100 Subject: x86/cfi,bpf: Fix bpf_struct_ops CFI BPF struct_ops uses __arch_prepare_bpf_trampoline() to write trampolines for indirect function calls. These tramplines much have matching CFI. In order to obtain the correct CFI hash for the various methods, add a matching structure that contains stub functions, the compiler will generate correct CFI which we can pilfer for the trampolines. Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20231215092707.566977112@infradead.org Signed-off-by: Alexei Starovoitov --- arch/x86/include/asm/cfi.h | 6 ++++ arch/x86/kernel/alternative.c | 22 ++++++++++++++ arch/x86/net/bpf_jit_comp.c | 66 ++++++++++++++++++++++++++-------------- include/linux/bpf.h | 13 ++++++++ kernel/bpf/bpf_struct_ops.c | 16 +++++----- net/bpf/bpf_dummy_struct_ops.c | 31 ++++++++++++++++++- net/ipv4/bpf_tcp_ca.c | 69 ++++++++++++++++++++++++++++++++++++++++++ 7 files changed, 191 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/arch/x86/include/asm/cfi.h b/arch/x86/include/asm/cfi.h index 8779abd217b7..1a50b2cd4713 100644 --- a/arch/x86/include/asm/cfi.h +++ b/arch/x86/include/asm/cfi.h @@ -123,6 +123,8 @@ static inline int cfi_get_offset(void) } #define cfi_get_offset cfi_get_offset +extern u32 cfi_get_func_hash(void *func); + #else static inline enum bug_trap_type handle_cfi_failure(struct pt_regs *regs) { @@ -130,6 +132,10 @@ static inline enum bug_trap_type handle_cfi_failure(struct pt_regs *regs) } #define cfi_bpf_hash 0U #define cfi_bpf_subprog_hash 0U +static inline u32 cfi_get_func_hash(void *func) +{ + return 0; +} #endif /* CONFIG_CFI_CLANG */ #endif /* _ASM_X86_CFI_H */ diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index cb393efd5ccd..49c2a62ba5e4 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -883,6 +883,28 @@ asm ( " .size cfi_bpf_subprog_hash, 4 \n" " .popsection \n" ); + +u32 cfi_get_func_hash(void *func) +{ + u32 hash; + + func -= cfi_get_offset(); + switch (cfi_mode) { + case CFI_FINEIBT: + func += 7; + break; + case CFI_KCFI: + func += 1; + break; + default: + return 0; + } + + if (get_kernel_nofault(hash, func)) + return 0; + + return hash; +} #endif #ifdef CONFIG_FINEIBT diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 43b8c08fdf8d..c89a4abdd726 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -312,9 +312,8 @@ static void pop_callee_regs(u8 **pprog, bool *callee_regs_used) * in arch/x86/kernel/alternative.c */ -static void emit_fineibt(u8 **pprog, bool is_subprog) +static void emit_fineibt(u8 **pprog, u32 hash) { - u32 hash = is_subprog ? cfi_bpf_subprog_hash : cfi_bpf_hash; u8 *prog = *pprog; EMIT_ENDBR(); @@ -327,9 +326,8 @@ static void emit_fineibt(u8 **pprog, bool is_subprog) *pprog = prog; } -static void emit_kcfi(u8 **pprog, bool is_subprog) +static void emit_kcfi(u8 **pprog, u32 hash) { - u32 hash = is_subprog ? cfi_bpf_subprog_hash : cfi_bpf_hash; u8 *prog = *pprog; EMIT1_off32(0xb8, hash); /* movl $hash, %eax */ @@ -351,17 +349,17 @@ static void emit_kcfi(u8 **pprog, bool is_subprog) *pprog = prog; } -static void emit_cfi(u8 **pprog, bool is_subprog) +static void emit_cfi(u8 **pprog, u32 hash) { u8 *prog = *pprog; switch (cfi_mode) { case CFI_FINEIBT: - emit_fineibt(&prog, is_subprog); + emit_fineibt(&prog, hash); break; case CFI_KCFI: - emit_kcfi(&prog, is_subprog); + emit_kcfi(&prog, hash); break; default: @@ -383,7 +381,7 @@ static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf, { u8 *prog = *pprog; - emit_cfi(&prog, is_subprog); + emit_cfi(&prog, is_subprog ? cfi_bpf_subprog_hash : cfi_bpf_hash); /* BPF trampoline can be made to work without these nops, * but let's waste 5 bytes for now and optimize later */ @@ -2510,10 +2508,19 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im u8 *prog; bool save_ret; + /* + * F_INDIRECT is only compatible with F_RET_FENTRY_RET, it is + * explicitly incompatible with F_CALL_ORIG | F_SKIP_FRAME | F_IP_ARG + * because @func_addr. + */ + WARN_ON_ONCE((flags & BPF_TRAMP_F_INDIRECT) && + (flags & ~(BPF_TRAMP_F_INDIRECT | BPF_TRAMP_F_RET_FENTRY_RET))); + /* extra registers for struct arguments */ - for (i = 0; i < m->nr_args; i++) + for (i = 0; i < m->nr_args; i++) { if (m->arg_flags[i] & BTF_FMODEL_STRUCT_ARG) nr_regs += (m->arg_size[i] + 7) / 8 - 1; + } /* x86-64 supports up to MAX_BPF_FUNC_ARGS arguments. 1-6 * are passed through regs, the remains are through stack. @@ -2596,20 +2603,27 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im prog = rw_image; - EMIT_ENDBR(); - /* - * This is the direct-call trampoline, as such it needs accounting - * for the __fentry__ call. - */ - x86_call_depth_emit_accounting(&prog, NULL); + if (flags & BPF_TRAMP_F_INDIRECT) { + /* + * Indirect call for bpf_struct_ops + */ + emit_cfi(&prog, cfi_get_func_hash(func_addr)); + } else { + /* + * Direct-call fentry stub, as such it needs accounting for the + * __fentry__ call. + */ + x86_call_depth_emit_accounting(&prog, NULL); + } EMIT1(0x55); /* push rbp */ EMIT3(0x48, 0x89, 0xE5); /* mov rbp, rsp */ - if (!is_imm8(stack_size)) + if (!is_imm8(stack_size)) { /* sub rsp, stack_size */ EMIT3_off32(0x48, 0x81, 0xEC, stack_size); - else + } else { /* sub rsp, stack_size */ EMIT4(0x48, 0x83, 0xEC, stack_size); + } if (flags & BPF_TRAMP_F_TAIL_CALL_CTX) EMIT1(0x50); /* push rax */ /* mov QWORD PTR [rbp - rbx_off], rbx */ @@ -2643,10 +2657,11 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im } } - if (fentry->nr_links) + if (fentry->nr_links) { if (invoke_bpf(m, &prog, fentry, regs_off, run_ctx_off, flags & BPF_TRAMP_F_RET_FENTRY_RET, image, rw_image)) return -EINVAL; + } if (fmod_ret->nr_links) { branches = kcalloc(fmod_ret->nr_links, sizeof(u8 *), @@ -2665,11 +2680,12 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im restore_regs(m, &prog, regs_off); save_args(m, &prog, arg_stack_off, true); - if (flags & BPF_TRAMP_F_TAIL_CALL_CTX) + if (flags & BPF_TRAMP_F_TAIL_CALL_CTX) { /* Before calling the original function, restore the * tail_call_cnt from stack to rax. */ RESTORE_TAIL_CALL_CNT(stack_size); + } if (flags & BPF_TRAMP_F_ORIG_STACK) { emit_ldx(&prog, BPF_DW, BPF_REG_6, BPF_REG_FP, 8); @@ -2698,17 +2714,19 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im /* Update the branches saved in invoke_bpf_mod_ret with the * aligned address of do_fexit. */ - for (i = 0; i < fmod_ret->nr_links; i++) + for (i = 0; i < fmod_ret->nr_links; i++) { emit_cond_near_jump(&branches[i], image + (prog - (u8 *)rw_image), image + (branches[i] - (u8 *)rw_image), X86_JNE); + } } - if (fexit->nr_links) + if (fexit->nr_links) { if (invoke_bpf(m, &prog, fexit, regs_off, run_ctx_off, false, image, rw_image)) { ret = -EINVAL; goto cleanup; } + } if (flags & BPF_TRAMP_F_RESTORE_REGS) restore_regs(m, &prog, regs_off); @@ -2725,11 +2743,12 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im ret = -EINVAL; goto cleanup; } - } else if (flags & BPF_TRAMP_F_TAIL_CALL_CTX) + } else if (flags & BPF_TRAMP_F_TAIL_CALL_CTX) { /* Before running the original function, restore the * tail_call_cnt from stack to rax. */ RESTORE_TAIL_CALL_CNT(stack_size); + } /* restore return value of orig_call or fentry prog back into RAX */ if (save_ret) @@ -2737,9 +2756,10 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im emit_ldx(&prog, BPF_DW, BPF_REG_6, BPF_REG_FP, -rbx_off); EMIT1(0xC9); /* leave */ - if (flags & BPF_TRAMP_F_SKIP_FRAME) + if (flags & BPF_TRAMP_F_SKIP_FRAME) { /* skip our return address and return to parent */ EMIT4(0x48, 0x83, 0xC4, 8); /* add rsp, 8 */ + } emit_return(&prog, image + (prog - (u8 *)rw_image)); /* Make sure the trampoline generation logic doesn't overflow */ if (WARN_ON_ONCE(prog > (u8 *)rw_image_end - BPF_INSN_SAFETY)) { diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 9d84c376851a..db46b3359bf5 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1060,6 +1060,17 @@ struct btf_func_model { */ #define BPF_TRAMP_F_TAIL_CALL_CTX BIT(7) +/* + * Indicate the trampoline should be suitable to receive indirect calls; + * without this indirectly calling the generated code can result in #UD/#CP, + * depending on the CFI options. + * + * Used by bpf_struct_ops. + * + * Incompatible with FENTRY usage, overloads @func_addr argument. + */ +#define BPF_TRAMP_F_INDIRECT BIT(8) + /* Each call __bpf_prog_enter + call bpf_func + call __bpf_prog_exit is ~50 * bytes on x86. */ @@ -1697,6 +1708,7 @@ struct bpf_struct_ops { struct btf_func_model func_models[BPF_STRUCT_OPS_MAX_NR_MEMBERS]; u32 type_id; u32 value_id; + void *cfi_stubs; }; #if defined(CONFIG_BPF_JIT) && defined(CONFIG_BPF_SYSCALL) @@ -1710,6 +1722,7 @@ int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key, int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks, struct bpf_tramp_link *link, const struct btf_func_model *model, + void *stub_func, void *image, void *image_end); static inline bool bpf_try_module_get(const void *data, struct module *owner) { diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index 4d53c53fc5aa..02068bd0e4d9 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -352,17 +352,16 @@ const struct bpf_link_ops bpf_struct_ops_link_lops = { int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks, struct bpf_tramp_link *link, const struct btf_func_model *model, - void *image, void *image_end) + void *stub_func, void *image, void *image_end) { - u32 flags; + u32 flags = BPF_TRAMP_F_INDIRECT; int size; tlinks[BPF_TRAMP_FENTRY].links[0] = link; tlinks[BPF_TRAMP_FENTRY].nr_links = 1; - /* BPF_TRAMP_F_RET_FENTRY_RET is only used by bpf_struct_ops, - * and it must be used alone. - */ - flags = model->ret_size > 0 ? BPF_TRAMP_F_RET_FENTRY_RET : 0; + + if (model->ret_size > 0) + flags |= BPF_TRAMP_F_RET_FENTRY_RET; size = arch_bpf_trampoline_size(model, flags, tlinks, NULL); if (size < 0) @@ -370,7 +369,7 @@ int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks, if (size > (unsigned long)image_end - (unsigned long)image) return -E2BIG; return arch_prepare_bpf_trampoline(NULL, image, image_end, - model, flags, tlinks, NULL); + model, flags, tlinks, stub_func); } static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, @@ -504,11 +503,12 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, err = bpf_struct_ops_prepare_trampoline(tlinks, link, &st_ops->func_models[i], + *(void **)(st_ops->cfi_stubs + moff), image, image_end); if (err < 0) goto reset_unlock; - *(void **)(kdata + moff) = image; + *(void **)(kdata + moff) = image + cfi_get_offset(); image += err; /* put prog_id to udata */ diff --git a/net/bpf/bpf_dummy_struct_ops.c b/net/bpf/bpf_dummy_struct_ops.c index 2748f9d77b18..8906f7bdf4a9 100644 --- a/net/bpf/bpf_dummy_struct_ops.c +++ b/net/bpf/bpf_dummy_struct_ops.c @@ -12,6 +12,11 @@ extern struct bpf_struct_ops bpf_bpf_dummy_ops; /* A common type for test_N with return value in bpf_dummy_ops */ typedef int (*dummy_ops_test_ret_fn)(struct bpf_dummy_ops_state *state, ...); +static int dummy_ops_test_ret_function(struct bpf_dummy_ops_state *state, ...) +{ + return 0; +} + struct bpf_dummy_ops_test_args { u64 args[MAX_BPF_FUNC_ARGS]; struct bpf_dummy_ops_state state; @@ -62,7 +67,7 @@ static int dummy_ops_copy_args(struct bpf_dummy_ops_test_args *args) static int dummy_ops_call_op(void *image, struct bpf_dummy_ops_test_args *args) { - dummy_ops_test_ret_fn test = (void *)image; + dummy_ops_test_ret_fn test = (void *)image + cfi_get_offset(); struct bpf_dummy_ops_state *state = NULL; /* state needs to be NULL if args[0] is 0 */ @@ -119,6 +124,7 @@ int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr, op_idx = prog->expected_attach_type; err = bpf_struct_ops_prepare_trampoline(tlinks, link, &st_ops->func_models[op_idx], + &dummy_ops_test_ret_function, image, image + PAGE_SIZE); if (err < 0) goto out; @@ -219,6 +225,28 @@ static void bpf_dummy_unreg(void *kdata) { } +static int bpf_dummy_test_1(struct bpf_dummy_ops_state *cb) +{ + return 0; +} + +static int bpf_dummy_test_2(struct bpf_dummy_ops_state *cb, int a1, unsigned short a2, + char a3, unsigned long a4) +{ + return 0; +} + +static int bpf_dummy_test_sleepable(struct bpf_dummy_ops_state *cb) +{ + return 0; +} + +static struct bpf_dummy_ops __bpf_bpf_dummy_ops = { + .test_1 = bpf_dummy_test_1, + .test_2 = bpf_dummy_test_2, + .test_sleepable = bpf_dummy_test_sleepable, +}; + struct bpf_struct_ops bpf_bpf_dummy_ops = { .verifier_ops = &bpf_dummy_verifier_ops, .init = bpf_dummy_init, @@ -227,4 +255,5 @@ struct bpf_struct_ops bpf_bpf_dummy_ops = { .reg = bpf_dummy_reg, .unreg = bpf_dummy_unreg, .name = "bpf_dummy_ops", + .cfi_stubs = &__bpf_bpf_dummy_ops, }; diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c index c7bbd8f3c708..634cfafa583d 100644 --- a/net/ipv4/bpf_tcp_ca.c +++ b/net/ipv4/bpf_tcp_ca.c @@ -271,6 +271,74 @@ static int bpf_tcp_ca_validate(void *kdata) return tcp_validate_congestion_control(kdata); } +static u32 bpf_tcp_ca_ssthresh(struct sock *sk) +{ + return 0; +} + +static void bpf_tcp_ca_cong_avoid(struct sock *sk, u32 ack, u32 acked) +{ +} + +static void bpf_tcp_ca_set_state(struct sock *sk, u8 new_state) +{ +} + +static void bpf_tcp_ca_cwnd_event(struct sock *sk, enum tcp_ca_event ev) +{ +} + +static void bpf_tcp_ca_in_ack_event(struct sock *sk, u32 flags) +{ +} + +static void bpf_tcp_ca_pkts_acked(struct sock *sk, const struct ack_sample *sample) +{ +} + +static u32 bpf_tcp_ca_min_tso_segs(struct sock *sk) +{ + return 0; +} + +static void bpf_tcp_ca_cong_control(struct sock *sk, const struct rate_sample *rs) +{ +} + +static u32 bpf_tcp_ca_undo_cwnd(struct sock *sk) +{ + return 0; +} + +static u32 bpf_tcp_ca_sndbuf_expand(struct sock *sk) +{ + return 0; +} + +static void __bpf_tcp_ca_init(struct sock *sk) +{ +} + +static void __bpf_tcp_ca_release(struct sock *sk) +{ +} + +static struct tcp_congestion_ops __bpf_ops_tcp_congestion_ops = { + .ssthresh = bpf_tcp_ca_ssthresh, + .cong_avoid = bpf_tcp_ca_cong_avoid, + .set_state = bpf_tcp_ca_set_state, + .cwnd_event = bpf_tcp_ca_cwnd_event, + .in_ack_event = bpf_tcp_ca_in_ack_event, + .pkts_acked = bpf_tcp_ca_pkts_acked, + .min_tso_segs = bpf_tcp_ca_min_tso_segs, + .cong_control = bpf_tcp_ca_cong_control, + .undo_cwnd = bpf_tcp_ca_undo_cwnd, + .sndbuf_expand = bpf_tcp_ca_sndbuf_expand, + + .init = __bpf_tcp_ca_init, + .release = __bpf_tcp_ca_release, +}; + struct bpf_struct_ops bpf_tcp_congestion_ops = { .verifier_ops = &bpf_tcp_ca_verifier_ops, .reg = bpf_tcp_ca_reg, @@ -281,6 +349,7 @@ struct bpf_struct_ops bpf_tcp_congestion_ops = { .init = bpf_tcp_ca_init, .validate = bpf_tcp_ca_validate, .name = "tcp_congestion_ops", + .cfi_stubs = &__bpf_ops_tcp_congestion_ops, }; static int __init bpf_tcp_ca_kfunc_init(void) -- cgit From e4c00339891c074c76f626ac82981963cbba5332 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 15 Dec 2023 10:12:22 +0100 Subject: bpf: Fix dtor CFI Ensure the various dtor functions match their prototype and retain their CFI signatures, since they don't have their address taken, they are prone to not getting CFI, making them impossible to call indirectly. Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20231215092707.799451071@infradead.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/cpumask.c | 8 +++++++- kernel/bpf/helpers.c | 16 ++++++++++++++-- net/bpf/test_run.c | 15 +++++++++++++-- 3 files changed, 34 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/cpumask.c b/kernel/bpf/cpumask.c index 7499b7d8c06f..2e73533a3811 100644 --- a/kernel/bpf/cpumask.c +++ b/kernel/bpf/cpumask.c @@ -96,6 +96,12 @@ __bpf_kfunc void bpf_cpumask_release(struct bpf_cpumask *cpumask) migrate_enable(); } +__bpf_kfunc void bpf_cpumask_release_dtor(void *cpumask) +{ + bpf_cpumask_release(cpumask); +} +CFI_NOSEAL(bpf_cpumask_release_dtor); + /** * bpf_cpumask_first() - Get the index of the first nonzero bit in the cpumask. * @cpumask: The cpumask being queried. @@ -453,7 +459,7 @@ static const struct btf_kfunc_id_set cpumask_kfunc_set = { BTF_ID_LIST(cpumask_dtor_ids) BTF_ID(struct, bpf_cpumask) -BTF_ID(func, bpf_cpumask_release) +BTF_ID(func, bpf_cpumask_release_dtor) static int __init cpumask_kfunc_init(void) { diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index b0b485126a76..e0c0e3676df8 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -2150,6 +2150,12 @@ __bpf_kfunc void bpf_task_release(struct task_struct *p) put_task_struct_rcu_user(p); } +__bpf_kfunc void bpf_task_release_dtor(void *p) +{ + put_task_struct_rcu_user(p); +} +CFI_NOSEAL(bpf_task_release_dtor); + #ifdef CONFIG_CGROUPS /** * bpf_cgroup_acquire - Acquire a reference to a cgroup. A cgroup acquired by @@ -2174,6 +2180,12 @@ __bpf_kfunc void bpf_cgroup_release(struct cgroup *cgrp) cgroup_put(cgrp); } +__bpf_kfunc void bpf_cgroup_release_dtor(void *cgrp) +{ + cgroup_put(cgrp); +} +CFI_NOSEAL(bpf_cgroup_release_dtor); + /** * bpf_cgroup_ancestor - Perform a lookup on an entry in a cgroup's ancestor * array. A cgroup returned by this kfunc which is not subsequently stored in a @@ -2570,10 +2582,10 @@ static const struct btf_kfunc_id_set generic_kfunc_set = { BTF_ID_LIST(generic_dtor_ids) BTF_ID(struct, task_struct) -BTF_ID(func, bpf_task_release) +BTF_ID(func, bpf_task_release_dtor) #ifdef CONFIG_CGROUPS BTF_ID(struct, cgroup) -BTF_ID(func, bpf_cgroup_release) +BTF_ID(func, bpf_cgroup_release_dtor) #endif BTF_SET8_START(common_btf_ids) diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 711cf5d59816..dfd919374017 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -600,10 +600,21 @@ __bpf_kfunc void bpf_kfunc_call_test_release(struct prog_test_ref_kfunc *p) refcount_dec(&p->cnt); } +__bpf_kfunc void bpf_kfunc_call_test_release_dtor(void *p) +{ + bpf_kfunc_call_test_release(p); +} +CFI_NOSEAL(bpf_kfunc_call_test_release_dtor); + __bpf_kfunc void bpf_kfunc_call_memb_release(struct prog_test_member *p) { } +__bpf_kfunc void bpf_kfunc_call_memb_release_dtor(void *p) +{ +} +CFI_NOSEAL(bpf_kfunc_call_memb_release_dtor); + __bpf_kfunc_end_defs(); BTF_SET8_START(bpf_test_modify_return_ids) @@ -1671,9 +1682,9 @@ static const struct btf_kfunc_id_set bpf_prog_test_kfunc_set = { BTF_ID_LIST(bpf_prog_test_dtor_kfunc_ids) BTF_ID(struct, prog_test_ref_kfunc) -BTF_ID(func, bpf_kfunc_call_test_release) +BTF_ID(func, bpf_kfunc_call_test_release_dtor) BTF_ID(struct, prog_test_member) -BTF_ID(func, bpf_kfunc_call_memb_release) +BTF_ID(func, bpf_kfunc_call_memb_release_dtor) static int __init bpf_prog_test_run_init(void) { -- cgit From 852486b35f344887786d63250946dd921a05d7e8 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 15 Dec 2023 10:12:23 +0100 Subject: x86/cfi,bpf: Fix bpf_exception_cb() signature As per the earlier patches, BPF sub-programs have bpf_callback_t signature and CFI expects callers to have matching signature. This is violated by bpf_prog_aux::bpf_exception_cb(). [peterz: Changelog] Reported-by: Peter Zijlstra Signed-off-by: Alexei Starovoitov Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/CAADnVQ+Z7UcXXBBhMubhcMM=R-dExk-uHtfOLtoLxQ1XxEpqEA@mail.gmail.com Link: https://lore.kernel.org/r/20231215092707.910319166@infradead.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 +- kernel/bpf/helpers.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index db46b3359bf5..5e694934cf37 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1484,7 +1484,7 @@ struct bpf_prog_aux { int cgroup_atype; /* enum cgroup_bpf_attach_type */ struct bpf_map *cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]; char name[BPF_OBJ_NAME_LEN]; - unsigned int (*bpf_exception_cb)(u64 cookie, u64 sp, u64 bp); + u64 (*bpf_exception_cb)(u64 cookie, u64 sp, u64 bp, u64, u64); #ifdef CONFIG_SECURITY void *security; #endif diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index e0c0e3676df8..07fd4b5704f3 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -2537,7 +2537,7 @@ __bpf_kfunc void bpf_throw(u64 cookie) * which skips compiler generated instrumentation to do the same. */ kasan_unpoison_task_stack_below((void *)(long)ctx.sp); - ctx.aux->bpf_exception_cb(cookie, ctx.sp, ctx.bp); + ctx.aux->bpf_exception_cb(cookie, ctx.sp, ctx.bp, 0, 0); WARN(1, "A call to BPF exception callback should never return\n"); } -- cgit From 9c556b7c3f520d42c435c0d78b25c719c060f8a1 Mon Sep 17 00:00:00 2001 From: Naveen N Rao Date: Thu, 14 Dec 2023 10:47:02 +0530 Subject: trace/kprobe: Display the actual notrace function when rejecting a probe Trying to probe update_sd_lb_stats() using perf results in the below message in the kernel log: trace_kprobe: Could not probe notrace function _text This is because 'perf probe' specifies the kprobe location as an offset from '_text': $ sudo perf probe -D update_sd_lb_stats p:probe/update_sd_lb_stats _text+1830728 However, the error message is misleading and doesn't help convey the actual notrace function that is being probed. Fix this by looking up the actual function name that is being probed. With this fix, we now get the below message in the kernel log: trace_kprobe: Could not probe notrace function update_sd_lb_stats.constprop.0 Link: https://lore.kernel.org/all/20231214051702.1687300-1-naveen@kernel.org/ Signed-off-by: Naveen N Rao Acked-by: Masami Hiramatsu (Google) Signed-off-by: Masami Hiramatsu (Google) --- kernel/trace/trace_kprobe.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 52f8b537dd0a..c4c6e0e0068b 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -487,8 +487,8 @@ static int __register_trace_kprobe(struct trace_kprobe *tk) return -EINVAL; if (within_notrace_func(tk)) { - pr_warn("Could not probe notrace function %s\n", - trace_kprobe_symbol(tk)); + pr_warn("Could not probe notrace function %ps\n", + (void *)trace_kprobe_address(tk)); return -EINVAL; } -- cgit From 3983c00281d96af2ba611254d679107b5c390627 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sun, 17 Dec 2023 22:55:37 +0100 Subject: bpf: Fail uprobe multi link with negative offset Currently the __uprobe_register will return 0 (success) when called with negative offset. The reason is that the call to register_for_each_vma and then build_map_info won't return error for negative offset. They just won't do anything - no matching vma is found so there's no registered breakpoint for the uprobe. I don't think we can change the behaviour of __uprobe_register and fail for negative uprobe offset, because apps might depend on that already. But I think we can still make the change and check for it on bpf multi link syscall level. Also moving the __get_user call and check for the offsets to the top of loop, to fail early without extra __get_user calls for ref_ctr_offset and cookie arrays. Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20231217215538.3361991-2-jolsa@kernel.org --- kernel/trace/bpf_trace.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 97c0c49c40a0..492d60e9c480 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -3391,15 +3391,19 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr goto error_free; for (i = 0; i < cnt; i++) { - if (ucookies && __get_user(uprobes[i].cookie, ucookies + i)) { + if (__get_user(uprobes[i].offset, uoffsets + i)) { err = -EFAULT; goto error_free; } + if (uprobes[i].offset < 0) { + err = -EINVAL; + goto error_free; + } if (uref_ctr_offsets && __get_user(uprobes[i].ref_ctr_offset, uref_ctr_offsets + i)) { err = -EFAULT; goto error_free; } - if (__get_user(uprobes[i].offset, uoffsets + i)) { + if (ucookies && __get_user(uprobes[i].cookie, ucookies + i)) { err = -EFAULT; goto error_free; } -- cgit From d81f0d7b8b23ec79f80be602ed6129ded27862e8 Mon Sep 17 00:00:00 2001 From: Rae Moar Date: Wed, 13 Dec 2023 19:44:17 +0000 Subject: kunit: add KUNIT_INIT_TABLE to init linker section Add KUNIT_INIT_TABLE to the INIT_DATA linker section. Alter the KUnit macros to create init tests: kunit_test_init_section_suites Update lib/kunit/executor.c to run both the suites in KUNIT_TABLE and KUNIT_INIT_TABLE. Reviewed-by: David Gow Signed-off-by: Rae Moar Signed-off-by: Shuah Khan --- include/asm-generic/vmlinux.lds.h | 9 +++++- include/kunit/test.h | 30 +++++++++++------- include/linux/module.h | 2 ++ kernel/module/main.c | 3 ++ lib/kunit/executor.c | 64 +++++++++++++++++++++++++++++++++++---- lib/kunit/test.c | 26 +++++++++++----- 6 files changed, 109 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 1107905d37fc..5dd3a61d673d 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -700,7 +700,8 @@ THERMAL_TABLE(governor) \ EARLYCON_TABLE() \ LSM_TABLE() \ - EARLY_LSM_TABLE() + EARLY_LSM_TABLE() \ + KUNIT_INIT_TABLE() #define INIT_TEXT \ *(.init.text .init.text.*) \ @@ -926,6 +927,12 @@ . = ALIGN(8); \ BOUNDED_SECTION_POST_LABEL(.kunit_test_suites, __kunit_suites, _start, _end) +/* Alignment must be consistent with (kunit_suite *) in include/kunit/test.h */ +#define KUNIT_INIT_TABLE() \ + . = ALIGN(8); \ + BOUNDED_SECTION_POST_LABEL(.kunit_init_test_suites, \ + __kunit_init_suites, _start, _end) + #ifdef CONFIG_BLK_DEV_INITRD #define INIT_RAM_FS \ . = ALIGN(4); \ diff --git a/include/kunit/test.h b/include/kunit/test.h index 20ed9f9275c9..fe79cd736e94 100644 --- a/include/kunit/test.h +++ b/include/kunit/test.h @@ -337,6 +337,9 @@ void __kunit_test_suites_exit(struct kunit_suite **suites, int num_suites); void kunit_exec_run_tests(struct kunit_suite_set *suite_set, bool builtin); void kunit_exec_list_tests(struct kunit_suite_set *suite_set, bool include_attr); +struct kunit_suite_set kunit_merge_suite_sets(struct kunit_suite_set init_suite_set, + struct kunit_suite_set suite_set); + #if IS_BUILTIN(CONFIG_KUNIT) int kunit_run_all_tests(void); #else @@ -371,6 +374,11 @@ static inline int kunit_run_all_tests(void) #define kunit_test_suite(suite) kunit_test_suites(&suite) +#define __kunit_init_test_suites(unique_array, ...) \ + static struct kunit_suite *unique_array[] \ + __aligned(sizeof(struct kunit_suite *)) \ + __used __section(".kunit_init_test_suites") = { __VA_ARGS__ } + /** * kunit_test_init_section_suites() - used to register one or more &struct * kunit_suite containing init functions or @@ -378,21 +386,21 @@ static inline int kunit_run_all_tests(void) * * @__suites: a statically allocated list of &struct kunit_suite. * - * This functions identically as kunit_test_suites() except that it suppresses - * modpost warnings for referencing functions marked __init or data marked - * __initdata; this is OK because currently KUnit only runs tests upon boot - * during the init phase or upon loading a module during the init phase. + * This functions similar to kunit_test_suites() except that it compiles the + * list of suites during init phase. + * + * This macro also suffixes the array and suite declarations it makes with + * _probe; so that modpost suppresses warnings about referencing init data + * for symbols named in this manner. * - * NOTE TO KUNIT DEVS: If we ever allow KUnit tests to be run after boot, these - * tests must be excluded. + * Note: these init tests are not able to be run after boot so there is no + * "run" debugfs file generated for these tests. * - * The only thing this macro does that's different from kunit_test_suites is - * that it suffixes the array and suite declarations it makes with _probe; - * modpost suppresses warnings about referencing init data for symbols named in - * this manner. + * Also, do not mark the suite or test case structs with __initdata because + * they will be used after the init phase with debugfs. */ #define kunit_test_init_section_suites(__suites...) \ - __kunit_test_suites(CONCATENATE(__UNIQUE_ID(array), _probe), \ + __kunit_init_test_suites(CONCATENATE(__UNIQUE_ID(array), _probe), \ ##__suites) #define kunit_test_init_section_suite(suite) \ diff --git a/include/linux/module.h b/include/linux/module.h index a98e188cf37b..9cd0009bd050 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -540,6 +540,8 @@ struct module { struct static_call_site *static_call_sites; #endif #if IS_ENABLED(CONFIG_KUNIT) + int num_kunit_init_suites; + struct kunit_suite **kunit_init_suites; int num_kunit_suites; struct kunit_suite **kunit_suites; #endif diff --git a/kernel/module/main.c b/kernel/module/main.c index 98fedfdb8db5..36681911c05a 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -2199,6 +2199,9 @@ static int find_module_sections(struct module *mod, struct load_info *info) mod->kunit_suites = section_objs(info, ".kunit_test_suites", sizeof(*mod->kunit_suites), &mod->num_kunit_suites); + mod->kunit_init_suites = section_objs(info, ".kunit_init_test_suites", + sizeof(*mod->kunit_init_suites), + &mod->num_kunit_init_suites); #endif mod->extable = section_objs(info, "__ex_table", diff --git a/lib/kunit/executor.c b/lib/kunit/executor.c index 1236b3cd2fbb..847329c51e91 100644 --- a/lib/kunit/executor.c +++ b/lib/kunit/executor.c @@ -12,6 +12,8 @@ */ extern struct kunit_suite * const __kunit_suites_start[]; extern struct kunit_suite * const __kunit_suites_end[]; +extern struct kunit_suite * const __kunit_init_suites_start[]; +extern struct kunit_suite * const __kunit_init_suites_end[]; static char *action_param; @@ -292,6 +294,33 @@ void kunit_exec_list_tests(struct kunit_suite_set *suite_set, bool include_attr) } } +struct kunit_suite_set kunit_merge_suite_sets(struct kunit_suite_set init_suite_set, + struct kunit_suite_set suite_set) +{ + struct kunit_suite_set total_suite_set = {NULL, NULL}; + struct kunit_suite **total_suite_start = NULL; + size_t init_num_suites, num_suites, suite_size; + + init_num_suites = init_suite_set.end - init_suite_set.start; + num_suites = suite_set.end - suite_set.start; + suite_size = sizeof(suite_set.start); + + /* Allocate memory for array of all kunit suites */ + total_suite_start = kmalloc_array(init_num_suites + num_suites, suite_size, GFP_KERNEL); + if (!total_suite_start) + return total_suite_set; + + /* Append init suites and then all other kunit suites */ + memcpy(total_suite_start, init_suite_set.start, init_num_suites * suite_size); + memcpy(total_suite_start + init_num_suites, suite_set.start, num_suites * suite_size); + + /* Set kunit suite set start and end */ + total_suite_set.start = total_suite_start; + total_suite_set.end = total_suite_start + (init_num_suites + num_suites); + + return total_suite_set; +} + #if IS_BUILTIN(CONFIG_KUNIT) static char *kunit_shutdown; @@ -313,21 +342,41 @@ static void kunit_handle_shutdown(void) int kunit_run_all_tests(void) { - struct kunit_suite_set suite_set = { + struct kunit_suite_set suite_set = {NULL, NULL}; + struct kunit_suite_set filtered_suite_set = {NULL, NULL}; + struct kunit_suite_set init_suite_set = { + __kunit_init_suites_start, __kunit_init_suites_end, + }; + struct kunit_suite_set normal_suite_set = { __kunit_suites_start, __kunit_suites_end, }; + size_t init_num_suites = init_suite_set.end - init_suite_set.start; int err = 0; + + if (init_num_suites > 0) { + suite_set = kunit_merge_suite_sets(init_suite_set, normal_suite_set); + if (!suite_set.start) + goto out; + } else + suite_set = normal_suite_set; + if (!kunit_enabled()) { pr_info("kunit: disabled\n"); - goto out; + goto free_out; } if (filter_glob_param || filter_param) { - suite_set = kunit_filter_suites(&suite_set, filter_glob_param, + filtered_suite_set = kunit_filter_suites(&suite_set, filter_glob_param, filter_param, filter_action_param, &err); + + /* Free original suite set before using filtered suite set */ + if (init_num_suites > 0) + kfree(suite_set.start); + suite_set = filtered_suite_set; + if (err) { pr_err("kunit executor: error filtering suites: %d\n", err); - goto out; + goto free_out; } } @@ -340,9 +389,12 @@ int kunit_run_all_tests(void) else pr_err("kunit executor: unknown action '%s'\n", action_param); - if (filter_glob_param || filter_param) { /* a copy was made of each suite */ +free_out: + if (filter_glob_param || filter_param) kunit_free_suite_set(suite_set); - } + else if (init_num_suites > 0) + /* Don't use kunit_free_suite_set because suites aren't individually allocated */ + kfree(suite_set.start); out: kunit_handle_shutdown(); diff --git a/lib/kunit/test.c b/lib/kunit/test.c index 7deee3701d20..6b60d85ce108 100644 --- a/lib/kunit/test.c +++ b/lib/kunit/test.c @@ -742,28 +742,40 @@ EXPORT_SYMBOL_GPL(__kunit_test_suites_exit); #ifdef CONFIG_MODULES static void kunit_module_init(struct module *mod) { - struct kunit_suite_set suite_set = { + struct kunit_suite_set suite_set, filtered_set; + struct kunit_suite_set normal_suite_set = { mod->kunit_suites, mod->kunit_suites + mod->num_kunit_suites, }; + struct kunit_suite_set init_suite_set = { + mod->kunit_init_suites, mod->kunit_init_suites + mod->num_kunit_init_suites, + }; const char *action = kunit_action(); int err = 0; - suite_set = kunit_filter_suites(&suite_set, + if (mod->num_kunit_init_suites > 0) + suite_set = kunit_merge_suite_sets(init_suite_set, normal_suite_set); + else + suite_set = normal_suite_set; + + filtered_set = kunit_filter_suites(&suite_set, kunit_filter_glob() ?: "*.*", kunit_filter(), kunit_filter_action(), &err); if (err) pr_err("kunit module: error filtering suites: %d\n", err); - mod->kunit_suites = (struct kunit_suite **)suite_set.start; - mod->num_kunit_suites = suite_set.end - suite_set.start; + mod->kunit_suites = (struct kunit_suite **)filtered_set.start; + mod->num_kunit_suites = filtered_set.end - filtered_set.start; + + if (mod->num_kunit_init_suites > 0) + kfree(suite_set.start); if (!action) - kunit_exec_run_tests(&suite_set, false); + kunit_exec_run_tests(&filtered_set, false); else if (!strcmp(action, "list")) - kunit_exec_list_tests(&suite_set, false); + kunit_exec_list_tests(&filtered_set, false); else if (!strcmp(action, "list_attr")) - kunit_exec_list_tests(&suite_set, true); + kunit_exec_list_tests(&filtered_set, true); else pr_err("kunit: unknown action '%s'\n", action); } -- cgit From 8e432e6197cef6250dfd6fdffd41c06613c874ca Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 18 Dec 2023 09:36:01 -0800 Subject: bpf: Ensure precise is reset to false in __mark_reg_const_zero() It is safe to always start with imprecise SCALAR_VALUE register. Previously __mark_reg_const_zero() relied on caller to reset precise mark, but it's very error prone and we already missed it in a few places. So instead make __mark_reg_const_zero() reset precision always, as it's a safe default for SCALAR_VALUE. Explanation is basically the same as for why we are resetting (or rather not setting) precision in current state. If necessary, precision propagation will set it to precise correctly. As such, also remove a big comment about forward precision propagation in mark_reg_stack_read() and avoid unnecessarily setting precision to true after reading from STACK_ZERO stack. Again, precision propagation will correctly handle this, if that SCALAR_VALUE register will ever be needed to be precise. Reported-by: Maxim Mikityanskiy Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Acked-by: Maxim Mikityanskiy Acked-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20231218173601.53047-1-andrii@kernel.org --- kernel/bpf/verifier.c | 29 ++++++++-------------- .../selftests/bpf/progs/verifier_spill_fill.c | 10 ++++++-- 2 files changed, 19 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 1863826a4ac3..9456ee0ad129 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1777,10 +1777,14 @@ static void __mark_reg_known_zero(struct bpf_reg_state *reg) __mark_reg_known(reg, 0); } -static void __mark_reg_const_zero(struct bpf_reg_state *reg) +static void __mark_reg_const_zero(const struct bpf_verifier_env *env, struct bpf_reg_state *reg) { __mark_reg_known(reg, 0); reg->type = SCALAR_VALUE; + /* all scalars are assumed imprecise initially (unless unprivileged, + * in which case everything is forced to be precise) + */ + reg->precise = !env->bpf_capable; } static void mark_reg_known_zero(struct bpf_verifier_env *env, @@ -4706,21 +4710,10 @@ static void mark_reg_stack_read(struct bpf_verifier_env *env, zeros++; } if (zeros == max_off - min_off) { - /* any access_size read into register is zero extended, - * so the whole register == const_zero - */ - __mark_reg_const_zero(&state->regs[dst_regno]); - /* backtracking doesn't support STACK_ZERO yet, - * so mark it precise here, so that later - * backtracking can stop here. - * Backtracking may not need this if this register - * doesn't participate in pointer adjustment. - * Forward propagation of precise flag is not - * necessary either. This mark is only to stop - * backtracking. Any register that contributed - * to const 0 was marked precise before spill. + /* Any access_size read into register is zero extended, + * so the whole register == const_zero. */ - state->regs[dst_regno].precise = true; + __mark_reg_const_zero(env, &state->regs[dst_regno]); } else { /* have read misc data from the stack */ mark_reg_unknown(env, state->regs, dst_regno); @@ -4803,11 +4796,11 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env, if (spill_cnt == size && tnum_is_const(reg->var_off) && reg->var_off.value == 0) { - __mark_reg_const_zero(&state->regs[dst_regno]); + __mark_reg_const_zero(env, &state->regs[dst_regno]); /* this IS register fill, so keep insn_flags */ } else if (zero_cnt == size) { /* similarly to mark_reg_stack_read(), preserve zeroes */ - __mark_reg_const_zero(&state->regs[dst_regno]); + __mark_reg_const_zero(env, &state->regs[dst_regno]); insn_flags = 0; /* not restoring original register state */ } else { mark_reg_unknown(env, state->regs, dst_regno); @@ -7963,7 +7956,7 @@ static int process_iter_next_call(struct bpf_verifier_env *env, int insn_idx, /* switch to DRAINED state, but keep the depth unchanged */ /* mark current iter state as drained and assume returned NULL */ cur_iter->iter.state = BPF_ITER_STATE_DRAINED; - __mark_reg_const_zero(&cur_fr->regs[BPF_REG_0]); + __mark_reg_const_zero(env, &cur_fr->regs[BPF_REG_0]); return 0; } diff --git a/tools/testing/selftests/bpf/progs/verifier_spill_fill.c b/tools/testing/selftests/bpf/progs/verifier_spill_fill.c index 508f5d6c7347..39fe3372e0e0 100644 --- a/tools/testing/selftests/bpf/progs/verifier_spill_fill.c +++ b/tools/testing/selftests/bpf/progs/verifier_spill_fill.c @@ -499,8 +499,14 @@ __success __msg("2: (7a) *(u64 *)(r10 -8) = 0 ; R10=fp0 fp-8_w=00000000") /* but fp-16 is spilled IMPRECISE zero const reg */ __msg("4: (7b) *(u64 *)(r10 -16) = r0 ; R0_w=0 R10=fp0 fp-16_w=0") -/* and now check that precision propagation works even for such tricky case */ -__msg("10: (71) r2 = *(u8 *)(r10 -9) ; R2_w=P0 R10=fp0 fp-16_w=0") +/* validate that assigning R2 from STACK_ZERO doesn't mark register + * precise immediately; if necessary, it will be marked precise later + */ +__msg("6: (71) r2 = *(u8 *)(r10 -1) ; R2_w=0 R10=fp0 fp-8_w=00000000") +/* similarly, when R2 is assigned from spilled register, it is initially + * imprecise, but will be marked precise later once it is used in precise context + */ +__msg("10: (71) r2 = *(u8 *)(r10 -9) ; R2_w=0 R10=fp0 fp-16_w=0") __msg("11: (0f) r1 += r2") __msg("mark_precise: frame0: last_idx 11 first_idx 0 subseq_idx -1") __msg("mark_precise: frame0: regs=r2 stack= before 10: (71) r2 = *(u8 *)(r10 -9)") -- cgit From b803d7c664d55705831729d2f2e29c874bcd62ea Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Mon, 18 Dec 2023 23:07:12 -0500 Subject: ring-buffer: Fix slowpath of interrupted event To synchronize the timestamps with the ring buffer reservation, there are two timestamps that are saved in the buffer meta data. 1. before_stamp 2. write_stamp When the two are equal, the write_stamp is considered valid, as in, it may be used to calculate the delta of the next event as the write_stamp is the timestamp of the previous reserved event on the buffer. This is done by the following: /*A*/ w = current position on the ring buffer before = before_stamp after = write_stamp ts = read current timestamp if (before != after) { write_stamp is not valid, force adding an absolute timestamp. } /*B*/ before_stamp = ts /*C*/ write = local_add_return(event length, position on ring buffer) if (w == write - event length) { /* Nothing interrupted between A and C */ /*E*/ write_stamp = ts; delta = ts - after /* * If nothing interrupted again, * before_stamp == write_stamp and write_stamp * can be used to calculate the delta for * events that come in after this one. */ } else { /* * The slow path! * Was interrupted between A and C. */ This is the place that there's a bug. We currently have: after = write_stamp ts = read current timestamp /*F*/ if (write == current position on the ring buffer && after < ts && cmpxchg(write_stamp, after, ts)) { delta = ts - after; } else { delta = 0; } The assumption is that if the current position on the ring buffer hasn't moved between C and F, then it also was not interrupted, and that the last event written has a timestamp that matches the write_stamp. That is the write_stamp is valid. But this may not be the case: If a task context event was interrupted by softirq between B and C. And the softirq wrote an event that got interrupted by a hard irq between C and E. and the hard irq wrote an event (does not need to be interrupted) We have: /*B*/ before_stamp = ts of normal context ---> interrupted by softirq /*B*/ before_stamp = ts of softirq context ---> interrupted by hardirq /*B*/ before_stamp = ts of hard irq context /*E*/ write_stamp = ts of hard irq context /* matches and write_stamp valid */ <---- /*E*/ write_stamp = ts of softirq context /* No longer matches before_stamp, write_stamp is not valid! */ <--- w != write - length, go to slow path // Right now the order of events in the ring buffer is: // // |-- softirq event --|-- hard irq event --|-- normal context event --| // after = write_stamp (this is the ts of softirq) ts = read current timestamp if (write == current position on the ring buffer [true] && after < ts [true] && cmpxchg(write_stamp, after, ts) [true]) { delta = ts - after [Wrong!] The delta is to be between the hard irq event and the normal context event, but the above logic made the delta between the softirq event and the normal context event, where the hard irq event is between the two. This will shift all the remaining event timestamps on the sub-buffer incorrectly. The write_stamp is only valid if it matches the before_stamp. The cmpxchg does nothing to help this. Instead, the following logic can be done to fix this: before = before_stamp ts = read current timestamp before_stamp = ts after = write_stamp if (write == current position on the ring buffer && after == before && after < ts) { delta = ts - after } else { delta = 0; } The above will only use the write_stamp if it still matches before_stamp and was tested to not have changed since C. As a bonus, with this logic we do not need any 64-bit cmpxchg() at all! This means the 32-bit rb_time_t workaround can finally be removed. But that's for a later time. Link: https://lore.kernel.org/linux-trace-kernel/20231218175229.58ec3daf@gandalf.local.home/ Link: https://lore.kernel.org/linux-trace-kernel/20231218230712.3a76b081@gandalf.local.home Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Linus Torvalds Fixes: dd93942570789 ("ring-buffer: Do not try to put back write_stamp") Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 79 ++++++++++++++-------------------------------- 1 file changed, 24 insertions(+), 55 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 5a114e752f11..83eab547f1d1 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -700,48 +700,6 @@ rb_time_read_cmpxchg(local_t *l, unsigned long expect, unsigned long set) return local_try_cmpxchg(l, &expect, set); } -static bool rb_time_cmpxchg(rb_time_t *t, u64 expect, u64 set) -{ - unsigned long cnt, top, bottom, msb; - unsigned long cnt2, top2, bottom2, msb2; - u64 val; - - /* Any interruptions in this function should cause a failure */ - cnt = local_read(&t->cnt); - - /* The cmpxchg always fails if it interrupted an update */ - if (!__rb_time_read(t, &val, &cnt2)) - return false; - - if (val != expect) - return false; - - if ((cnt & 3) != cnt2) - return false; - - cnt2 = cnt + 1; - - rb_time_split(val, &top, &bottom, &msb); - msb = rb_time_val_cnt(msb, cnt); - top = rb_time_val_cnt(top, cnt); - bottom = rb_time_val_cnt(bottom, cnt); - - rb_time_split(set, &top2, &bottom2, &msb2); - msb2 = rb_time_val_cnt(msb2, cnt); - top2 = rb_time_val_cnt(top2, cnt2); - bottom2 = rb_time_val_cnt(bottom2, cnt2); - - if (!rb_time_read_cmpxchg(&t->cnt, cnt, cnt2)) - return false; - if (!rb_time_read_cmpxchg(&t->msb, msb, msb2)) - return false; - if (!rb_time_read_cmpxchg(&t->top, top, top2)) - return false; - if (!rb_time_read_cmpxchg(&t->bottom, bottom, bottom2)) - return false; - return true; -} - #else /* 64 bits */ /* local64_t always succeeds */ @@ -755,11 +713,6 @@ static void rb_time_set(rb_time_t *t, u64 val) { local64_set(&t->time, val); } - -static bool rb_time_cmpxchg(rb_time_t *t, u64 expect, u64 set) -{ - return local64_try_cmpxchg(&t->time, &expect, set); -} #endif /* @@ -3610,20 +3563,36 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, } else { u64 ts; /* SLOW PATH - Interrupted between A and C */ - a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after); - /* Was interrupted before here, write_stamp must be valid */ + + /* Save the old before_stamp */ + a_ok = rb_time_read(&cpu_buffer->before_stamp, &info->before); RB_WARN_ON(cpu_buffer, !a_ok); + + /* + * Read a new timestamp and update the before_stamp to make + * the next event after this one force using an absolute + * timestamp. This is in case an interrupt were to come in + * between E and F. + */ ts = rb_time_stamp(cpu_buffer->buffer); + rb_time_set(&cpu_buffer->before_stamp, ts); + + barrier(); + /*E*/ a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after); + /* Was interrupted before here, write_stamp must be valid */ + RB_WARN_ON(cpu_buffer, !a_ok); barrier(); - /*E*/ if (write == (local_read(&tail_page->write) & RB_WRITE_MASK) && - info->after < ts && - rb_time_cmpxchg(&cpu_buffer->write_stamp, - info->after, ts)) { - /* Nothing came after this event between C and E */ + /*F*/ if (write == (local_read(&tail_page->write) & RB_WRITE_MASK) && + info->after == info->before && info->after < ts) { + /* + * Nothing came after this event between C and F, it is + * safe to use info->after for the delta as it + * matched info->before and is still valid. + */ info->delta = ts - info->after; } else { /* - * Interrupted between C and E: + * Interrupted between C and F: * Lost the previous events time stamp. Just set the * delta to zero, and this will be the same time as * the event this event interrupted. And the events that -- cgit From d23569979ca1cd139a42c410e0c7b9e6014c3b3a Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Wed, 13 Dec 2023 09:37:01 -0500 Subject: tracing: Allow creating instances with specified system events A trace instance may only need to enable specific events. As the eventfs directory of an instance currently creates all events which adds overhead, allow internal instances to be created with just the events in systems that they care about. This currently only deals with systems and not individual events, but this should bring down the overhead of creating instances for specific use cases quite bit. The trace_array_get_by_name() now has another parameter "systems". This parameter is a const string pointer of a comma/space separated list of event systems that should be created by the trace_array. (Note if the trace_array already exists, this parameter is ignored). The list of systems is saved and if a module is loaded, its events will not be added unless the system for those events also match the systems string. Link: https://lore.kernel.org/linux-trace-kernel/20231213093701.03fddec0@gandalf.local.home Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Sean Paul Cc: Arun Easi Cc: Daniel Wagner Tested-by: Dmytro Maluka Signed-off-by: Steven Rostedt (Google) --- drivers/scsi/qla2xxx/qla_os.c | 2 +- include/linux/trace.h | 4 ++-- kernel/trace/trace.c | 23 ++++++++++++++---- kernel/trace/trace.h | 1 + kernel/trace/trace_boot.c | 2 +- kernel/trace/trace_events.c | 48 ++++++++++++++++++++++++++++++++++--- samples/ftrace/sample-trace-array.c | 2 +- 7 files changed, 70 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/drivers/scsi/qla2xxx/qla_os.c b/drivers/scsi/qla2xxx/qla_os.c index 03348f605c2e..dd674378f2f3 100644 --- a/drivers/scsi/qla2xxx/qla_os.c +++ b/drivers/scsi/qla2xxx/qla_os.c @@ -2889,7 +2889,7 @@ static void qla2x00_iocb_work_fn(struct work_struct *work) static void qla_trace_init(void) { - qla_trc_array = trace_array_get_by_name("qla2xxx"); + qla_trc_array = trace_array_get_by_name("qla2xxx", NULL); if (!qla_trc_array) { ql_log(ql_log_fatal, NULL, 0x0001, "Unable to create qla2xxx trace instance, instance logging will be disabled.\n"); diff --git a/include/linux/trace.h b/include/linux/trace.h index 2a70a447184c..fdcd76b7be83 100644 --- a/include/linux/trace.h +++ b/include/linux/trace.h @@ -51,7 +51,7 @@ int trace_array_printk(struct trace_array *tr, unsigned long ip, const char *fmt, ...); int trace_array_init_printk(struct trace_array *tr); void trace_array_put(struct trace_array *tr); -struct trace_array *trace_array_get_by_name(const char *name); +struct trace_array *trace_array_get_by_name(const char *name, const char *systems); int trace_array_destroy(struct trace_array *tr); /* For osnoise tracer */ @@ -84,7 +84,7 @@ static inline int trace_array_init_printk(struct trace_array *tr) static inline void trace_array_put(struct trace_array *tr) { } -static inline struct trace_array *trace_array_get_by_name(const char *name) +static inline struct trace_array *trace_array_get_by_name(const char *name, const char *systems) { return NULL; } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 199df497db07..59e39b652afb 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -9490,7 +9490,8 @@ static int trace_array_create_dir(struct trace_array *tr) return ret; } -static struct trace_array *trace_array_create(const char *name) +static struct trace_array * +trace_array_create_systems(const char *name, const char *systems) { struct trace_array *tr; int ret; @@ -9510,6 +9511,12 @@ static struct trace_array *trace_array_create(const char *name) if (!zalloc_cpumask_var(&tr->pipe_cpumask, GFP_KERNEL)) goto out_free_tr; + if (systems) { + tr->system_names = kstrdup_const(systems, GFP_KERNEL); + if (!tr->system_names) + goto out_free_tr; + } + tr->trace_flags = global_trace.trace_flags & ~ZEROED_TRACE_FLAGS; cpumask_copy(tr->tracing_cpumask, cpu_all_mask); @@ -9556,12 +9563,18 @@ static struct trace_array *trace_array_create(const char *name) free_trace_buffers(tr); free_cpumask_var(tr->pipe_cpumask); free_cpumask_var(tr->tracing_cpumask); + kfree_const(tr->system_names); kfree(tr->name); kfree(tr); return ERR_PTR(ret); } +static struct trace_array *trace_array_create(const char *name) +{ + return trace_array_create_systems(name, NULL); +} + static int instance_mkdir(const char *name) { struct trace_array *tr; @@ -9587,6 +9600,7 @@ out_unlock: /** * trace_array_get_by_name - Create/Lookup a trace array, given its name. * @name: The name of the trace array to be looked up/created. + * @systems: A list of systems to create event directories for (NULL for all) * * Returns pointer to trace array with given name. * NULL, if it cannot be created. @@ -9600,7 +9614,7 @@ out_unlock: * trace_array_put() is called, user space can not delete it. * */ -struct trace_array *trace_array_get_by_name(const char *name) +struct trace_array *trace_array_get_by_name(const char *name, const char *systems) { struct trace_array *tr; @@ -9612,7 +9626,7 @@ struct trace_array *trace_array_get_by_name(const char *name) goto out_unlock; } - tr = trace_array_create(name); + tr = trace_array_create_systems(name, systems); if (IS_ERR(tr)) tr = NULL; @@ -9659,6 +9673,7 @@ static int __remove_instance(struct trace_array *tr) free_cpumask_var(tr->pipe_cpumask); free_cpumask_var(tr->tracing_cpumask); + kfree_const(tr->system_names); kfree(tr->name); kfree(tr); @@ -10377,7 +10392,7 @@ __init static void enable_instances(void) if (IS_ENABLED(CONFIG_TRACER_MAX_TRACE)) do_allocate_snapshot(tok); - tr = trace_array_get_by_name(tok); + tr = trace_array_get_by_name(tok, NULL); if (!tr) { pr_warn("Failed to create instance buffer %s\n", curr_str); continue; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 0489e72c8169..79180aed13ee 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -377,6 +377,7 @@ struct trace_array { unsigned char trace_flags_index[TRACE_FLAGS_MAX_SIZE]; unsigned int flags; raw_spinlock_t start_lock; + const char *system_names; struct list_head err_log; struct dentry *dir; struct dentry *options; diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index 7ccc7a8e155b..dbe29b4c6a7a 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -633,7 +633,7 @@ trace_boot_init_instances(struct xbc_node *node) if (!p || *p == '\0') continue; - tr = trace_array_get_by_name(p); + tr = trace_array_get_by_name(p, NULL); if (!tr) { pr_err("Failed to get trace instance %s\n", p); continue; diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index f29e815ca5b2..b70d03818038 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2896,6 +2896,27 @@ void trace_event_eval_update(struct trace_eval_map **map, int len) up_write(&trace_event_sem); } +static bool event_in_systems(struct trace_event_call *call, + const char *systems) +{ + const char *system; + const char *p; + + if (!systems) + return true; + + system = call->class->system; + p = strstr(systems, system); + if (!p) + return false; + + if (p != systems && !isspace(*(p - 1)) && *(p - 1) != ',') + return false; + + p += strlen(system); + return !*p || isspace(*p) || *p == ','; +} + static struct trace_event_file * trace_create_new_event(struct trace_event_call *call, struct trace_array *tr) @@ -2905,9 +2926,12 @@ trace_create_new_event(struct trace_event_call *call, struct trace_event_file *file; unsigned int first; + if (!event_in_systems(call, tr->system_names)) + return NULL; + file = kmem_cache_alloc(file_cachep, GFP_TRACE); if (!file) - return NULL; + return ERR_PTR(-ENOMEM); pid_list = rcu_dereference_protected(tr->filtered_pids, lockdep_is_held(&event_mutex)); @@ -2972,8 +2996,17 @@ __trace_add_new_event(struct trace_event_call *call, struct trace_array *tr) struct trace_event_file *file; file = trace_create_new_event(call, tr); + /* + * trace_create_new_event() returns ERR_PTR(-ENOMEM) if failed + * allocation, or NULL if the event is not part of the tr->system_names. + * When the event is not part of the tr->system_names, return zero, not + * an error. + */ if (!file) - return -ENOMEM; + return 0; + + if (IS_ERR(file)) + return PTR_ERR(file); if (eventdir_initialized) return event_create_dir(tr->event_dir, file); @@ -3012,8 +3045,17 @@ __trace_early_add_new_event(struct trace_event_call *call, int ret; file = trace_create_new_event(call, tr); + /* + * trace_create_new_event() returns ERR_PTR(-ENOMEM) if failed + * allocation, or NULL if the event is not part of the tr->system_names. + * When the event is not part of the tr->system_names, return zero, not + * an error. + */ if (!file) - return -ENOMEM; + return 0; + + if (IS_ERR(file)) + return PTR_ERR(file); ret = event_define_fields(call); if (ret) diff --git a/samples/ftrace/sample-trace-array.c b/samples/ftrace/sample-trace-array.c index 6aba02a31c96..d0ee9001c7b3 100644 --- a/samples/ftrace/sample-trace-array.c +++ b/samples/ftrace/sample-trace-array.c @@ -105,7 +105,7 @@ static int __init sample_trace_array_init(void) * NOTE: This function increments the reference counter * associated with the trace array - "tr". */ - tr = trace_array_get_by_name("sample-instance"); + tr = trace_array_get_by_name("sample-instance", "sched,timer,kprobes"); if (!tr) return -1; -- cgit From 0b9036efd83d4adefab197a1ec98c496c9df44f0 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Mon, 11 Dec 2023 13:16:23 -0500 Subject: ring-buffer: Add offset of events in dump on mismatch On bugs that have the ring buffer timestamp get out of sync, the config CONFIG_RING_BUFFER_VALIDATE_TIME_DELTAS, that checks for it and if it is detected it causes a dump of the bad sub buffer. It shows each event and their timestamp as well as the delta in the event. But it's also good to see the offset into the subbuffer for that event to know if how close to the end it is. Also print where the last event actually ended compared to where it was expected to end. Link: https://lore.kernel.org/linux-trace-kernel/20231211131623.59eaebd2@gandalf.local.home Cc: Mark Rutland Cc: Mathieu Desnoyers Acked-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 83eab547f1d1..bfe2697a92ee 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -3358,29 +3358,34 @@ static void dump_buffer_page(struct buffer_data_page *bpage, case RINGBUF_TYPE_TIME_EXTEND: delta = rb_event_time_stamp(event); ts += delta; - pr_warn(" [%lld] delta:%lld TIME EXTEND\n", ts, delta); + pr_warn(" 0x%x: [%lld] delta:%lld TIME EXTEND\n", + e, ts, delta); break; case RINGBUF_TYPE_TIME_STAMP: delta = rb_event_time_stamp(event); ts = rb_fix_abs_ts(delta, ts); - pr_warn(" [%lld] absolute:%lld TIME STAMP\n", ts, delta); + pr_warn(" 0x%x: [%lld] absolute:%lld TIME STAMP\n", + e, ts, delta); break; case RINGBUF_TYPE_PADDING: ts += event->time_delta; - pr_warn(" [%lld] delta:%d PADDING\n", ts, event->time_delta); + pr_warn(" 0x%x: [%lld] delta:%d PADDING\n", + e, ts, event->time_delta); break; case RINGBUF_TYPE_DATA: ts += event->time_delta; - pr_warn(" [%lld] delta:%d\n", ts, event->time_delta); + pr_warn(" 0x%x: [%lld] delta:%d\n", + e, ts, event->time_delta); break; default: break; } } + pr_warn("expected end:0x%lx last event actually ended at:0x%x\n", tail, e); } static DEFINE_PER_CPU(atomic_t, checking); -- cgit From 8ec90be7f15fac42992ea821be929d3b06cd0fd9 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 12 Dec 2023 13:19:01 -0500 Subject: tracing: Allow for max buffer data size trace_marker writes Allow a trace write to be as big as the ring buffer tracing data will allow. Currently, it only allows writes of 1KB in size, but there's no reason that it cannot allow what the ring buffer can hold. Link: https://lore.kernel.org/linux-trace-kernel/20231212131901.5f501e72@gandalf.local.home Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Signed-off-by: Steven Rostedt (Google) --- include/linux/ring_buffer.h | 1 + kernel/trace/ring_buffer.c | 15 +++++++++++++++ kernel/trace/trace.c | 31 ++++++++++++++++++++++++------- 3 files changed, 40 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index 782e14f62201..b1b03b2c0f08 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -141,6 +141,7 @@ int ring_buffer_iter_empty(struct ring_buffer_iter *iter); bool ring_buffer_iter_dropped(struct ring_buffer_iter *iter); unsigned long ring_buffer_size(struct trace_buffer *buffer, int cpu); +unsigned long ring_buffer_max_event_size(struct trace_buffer *buffer); void ring_buffer_reset_cpu(struct trace_buffer *buffer, int cpu); void ring_buffer_reset_online_cpus(struct trace_buffer *buffer); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index bfe2697a92ee..16b640d824f9 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -5190,6 +5190,21 @@ unsigned long ring_buffer_size(struct trace_buffer *buffer, int cpu) } EXPORT_SYMBOL_GPL(ring_buffer_size); +/** + * ring_buffer_max_event_size - return the max data size of an event + * @buffer: The ring buffer. + * + * Returns the maximum size an event can be. + */ +unsigned long ring_buffer_max_event_size(struct trace_buffer *buffer) +{ + /* If abs timestamp is requested, events have a timestamp too */ + if (ring_buffer_time_stamp_abs(buffer)) + return BUF_MAX_DATA_SIZE - RB_LEN_TIME_EXTEND; + return BUF_MAX_DATA_SIZE; +} +EXPORT_SYMBOL_GPL(ring_buffer_max_event_size); + static void rb_clear_buffer_page(struct buffer_page *page) { local_set(&page->write, 0); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 59e39b652afb..dba1328e454b 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -7278,8 +7278,9 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, enum event_trigger_type tt = ETT_NONE; struct trace_buffer *buffer; struct print_entry *entry; + int meta_size; ssize_t written; - int size; + size_t size; int len; /* Used in tracing_mark_raw_write() as well */ @@ -7292,12 +7293,12 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, if (!(tr->trace_flags & TRACE_ITER_MARKERS)) return -EINVAL; - if (cnt > TRACE_BUF_SIZE) - cnt = TRACE_BUF_SIZE; - - BUILD_BUG_ON(TRACE_BUF_SIZE >= PAGE_SIZE); + if ((ssize_t)cnt < 0) + return -EINVAL; - size = sizeof(*entry) + cnt + 2; /* add '\0' and possible '\n' */ + meta_size = sizeof(*entry) + 2; /* add '\0' and possible '\n' */ + again: + size = cnt + meta_size; /* If less than "", then make sure we can still add that */ if (cnt < FAULTED_SIZE) @@ -7306,9 +7307,25 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, buffer = tr->array_buffer.buffer; event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, tracing_gen_ctx()); - if (unlikely(!event)) + if (unlikely(!event)) { + /* + * If the size was greater than what was allowed, then + * make it smaller and try again. + */ + if (size > ring_buffer_max_event_size(buffer)) { + /* cnt < FAULTED size should never be bigger than max */ + if (WARN_ON_ONCE(cnt < FAULTED_SIZE)) + return -EBADF; + cnt = ring_buffer_max_event_size(buffer) - meta_size; + /* The above should only happen once */ + if (WARN_ON_ONCE(cnt + meta_size == size)) + return -EBADF; + goto again; + } + /* Ring buffer disabled, return as if not open for write */ return -EBADF; + } entry = ring_buffer_event_data(event); entry->ip = _THIS_IP_; -- cgit From 40fc60e36c60ba85b2974e507b67df40c94e9578 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Sat, 9 Dec 2023 17:52:20 -0500 Subject: trace_seq: Increase the buffer size to almost two pages Now that trace_marker can hold more than 1KB string, and can write as much as the ring buffer can hold, the trace_seq is not big enough to hold writes: ~# a="1234567890" ~# cnt=4080 ~# s="" ~# while [ $cnt -gt 10 ]; do ~# s="${s}${a}" ~# cnt=$((cnt-10)) ~# done ~# echo $s > trace_marker ~# cat trace # tracer: nop # # entries-in-buffer/entries-written: 2/2 #P:8 # # _-----=> irqs-off/BH-disabled # / _----=> need-resched # | / _---=> hardirq/softirq # || / _--=> preempt-depth # ||| / _-=> migrate-disable # |||| / delay # TASK-PID CPU# ||||| TIMESTAMP FUNCTION # | | | ||||| | | <...>-860 [002] ..... 105.543465: tracing_mark_write[LINE TOO BIG] <...>-860 [002] ..... 105.543496: tracing_mark_write: 789012345678901234567890 By increasing the trace_seq buffer to almost two pages, it can now print out the first line. This also subtracts the rest of the trace_seq fields from the buffer, so that the entire trace_seq is now PAGE_SIZE aligned. Link: https://lore.kernel.org/linux-trace-kernel/20231209175220.19867af4@gandalf.local.home Cc: Mark Rutland Cc: Mathieu Desnoyers Reviewed-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- include/linux/trace_seq.h | 9 ++++++--- kernel/trace/trace.c | 6 +++--- kernel/trace/trace_seq.c | 3 --- 3 files changed, 9 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/include/linux/trace_seq.h b/include/linux/trace_seq.h index 3691e0e76a1a..9ec229dfddaa 100644 --- a/include/linux/trace_seq.h +++ b/include/linux/trace_seq.h @@ -8,11 +8,14 @@ /* * Trace sequences are used to allow a function to call several other functions - * to create a string of data to use (up to a max of PAGE_SIZE). + * to create a string of data to use. */ +#define TRACE_SEQ_BUFFER_SIZE (PAGE_SIZE * 2 - \ + (sizeof(struct seq_buf) + sizeof(size_t) + sizeof(int))) + struct trace_seq { - char buffer[PAGE_SIZE]; + char buffer[TRACE_SEQ_BUFFER_SIZE]; struct seq_buf seq; size_t readpos; int full; @@ -21,7 +24,7 @@ struct trace_seq { static inline void trace_seq_init(struct trace_seq *s) { - seq_buf_init(&s->seq, s->buffer, PAGE_SIZE); + seq_buf_init(&s->seq, s->buffer, TRACE_SEQ_BUFFER_SIZE); s->full = 0; s->readpos = 0; } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index dba1328e454b..0be30cccabb4 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3753,7 +3753,7 @@ static bool trace_safe_str(struct trace_iterator *iter, const char *str, /* OK if part of the temp seq buffer */ if ((addr >= (unsigned long)iter->tmp_seq.buffer) && - (addr < (unsigned long)iter->tmp_seq.buffer + PAGE_SIZE)) + (addr < (unsigned long)iter->tmp_seq.buffer + TRACE_SEQ_BUFFER_SIZE)) return true; /* Core rodata can not be freed */ @@ -6932,8 +6932,8 @@ waitagain: goto out; } - if (cnt >= PAGE_SIZE) - cnt = PAGE_SIZE - 1; + if (cnt >= TRACE_SEQ_BUFFER_SIZE) + cnt = TRACE_SEQ_BUFFER_SIZE - 1; /* reset all but tr, trace, and overruns */ trace_iterator_reset(iter); diff --git a/kernel/trace/trace_seq.c b/kernel/trace/trace_seq.c index 7be97229ddf8..c158d65a8a88 100644 --- a/kernel/trace/trace_seq.c +++ b/kernel/trace/trace_seq.c @@ -13,9 +13,6 @@ * trace_seq_init() more than once to reset the trace_seq to start * from scratch. * - * The buffer size is currently PAGE_SIZE, although it may become dynamic - * in the future. - * * A write to the buffer will either succeed or fail. That is, unlike * sprintf() there will not be a partial write (well it may write into * the buffer but it wont update the pointers). This allows users to -- cgit From 9482341d9bdaf194552673b6c1c81a824e985e7f Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 12 Dec 2023 19:04:22 -0500 Subject: tracing: Have trace_marker break up by lines by size of trace_seq If a trace_marker write is bigger than what trace_seq can hold, then it will print "LINE TOO BIG" message and not what was written. Instead, check if the write is bigger than the trace_seq and break it up by that size. Ideally, we could make the trace_seq dynamic that could hold this. But that's for another time. Link: https://lore.kernel.org/linux-trace-kernel/20231212190422.1eaf224f@gandalf.local.home Cc: Mark Rutland Cc: Mathieu Desnoyers Reviewed-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 0be30cccabb4..9cf58383d2fb 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -7304,6 +7304,11 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, if (cnt < FAULTED_SIZE) size += FAULTED_SIZE - cnt; + if (size > TRACE_SEQ_BUFFER_SIZE) { + cnt -= size - TRACE_SEQ_BUFFER_SIZE; + goto again; + } + buffer = tr->array_buffer.buffer; event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, tracing_gen_ctx()); -- cgit From 76ca20c748684f63bb985166850049f242797f65 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Wed, 13 Dec 2023 10:42:18 -0500 Subject: tracing: Increase size of trace_marker_raw to max ring buffer entry There's no reason to give an arbitrary limit to the size of a raw trace marker. Just let it be as big as the size that is allowed by the ring buffer itself. And there's also no reason to artificially break up the write to TRACE_BUF_SIZE, as that's not even used. Link: https://lore.kernel.org/linux-trace-kernel/20231213104218.2efc70c1@gandalf.local.home Cc: Mark Rutland Cc: Mathieu Desnoyers Reviewed-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 9cf58383d2fb..55dabee4c78b 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -7365,9 +7365,6 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, return written; } -/* Limit it for now to 3K (including tag) */ -#define RAW_DATA_MAX_SIZE (1024*3) - static ssize_t tracing_mark_raw_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *fpos) @@ -7389,19 +7386,18 @@ tracing_mark_raw_write(struct file *filp, const char __user *ubuf, return -EINVAL; /* The marker must at least have a tag id */ - if (cnt < sizeof(unsigned int) || cnt > RAW_DATA_MAX_SIZE) + if (cnt < sizeof(unsigned int)) return -EINVAL; - if (cnt > TRACE_BUF_SIZE) - cnt = TRACE_BUF_SIZE; - - BUILD_BUG_ON(TRACE_BUF_SIZE >= PAGE_SIZE); - size = sizeof(*entry) + cnt; if (cnt < FAULT_SIZE_ID) size += FAULT_SIZE_ID - cnt; buffer = tr->array_buffer.buffer; + + if (size > ring_buffer_max_event_size(buffer)) + return -EINVAL; + event = __trace_buffer_lock_reserve(buffer, TRACE_RAW_DATA, size, tracing_gen_ctx()); if (!event) -- cgit From c84897c0ff5925090af0c6a8bf61424b71481764 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 19 Dec 2023 07:43:03 -0500 Subject: ring-buffer: Remove 32bit timestamp logic Each event has a 27 bit timestamp delta that is used to hold the delta from the last event. If the time between events is greater than 2^27, then a timestamp is added that holds a 59 bit absolute timestamp. Until a389d86f7fd09 ("ring-buffer: Have nested events still record running time stamp"), if an interrupt interrupted an event in progress, all the events delta would be zero to not deal with the races that need to be handled. The commit a389d86f7fd09 changed that to handle the races giving all events, even those that preempt other events, still have an accurate timestamp. To handle those races requires performing 64-bit cmpxchg on the timestamps. But doing 64-bit cmpxchg on 32-bit architectures is considered very slow. To try to deal with this the timestamp logic was broken into two and then three 32-bit cmpxchgs, with the thought that two (or three) 32-bit cmpxchgs are still faster than a single 64-bit cmpxchg on 32-bit architectures. Part of the problem with this is that I didn't have any 32-bit architectures to test on. After hitting several subtle bugs in this code, an effort was made to try and see if three 32-bit cmpxchgs are indeed faster than a single 64-bit. After a few people brushed off the dust of their old 32-bit machines, tests were done, and even though 32-bit cmpxchg was faster than a single 64-bit, it was in the order of 50% at best, not 300%. After some more refactoring of the code, all 4 64-bit cmpxchg were removed: https://lore.kernel.org/linux-trace-kernel/20231211114420.36dde01b@gandalf.local.home https://lore.kernel.org/linux-trace-kernel/20231214222921.193037a7@gandalf.local.home https://lore.kernel.org/linux-trace-kernel/20231215081810.1f4f38fe@rorschach.local.home https://lore.kernel.org/linux-trace-kernel/20231218230712.3a76b081@gandalf.local.home/ With all the 64-bit cmpxchg removed, the complex 32-bit workaround can also be removed. The 32-bit and 64-bit logic is now exactly the same. Link: https://lore.kernel.org/all/20231213214632.15047c40@gandalf.local.home/ Link: https://lore.kernel.org/linux-trace-kernel/20231219074303.28f9abda@gandalf.local.home Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Linus Torvalds Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 176 +++------------------------------------------ 1 file changed, 9 insertions(+), 167 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 16b640d824f9..a4ada4f303c4 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -27,6 +27,7 @@ #include #include +#include #include /* @@ -463,27 +464,9 @@ enum { RB_CTX_MAX }; -#if BITS_PER_LONG == 32 -#define RB_TIME_32 -#endif - -/* To test on 64 bit machines */ -//#define RB_TIME_32 - -#ifdef RB_TIME_32 - -struct rb_time_struct { - local_t cnt; - local_t top; - local_t bottom; - local_t msb; -}; -#else -#include struct rb_time_struct { local64_t time; }; -#endif typedef struct rb_time_struct rb_time_t; #define MAX_NEST 5 @@ -573,147 +556,14 @@ struct ring_buffer_iter { int missed_events; }; -#ifdef RB_TIME_32 - -/* - * On 32 bit machines, local64_t is very expensive. As the ring - * buffer doesn't need all the features of a true 64 bit atomic, - * on 32 bit, it uses these functions (64 still uses local64_t). - * - * For the ring buffer, 64 bit required operations for the time is - * the following: - * - * - Reads may fail if it interrupted a modification of the time stamp. - * It will succeed if it did not interrupt another write even if - * the read itself is interrupted by a write. - * It returns whether it was successful or not. - * - * - Writes always succeed and will overwrite other writes and writes - * that were done by events interrupting the current write. - * - * - A write followed by a read of the same time stamp will always succeed, - * but may not contain the same value. - * - * - A cmpxchg will fail if it interrupted another write or cmpxchg. - * Other than that, it acts like a normal cmpxchg. - * - * The 60 bit time stamp is broken up by 30 bits in a top and bottom half - * (bottom being the least significant 30 bits of the 60 bit time stamp). - * - * The two most significant bits of each half holds a 2 bit counter (0-3). - * Each update will increment this counter by one. - * When reading the top and bottom, if the two counter bits match then the - * top and bottom together make a valid 60 bit number. - */ -#define RB_TIME_SHIFT 30 -#define RB_TIME_VAL_MASK ((1 << RB_TIME_SHIFT) - 1) -#define RB_TIME_MSB_SHIFT 60 - -static inline int rb_time_cnt(unsigned long val) -{ - return (val >> RB_TIME_SHIFT) & 3; -} - -static inline u64 rb_time_val(unsigned long top, unsigned long bottom) -{ - u64 val; - - val = top & RB_TIME_VAL_MASK; - val <<= RB_TIME_SHIFT; - val |= bottom & RB_TIME_VAL_MASK; - - return val; -} - -static inline bool __rb_time_read(rb_time_t *t, u64 *ret, unsigned long *cnt) -{ - unsigned long top, bottom, msb; - unsigned long c; - - /* - * If the read is interrupted by a write, then the cnt will - * be different. Loop until both top and bottom have been read - * without interruption. - */ - do { - c = local_read(&t->cnt); - top = local_read(&t->top); - bottom = local_read(&t->bottom); - msb = local_read(&t->msb); - } while (c != local_read(&t->cnt)); - - *cnt = rb_time_cnt(top); - - /* If top, msb or bottom counts don't match, this interrupted a write */ - if (*cnt != rb_time_cnt(msb) || *cnt != rb_time_cnt(bottom)) - return false; - - /* The shift to msb will lose its cnt bits */ - *ret = rb_time_val(top, bottom) | ((u64)msb << RB_TIME_MSB_SHIFT); - return true; -} - -static bool rb_time_read(rb_time_t *t, u64 *ret) -{ - unsigned long cnt; - - return __rb_time_read(t, ret, &cnt); -} - -static inline unsigned long rb_time_val_cnt(unsigned long val, unsigned long cnt) -{ - return (val & RB_TIME_VAL_MASK) | ((cnt & 3) << RB_TIME_SHIFT); -} - -static inline void rb_time_split(u64 val, unsigned long *top, unsigned long *bottom, - unsigned long *msb) -{ - *top = (unsigned long)((val >> RB_TIME_SHIFT) & RB_TIME_VAL_MASK); - *bottom = (unsigned long)(val & RB_TIME_VAL_MASK); - *msb = (unsigned long)(val >> RB_TIME_MSB_SHIFT); -} - -static inline void rb_time_val_set(local_t *t, unsigned long val, unsigned long cnt) -{ - val = rb_time_val_cnt(val, cnt); - local_set(t, val); -} - -static void rb_time_set(rb_time_t *t, u64 val) -{ - unsigned long cnt, top, bottom, msb; - - rb_time_split(val, &top, &bottom, &msb); - - /* Writes always succeed with a valid number even if it gets interrupted. */ - do { - cnt = local_inc_return(&t->cnt); - rb_time_val_set(&t->top, top, cnt); - rb_time_val_set(&t->bottom, bottom, cnt); - rb_time_val_set(&t->msb, val >> RB_TIME_MSB_SHIFT, cnt); - } while (cnt != local_read(&t->cnt)); -} - -static inline bool -rb_time_read_cmpxchg(local_t *l, unsigned long expect, unsigned long set) -{ - return local_try_cmpxchg(l, &expect, set); -} - -#else /* 64 bits */ - -/* local64_t always succeeds */ - -static inline bool rb_time_read(rb_time_t *t, u64 *ret) +static inline void rb_time_read(rb_time_t *t, u64 *ret) { *ret = local64_read(&t->time); - return true; } static void rb_time_set(rb_time_t *t, u64 val) { local64_set(&t->time, val); } -#endif /* * Enable this to make sure that the event passed to @@ -820,10 +670,7 @@ u64 ring_buffer_event_time_stamp(struct trace_buffer *buffer, WARN_ONCE(1, "nest (%d) greater than max", nest); fail: - /* Can only fail on 32 bit */ - if (!rb_time_read(&cpu_buffer->write_stamp, &ts)) - /* Screw it, just read the current time */ - ts = rb_time_stamp(cpu_buffer->buffer); + rb_time_read(&cpu_buffer->write_stamp, &ts); return ts; } @@ -2820,7 +2667,7 @@ rb_check_timestamp(struct ring_buffer_per_cpu *cpu_buffer, (unsigned long long)info->ts, (unsigned long long)info->before, (unsigned long long)info->after, - (unsigned long long)(rb_time_read(&cpu_buffer->write_stamp, &write_stamp) ? write_stamp : 0), + (unsigned long long)({rb_time_read(&cpu_buffer->write_stamp, &write_stamp); write_stamp;}), sched_clock_stable() ? "" : "If you just came from a suspend/resume,\n" "please switch to the trace global clock:\n" @@ -3497,16 +3344,14 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer_event *event; struct buffer_page *tail_page; unsigned long tail, write, w; - bool a_ok; - bool b_ok; /* Don't let the compiler play games with cpu_buffer->tail_page */ tail_page = info->tail_page = READ_ONCE(cpu_buffer->tail_page); /*A*/ w = local_read(&tail_page->write) & RB_WRITE_MASK; barrier(); - b_ok = rb_time_read(&cpu_buffer->before_stamp, &info->before); - a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after); + rb_time_read(&cpu_buffer->before_stamp, &info->before); + rb_time_read(&cpu_buffer->write_stamp, &info->after); barrier(); info->ts = rb_time_stamp(cpu_buffer->buffer); @@ -3521,7 +3366,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, if (!w) { /* Use the sub-buffer timestamp */ info->delta = 0; - } else if (unlikely(!a_ok || !b_ok || info->before != info->after)) { + } else if (unlikely(info->before != info->after)) { info->add_timestamp |= RB_ADD_STAMP_FORCE | RB_ADD_STAMP_EXTEND; info->length += RB_LEN_TIME_EXTEND; } else { @@ -3570,8 +3415,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, /* SLOW PATH - Interrupted between A and C */ /* Save the old before_stamp */ - a_ok = rb_time_read(&cpu_buffer->before_stamp, &info->before); - RB_WARN_ON(cpu_buffer, !a_ok); + rb_time_read(&cpu_buffer->before_stamp, &info->before); /* * Read a new timestamp and update the before_stamp to make @@ -3583,9 +3427,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, rb_time_set(&cpu_buffer->before_stamp, ts); barrier(); - /*E*/ a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after); - /* Was interrupted before here, write_stamp must be valid */ - RB_WARN_ON(cpu_buffer, !a_ok); + /*E*/ rb_time_read(&cpu_buffer->write_stamp, &info->after); barrier(); /*F*/ if (write == (local_read(&tail_page->write) & RB_WRITE_MASK) && info->after == info->before && info->after < ts) { -- cgit From d40dbb617ae9a8fe57343b26b4ad2bd7bb05c130 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 19 Dec 2023 07:45:42 -0500 Subject: ring-buffer: Add interrupt information to dump of data sub-buffer When the ring buffer timestamp verifier triggers, it dumps the content of the sub-buffer. But currently it only dumps the timestamps and the offset of the data as well as the deltas. It would be even more informative if the event data also showed the interrupt context level it was in. That is, if each event showed that the event was written in normal, softirq, irq or NMI context. Then a better idea about how the events may have been interrupted from each other. As the payload of the ring buffer is really a black box of the ring buffer, just assume that if the payload is larger than a trace entry, that it is a trace entry. As trace entries have the interrupt context information saved in a flags field, look at that location and report the output of the flags. If the payload is not a trace entry, there's no way to really know, and the information will be garbage. But that's OK, because this is for debugging only (this output is not used in production as the buffer check that calls it causes a huge overhead to the tracing). This information, when available, is crucial for debugging timestamp issues. If it's garbage, it will also be pretty obvious that its garbage too. As this output usually happens in kselftests of the tracing code, the user will know what the payload is at the time. Link: https://lore.kernel.org/linux-trace-kernel/20231219074542.6f304601@gandalf.local.home Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Suggested-by: Joel Fernandes (Google) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 79 +++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 75 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index a4ada4f303c4..c0cc45482e1e 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -3185,6 +3185,76 @@ EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit); #define CHECK_FULL_PAGE 1L #ifdef CONFIG_RING_BUFFER_VALIDATE_TIME_DELTAS + +static const char *show_irq_str(int bits) +{ + const char *type[] = { + ".", // 0 + "s", // 1 + "h", // 2 + "Hs", // 3 + "n", // 4 + "Ns", // 5 + "Nh", // 6 + "NHs", // 7 + }; + + return type[bits]; +} + +/* Assume this is an trace event */ +static const char *show_flags(struct ring_buffer_event *event) +{ + struct trace_entry *entry; + int bits = 0; + + if (rb_event_data_length(event) - RB_EVNT_HDR_SIZE < sizeof(*entry)) + return "X"; + + entry = ring_buffer_event_data(event); + + if (entry->flags & TRACE_FLAG_SOFTIRQ) + bits |= 1; + + if (entry->flags & TRACE_FLAG_HARDIRQ) + bits |= 2; + + if (entry->flags & TRACE_FLAG_NMI) + bits |= 4; + + return show_irq_str(bits); +} + +static const char *show_irq(struct ring_buffer_event *event) +{ + struct trace_entry *entry; + + if (rb_event_data_length(event) - RB_EVNT_HDR_SIZE < sizeof(*entry)) + return ""; + + entry = ring_buffer_event_data(event); + if (entry->flags & TRACE_FLAG_IRQS_OFF) + return "d"; + return ""; +} + +static const char *show_interrupt_level(void) +{ + unsigned long pc = preempt_count(); + unsigned char level = 0; + + if (pc & SOFTIRQ_OFFSET) + level |= 1; + + if (pc & HARDIRQ_MASK) + level |= 2; + + if (pc & NMI_MASK) + level |= 4; + + return show_irq_str(level); +} + static void dump_buffer_page(struct buffer_data_page *bpage, struct rb_event_info *info, unsigned long tail) @@ -3224,8 +3294,9 @@ static void dump_buffer_page(struct buffer_data_page *bpage, case RINGBUF_TYPE_DATA: ts += event->time_delta; - pr_warn(" 0x%x: [%lld] delta:%d\n", - e, ts, event->time_delta); + pr_warn(" 0x%x: [%lld] delta:%d %s%s\n", + e, ts, event->time_delta, + show_flags(event), show_irq(event)); break; default: @@ -3316,11 +3387,11 @@ static void check_buffer(struct ring_buffer_per_cpu *cpu_buffer, atomic_inc(&cpu_buffer->record_disabled); /* There's some cases in boot up that this can happen */ WARN_ON_ONCE(system_state != SYSTEM_BOOTING); - pr_warn("[CPU: %d]TIME DOES NOT MATCH expected:%lld actual:%lld delta:%lld before:%lld after:%lld%s\n", + pr_warn("[CPU: %d]TIME DOES NOT MATCH expected:%lld actual:%lld delta:%lld before:%lld after:%lld%s context:%s\n", cpu_buffer->cpu, ts + info->delta, info->ts, info->delta, info->before, info->after, - full ? " (full)" : ""); + full ? " (full)" : "", show_interrupt_level()); dump_buffer_page(bpage, info, tail); atomic_dec(&ts_dump); /* Do not re-enable checking */ -- cgit From f50345b49b16f7fe6dbebbec065b9248c38556ff Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 19 Dec 2023 07:47:32 -0500 Subject: ring-buffer: Check if absolute timestamp goes backwards The check_buffer() which checks the timestamps of the ring buffer sub-buffer page, when enabled, only checks if the adding of deltas of the events from the last absolute timestamp or the timestamp of the sub-buffer page adds up to the current event. What it does not check is if the absolute timestamp causes the time of the events to go backwards, as that can cause issues elsewhere. Test for the timestamp going backwards too. This also fixes a slight issue where if the warning triggers at boot up (because of the resetting of the tsc), it will disable all further checks, even those that are after boot Have it continue checking if the warning was ignored during boot up. Link: https://lore.kernel.org/linux-trace-kernel/20231219074732.18b092d4@gandalf.local.home Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 46 ++++++++++++++++++++++++++++------------------ 1 file changed, 28 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index c0cc45482e1e..f7dc74e45ebf 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -3309,6 +3309,23 @@ static void dump_buffer_page(struct buffer_data_page *bpage, static DEFINE_PER_CPU(atomic_t, checking); static atomic_t ts_dump; +#define buffer_warn_return(fmt, ...) \ + do { \ + /* If another report is happening, ignore this one */ \ + if (atomic_inc_return(&ts_dump) != 1) { \ + atomic_dec(&ts_dump); \ + goto out; \ + } \ + atomic_inc(&cpu_buffer->record_disabled); \ + pr_warn(fmt, ##__VA_ARGS__); \ + dump_buffer_page(bpage, info, tail); \ + atomic_dec(&ts_dump); \ + /* There's some cases in boot up that this can happen */ \ + if (WARN_ON_ONCE(system_state != SYSTEM_BOOTING)) \ + /* Do not re-enable checking */ \ + return; \ + } while (0) + /* * Check if the current event time stamp matches the deltas on * the buffer page. @@ -3362,7 +3379,12 @@ static void check_buffer(struct ring_buffer_per_cpu *cpu_buffer, case RINGBUF_TYPE_TIME_STAMP: delta = rb_event_time_stamp(event); - ts = rb_fix_abs_ts(delta, ts); + delta = rb_fix_abs_ts(delta, ts); + if (delta < ts) { + buffer_warn_return("[CPU: %d]ABSOLUTE TIME WENT BACKWARDS: last ts: %lld absolute ts: %lld\n", + cpu_buffer->cpu, ts, delta); + } + ts = delta; break; case RINGBUF_TYPE_PADDING: @@ -3379,23 +3401,11 @@ static void check_buffer(struct ring_buffer_per_cpu *cpu_buffer, } if ((full && ts > info->ts) || (!full && ts + info->delta != info->ts)) { - /* If another report is happening, ignore this one */ - if (atomic_inc_return(&ts_dump) != 1) { - atomic_dec(&ts_dump); - goto out; - } - atomic_inc(&cpu_buffer->record_disabled); - /* There's some cases in boot up that this can happen */ - WARN_ON_ONCE(system_state != SYSTEM_BOOTING); - pr_warn("[CPU: %d]TIME DOES NOT MATCH expected:%lld actual:%lld delta:%lld before:%lld after:%lld%s context:%s\n", - cpu_buffer->cpu, - ts + info->delta, info->ts, info->delta, - info->before, info->after, - full ? " (full)" : "", show_interrupt_level()); - dump_buffer_page(bpage, info, tail); - atomic_dec(&ts_dump); - /* Do not re-enable checking */ - return; + buffer_warn_return("[CPU: %d]TIME DOES NOT MATCH expected:%lld actual:%lld delta:%lld before:%lld after:%lld%s context:%s\n", + cpu_buffer->cpu, + ts + info->delta, info->ts, info->delta, + info->before, info->after, + full ? " (full)" : "", show_interrupt_level()); } out: atomic_dec(this_cpu_ptr(&checking)); -- cgit From d17aff807f845cf93926c28705216639c7279110 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 19 Dec 2023 07:37:35 -0800 Subject: Revert BPF token-related functionality This patch includes the following revert (one conflicting BPF FS patch and three token patch sets, represented by merge commits): - revert 0f5d5454c723 "Merge branch 'bpf-fs-mount-options-parsing-follow-ups'"; - revert 750e785796bb "bpf: Support uid and gid when mounting bpffs"; - revert 733763285acf "Merge branch 'bpf-token-support-in-libbpf-s-bpf-object'"; - revert c35919dcce28 "Merge branch 'bpf-token-and-bpf-fs-based-delegation'". Link: https://lore.kernel.org/bpf/CAHk-=wg7JuFYwGy=GOMbRCtOL+jwSQsdUaBsRWkDVYbxipbM5A@mail.gmail.com Signed-off-by: Andrii Nakryiko --- drivers/media/rc/bpf-lirc.c | 2 +- include/linux/bpf.h | 85 +- include/linux/filter.h | 2 +- include/linux/lsm_hook_defs.h | 15 +- include/linux/security.h | 43 +- include/uapi/linux/bpf.h | 42 - kernel/bpf/Makefile | 2 +- kernel/bpf/arraymap.c | 2 +- kernel/bpf/bpf_lsm.c | 15 +- kernel/bpf/cgroup.c | 6 +- kernel/bpf/core.c | 3 +- kernel/bpf/helpers.c | 6 +- kernel/bpf/inode.c | 326 +------ kernel/bpf/syscall.c | 215 ++-- kernel/bpf/token.c | 271 ----- kernel/bpf/verifier.c | 13 +- kernel/trace/bpf_trace.c | 2 +- net/core/filter.c | 36 +- net/ipv4/bpf_tcp_ca.c | 2 +- net/netfilter/nf_bpf_link.c | 2 +- security/security.c | 101 +- security/selinux/hooks.c | 47 +- tools/include/uapi/linux/bpf.h | 42 - tools/lib/bpf/Build | 2 +- tools/lib/bpf/bpf.c | 37 +- tools/lib/bpf/bpf.h | 35 +- tools/lib/bpf/btf.c | 7 +- tools/lib/bpf/elf.c | 2 + tools/lib/bpf/features.c | 478 --------- tools/lib/bpf/libbpf.c | 573 ++++++++--- tools/lib/bpf/libbpf.h | 37 +- tools/lib/bpf/libbpf.map | 1 - tools/lib/bpf/libbpf_internal.h | 36 +- tools/lib/bpf/libbpf_probes.c | 8 +- tools/lib/bpf/str_error.h | 3 - .../selftests/bpf/prog_tests/libbpf_probes.c | 4 - .../testing/selftests/bpf/prog_tests/libbpf_str.c | 6 - tools/testing/selftests/bpf/prog_tests/token.c | 1031 -------------------- tools/testing/selftests/bpf/progs/priv_map.c | 13 - tools/testing/selftests/bpf/progs/priv_prog.c | 13 - 40 files changed, 641 insertions(+), 2925 deletions(-) delete mode 100644 kernel/bpf/token.c delete mode 100644 tools/lib/bpf/features.c delete mode 100644 tools/testing/selftests/bpf/prog_tests/token.c delete mode 100644 tools/testing/selftests/bpf/progs/priv_map.c delete mode 100644 tools/testing/selftests/bpf/progs/priv_prog.c (limited to 'kernel') diff --git a/drivers/media/rc/bpf-lirc.c b/drivers/media/rc/bpf-lirc.c index 6d07693c6b9f..fe17c7f98e81 100644 --- a/drivers/media/rc/bpf-lirc.c +++ b/drivers/media/rc/bpf-lirc.c @@ -110,7 +110,7 @@ lirc_mode2_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_get_prandom_u32: return &bpf_get_prandom_u32_proto; case BPF_FUNC_trace_printk: - if (bpf_token_capable(prog->aux->token, CAP_PERFMON)) + if (perfmon_capable()) return bpf_get_trace_printk_proto(); fallthrough; default: diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 2f54cc0436c4..7a8d4c81a39a 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -52,10 +52,6 @@ struct module; struct bpf_func_state; struct ftrace_ops; struct cgroup; -struct bpf_token; -struct user_namespace; -struct super_block; -struct inode; extern struct idr btf_idr; extern spinlock_t btf_idr_lock; @@ -1488,7 +1484,6 @@ struct bpf_prog_aux { #ifdef CONFIG_SECURITY void *security; #endif - struct bpf_token *token; struct bpf_prog_offload *offload; struct btf *btf; struct bpf_func_info *func_info; @@ -1613,31 +1608,6 @@ struct bpf_link_primer { u32 id; }; -struct bpf_mount_opts { - kuid_t uid; - kgid_t gid; - umode_t mode; - - /* BPF token-related delegation options */ - u64 delegate_cmds; - u64 delegate_maps; - u64 delegate_progs; - u64 delegate_attachs; -}; - -struct bpf_token { - struct work_struct work; - atomic64_t refcnt; - struct user_namespace *userns; - u64 allowed_cmds; - u64 allowed_maps; - u64 allowed_progs; - u64 allowed_attachs; -#ifdef CONFIG_SECURITY - void *security; -#endif -}; - struct bpf_struct_ops_value; struct btf_member; @@ -2097,7 +2067,6 @@ static inline void bpf_enable_instrumentation(void) migrate_enable(); } -extern const struct super_operations bpf_super_ops; extern const struct file_operations bpf_map_fops; extern const struct file_operations bpf_prog_fops; extern const struct file_operations bpf_iter_fops; @@ -2232,26 +2201,24 @@ static inline void bpf_map_dec_elem_count(struct bpf_map *map) extern int sysctl_unprivileged_bpf_disabled; -bool bpf_token_capable(const struct bpf_token *token, int cap); - -static inline bool bpf_allow_ptr_leaks(const struct bpf_token *token) +static inline bool bpf_allow_ptr_leaks(void) { - return bpf_token_capable(token, CAP_PERFMON); + return perfmon_capable(); } -static inline bool bpf_allow_uninit_stack(const struct bpf_token *token) +static inline bool bpf_allow_uninit_stack(void) { - return bpf_token_capable(token, CAP_PERFMON); + return perfmon_capable(); } -static inline bool bpf_bypass_spec_v1(const struct bpf_token *token) +static inline bool bpf_bypass_spec_v1(void) { - return cpu_mitigations_off() || bpf_token_capable(token, CAP_PERFMON); + return cpu_mitigations_off() || perfmon_capable(); } -static inline bool bpf_bypass_spec_v4(const struct bpf_token *token) +static inline bool bpf_bypass_spec_v4(void) { - return cpu_mitigations_off() || bpf_token_capable(token, CAP_PERFMON); + return cpu_mitigations_off() || perfmon_capable(); } int bpf_map_new_fd(struct bpf_map *map, int flags); @@ -2268,21 +2235,8 @@ int bpf_link_new_fd(struct bpf_link *link); struct bpf_link *bpf_link_get_from_fd(u32 ufd); struct bpf_link *bpf_link_get_curr_or_next(u32 *id); -void bpf_token_inc(struct bpf_token *token); -void bpf_token_put(struct bpf_token *token); -int bpf_token_create(union bpf_attr *attr); -struct bpf_token *bpf_token_get_from_fd(u32 ufd); - -bool bpf_token_allow_cmd(const struct bpf_token *token, enum bpf_cmd cmd); -bool bpf_token_allow_map_type(const struct bpf_token *token, enum bpf_map_type type); -bool bpf_token_allow_prog_type(const struct bpf_token *token, - enum bpf_prog_type prog_type, - enum bpf_attach_type attach_type); - int bpf_obj_pin_user(u32 ufd, int path_fd, const char __user *pathname); int bpf_obj_get_user(int path_fd, const char __user *pathname, int flags); -struct inode *bpf_get_inode(struct super_block *sb, const struct inode *dir, - umode_t mode); #define BPF_ITER_FUNC_PREFIX "bpf_iter_" #define DEFINE_BPF_ITER_FUNC(target, args...) \ @@ -2526,8 +2480,7 @@ const char *btf_find_decl_tag_value(const struct btf *btf, const struct btf_type struct bpf_prog *bpf_prog_by_id(u32 id); struct bpf_link *bpf_link_by_id(u32 id); -const struct bpf_func_proto *bpf_base_func_proto(enum bpf_func_id func_id, - const struct bpf_prog *prog); +const struct bpf_func_proto *bpf_base_func_proto(enum bpf_func_id func_id); void bpf_task_storage_free(struct task_struct *task); void bpf_cgrp_storage_free(struct cgroup *cgroup); bool bpf_prog_has_kfunc_call(const struct bpf_prog *prog); @@ -2646,24 +2599,6 @@ static inline int bpf_obj_get_user(const char __user *pathname, int flags) return -EOPNOTSUPP; } -static inline bool bpf_token_capable(const struct bpf_token *token, int cap) -{ - return capable(cap) || (cap != CAP_SYS_ADMIN && capable(CAP_SYS_ADMIN)); -} - -static inline void bpf_token_inc(struct bpf_token *token) -{ -} - -static inline void bpf_token_put(struct bpf_token *token) -{ -} - -static inline struct bpf_token *bpf_token_get_from_fd(u32 ufd) -{ - return ERR_PTR(-EOPNOTSUPP); -} - static inline void __dev_flush(void) { } @@ -2787,7 +2722,7 @@ static inline int btf_struct_access(struct bpf_verifier_log *log, } static inline const struct bpf_func_proto * -bpf_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +bpf_base_func_proto(enum bpf_func_id func_id) { return NULL; } diff --git a/include/linux/filter.h b/include/linux/filter.h index 12d907f17d36..68fb6c8142fe 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1139,7 +1139,7 @@ static inline bool bpf_jit_blinding_enabled(struct bpf_prog *prog) return false; if (!bpf_jit_harden) return false; - if (bpf_jit_harden == 1 && bpf_token_capable(prog->aux->token, CAP_BPF)) + if (bpf_jit_harden == 1 && bpf_capable()) return false; return true; diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index 3fdd00b452ac..ff217a5ce552 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -398,17 +398,10 @@ LSM_HOOK(void, LSM_RET_VOID, audit_rule_free, void *lsmrule) LSM_HOOK(int, 0, bpf, int cmd, union bpf_attr *attr, unsigned int size) LSM_HOOK(int, 0, bpf_map, struct bpf_map *map, fmode_t fmode) LSM_HOOK(int, 0, bpf_prog, struct bpf_prog *prog) -LSM_HOOK(int, 0, bpf_map_create, struct bpf_map *map, union bpf_attr *attr, - struct bpf_token *token) -LSM_HOOK(void, LSM_RET_VOID, bpf_map_free, struct bpf_map *map) -LSM_HOOK(int, 0, bpf_prog_load, struct bpf_prog *prog, union bpf_attr *attr, - struct bpf_token *token) -LSM_HOOK(void, LSM_RET_VOID, bpf_prog_free, struct bpf_prog *prog) -LSM_HOOK(int, 0, bpf_token_create, struct bpf_token *token, union bpf_attr *attr, - struct path *path) -LSM_HOOK(void, LSM_RET_VOID, bpf_token_free, struct bpf_token *token) -LSM_HOOK(int, 0, bpf_token_cmd, const struct bpf_token *token, enum bpf_cmd cmd) -LSM_HOOK(int, 0, bpf_token_capable, const struct bpf_token *token, int cap) +LSM_HOOK(int, 0, bpf_map_alloc_security, struct bpf_map *map) +LSM_HOOK(void, LSM_RET_VOID, bpf_map_free_security, struct bpf_map *map) +LSM_HOOK(int, 0, bpf_prog_alloc_security, struct bpf_prog_aux *aux) +LSM_HOOK(void, LSM_RET_VOID, bpf_prog_free_security, struct bpf_prog_aux *aux) #endif /* CONFIG_BPF_SYSCALL */ LSM_HOOK(int, 0, locked_down, enum lockdown_reason what) diff --git a/include/linux/security.h b/include/linux/security.h index 00809d2d5c38..1d1df326c881 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -32,7 +32,6 @@ #include #include #include -#include struct linux_binprm; struct cred; @@ -2021,22 +2020,15 @@ static inline void securityfs_remove(struct dentry *dentry) union bpf_attr; struct bpf_map; struct bpf_prog; -struct bpf_token; +struct bpf_prog_aux; #ifdef CONFIG_SECURITY extern int security_bpf(int cmd, union bpf_attr *attr, unsigned int size); extern int security_bpf_map(struct bpf_map *map, fmode_t fmode); extern int security_bpf_prog(struct bpf_prog *prog); -extern int security_bpf_map_create(struct bpf_map *map, union bpf_attr *attr, - struct bpf_token *token); +extern int security_bpf_map_alloc(struct bpf_map *map); extern void security_bpf_map_free(struct bpf_map *map); -extern int security_bpf_prog_load(struct bpf_prog *prog, union bpf_attr *attr, - struct bpf_token *token); -extern void security_bpf_prog_free(struct bpf_prog *prog); -extern int security_bpf_token_create(struct bpf_token *token, union bpf_attr *attr, - struct path *path); -extern void security_bpf_token_free(struct bpf_token *token); -extern int security_bpf_token_cmd(const struct bpf_token *token, enum bpf_cmd cmd); -extern int security_bpf_token_capable(const struct bpf_token *token, int cap); +extern int security_bpf_prog_alloc(struct bpf_prog_aux *aux); +extern void security_bpf_prog_free(struct bpf_prog_aux *aux); #else static inline int security_bpf(int cmd, union bpf_attr *attr, unsigned int size) @@ -2054,8 +2046,7 @@ static inline int security_bpf_prog(struct bpf_prog *prog) return 0; } -static inline int security_bpf_map_create(struct bpf_map *map, union bpf_attr *attr, - struct bpf_token *token) +static inline int security_bpf_map_alloc(struct bpf_map *map) { return 0; } @@ -2063,33 +2054,13 @@ static inline int security_bpf_map_create(struct bpf_map *map, union bpf_attr *a static inline void security_bpf_map_free(struct bpf_map *map) { } -static inline int security_bpf_prog_load(struct bpf_prog *prog, union bpf_attr *attr, - struct bpf_token *token) +static inline int security_bpf_prog_alloc(struct bpf_prog_aux *aux) { return 0; } -static inline void security_bpf_prog_free(struct bpf_prog *prog) +static inline void security_bpf_prog_free(struct bpf_prog_aux *aux) { } - -static inline int security_bpf_token_create(struct bpf_token *token, union bpf_attr *attr, - struct path *path) -{ - return 0; -} - -static inline void security_bpf_token_free(struct bpf_token *token) -{ } - -static inline int security_bpf_token_cmd(const struct bpf_token *token, enum bpf_cmd cmd) -{ - return 0; -} - -static inline int security_bpf_token_capable(const struct bpf_token *token, int cap) -{ - return 0; -} #endif /* CONFIG_SECURITY */ #endif /* CONFIG_BPF_SYSCALL */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 42f4d3090efe..754e68ca8744 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -847,36 +847,6 @@ union bpf_iter_link_info { * Returns zero on success. On error, -1 is returned and *errno* * is set appropriately. * - * BPF_TOKEN_CREATE - * Description - * Create BPF token with embedded information about what - * BPF-related functionality it allows: - * - a set of allowed bpf() syscall commands; - * - a set of allowed BPF map types to be created with - * BPF_MAP_CREATE command, if BPF_MAP_CREATE itself is allowed; - * - a set of allowed BPF program types and BPF program attach - * types to be loaded with BPF_PROG_LOAD command, if - * BPF_PROG_LOAD itself is allowed. - * - * BPF token is created (derived) from an instance of BPF FS, - * assuming it has necessary delegation mount options specified. - * This BPF token can be passed as an extra parameter to various - * bpf() syscall commands to grant BPF subsystem functionality to - * unprivileged processes. - * - * When created, BPF token is "associated" with the owning - * user namespace of BPF FS instance (super block) that it was - * derived from, and subsequent BPF operations performed with - * BPF token would be performing capabilities checks (i.e., - * CAP_BPF, CAP_PERFMON, CAP_NET_ADMIN, CAP_SYS_ADMIN) within - * that user namespace. Without BPF token, such capabilities - * have to be granted in init user namespace, making bpf() - * syscall incompatible with user namespace, for the most part. - * - * Return - * A new file descriptor (a nonnegative integer), or -1 if an - * error occurred (in which case, *errno* is set appropriately). - * * NOTES * eBPF objects (maps and programs) can be shared between processes. * @@ -931,8 +901,6 @@ enum bpf_cmd { BPF_ITER_CREATE, BPF_LINK_DETACH, BPF_PROG_BIND_MAP, - BPF_TOKEN_CREATE, - __MAX_BPF_CMD, }; enum bpf_map_type { @@ -983,7 +951,6 @@ enum bpf_map_type { BPF_MAP_TYPE_BLOOM_FILTER, BPF_MAP_TYPE_USER_RINGBUF, BPF_MAP_TYPE_CGRP_STORAGE, - __MAX_BPF_MAP_TYPE }; /* Note that tracing related programs such as @@ -1028,7 +995,6 @@ enum bpf_prog_type { BPF_PROG_TYPE_SK_LOOKUP, BPF_PROG_TYPE_SYSCALL, /* a program that can execute syscalls */ BPF_PROG_TYPE_NETFILTER, - __MAX_BPF_PROG_TYPE }; enum bpf_attach_type { @@ -1437,7 +1403,6 @@ union bpf_attr { * to using 5 hash functions). */ __u64 map_extra; - __u32 map_token_fd; }; struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */ @@ -1507,7 +1472,6 @@ union bpf_attr { * truncated), or smaller (if log buffer wasn't filled completely). */ __u32 log_true_size; - __u32 prog_token_fd; }; struct { /* anonymous struct used by BPF_OBJ_* commands */ @@ -1620,7 +1584,6 @@ union bpf_attr { * truncated), or smaller (if log buffer wasn't filled completely). */ __u32 btf_log_true_size; - __u32 btf_token_fd; }; struct { @@ -1751,11 +1714,6 @@ union bpf_attr { __u32 flags; /* extra flags */ } prog_bind_map; - struct { /* struct used by BPF_TOKEN_CREATE command */ - __u32 flags; - __u32 bpffs_fd; - } token_create; - } __attribute__((aligned(8))); /* The description below is an attempt at providing documentation to eBPF diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 4ce95acfcaa7..f526b7573e97 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -6,7 +6,7 @@ cflags-nogcse-$(CONFIG_X86)$(CONFIG_CC_IS_GCC) := -fno-gcse endif CFLAGS_core.o += $(call cc-disable-warning, override-init) $(cflags-nogcse-yy) -obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o log.o token.o +obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o log.o obj-$(CONFIG_BPF_SYSCALL) += bpf_iter.o map_iter.o task_iter.o prog_iter.o link_iter.o obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o bloom_filter.o obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 13358675ff2e..0bdbbbeab155 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -82,7 +82,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY; int numa_node = bpf_map_attr_numa_node(attr); u32 elem_size, index_mask, max_entries; - bool bypass_spec_v1 = bpf_bypass_spec_v1(NULL); + bool bypass_spec_v1 = bpf_bypass_spec_v1(); u64 array_size, mask64; struct bpf_array *array; diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index 63b4dc495125..e8e910395bf6 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -260,15 +260,9 @@ bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) BTF_SET_START(sleepable_lsm_hooks) BTF_ID(func, bpf_lsm_bpf) BTF_ID(func, bpf_lsm_bpf_map) -BTF_ID(func, bpf_lsm_bpf_map_create) -BTF_ID(func, bpf_lsm_bpf_map_free) +BTF_ID(func, bpf_lsm_bpf_map_alloc_security) +BTF_ID(func, bpf_lsm_bpf_map_free_security) BTF_ID(func, bpf_lsm_bpf_prog) -BTF_ID(func, bpf_lsm_bpf_prog_load) -BTF_ID(func, bpf_lsm_bpf_prog_free) -BTF_ID(func, bpf_lsm_bpf_token_create) -BTF_ID(func, bpf_lsm_bpf_token_free) -BTF_ID(func, bpf_lsm_bpf_token_cmd) -BTF_ID(func, bpf_lsm_bpf_token_capable) BTF_ID(func, bpf_lsm_bprm_check_security) BTF_ID(func, bpf_lsm_bprm_committed_creds) BTF_ID(func, bpf_lsm_bprm_committing_creds) @@ -363,8 +357,9 @@ BTF_ID(func, bpf_lsm_userns_create) BTF_SET_END(sleepable_lsm_hooks) BTF_SET_START(untrusted_lsm_hooks) -BTF_ID(func, bpf_lsm_bpf_map_free) -BTF_ID(func, bpf_lsm_bpf_prog_free) +BTF_ID(func, bpf_lsm_bpf_map_free_security) +BTF_ID(func, bpf_lsm_bpf_prog_alloc_security) +BTF_ID(func, bpf_lsm_bpf_prog_free_security) BTF_ID(func, bpf_lsm_file_alloc_security) BTF_ID(func, bpf_lsm_file_free_security) #ifdef CONFIG_SECURITY_NETWORK diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 98e0e3835b28..491d20038cbe 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1630,7 +1630,7 @@ cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_perf_event_output: return &bpf_event_output_data_proto; default: - return bpf_base_func_proto(func_id, prog); + return bpf_base_func_proto(func_id); } } @@ -2191,7 +2191,7 @@ sysctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_perf_event_output: return &bpf_event_output_data_proto; default: - return bpf_base_func_proto(func_id, prog); + return bpf_base_func_proto(func_id); } } @@ -2348,7 +2348,7 @@ cg_sockopt_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_perf_event_output: return &bpf_event_output_data_proto; default: - return bpf_base_func_proto(func_id, prog); + return bpf_base_func_proto(func_id); } } diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 14ace23d517b..ea6843be2616 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -682,7 +682,7 @@ static bool bpf_prog_kallsyms_candidate(const struct bpf_prog *fp) void bpf_prog_kallsyms_add(struct bpf_prog *fp) { if (!bpf_prog_kallsyms_candidate(fp) || - !bpf_token_capable(fp->aux->token, CAP_BPF)) + !bpf_capable()) return; bpf_prog_ksym_set_addr(fp); @@ -2779,7 +2779,6 @@ void bpf_prog_free(struct bpf_prog *fp) if (aux->dst_prog) bpf_prog_put(aux->dst_prog); - bpf_token_put(aux->token); INIT_WORK(&aux->work, bpf_prog_free_deferred); schedule_work(&aux->work); } diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 07fd4b5704f3..be72824f32b2 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1679,7 +1679,7 @@ const struct bpf_func_proto bpf_probe_read_kernel_str_proto __weak; const struct bpf_func_proto bpf_task_pt_regs_proto __weak; const struct bpf_func_proto * -bpf_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +bpf_base_func_proto(enum bpf_func_id func_id) { switch (func_id) { case BPF_FUNC_map_lookup_elem: @@ -1730,7 +1730,7 @@ bpf_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) break; } - if (!bpf_token_capable(prog->aux->token, CAP_BPF)) + if (!bpf_capable()) return NULL; switch (func_id) { @@ -1788,7 +1788,7 @@ bpf_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) break; } - if (!bpf_token_capable(prog->aux->token, CAP_PERFMON)) + if (!perfmon_capable()) return NULL; switch (func_id) { diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 4383b3d13a55..1aafb2ff2e95 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -20,7 +20,6 @@ #include #include #include -#include #include "preload/bpf_preload.h" enum bpf_type { @@ -99,9 +98,9 @@ static const struct inode_operations bpf_prog_iops = { }; static const struct inode_operations bpf_map_iops = { }; static const struct inode_operations bpf_link_iops = { }; -struct inode *bpf_get_inode(struct super_block *sb, - const struct inode *dir, - umode_t mode) +static struct inode *bpf_get_inode(struct super_block *sb, + const struct inode *dir, + umode_t mode) { struct inode *inode; @@ -595,183 +594,15 @@ struct bpf_prog *bpf_prog_get_type_path(const char *name, enum bpf_prog_type typ } EXPORT_SYMBOL(bpf_prog_get_type_path); -struct bpffs_btf_enums { - const struct btf *btf; - const struct btf_type *cmd_t; - const struct btf_type *map_t; - const struct btf_type *prog_t; - const struct btf_type *attach_t; -}; - -static int find_bpffs_btf_enums(struct bpffs_btf_enums *info) -{ - const struct btf *btf; - const struct btf_type *t; - const char *name; - int i, n; - - memset(info, 0, sizeof(*info)); - - btf = bpf_get_btf_vmlinux(); - if (IS_ERR(btf)) - return PTR_ERR(btf); - if (!btf) - return -ENOENT; - - info->btf = btf; - - for (i = 1, n = btf_nr_types(btf); i < n; i++) { - t = btf_type_by_id(btf, i); - if (!btf_type_is_enum(t)) - continue; - - name = btf_name_by_offset(btf, t->name_off); - if (!name) - continue; - - if (strcmp(name, "bpf_cmd") == 0) - info->cmd_t = t; - else if (strcmp(name, "bpf_map_type") == 0) - info->map_t = t; - else if (strcmp(name, "bpf_prog_type") == 0) - info->prog_t = t; - else if (strcmp(name, "bpf_attach_type") == 0) - info->attach_t = t; - else - continue; - - if (info->cmd_t && info->map_t && info->prog_t && info->attach_t) - return 0; - } - - return -ESRCH; -} - -static bool find_btf_enum_const(const struct btf *btf, const struct btf_type *enum_t, - const char *prefix, const char *str, int *value) -{ - const struct btf_enum *e; - const char *name; - int i, n, pfx_len = strlen(prefix); - - *value = 0; - - if (!btf || !enum_t) - return false; - - for (i = 0, n = btf_vlen(enum_t); i < n; i++) { - e = &btf_enum(enum_t)[i]; - - name = btf_name_by_offset(btf, e->name_off); - if (!name || strncasecmp(name, prefix, pfx_len) != 0) - continue; - - /* match symbolic name case insensitive and ignoring prefix */ - if (strcasecmp(name + pfx_len, str) == 0) { - *value = e->val; - return true; - } - } - - return false; -} - -static void seq_print_delegate_opts(struct seq_file *m, - const char *opt_name, - const struct btf *btf, - const struct btf_type *enum_t, - const char *prefix, - u64 delegate_msk, u64 any_msk) -{ - const struct btf_enum *e; - bool first = true; - const char *name; - u64 msk; - int i, n, pfx_len = strlen(prefix); - - delegate_msk &= any_msk; /* clear unknown bits */ - - if (delegate_msk == 0) - return; - - seq_printf(m, ",%s", opt_name); - if (delegate_msk == any_msk) { - seq_printf(m, "=any"); - return; - } - - if (btf && enum_t) { - for (i = 0, n = btf_vlen(enum_t); i < n; i++) { - e = &btf_enum(enum_t)[i]; - name = btf_name_by_offset(btf, e->name_off); - if (!name || strncasecmp(name, prefix, pfx_len) != 0) - continue; - msk = 1ULL << e->val; - if (delegate_msk & msk) { - /* emit lower-case name without prefix */ - seq_printf(m, "%c", first ? '=' : ':'); - name += pfx_len; - while (*name) { - seq_printf(m, "%c", tolower(*name)); - name++; - } - - delegate_msk &= ~msk; - first = false; - } - } - } - if (delegate_msk) - seq_printf(m, "%c0x%llx", first ? '=' : ':', delegate_msk); -} - /* * Display the mount options in /proc/mounts. */ static int bpf_show_options(struct seq_file *m, struct dentry *root) { - struct bpf_mount_opts *opts = root->d_sb->s_fs_info; - struct inode *inode = d_inode(root); - umode_t mode = inode->i_mode & S_IALLUGO & ~S_ISVTX; - u64 mask; - - if (!uid_eq(inode->i_uid, GLOBAL_ROOT_UID)) - seq_printf(m, ",uid=%u", - from_kuid_munged(&init_user_ns, inode->i_uid)); - if (!gid_eq(inode->i_gid, GLOBAL_ROOT_GID)) - seq_printf(m, ",gid=%u", - from_kgid_munged(&init_user_ns, inode->i_gid)); + umode_t mode = d_inode(root)->i_mode & S_IALLUGO & ~S_ISVTX; + if (mode != S_IRWXUGO) seq_printf(m, ",mode=%o", mode); - - if (opts->delegate_cmds || opts->delegate_maps || - opts->delegate_progs || opts->delegate_attachs) { - struct bpffs_btf_enums info; - - /* ignore errors, fallback to hex */ - (void)find_bpffs_btf_enums(&info); - - mask = (1ULL << __MAX_BPF_CMD) - 1; - seq_print_delegate_opts(m, "delegate_cmds", - info.btf, info.cmd_t, "BPF_", - opts->delegate_cmds, mask); - - mask = (1ULL << __MAX_BPF_MAP_TYPE) - 1; - seq_print_delegate_opts(m, "delegate_maps", - info.btf, info.map_t, "BPF_MAP_TYPE_", - opts->delegate_maps, mask); - - mask = (1ULL << __MAX_BPF_PROG_TYPE) - 1; - seq_print_delegate_opts(m, "delegate_progs", - info.btf, info.prog_t, "BPF_PROG_TYPE_", - opts->delegate_progs, mask); - - mask = (1ULL << __MAX_BPF_ATTACH_TYPE) - 1; - seq_print_delegate_opts(m, "delegate_attachs", - info.btf, info.attach_t, "BPF_", - opts->delegate_attachs, mask); - } - return 0; } @@ -786,7 +617,7 @@ static void bpf_free_inode(struct inode *inode) free_inode_nonrcu(inode); } -const struct super_operations bpf_super_ops = { +static const struct super_operations bpf_super_ops = { .statfs = simple_statfs, .drop_inode = generic_delete_inode, .show_options = bpf_show_options, @@ -794,33 +625,23 @@ const struct super_operations bpf_super_ops = { }; enum { - OPT_UID, - OPT_GID, OPT_MODE, - OPT_DELEGATE_CMDS, - OPT_DELEGATE_MAPS, - OPT_DELEGATE_PROGS, - OPT_DELEGATE_ATTACHS, }; static const struct fs_parameter_spec bpf_fs_parameters[] = { - fsparam_u32 ("uid", OPT_UID), - fsparam_u32 ("gid", OPT_GID), fsparam_u32oct ("mode", OPT_MODE), - fsparam_string ("delegate_cmds", OPT_DELEGATE_CMDS), - fsparam_string ("delegate_maps", OPT_DELEGATE_MAPS), - fsparam_string ("delegate_progs", OPT_DELEGATE_PROGS), - fsparam_string ("delegate_attachs", OPT_DELEGATE_ATTACHS), {} }; +struct bpf_mount_opts { + umode_t mode; +}; + static int bpf_parse_param(struct fs_context *fc, struct fs_parameter *param) { - struct bpf_mount_opts *opts = fc->s_fs_info; + struct bpf_mount_opts *opts = fc->fs_private; struct fs_parse_result result; - kuid_t uid; - kgid_t gid; - int opt, err; + int opt; opt = fs_parse(fc, bpf_fs_parameters, param, &result); if (opt < 0) { @@ -841,104 +662,12 @@ static int bpf_parse_param(struct fs_context *fc, struct fs_parameter *param) } switch (opt) { - case OPT_UID: - uid = make_kuid(current_user_ns(), result.uint_32); - if (!uid_valid(uid)) - goto bad_value; - - /* - * The requested uid must be representable in the - * filesystem's idmapping. - */ - if (!kuid_has_mapping(fc->user_ns, uid)) - goto bad_value; - - opts->uid = uid; - break; - case OPT_GID: - gid = make_kgid(current_user_ns(), result.uint_32); - if (!gid_valid(gid)) - goto bad_value; - - /* - * The requested gid must be representable in the - * filesystem's idmapping. - */ - if (!kgid_has_mapping(fc->user_ns, gid)) - goto bad_value; - - opts->gid = gid; - break; case OPT_MODE: opts->mode = result.uint_32 & S_IALLUGO; break; - case OPT_DELEGATE_CMDS: - case OPT_DELEGATE_MAPS: - case OPT_DELEGATE_PROGS: - case OPT_DELEGATE_ATTACHS: { - struct bpffs_btf_enums info; - const struct btf_type *enum_t; - const char *enum_pfx; - u64 *delegate_msk, msk = 0; - char *p; - int val; - - /* ignore errors, fallback to hex */ - (void)find_bpffs_btf_enums(&info); - - switch (opt) { - case OPT_DELEGATE_CMDS: - delegate_msk = &opts->delegate_cmds; - enum_t = info.cmd_t; - enum_pfx = "BPF_"; - break; - case OPT_DELEGATE_MAPS: - delegate_msk = &opts->delegate_maps; - enum_t = info.map_t; - enum_pfx = "BPF_MAP_TYPE_"; - break; - case OPT_DELEGATE_PROGS: - delegate_msk = &opts->delegate_progs; - enum_t = info.prog_t; - enum_pfx = "BPF_PROG_TYPE_"; - break; - case OPT_DELEGATE_ATTACHS: - delegate_msk = &opts->delegate_attachs; - enum_t = info.attach_t; - enum_pfx = "BPF_"; - break; - default: - return -EINVAL; - } - - while ((p = strsep(¶m->string, ":"))) { - if (strcmp(p, "any") == 0) { - msk |= ~0ULL; - } else if (find_btf_enum_const(info.btf, enum_t, enum_pfx, p, &val)) { - msk |= 1ULL << val; - } else { - err = kstrtou64(p, 0, &msk); - if (err) - return err; - } - } - - /* Setting delegation mount options requires privileges */ - if (msk && !capable(CAP_SYS_ADMIN)) - return -EPERM; - - *delegate_msk |= msk; - break; - } - default: - /* ignore unknown mount options */ - break; } return 0; - -bad_value: - return invalfc(fc, "Bad value for '%s'", param->key); } struct bpf_preload_ops *bpf_preload_ops; @@ -1010,14 +739,10 @@ out: static int bpf_fill_super(struct super_block *sb, struct fs_context *fc) { static const struct tree_descr bpf_rfiles[] = { { "" } }; - struct bpf_mount_opts *opts = sb->s_fs_info; + struct bpf_mount_opts *opts = fc->fs_private; struct inode *inode; int ret; - /* Mounting an instance of BPF FS requires privileges */ - if (fc->user_ns != &init_user_ns && !capable(CAP_SYS_ADMIN)) - return -EPERM; - ret = simple_fill_super(sb, BPF_FS_MAGIC, bpf_rfiles); if (ret) return ret; @@ -1025,8 +750,6 @@ static int bpf_fill_super(struct super_block *sb, struct fs_context *fc) sb->s_op = &bpf_super_ops; inode = sb->s_root->d_inode; - inode->i_uid = opts->uid; - inode->i_gid = opts->gid; inode->i_op = &bpf_dir_iops; inode->i_mode &= ~S_IALLUGO; populate_bpffs(sb->s_root); @@ -1041,7 +764,7 @@ static int bpf_get_tree(struct fs_context *fc) static void bpf_free_fc(struct fs_context *fc) { - kfree(fc->s_fs_info); + kfree(fc->fs_private); } static const struct fs_context_operations bpf_context_ops = { @@ -1062,35 +785,18 @@ static int bpf_init_fs_context(struct fs_context *fc) return -ENOMEM; opts->mode = S_IRWXUGO; - opts->uid = current_fsuid(); - opts->gid = current_fsgid(); - - /* start out with no BPF token delegation enabled */ - opts->delegate_cmds = 0; - opts->delegate_maps = 0; - opts->delegate_progs = 0; - opts->delegate_attachs = 0; - fc->s_fs_info = opts; + fc->fs_private = opts; fc->ops = &bpf_context_ops; return 0; } -static void bpf_kill_super(struct super_block *sb) -{ - struct bpf_mount_opts *opts = sb->s_fs_info; - - kill_litter_super(sb); - kfree(opts); -} - static struct file_system_type bpf_fs_type = { .owner = THIS_MODULE, .name = "bpf", .init_fs_context = bpf_init_fs_context, .parameters = bpf_fs_parameters, - .kill_sb = bpf_kill_super, - .fs_flags = FS_USERNS_MOUNT, + .kill_sb = kill_litter_super, }; static int __init bpf_init(void) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 8faa1a20edf8..1bf9805ee185 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1011,8 +1011,8 @@ int map_check_no_btf(const struct bpf_map *map, return -ENOTSUPP; } -static int map_check_btf(struct bpf_map *map, struct bpf_token *token, - const struct btf *btf, u32 btf_key_id, u32 btf_value_id) +static int map_check_btf(struct bpf_map *map, const struct btf *btf, + u32 btf_key_id, u32 btf_value_id) { const struct btf_type *key_type, *value_type; u32 key_size, value_size; @@ -1040,7 +1040,7 @@ static int map_check_btf(struct bpf_map *map, struct bpf_token *token, if (!IS_ERR_OR_NULL(map->record)) { int i; - if (!bpf_token_capable(token, CAP_BPF)) { + if (!bpf_capable()) { ret = -EPERM; goto free_map_tab; } @@ -1123,17 +1123,11 @@ free_map_tab: return ret; } -static bool bpf_net_capable(void) -{ - return capable(CAP_NET_ADMIN) || capable(CAP_SYS_ADMIN); -} - -#define BPF_MAP_CREATE_LAST_FIELD map_token_fd +#define BPF_MAP_CREATE_LAST_FIELD map_extra /* called via syscall */ static int map_create(union bpf_attr *attr) { const struct bpf_map_ops *ops; - struct bpf_token *token = NULL; int numa_node = bpf_map_attr_numa_node(attr); u32 map_type = attr->map_type; struct bpf_map *map; @@ -1184,32 +1178,14 @@ static int map_create(union bpf_attr *attr) if (!ops->map_mem_usage) return -EINVAL; - if (attr->map_token_fd) { - token = bpf_token_get_from_fd(attr->map_token_fd); - if (IS_ERR(token)) - return PTR_ERR(token); - - /* if current token doesn't grant map creation permissions, - * then we can't use this token, so ignore it and rely on - * system-wide capabilities checks - */ - if (!bpf_token_allow_cmd(token, BPF_MAP_CREATE) || - !bpf_token_allow_map_type(token, attr->map_type)) { - bpf_token_put(token); - token = NULL; - } - } - - err = -EPERM; - /* Intent here is for unprivileged_bpf_disabled to block BPF map * creation for unprivileged users; other actions depend * on fd availability and access to bpffs, so are dependent on * object creation success. Even with unprivileged BPF disabled, * capability checks are still carried out. */ - if (sysctl_unprivileged_bpf_disabled && !bpf_token_capable(token, CAP_BPF)) - goto put_token; + if (sysctl_unprivileged_bpf_disabled && !bpf_capable()) + return -EPERM; /* check privileged map type permissions */ switch (map_type) { @@ -1242,27 +1218,25 @@ static int map_create(union bpf_attr *attr) case BPF_MAP_TYPE_LRU_PERCPU_HASH: case BPF_MAP_TYPE_STRUCT_OPS: case BPF_MAP_TYPE_CPUMAP: - if (!bpf_token_capable(token, CAP_BPF)) - goto put_token; + if (!bpf_capable()) + return -EPERM; break; case BPF_MAP_TYPE_SOCKMAP: case BPF_MAP_TYPE_SOCKHASH: case BPF_MAP_TYPE_DEVMAP: case BPF_MAP_TYPE_DEVMAP_HASH: case BPF_MAP_TYPE_XSKMAP: - if (!bpf_token_capable(token, CAP_NET_ADMIN)) - goto put_token; + if (!capable(CAP_NET_ADMIN)) + return -EPERM; break; default: WARN(1, "unsupported map type %d", map_type); - goto put_token; + return -EPERM; } map = ops->map_alloc(attr); - if (IS_ERR(map)) { - err = PTR_ERR(map); - goto put_token; - } + if (IS_ERR(map)) + return PTR_ERR(map); map->ops = ops; map->map_type = map_type; @@ -1299,7 +1273,7 @@ static int map_create(union bpf_attr *attr) map->btf = btf; if (attr->btf_value_type_id) { - err = map_check_btf(map, token, btf, attr->btf_key_type_id, + err = map_check_btf(map, btf, attr->btf_key_type_id, attr->btf_value_type_id); if (err) goto free_map; @@ -1311,16 +1285,15 @@ static int map_create(union bpf_attr *attr) attr->btf_vmlinux_value_type_id; } - err = security_bpf_map_create(map, attr, token); + err = security_bpf_map_alloc(map); if (err) - goto free_map_sec; + goto free_map; err = bpf_map_alloc_id(map); if (err) goto free_map_sec; bpf_map_save_memcg(map); - bpf_token_put(token); err = bpf_map_new_fd(map, f_flags); if (err < 0) { @@ -1341,8 +1314,6 @@ free_map_sec: free_map: btf_put(map->btf); map->ops->map_free(map); -put_token: - bpf_token_put(token); return err; } @@ -2173,7 +2144,7 @@ static void __bpf_prog_put_rcu(struct rcu_head *rcu) kvfree(aux->func_info); kfree(aux->func_info_aux); free_uid(aux->user); - security_bpf_prog_free(aux->prog); + security_bpf_prog_free(aux); bpf_prog_free(aux->prog); } @@ -2619,15 +2590,13 @@ static bool is_perfmon_prog_type(enum bpf_prog_type prog_type) } /* last field in 'union bpf_attr' used by this command */ -#define BPF_PROG_LOAD_LAST_FIELD prog_token_fd +#define BPF_PROG_LOAD_LAST_FIELD log_true_size static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) { enum bpf_prog_type type = attr->prog_type; struct bpf_prog *prog, *dst_prog = NULL; struct btf *attach_btf = NULL; - struct bpf_token *token = NULL; - bool bpf_cap; int err; char license[128]; @@ -2644,31 +2613,10 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) BPF_F_TEST_REG_INVARIANTS)) return -EINVAL; - bpf_prog_load_fixup_attach_type(attr); - - if (attr->prog_token_fd) { - token = bpf_token_get_from_fd(attr->prog_token_fd); - if (IS_ERR(token)) - return PTR_ERR(token); - /* if current token doesn't grant prog loading permissions, - * then we can't use this token, so ignore it and rely on - * system-wide capabilities checks - */ - if (!bpf_token_allow_cmd(token, BPF_PROG_LOAD) || - !bpf_token_allow_prog_type(token, attr->prog_type, - attr->expected_attach_type)) { - bpf_token_put(token); - token = NULL; - } - } - - bpf_cap = bpf_token_capable(token, CAP_BPF); - err = -EPERM; - if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && (attr->prog_flags & BPF_F_ANY_ALIGNMENT) && - !bpf_cap) - goto put_token; + !bpf_capable()) + return -EPERM; /* Intent here is for unprivileged_bpf_disabled to block BPF program * creation for unprivileged users; other actions depend @@ -2677,23 +2625,21 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) * capability checks are still carried out for these * and other operations. */ - if (sysctl_unprivileged_bpf_disabled && !bpf_cap) - goto put_token; + if (sysctl_unprivileged_bpf_disabled && !bpf_capable()) + return -EPERM; if (attr->insn_cnt == 0 || - attr->insn_cnt > (bpf_cap ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS)) { - err = -E2BIG; - goto put_token; - } + attr->insn_cnt > (bpf_capable() ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS)) + return -E2BIG; if (type != BPF_PROG_TYPE_SOCKET_FILTER && type != BPF_PROG_TYPE_CGROUP_SKB && - !bpf_cap) - goto put_token; + !bpf_capable()) + return -EPERM; - if (is_net_admin_prog_type(type) && !bpf_token_capable(token, CAP_NET_ADMIN)) - goto put_token; - if (is_perfmon_prog_type(type) && !bpf_token_capable(token, CAP_PERFMON)) - goto put_token; + if (is_net_admin_prog_type(type) && !capable(CAP_NET_ADMIN) && !capable(CAP_SYS_ADMIN)) + return -EPERM; + if (is_perfmon_prog_type(type) && !perfmon_capable()) + return -EPERM; /* attach_prog_fd/attach_btf_obj_fd can specify fd of either bpf_prog * or btf, we need to check which one it is @@ -2703,33 +2649,27 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) if (IS_ERR(dst_prog)) { dst_prog = NULL; attach_btf = btf_get_by_fd(attr->attach_btf_obj_fd); - if (IS_ERR(attach_btf)) { - err = -EINVAL; - goto put_token; - } + if (IS_ERR(attach_btf)) + return -EINVAL; if (!btf_is_kernel(attach_btf)) { /* attaching through specifying bpf_prog's BTF * objects directly might be supported eventually */ btf_put(attach_btf); - err = -ENOTSUPP; - goto put_token; + return -ENOTSUPP; } } } else if (attr->attach_btf_id) { /* fall back to vmlinux BTF, if BTF type ID is specified */ attach_btf = bpf_get_btf_vmlinux(); - if (IS_ERR(attach_btf)) { - err = PTR_ERR(attach_btf); - goto put_token; - } - if (!attach_btf) { - err = -EINVAL; - goto put_token; - } + if (IS_ERR(attach_btf)) + return PTR_ERR(attach_btf); + if (!attach_btf) + return -EINVAL; btf_get(attach_btf); } + bpf_prog_load_fixup_attach_type(attr); if (bpf_prog_load_check_attach(type, attr->expected_attach_type, attach_btf, attr->attach_btf_id, dst_prog)) { @@ -2737,8 +2677,7 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) bpf_prog_put(dst_prog); if (attach_btf) btf_put(attach_btf); - err = -EINVAL; - goto put_token; + return -EINVAL; } /* plain bpf_prog allocation */ @@ -2748,8 +2687,7 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) bpf_prog_put(dst_prog); if (attach_btf) btf_put(attach_btf); - err = -EINVAL; - goto put_token; + return -ENOMEM; } prog->expected_attach_type = attr->expected_attach_type; @@ -2760,9 +2698,9 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) prog->aux->sleepable = attr->prog_flags & BPF_F_SLEEPABLE; prog->aux->xdp_has_frags = attr->prog_flags & BPF_F_XDP_HAS_FRAGS; - /* move token into prog->aux, reuse taken refcnt */ - prog->aux->token = token; - token = NULL; + err = security_bpf_prog_alloc(prog->aux); + if (err) + goto free_prog; prog->aux->user = get_current_user(); prog->len = attr->insn_cnt; @@ -2771,12 +2709,12 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) if (copy_from_bpfptr(prog->insns, make_bpfptr(attr->insns, uattr.is_kernel), bpf_prog_insn_size(prog)) != 0) - goto free_prog; + goto free_prog_sec; /* copy eBPF program license from user space */ if (strncpy_from_bpfptr(license, make_bpfptr(attr->license, uattr.is_kernel), sizeof(license) - 1) < 0) - goto free_prog; + goto free_prog_sec; license[sizeof(license) - 1] = 0; /* eBPF programs must be GPL compatible to use GPL-ed functions */ @@ -2790,29 +2728,25 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) if (bpf_prog_is_dev_bound(prog->aux)) { err = bpf_prog_dev_bound_init(prog, attr); if (err) - goto free_prog; + goto free_prog_sec; } if (type == BPF_PROG_TYPE_EXT && dst_prog && bpf_prog_is_dev_bound(dst_prog->aux)) { err = bpf_prog_dev_bound_inherit(prog, dst_prog); if (err) - goto free_prog; + goto free_prog_sec; } /* find program type: socket_filter vs tracing_filter */ err = find_prog_type(type, prog); if (err < 0) - goto free_prog; + goto free_prog_sec; prog->aux->load_time = ktime_get_boottime_ns(); err = bpf_obj_name_cpy(prog->aux->name, attr->prog_name, sizeof(attr->prog_name)); if (err < 0) - goto free_prog; - - err = security_bpf_prog_load(prog, attr, token); - if (err) goto free_prog_sec; /* run eBPF verifier */ @@ -2858,16 +2792,13 @@ free_used_maps: */ __bpf_prog_put_noref(prog, prog->aux->real_func_cnt); return err; - free_prog_sec: - security_bpf_prog_free(prog); -free_prog: free_uid(prog->aux->user); + security_bpf_prog_free(prog->aux); +free_prog: if (prog->aux->attach_btf) btf_put(prog->aux->attach_btf); bpf_prog_free(prog); -put_token: - bpf_token_put(token); return err; } @@ -3857,7 +3788,7 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, case BPF_PROG_TYPE_SK_LOOKUP: return attach_type == prog->expected_attach_type ? 0 : -EINVAL; case BPF_PROG_TYPE_CGROUP_SKB: - if (!bpf_token_capable(prog->aux->token, CAP_NET_ADMIN)) + if (!capable(CAP_NET_ADMIN)) /* cg-skb progs can be loaded by unpriv user. * check permissions at attach time. */ @@ -4060,7 +3991,7 @@ static int bpf_prog_detach(const union bpf_attr *attr) static int bpf_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr) { - if (!bpf_net_capable()) + if (!capable(CAP_NET_ADMIN)) return -EPERM; if (CHECK_ATTR(BPF_PROG_QUERY)) return -EINVAL; @@ -4828,31 +4759,15 @@ static int bpf_obj_get_info_by_fd(const union bpf_attr *attr, return err; } -#define BPF_BTF_LOAD_LAST_FIELD btf_token_fd +#define BPF_BTF_LOAD_LAST_FIELD btf_log_true_size static int bpf_btf_load(const union bpf_attr *attr, bpfptr_t uattr, __u32 uattr_size) { - struct bpf_token *token = NULL; - if (CHECK_ATTR(BPF_BTF_LOAD)) return -EINVAL; - if (attr->btf_token_fd) { - token = bpf_token_get_from_fd(attr->btf_token_fd); - if (IS_ERR(token)) - return PTR_ERR(token); - if (!bpf_token_allow_cmd(token, BPF_BTF_LOAD)) { - bpf_token_put(token); - token = NULL; - } - } - - if (!bpf_token_capable(token, CAP_BPF)) { - bpf_token_put(token); + if (!bpf_capable()) return -EPERM; - } - - bpf_token_put(token); return btf_new_fd(attr, uattr, uattr_size); } @@ -5470,20 +5385,6 @@ out_prog_put: return ret; } -#define BPF_TOKEN_CREATE_LAST_FIELD token_create.bpffs_fd - -static int token_create(union bpf_attr *attr) -{ - if (CHECK_ATTR(BPF_TOKEN_CREATE)) - return -EINVAL; - - /* no flags are supported yet */ - if (attr->token_create.flags) - return -EINVAL; - - return bpf_token_create(attr); -} - static int __sys_bpf(int cmd, bpfptr_t uattr, unsigned int size) { union bpf_attr attr; @@ -5617,9 +5518,6 @@ static int __sys_bpf(int cmd, bpfptr_t uattr, unsigned int size) case BPF_PROG_BIND_MAP: err = bpf_prog_bind_map(&attr); break; - case BPF_TOKEN_CREATE: - err = token_create(&attr); - break; default: err = -EINVAL; break; @@ -5726,7 +5624,7 @@ static const struct bpf_func_proto bpf_sys_bpf_proto = { const struct bpf_func_proto * __weak tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { - return bpf_base_func_proto(func_id, prog); + return bpf_base_func_proto(func_id); } BPF_CALL_1(bpf_sys_close, u32, fd) @@ -5776,8 +5674,7 @@ syscall_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_sys_bpf: - return !bpf_token_capable(prog->aux->token, CAP_PERFMON) - ? NULL : &bpf_sys_bpf_proto; + return !perfmon_capable() ? NULL : &bpf_sys_bpf_proto; case BPF_FUNC_btf_find_by_name_kind: return &bpf_btf_find_by_name_kind_proto; case BPF_FUNC_sys_close: diff --git a/kernel/bpf/token.c b/kernel/bpf/token.c deleted file mode 100644 index a86fccd57e2d..000000000000 --- a/kernel/bpf/token.c +++ /dev/null @@ -1,271 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -bool bpf_token_capable(const struct bpf_token *token, int cap) -{ - /* BPF token allows ns_capable() level of capabilities, but only if - * token's userns is *exactly* the same as current user's userns - */ - if (token && current_user_ns() == token->userns) { - if (ns_capable(token->userns, cap) || - (cap != CAP_SYS_ADMIN && ns_capable(token->userns, CAP_SYS_ADMIN))) - return security_bpf_token_capable(token, cap) == 0; - } - /* otherwise fallback to capable() checks */ - return capable(cap) || (cap != CAP_SYS_ADMIN && capable(CAP_SYS_ADMIN)); -} - -void bpf_token_inc(struct bpf_token *token) -{ - atomic64_inc(&token->refcnt); -} - -static void bpf_token_free(struct bpf_token *token) -{ - security_bpf_token_free(token); - put_user_ns(token->userns); - kvfree(token); -} - -static void bpf_token_put_deferred(struct work_struct *work) -{ - struct bpf_token *token = container_of(work, struct bpf_token, work); - - bpf_token_free(token); -} - -void bpf_token_put(struct bpf_token *token) -{ - if (!token) - return; - - if (!atomic64_dec_and_test(&token->refcnt)) - return; - - INIT_WORK(&token->work, bpf_token_put_deferred); - schedule_work(&token->work); -} - -static int bpf_token_release(struct inode *inode, struct file *filp) -{ - struct bpf_token *token = filp->private_data; - - bpf_token_put(token); - return 0; -} - -static void bpf_token_show_fdinfo(struct seq_file *m, struct file *filp) -{ - struct bpf_token *token = filp->private_data; - u64 mask; - - BUILD_BUG_ON(__MAX_BPF_CMD >= 64); - mask = (1ULL << __MAX_BPF_CMD) - 1; - if ((token->allowed_cmds & mask) == mask) - seq_printf(m, "allowed_cmds:\tany\n"); - else - seq_printf(m, "allowed_cmds:\t0x%llx\n", token->allowed_cmds); - - BUILD_BUG_ON(__MAX_BPF_MAP_TYPE >= 64); - mask = (1ULL << __MAX_BPF_MAP_TYPE) - 1; - if ((token->allowed_maps & mask) == mask) - seq_printf(m, "allowed_maps:\tany\n"); - else - seq_printf(m, "allowed_maps:\t0x%llx\n", token->allowed_maps); - - BUILD_BUG_ON(__MAX_BPF_PROG_TYPE >= 64); - mask = (1ULL << __MAX_BPF_PROG_TYPE) - 1; - if ((token->allowed_progs & mask) == mask) - seq_printf(m, "allowed_progs:\tany\n"); - else - seq_printf(m, "allowed_progs:\t0x%llx\n", token->allowed_progs); - - BUILD_BUG_ON(__MAX_BPF_ATTACH_TYPE >= 64); - mask = (1ULL << __MAX_BPF_ATTACH_TYPE) - 1; - if ((token->allowed_attachs & mask) == mask) - seq_printf(m, "allowed_attachs:\tany\n"); - else - seq_printf(m, "allowed_attachs:\t0x%llx\n", token->allowed_attachs); -} - -#define BPF_TOKEN_INODE_NAME "bpf-token" - -static const struct inode_operations bpf_token_iops = { }; - -static const struct file_operations bpf_token_fops = { - .release = bpf_token_release, - .show_fdinfo = bpf_token_show_fdinfo, -}; - -int bpf_token_create(union bpf_attr *attr) -{ - struct bpf_mount_opts *mnt_opts; - struct bpf_token *token = NULL; - struct user_namespace *userns; - struct inode *inode; - struct file *file; - struct path path; - struct fd f; - umode_t mode; - int err, fd; - - f = fdget(attr->token_create.bpffs_fd); - if (!f.file) - return -EBADF; - - path = f.file->f_path; - path_get(&path); - fdput(f); - - if (path.dentry != path.mnt->mnt_sb->s_root) { - err = -EINVAL; - goto out_path; - } - if (path.mnt->mnt_sb->s_op != &bpf_super_ops) { - err = -EINVAL; - goto out_path; - } - err = path_permission(&path, MAY_ACCESS); - if (err) - goto out_path; - - userns = path.dentry->d_sb->s_user_ns; - /* - * Enforce that creators of BPF tokens are in the same user - * namespace as the BPF FS instance. This makes reasoning about - * permissions a lot easier and we can always relax this later. - */ - if (current_user_ns() != userns) { - err = -EPERM; - goto out_path; - } - if (!ns_capable(userns, CAP_BPF)) { - err = -EPERM; - goto out_path; - } - - mnt_opts = path.dentry->d_sb->s_fs_info; - if (mnt_opts->delegate_cmds == 0 && - mnt_opts->delegate_maps == 0 && - mnt_opts->delegate_progs == 0 && - mnt_opts->delegate_attachs == 0) { - err = -ENOENT; /* no BPF token delegation is set up */ - goto out_path; - } - - mode = S_IFREG | ((S_IRUSR | S_IWUSR) & ~current_umask()); - inode = bpf_get_inode(path.mnt->mnt_sb, NULL, mode); - if (IS_ERR(inode)) { - err = PTR_ERR(inode); - goto out_path; - } - - inode->i_op = &bpf_token_iops; - inode->i_fop = &bpf_token_fops; - clear_nlink(inode); /* make sure it is unlinked */ - - file = alloc_file_pseudo(inode, path.mnt, BPF_TOKEN_INODE_NAME, O_RDWR, &bpf_token_fops); - if (IS_ERR(file)) { - iput(inode); - err = PTR_ERR(file); - goto out_path; - } - - token = kvzalloc(sizeof(*token), GFP_USER); - if (!token) { - err = -ENOMEM; - goto out_file; - } - - atomic64_set(&token->refcnt, 1); - - /* remember bpffs owning userns for future ns_capable() checks */ - token->userns = get_user_ns(userns); - - token->allowed_cmds = mnt_opts->delegate_cmds; - token->allowed_maps = mnt_opts->delegate_maps; - token->allowed_progs = mnt_opts->delegate_progs; - token->allowed_attachs = mnt_opts->delegate_attachs; - - err = security_bpf_token_create(token, attr, &path); - if (err) - goto out_token; - - fd = get_unused_fd_flags(O_CLOEXEC); - if (fd < 0) { - err = fd; - goto out_token; - } - - file->private_data = token; - fd_install(fd, file); - - path_put(&path); - return fd; - -out_token: - bpf_token_free(token); -out_file: - fput(file); -out_path: - path_put(&path); - return err; -} - -struct bpf_token *bpf_token_get_from_fd(u32 ufd) -{ - struct fd f = fdget(ufd); - struct bpf_token *token; - - if (!f.file) - return ERR_PTR(-EBADF); - if (f.file->f_op != &bpf_token_fops) { - fdput(f); - return ERR_PTR(-EINVAL); - } - - token = f.file->private_data; - bpf_token_inc(token); - fdput(f); - - return token; -} - -bool bpf_token_allow_cmd(const struct bpf_token *token, enum bpf_cmd cmd) -{ - /* BPF token can be used only within exactly the same userns in which - * it was created - */ - if (!token || current_user_ns() != token->userns) - return false; - if (!(token->allowed_cmds & (1ULL << cmd))) - return false; - return security_bpf_token_cmd(token, cmd) == 0; -} - -bool bpf_token_allow_map_type(const struct bpf_token *token, enum bpf_map_type type) -{ - if (!token || type >= __MAX_BPF_MAP_TYPE) - return false; - - return token->allowed_maps & (1ULL << type); -} - -bool bpf_token_allow_prog_type(const struct bpf_token *token, - enum bpf_prog_type prog_type, - enum bpf_attach_type attach_type) -{ - if (!token || prog_type >= __MAX_BPF_PROG_TYPE || attach_type >= __MAX_BPF_ATTACH_TYPE) - return false; - - return (token->allowed_progs & (1ULL << prog_type)) && - (token->allowed_attachs & (1ULL << attach_type)); -} diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 9456ee0ad129..4ceec8c2a484 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -20594,12 +20594,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3 env->prog = *prog; env->ops = bpf_verifier_ops[env->prog->type]; env->fd_array = make_bpfptr(attr->fd_array, uattr.is_kernel); - - env->allow_ptr_leaks = bpf_allow_ptr_leaks(env->prog->aux->token); - env->allow_uninit_stack = bpf_allow_uninit_stack(env->prog->aux->token); - env->bypass_spec_v1 = bpf_bypass_spec_v1(env->prog->aux->token); - env->bypass_spec_v4 = bpf_bypass_spec_v4(env->prog->aux->token); - env->bpf_capable = is_priv = bpf_token_capable(env->prog->aux->token, CAP_BPF); + is_priv = bpf_capable(); bpf_get_btf_vmlinux(); @@ -20631,6 +20626,12 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3 if (attr->prog_flags & BPF_F_ANY_ALIGNMENT) env->strict_alignment = false; + env->allow_ptr_leaks = bpf_allow_ptr_leaks(); + env->allow_uninit_stack = bpf_allow_uninit_stack(); + env->bypass_spec_v1 = bpf_bypass_spec_v1(); + env->bypass_spec_v4 = bpf_bypass_spec_v4(); + env->bpf_capable = bpf_capable(); + if (is_priv) env->test_state_freq = attr->prog_flags & BPF_F_TEST_STATE_FREQ; env->test_reg_invariants = attr->prog_flags & BPF_F_TEST_REG_INVARIANTS; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 492d60e9c480..7ac6c52b25eb 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1629,7 +1629,7 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_trace_vprintk: return bpf_get_trace_vprintk_proto(); default: - return bpf_base_func_proto(func_id, prog); + return bpf_base_func_proto(func_id); } } diff --git a/net/core/filter.c b/net/core/filter.c index 3cc52b82bab8..24061f29c9dd 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -87,7 +87,7 @@ #include "dev.h" static const struct bpf_func_proto * -bpf_sk_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog); +bpf_sk_base_func_proto(enum bpf_func_id func_id); int copy_bpf_fprog_from_user(struct sock_fprog *dst, sockptr_t src, int len) { @@ -7862,7 +7862,7 @@ sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_ktime_get_coarse_ns: return &bpf_ktime_get_coarse_ns_proto; default: - return bpf_base_func_proto(func_id, prog); + return bpf_base_func_proto(func_id); } } @@ -7955,7 +7955,7 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return NULL; } default: - return bpf_sk_base_func_proto(func_id, prog); + return bpf_sk_base_func_proto(func_id); } } @@ -7974,7 +7974,7 @@ sk_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_perf_event_output: return &bpf_skb_event_output_proto; default: - return bpf_sk_base_func_proto(func_id, prog); + return bpf_sk_base_func_proto(func_id); } } @@ -8161,7 +8161,7 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) #endif #endif default: - return bpf_sk_base_func_proto(func_id, prog); + return bpf_sk_base_func_proto(func_id); } } @@ -8220,7 +8220,7 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) #endif #endif default: - return bpf_sk_base_func_proto(func_id, prog); + return bpf_sk_base_func_proto(func_id); } #if IS_MODULE(CONFIG_NF_CONNTRACK) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES) @@ -8281,7 +8281,7 @@ sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_tcp_sock_proto; #endif /* CONFIG_INET */ default: - return bpf_sk_base_func_proto(func_id, prog); + return bpf_sk_base_func_proto(func_id); } } @@ -8323,7 +8323,7 @@ sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_cgroup_classid_curr_proto; #endif default: - return bpf_sk_base_func_proto(func_id, prog); + return bpf_sk_base_func_proto(func_id); } } @@ -8367,7 +8367,7 @@ sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_skc_lookup_tcp_proto; #endif default: - return bpf_sk_base_func_proto(func_id, prog); + return bpf_sk_base_func_proto(func_id); } } @@ -8378,7 +8378,7 @@ flow_dissector_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_skb_load_bytes: return &bpf_flow_dissector_load_bytes_proto; default: - return bpf_sk_base_func_proto(func_id, prog); + return bpf_sk_base_func_proto(func_id); } } @@ -8405,7 +8405,7 @@ lwt_out_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_skb_under_cgroup: return &bpf_skb_under_cgroup_proto; default: - return bpf_sk_base_func_proto(func_id, prog); + return bpf_sk_base_func_proto(func_id); } } @@ -8580,7 +8580,7 @@ static bool cg_skb_is_valid_access(int off, int size, return false; case bpf_ctx_range(struct __sk_buff, data): case bpf_ctx_range(struct __sk_buff, data_end): - if (!bpf_token_capable(prog->aux->token, CAP_BPF)) + if (!bpf_capable()) return false; break; } @@ -8592,7 +8592,7 @@ static bool cg_skb_is_valid_access(int off, int size, case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]): break; case bpf_ctx_range(struct __sk_buff, tstamp): - if (!bpf_token_capable(prog->aux->token, CAP_BPF)) + if (!bpf_capable()) return false; break; default: @@ -11236,7 +11236,7 @@ sk_reuseport_func_proto(enum bpf_func_id func_id, case BPF_FUNC_ktime_get_coarse_ns: return &bpf_ktime_get_coarse_ns_proto; default: - return bpf_base_func_proto(func_id, prog); + return bpf_base_func_proto(func_id); } } @@ -11418,7 +11418,7 @@ sk_lookup_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_sk_release: return &bpf_sk_release_proto; default: - return bpf_sk_base_func_proto(func_id, prog); + return bpf_sk_base_func_proto(func_id); } } @@ -11752,7 +11752,7 @@ const struct bpf_func_proto bpf_sock_from_file_proto = { }; static const struct bpf_func_proto * -bpf_sk_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +bpf_sk_base_func_proto(enum bpf_func_id func_id) { const struct bpf_func_proto *func; @@ -11781,10 +11781,10 @@ bpf_sk_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_ktime_get_coarse_ns: return &bpf_ktime_get_coarse_ns_proto; default: - return bpf_base_func_proto(func_id, prog); + return bpf_base_func_proto(func_id); } - if (!bpf_token_capable(prog->aux->token, CAP_PERFMON)) + if (!perfmon_capable()) return NULL; return func; diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c index 634cfafa583d..ae8b15e6896f 100644 --- a/net/ipv4/bpf_tcp_ca.c +++ b/net/ipv4/bpf_tcp_ca.c @@ -191,7 +191,7 @@ bpf_tcp_ca_get_func_proto(enum bpf_func_id func_id, case BPF_FUNC_ktime_get_coarse_ns: return &bpf_ktime_get_coarse_ns_proto; default: - return bpf_base_func_proto(func_id, prog); + return bpf_base_func_proto(func_id); } } diff --git a/net/netfilter/nf_bpf_link.c b/net/netfilter/nf_bpf_link.c index 5257d5e7eb09..0e4beae421f8 100644 --- a/net/netfilter/nf_bpf_link.c +++ b/net/netfilter/nf_bpf_link.c @@ -314,7 +314,7 @@ static bool nf_is_valid_access(int off, int size, enum bpf_access_type type, static const struct bpf_func_proto * bpf_nf_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { - return bpf_base_func_proto(func_id, prog); + return bpf_base_func_proto(func_id); } const struct bpf_verifier_ops netfilter_verifier_ops = { diff --git a/security/security.c b/security/security.c index 088a79c35c26..dcb3e7014f9b 100644 --- a/security/security.c +++ b/security/security.c @@ -5167,87 +5167,29 @@ int security_bpf_prog(struct bpf_prog *prog) } /** - * security_bpf_map_create() - Check if BPF map creation is allowed - * @map: BPF map object - * @attr: BPF syscall attributes used to create BPF map - * @token: BPF token used to grant user access - * - * Do a check when the kernel creates a new BPF map. This is also the - * point where LSM blob is allocated for LSMs that need them. - * - * Return: Returns 0 on success, error on failure. - */ -int security_bpf_map_create(struct bpf_map *map, union bpf_attr *attr, - struct bpf_token *token) -{ - return call_int_hook(bpf_map_create, 0, map, attr, token); -} - -/** - * security_bpf_prog_load() - Check if loading of BPF program is allowed - * @prog: BPF program object - * @attr: BPF syscall attributes used to create BPF program - * @token: BPF token used to grant user access to BPF subsystem - * - * Perform an access control check when the kernel loads a BPF program and - * allocates associated BPF program object. This hook is also responsible for - * allocating any required LSM state for the BPF program. - * - * Return: Returns 0 on success, error on failure. - */ -int security_bpf_prog_load(struct bpf_prog *prog, union bpf_attr *attr, - struct bpf_token *token) -{ - return call_int_hook(bpf_prog_load, 0, prog, attr, token); -} - -/** - * security_bpf_token_create() - Check if creating of BPF token is allowed - * @token: BPF token object - * @attr: BPF syscall attributes used to create BPF token - * @path: path pointing to BPF FS mount point from which BPF token is created - * - * Do a check when the kernel instantiates a new BPF token object from BPF FS - * instance. This is also the point where LSM blob can be allocated for LSMs. - * - * Return: Returns 0 on success, error on failure. - */ -int security_bpf_token_create(struct bpf_token *token, union bpf_attr *attr, - struct path *path) -{ - return call_int_hook(bpf_token_create, 0, token, attr, path); -} - -/** - * security_bpf_token_cmd() - Check if BPF token is allowed to delegate - * requested BPF syscall command - * @token: BPF token object - * @cmd: BPF syscall command requested to be delegated by BPF token + * security_bpf_map_alloc() - Allocate a bpf map LSM blob + * @map: bpf map * - * Do a check when the kernel decides whether provided BPF token should allow - * delegation of requested BPF syscall command. + * Initialize the security field inside bpf map. * * Return: Returns 0 on success, error on failure. */ -int security_bpf_token_cmd(const struct bpf_token *token, enum bpf_cmd cmd) +int security_bpf_map_alloc(struct bpf_map *map) { - return call_int_hook(bpf_token_cmd, 0, token, cmd); + return call_int_hook(bpf_map_alloc_security, 0, map); } /** - * security_bpf_token_capable() - Check if BPF token is allowed to delegate - * requested BPF-related capability - * @token: BPF token object - * @cap: capabilities requested to be delegated by BPF token + * security_bpf_prog_alloc() - Allocate a bpf program LSM blob + * @aux: bpf program aux info struct * - * Do a check when the kernel decides whether provided BPF token should allow - * delegation of requested BPF-related capabilities. + * Initialize the security field inside bpf program. * * Return: Returns 0 on success, error on failure. */ -int security_bpf_token_capable(const struct bpf_token *token, int cap) +int security_bpf_prog_alloc(struct bpf_prog_aux *aux) { - return call_int_hook(bpf_token_capable, 0, token, cap); + return call_int_hook(bpf_prog_alloc_security, 0, aux); } /** @@ -5258,29 +5200,18 @@ int security_bpf_token_capable(const struct bpf_token *token, int cap) */ void security_bpf_map_free(struct bpf_map *map) { - call_void_hook(bpf_map_free, map); -} - -/** - * security_bpf_prog_free() - Free a BPF program's LSM blob - * @prog: BPF program struct - * - * Clean up the security information stored inside BPF program. - */ -void security_bpf_prog_free(struct bpf_prog *prog) -{ - call_void_hook(bpf_prog_free, prog); + call_void_hook(bpf_map_free_security, map); } /** - * security_bpf_token_free() - Free a BPF token's LSM blob - * @token: BPF token struct + * security_bpf_prog_free() - Free a bpf program's LSM blob + * @aux: bpf program aux info struct * - * Clean up the security information stored inside BPF token. + * Clean up the security information stored inside bpf prog. */ -void security_bpf_token_free(struct bpf_token *token) +void security_bpf_prog_free(struct bpf_prog_aux *aux) { - call_void_hook(bpf_token_free, token); + call_void_hook(bpf_prog_free_security, aux); } #endif /* CONFIG_BPF_SYSCALL */ diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 1501e95366a1..feda711c6b7b 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -6783,8 +6783,7 @@ static int selinux_bpf_prog(struct bpf_prog *prog) BPF__PROG_RUN, NULL); } -static int selinux_bpf_map_create(struct bpf_map *map, union bpf_attr *attr, - struct bpf_token *token) +static int selinux_bpf_map_alloc(struct bpf_map *map) { struct bpf_security_struct *bpfsec; @@ -6806,8 +6805,7 @@ static void selinux_bpf_map_free(struct bpf_map *map) kfree(bpfsec); } -static int selinux_bpf_prog_load(struct bpf_prog *prog, union bpf_attr *attr, - struct bpf_token *token) +static int selinux_bpf_prog_alloc(struct bpf_prog_aux *aux) { struct bpf_security_struct *bpfsec; @@ -6816,39 +6814,16 @@ static int selinux_bpf_prog_load(struct bpf_prog *prog, union bpf_attr *attr, return -ENOMEM; bpfsec->sid = current_sid(); - prog->aux->security = bpfsec; + aux->security = bpfsec; return 0; } -static void selinux_bpf_prog_free(struct bpf_prog *prog) +static void selinux_bpf_prog_free(struct bpf_prog_aux *aux) { - struct bpf_security_struct *bpfsec = prog->aux->security; + struct bpf_security_struct *bpfsec = aux->security; - prog->aux->security = NULL; - kfree(bpfsec); -} - -static int selinux_bpf_token_create(struct bpf_token *token, union bpf_attr *attr, - struct path *path) -{ - struct bpf_security_struct *bpfsec; - - bpfsec = kzalloc(sizeof(*bpfsec), GFP_KERNEL); - if (!bpfsec) - return -ENOMEM; - - bpfsec->sid = current_sid(); - token->security = bpfsec; - - return 0; -} - -static void selinux_bpf_token_free(struct bpf_token *token) -{ - struct bpf_security_struct *bpfsec = token->security; - - token->security = NULL; + aux->security = NULL; kfree(bpfsec); } #endif @@ -7204,9 +7179,8 @@ static struct security_hook_list selinux_hooks[] __ro_after_init = { LSM_HOOK_INIT(bpf, selinux_bpf), LSM_HOOK_INIT(bpf_map, selinux_bpf_map), LSM_HOOK_INIT(bpf_prog, selinux_bpf_prog), - LSM_HOOK_INIT(bpf_map_free, selinux_bpf_map_free), - LSM_HOOK_INIT(bpf_prog_free, selinux_bpf_prog_free), - LSM_HOOK_INIT(bpf_token_free, selinux_bpf_token_free), + LSM_HOOK_INIT(bpf_map_free_security, selinux_bpf_map_free), + LSM_HOOK_INIT(bpf_prog_free_security, selinux_bpf_prog_free), #endif #ifdef CONFIG_PERF_EVENTS @@ -7263,9 +7237,8 @@ static struct security_hook_list selinux_hooks[] __ro_after_init = { LSM_HOOK_INIT(audit_rule_init, selinux_audit_rule_init), #endif #ifdef CONFIG_BPF_SYSCALL - LSM_HOOK_INIT(bpf_map_create, selinux_bpf_map_create), - LSM_HOOK_INIT(bpf_prog_load, selinux_bpf_prog_load), - LSM_HOOK_INIT(bpf_token_create, selinux_bpf_token_create), + LSM_HOOK_INIT(bpf_map_alloc_security, selinux_bpf_map_alloc), + LSM_HOOK_INIT(bpf_prog_alloc_security, selinux_bpf_prog_alloc), #endif #ifdef CONFIG_PERF_EVENTS LSM_HOOK_INIT(perf_event_alloc, selinux_perf_event_alloc), diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index e0545201b55f..7f24d898efbb 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -847,36 +847,6 @@ union bpf_iter_link_info { * Returns zero on success. On error, -1 is returned and *errno* * is set appropriately. * - * BPF_TOKEN_CREATE - * Description - * Create BPF token with embedded information about what - * BPF-related functionality it allows: - * - a set of allowed bpf() syscall commands; - * - a set of allowed BPF map types to be created with - * BPF_MAP_CREATE command, if BPF_MAP_CREATE itself is allowed; - * - a set of allowed BPF program types and BPF program attach - * types to be loaded with BPF_PROG_LOAD command, if - * BPF_PROG_LOAD itself is allowed. - * - * BPF token is created (derived) from an instance of BPF FS, - * assuming it has necessary delegation mount options specified. - * This BPF token can be passed as an extra parameter to various - * bpf() syscall commands to grant BPF subsystem functionality to - * unprivileged processes. - * - * When created, BPF token is "associated" with the owning - * user namespace of BPF FS instance (super block) that it was - * derived from, and subsequent BPF operations performed with - * BPF token would be performing capabilities checks (i.e., - * CAP_BPF, CAP_PERFMON, CAP_NET_ADMIN, CAP_SYS_ADMIN) within - * that user namespace. Without BPF token, such capabilities - * have to be granted in init user namespace, making bpf() - * syscall incompatible with user namespace, for the most part. - * - * Return - * A new file descriptor (a nonnegative integer), or -1 if an - * error occurred (in which case, *errno* is set appropriately). - * * NOTES * eBPF objects (maps and programs) can be shared between processes. * @@ -931,8 +901,6 @@ enum bpf_cmd { BPF_ITER_CREATE, BPF_LINK_DETACH, BPF_PROG_BIND_MAP, - BPF_TOKEN_CREATE, - __MAX_BPF_CMD, }; enum bpf_map_type { @@ -983,7 +951,6 @@ enum bpf_map_type { BPF_MAP_TYPE_BLOOM_FILTER, BPF_MAP_TYPE_USER_RINGBUF, BPF_MAP_TYPE_CGRP_STORAGE, - __MAX_BPF_MAP_TYPE }; /* Note that tracing related programs such as @@ -1028,7 +995,6 @@ enum bpf_prog_type { BPF_PROG_TYPE_SK_LOOKUP, BPF_PROG_TYPE_SYSCALL, /* a program that can execute syscalls */ BPF_PROG_TYPE_NETFILTER, - __MAX_BPF_PROG_TYPE }; enum bpf_attach_type { @@ -1437,7 +1403,6 @@ union bpf_attr { * to using 5 hash functions). */ __u64 map_extra; - __u32 map_token_fd; }; struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */ @@ -1507,7 +1472,6 @@ union bpf_attr { * truncated), or smaller (if log buffer wasn't filled completely). */ __u32 log_true_size; - __u32 prog_token_fd; }; struct { /* anonymous struct used by BPF_OBJ_* commands */ @@ -1620,7 +1584,6 @@ union bpf_attr { * truncated), or smaller (if log buffer wasn't filled completely). */ __u32 btf_log_true_size; - __u32 btf_token_fd; }; struct { @@ -1751,11 +1714,6 @@ union bpf_attr { __u32 flags; /* extra flags */ } prog_bind_map; - struct { /* struct used by BPF_TOKEN_CREATE command */ - __u32 flags; - __u32 bpffs_fd; - } token_create; - } __attribute__((aligned(8))); /* The description below is an attempt at providing documentation to eBPF diff --git a/tools/lib/bpf/Build b/tools/lib/bpf/Build index b6619199a706..2d0c282c8588 100644 --- a/tools/lib/bpf/Build +++ b/tools/lib/bpf/Build @@ -1,4 +1,4 @@ libbpf-y := libbpf.o bpf.o nlattr.o btf.o libbpf_errno.o str_error.o \ netlink.o bpf_prog_linfo.o libbpf_probes.o hashmap.o \ btf_dump.o ringbuf.o strset.o linker.o gen_loader.o relo_core.o \ - usdt.o zip.o elf.o features.o + usdt.o zip.o elf.o diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c index 0ad8e532b3cf..9dc9625651dc 100644 --- a/tools/lib/bpf/bpf.c +++ b/tools/lib/bpf/bpf.c @@ -103,7 +103,7 @@ int sys_bpf_prog_load(union bpf_attr *attr, unsigned int size, int attempts) * [0] https://lore.kernel.org/bpf/20201201215900.3569844-1-guro@fb.com/ * [1] d05512618056 ("bpf: Add bpf_ktime_get_coarse_ns helper") */ -int probe_memcg_account(int token_fd) +int probe_memcg_account(void) { const size_t attr_sz = offsetofend(union bpf_attr, attach_btf_obj_fd); struct bpf_insn insns[] = { @@ -120,7 +120,6 @@ int probe_memcg_account(int token_fd) attr.insns = ptr_to_u64(insns); attr.insn_cnt = insn_cnt; attr.license = ptr_to_u64("GPL"); - attr.prog_token_fd = token_fd; prog_fd = sys_bpf_fd(BPF_PROG_LOAD, &attr, attr_sz); if (prog_fd >= 0) { @@ -147,7 +146,7 @@ int bump_rlimit_memlock(void) struct rlimit rlim; /* if kernel supports memcg-based accounting, skip bumping RLIMIT_MEMLOCK */ - if (memlock_bumped || feat_supported(NULL, FEAT_MEMCG_ACCOUNT)) + if (memlock_bumped || kernel_supports(NULL, FEAT_MEMCG_ACCOUNT)) return 0; memlock_bumped = true; @@ -170,7 +169,7 @@ int bpf_map_create(enum bpf_map_type map_type, __u32 max_entries, const struct bpf_map_create_opts *opts) { - const size_t attr_sz = offsetofend(union bpf_attr, map_token_fd); + const size_t attr_sz = offsetofend(union bpf_attr, map_extra); union bpf_attr attr; int fd; @@ -182,7 +181,7 @@ int bpf_map_create(enum bpf_map_type map_type, return libbpf_err(-EINVAL); attr.map_type = map_type; - if (map_name && feat_supported(NULL, FEAT_PROG_NAME)) + if (map_name && kernel_supports(NULL, FEAT_PROG_NAME)) libbpf_strlcpy(attr.map_name, map_name, sizeof(attr.map_name)); attr.key_size = key_size; attr.value_size = value_size; @@ -199,8 +198,6 @@ int bpf_map_create(enum bpf_map_type map_type, attr.numa_node = OPTS_GET(opts, numa_node, 0); attr.map_ifindex = OPTS_GET(opts, map_ifindex, 0); - attr.map_token_fd = OPTS_GET(opts, token_fd, 0); - fd = sys_bpf_fd(BPF_MAP_CREATE, &attr, attr_sz); return libbpf_err_errno(fd); } @@ -235,7 +232,7 @@ int bpf_prog_load(enum bpf_prog_type prog_type, const struct bpf_insn *insns, size_t insn_cnt, struct bpf_prog_load_opts *opts) { - const size_t attr_sz = offsetofend(union bpf_attr, prog_token_fd); + const size_t attr_sz = offsetofend(union bpf_attr, log_true_size); void *finfo = NULL, *linfo = NULL; const char *func_info, *line_info; __u32 log_size, log_level, attach_prog_fd, attach_btf_obj_fd; @@ -264,9 +261,8 @@ int bpf_prog_load(enum bpf_prog_type prog_type, attr.prog_flags = OPTS_GET(opts, prog_flags, 0); attr.prog_ifindex = OPTS_GET(opts, prog_ifindex, 0); attr.kern_version = OPTS_GET(opts, kern_version, 0); - attr.prog_token_fd = OPTS_GET(opts, token_fd, 0); - if (prog_name && feat_supported(NULL, FEAT_PROG_NAME)) + if (prog_name && kernel_supports(NULL, FEAT_PROG_NAME)) libbpf_strlcpy(attr.prog_name, prog_name, sizeof(attr.prog_name)); attr.license = ptr_to_u64(license); @@ -1186,7 +1182,7 @@ int bpf_raw_tracepoint_open(const char *name, int prog_fd) int bpf_btf_load(const void *btf_data, size_t btf_size, struct bpf_btf_load_opts *opts) { - const size_t attr_sz = offsetofend(union bpf_attr, btf_token_fd); + const size_t attr_sz = offsetofend(union bpf_attr, btf_log_true_size); union bpf_attr attr; char *log_buf; size_t log_size; @@ -1211,8 +1207,6 @@ int bpf_btf_load(const void *btf_data, size_t btf_size, struct bpf_btf_load_opts attr.btf = ptr_to_u64(btf_data); attr.btf_size = btf_size; - attr.btf_token_fd = OPTS_GET(opts, token_fd, 0); - /* log_level == 0 and log_buf != NULL means "try loading without * log_buf, but retry with log_buf and log_level=1 on error", which is * consistent across low-level and high-level BTF and program loading @@ -1293,20 +1287,3 @@ int bpf_prog_bind_map(int prog_fd, int map_fd, ret = sys_bpf(BPF_PROG_BIND_MAP, &attr, attr_sz); return libbpf_err_errno(ret); } - -int bpf_token_create(int bpffs_fd, struct bpf_token_create_opts *opts) -{ - const size_t attr_sz = offsetofend(union bpf_attr, token_create); - union bpf_attr attr; - int fd; - - if (!OPTS_VALID(opts, bpf_token_create_opts)) - return libbpf_err(-EINVAL); - - memset(&attr, 0, attr_sz); - attr.token_create.bpffs_fd = bpffs_fd; - attr.token_create.flags = OPTS_GET(opts, flags, 0); - - fd = sys_bpf_fd(BPF_TOKEN_CREATE, &attr, attr_sz); - return libbpf_err_errno(fd); -} diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h index 991b86bfe7e4..d0f53772bdc0 100644 --- a/tools/lib/bpf/bpf.h +++ b/tools/lib/bpf/bpf.h @@ -51,11 +51,8 @@ struct bpf_map_create_opts { __u32 numa_node; __u32 map_ifindex; - - __u32 token_fd; - size_t :0; }; -#define bpf_map_create_opts__last_field token_fd +#define bpf_map_create_opts__last_field map_ifindex LIBBPF_API int bpf_map_create(enum bpf_map_type map_type, const char *map_name, @@ -105,10 +102,9 @@ struct bpf_prog_load_opts { * If kernel doesn't support this feature, log_size is left unchanged. */ __u32 log_true_size; - __u32 token_fd; size_t :0; }; -#define bpf_prog_load_opts__last_field token_fd +#define bpf_prog_load_opts__last_field log_true_size LIBBPF_API int bpf_prog_load(enum bpf_prog_type prog_type, const char *prog_name, const char *license, @@ -134,10 +130,9 @@ struct bpf_btf_load_opts { * If kernel doesn't support this feature, log_size is left unchanged. */ __u32 log_true_size; - __u32 token_fd; size_t :0; }; -#define bpf_btf_load_opts__last_field token_fd +#define bpf_btf_load_opts__last_field log_true_size LIBBPF_API int bpf_btf_load(const void *btf_data, size_t btf_size, struct bpf_btf_load_opts *opts); @@ -645,30 +640,6 @@ struct bpf_test_run_opts { LIBBPF_API int bpf_prog_test_run_opts(int prog_fd, struct bpf_test_run_opts *opts); -struct bpf_token_create_opts { - size_t sz; /* size of this struct for forward/backward compatibility */ - __u32 flags; - size_t :0; -}; -#define bpf_token_create_opts__last_field flags - -/** - * @brief **bpf_token_create()** creates a new instance of BPF token derived - * from specified BPF FS mount point. - * - * BPF token created with this API can be passed to bpf() syscall for - * commands like BPF_PROG_LOAD, BPF_MAP_CREATE, etc. - * - * @param bpffs_fd FD for BPF FS instance from which to derive a BPF token - * instance. - * @param opts optional BPF token creation options, can be NULL - * - * @return BPF token FD > 0, on success; negative error code, otherwise (errno - * is also set to the error code) - */ -LIBBPF_API int bpf_token_create(int bpffs_fd, - struct bpf_token_create_opts *opts); - #ifdef __cplusplus } /* extern "C" */ #endif diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c index 63033c334320..ee95fd379d4d 100644 --- a/tools/lib/bpf/btf.c +++ b/tools/lib/bpf/btf.c @@ -1317,9 +1317,7 @@ struct btf *btf__parse_split(const char *path, struct btf *base_btf) static void *btf_get_raw_data(const struct btf *btf, __u32 *size, bool swap_endian); -int btf_load_into_kernel(struct btf *btf, - char *log_buf, size_t log_sz, __u32 log_level, - int token_fd) +int btf_load_into_kernel(struct btf *btf, char *log_buf, size_t log_sz, __u32 log_level) { LIBBPF_OPTS(bpf_btf_load_opts, opts); __u32 buf_sz = 0, raw_size; @@ -1369,7 +1367,6 @@ retry_load: opts.log_level = log_level; } - opts.token_fd = token_fd; btf->fd = bpf_btf_load(raw_data, raw_size, &opts); if (btf->fd < 0) { /* time to turn on verbose mode and try again */ @@ -1397,7 +1394,7 @@ done: int btf__load_into_kernel(struct btf *btf) { - return btf_load_into_kernel(btf, NULL, 0, 0, 0); + return btf_load_into_kernel(btf, NULL, 0, 0); } int btf__fd(const struct btf *btf) diff --git a/tools/lib/bpf/elf.c b/tools/lib/bpf/elf.c index c92e02394159..b02faec748a5 100644 --- a/tools/lib/bpf/elf.c +++ b/tools/lib/bpf/elf.c @@ -11,6 +11,8 @@ #include "libbpf_internal.h" #include "str_error.h" +#define STRERR_BUFSIZE 128 + /* A SHT_GNU_versym section holds 16-bit words. This bit is set if * the symbol is hidden and can only be seen when referenced using an * explicit version number. This is a GNU extension. diff --git a/tools/lib/bpf/features.c b/tools/lib/bpf/features.c deleted file mode 100644 index ce98a334be21..000000000000 --- a/tools/lib/bpf/features.c +++ /dev/null @@ -1,478 +0,0 @@ -// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) -/* Copyright (c) 2023 Meta Platforms, Inc. and affiliates. */ -#include -#include -#include "bpf.h" -#include "libbpf.h" -#include "libbpf_common.h" -#include "libbpf_internal.h" -#include "str_error.h" - -static inline __u64 ptr_to_u64(const void *ptr) -{ - return (__u64)(unsigned long)ptr; -} - -static int probe_fd(int fd) -{ - if (fd >= 0) - close(fd); - return fd >= 0; -} - -static int probe_kern_prog_name(int token_fd) -{ - const size_t attr_sz = offsetofend(union bpf_attr, prog_name); - struct bpf_insn insns[] = { - BPF_MOV64_IMM(BPF_REG_0, 0), - BPF_EXIT_INSN(), - }; - union bpf_attr attr; - int ret; - - memset(&attr, 0, attr_sz); - attr.prog_type = BPF_PROG_TYPE_SOCKET_FILTER; - attr.license = ptr_to_u64("GPL"); - attr.insns = ptr_to_u64(insns); - attr.insn_cnt = (__u32)ARRAY_SIZE(insns); - attr.prog_token_fd = token_fd; - libbpf_strlcpy(attr.prog_name, "libbpf_nametest", sizeof(attr.prog_name)); - - /* make sure loading with name works */ - ret = sys_bpf_prog_load(&attr, attr_sz, PROG_LOAD_ATTEMPTS); - return probe_fd(ret); -} - -static int probe_kern_global_data(int token_fd) -{ - char *cp, errmsg[STRERR_BUFSIZE]; - struct bpf_insn insns[] = { - BPF_LD_MAP_VALUE(BPF_REG_1, 0, 16), - BPF_ST_MEM(BPF_DW, BPF_REG_1, 0, 42), - BPF_MOV64_IMM(BPF_REG_0, 0), - BPF_EXIT_INSN(), - }; - LIBBPF_OPTS(bpf_map_create_opts, map_opts, .token_fd = token_fd); - LIBBPF_OPTS(bpf_prog_load_opts, prog_opts, .token_fd = token_fd); - int ret, map, insn_cnt = ARRAY_SIZE(insns); - - map = bpf_map_create(BPF_MAP_TYPE_ARRAY, "libbpf_global", sizeof(int), 32, 1, &map_opts); - if (map < 0) { - ret = -errno; - cp = libbpf_strerror_r(ret, errmsg, sizeof(errmsg)); - pr_warn("Error in %s():%s(%d). Couldn't create simple array map.\n", - __func__, cp, -ret); - return ret; - } - - insns[0].imm = map; - - ret = bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER, NULL, "GPL", insns, insn_cnt, &prog_opts); - close(map); - return probe_fd(ret); -} - -static int probe_kern_btf(int token_fd) -{ - static const char strs[] = "\0int"; - __u32 types[] = { - /* int */ - BTF_TYPE_INT_ENC(1, BTF_INT_SIGNED, 0, 32, 4), - }; - - return probe_fd(libbpf__load_raw_btf((char *)types, sizeof(types), - strs, sizeof(strs), token_fd)); -} - -static int probe_kern_btf_func(int token_fd) -{ - static const char strs[] = "\0int\0x\0a"; - /* void x(int a) {} */ - __u32 types[] = { - /* int */ - BTF_TYPE_INT_ENC(1, BTF_INT_SIGNED, 0, 32, 4), /* [1] */ - /* FUNC_PROTO */ /* [2] */ - BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_FUNC_PROTO, 0, 1), 0), - BTF_PARAM_ENC(7, 1), - /* FUNC x */ /* [3] */ - BTF_TYPE_ENC(5, BTF_INFO_ENC(BTF_KIND_FUNC, 0, 0), 2), - }; - - return probe_fd(libbpf__load_raw_btf((char *)types, sizeof(types), - strs, sizeof(strs), token_fd)); -} - -static int probe_kern_btf_func_global(int token_fd) -{ - static const char strs[] = "\0int\0x\0a"; - /* static void x(int a) {} */ - __u32 types[] = { - /* int */ - BTF_TYPE_INT_ENC(1, BTF_INT_SIGNED, 0, 32, 4), /* [1] */ - /* FUNC_PROTO */ /* [2] */ - BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_FUNC_PROTO, 0, 1), 0), - BTF_PARAM_ENC(7, 1), - /* FUNC x BTF_FUNC_GLOBAL */ /* [3] */ - BTF_TYPE_ENC(5, BTF_INFO_ENC(BTF_KIND_FUNC, 0, BTF_FUNC_GLOBAL), 2), - }; - - return probe_fd(libbpf__load_raw_btf((char *)types, sizeof(types), - strs, sizeof(strs), token_fd)); -} - -static int probe_kern_btf_datasec(int token_fd) -{ - static const char strs[] = "\0x\0.data"; - /* static int a; */ - __u32 types[] = { - /* int */ - BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */ - /* VAR x */ /* [2] */ - BTF_TYPE_ENC(1, BTF_INFO_ENC(BTF_KIND_VAR, 0, 0), 1), - BTF_VAR_STATIC, - /* DATASEC val */ /* [3] */ - BTF_TYPE_ENC(3, BTF_INFO_ENC(BTF_KIND_DATASEC, 0, 1), 4), - BTF_VAR_SECINFO_ENC(2, 0, 4), - }; - - return probe_fd(libbpf__load_raw_btf((char *)types, sizeof(types), - strs, sizeof(strs), token_fd)); -} - -static int probe_kern_btf_float(int token_fd) -{ - static const char strs[] = "\0float"; - __u32 types[] = { - /* float */ - BTF_TYPE_FLOAT_ENC(1, 4), - }; - - return probe_fd(libbpf__load_raw_btf((char *)types, sizeof(types), - strs, sizeof(strs), token_fd)); -} - -static int probe_kern_btf_decl_tag(int token_fd) -{ - static const char strs[] = "\0tag"; - __u32 types[] = { - /* int */ - BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */ - /* VAR x */ /* [2] */ - BTF_TYPE_ENC(1, BTF_INFO_ENC(BTF_KIND_VAR, 0, 0), 1), - BTF_VAR_STATIC, - /* attr */ - BTF_TYPE_DECL_TAG_ENC(1, 2, -1), - }; - - return probe_fd(libbpf__load_raw_btf((char *)types, sizeof(types), - strs, sizeof(strs), token_fd)); -} - -static int probe_kern_btf_type_tag(int token_fd) -{ - static const char strs[] = "\0tag"; - __u32 types[] = { - /* int */ - BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */ - /* attr */ - BTF_TYPE_TYPE_TAG_ENC(1, 1), /* [2] */ - /* ptr */ - BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_PTR, 0, 0), 2), /* [3] */ - }; - - return probe_fd(libbpf__load_raw_btf((char *)types, sizeof(types), - strs, sizeof(strs), token_fd)); -} - -static int probe_kern_array_mmap(int token_fd) -{ - LIBBPF_OPTS(bpf_map_create_opts, opts, - .map_flags = BPF_F_MMAPABLE, - .token_fd = token_fd, - ); - int fd; - - fd = bpf_map_create(BPF_MAP_TYPE_ARRAY, "libbpf_mmap", sizeof(int), sizeof(int), 1, &opts); - return probe_fd(fd); -} - -static int probe_kern_exp_attach_type(int token_fd) -{ - LIBBPF_OPTS(bpf_prog_load_opts, opts, - .expected_attach_type = BPF_CGROUP_INET_SOCK_CREATE, - .token_fd = token_fd, - ); - struct bpf_insn insns[] = { - BPF_MOV64_IMM(BPF_REG_0, 0), - BPF_EXIT_INSN(), - }; - int fd, insn_cnt = ARRAY_SIZE(insns); - - /* use any valid combination of program type and (optional) - * non-zero expected attach type (i.e., not a BPF_CGROUP_INET_INGRESS) - * to see if kernel supports expected_attach_type field for - * BPF_PROG_LOAD command - */ - fd = bpf_prog_load(BPF_PROG_TYPE_CGROUP_SOCK, NULL, "GPL", insns, insn_cnt, &opts); - return probe_fd(fd); -} - -static int probe_kern_probe_read_kernel(int token_fd) -{ - LIBBPF_OPTS(bpf_prog_load_opts, opts, .token_fd = token_fd); - struct bpf_insn insns[] = { - BPF_MOV64_REG(BPF_REG_1, BPF_REG_10), /* r1 = r10 (fp) */ - BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -8), /* r1 += -8 */ - BPF_MOV64_IMM(BPF_REG_2, 8), /* r2 = 8 */ - BPF_MOV64_IMM(BPF_REG_3, 0), /* r3 = 0 */ - BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_probe_read_kernel), - BPF_EXIT_INSN(), - }; - int fd, insn_cnt = ARRAY_SIZE(insns); - - fd = bpf_prog_load(BPF_PROG_TYPE_TRACEPOINT, NULL, "GPL", insns, insn_cnt, &opts); - return probe_fd(fd); -} - -static int probe_prog_bind_map(int token_fd) -{ - char *cp, errmsg[STRERR_BUFSIZE]; - struct bpf_insn insns[] = { - BPF_MOV64_IMM(BPF_REG_0, 0), - BPF_EXIT_INSN(), - }; - LIBBPF_OPTS(bpf_map_create_opts, map_opts, .token_fd = token_fd); - LIBBPF_OPTS(bpf_prog_load_opts, prog_opts, .token_fd = token_fd); - int ret, map, prog, insn_cnt = ARRAY_SIZE(insns); - - map = bpf_map_create(BPF_MAP_TYPE_ARRAY, "libbpf_det_bind", sizeof(int), 32, 1, &map_opts); - if (map < 0) { - ret = -errno; - cp = libbpf_strerror_r(ret, errmsg, sizeof(errmsg)); - pr_warn("Error in %s():%s(%d). Couldn't create simple array map.\n", - __func__, cp, -ret); - return ret; - } - - prog = bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER, NULL, "GPL", insns, insn_cnt, &prog_opts); - if (prog < 0) { - close(map); - return 0; - } - - ret = bpf_prog_bind_map(prog, map, NULL); - - close(map); - close(prog); - - return ret >= 0; -} - -static int probe_module_btf(int token_fd) -{ - static const char strs[] = "\0int"; - __u32 types[] = { - /* int */ - BTF_TYPE_INT_ENC(1, BTF_INT_SIGNED, 0, 32, 4), - }; - struct bpf_btf_info info; - __u32 len = sizeof(info); - char name[16]; - int fd, err; - - fd = libbpf__load_raw_btf((char *)types, sizeof(types), strs, sizeof(strs), token_fd); - if (fd < 0) - return 0; /* BTF not supported at all */ - - memset(&info, 0, sizeof(info)); - info.name = ptr_to_u64(name); - info.name_len = sizeof(name); - - /* check that BPF_OBJ_GET_INFO_BY_FD supports specifying name pointer; - * kernel's module BTF support coincides with support for - * name/name_len fields in struct bpf_btf_info. - */ - err = bpf_btf_get_info_by_fd(fd, &info, &len); - close(fd); - return !err; -} - -static int probe_perf_link(int token_fd) -{ - struct bpf_insn insns[] = { - BPF_MOV64_IMM(BPF_REG_0, 0), - BPF_EXIT_INSN(), - }; - LIBBPF_OPTS(bpf_prog_load_opts, opts, .token_fd = token_fd); - int prog_fd, link_fd, err; - - prog_fd = bpf_prog_load(BPF_PROG_TYPE_TRACEPOINT, NULL, "GPL", - insns, ARRAY_SIZE(insns), &opts); - if (prog_fd < 0) - return -errno; - - /* use invalid perf_event FD to get EBADF, if link is supported; - * otherwise EINVAL should be returned - */ - link_fd = bpf_link_create(prog_fd, -1, BPF_PERF_EVENT, NULL); - err = -errno; /* close() can clobber errno */ - - if (link_fd >= 0) - close(link_fd); - close(prog_fd); - - return link_fd < 0 && err == -EBADF; -} - -static int probe_uprobe_multi_link(int token_fd) -{ - LIBBPF_OPTS(bpf_prog_load_opts, load_opts, - .expected_attach_type = BPF_TRACE_UPROBE_MULTI, - .token_fd = token_fd, - ); - LIBBPF_OPTS(bpf_link_create_opts, link_opts); - struct bpf_insn insns[] = { - BPF_MOV64_IMM(BPF_REG_0, 0), - BPF_EXIT_INSN(), - }; - int prog_fd, link_fd, err; - unsigned long offset = 0; - - prog_fd = bpf_prog_load(BPF_PROG_TYPE_KPROBE, NULL, "GPL", - insns, ARRAY_SIZE(insns), &load_opts); - if (prog_fd < 0) - return -errno; - - /* Creating uprobe in '/' binary should fail with -EBADF. */ - link_opts.uprobe_multi.path = "/"; - link_opts.uprobe_multi.offsets = &offset; - link_opts.uprobe_multi.cnt = 1; - - link_fd = bpf_link_create(prog_fd, -1, BPF_TRACE_UPROBE_MULTI, &link_opts); - err = -errno; /* close() can clobber errno */ - - if (link_fd >= 0) - close(link_fd); - close(prog_fd); - - return link_fd < 0 && err == -EBADF; -} - -static int probe_kern_bpf_cookie(int token_fd) -{ - struct bpf_insn insns[] = { - BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_attach_cookie), - BPF_EXIT_INSN(), - }; - LIBBPF_OPTS(bpf_prog_load_opts, opts, .token_fd = token_fd); - int ret, insn_cnt = ARRAY_SIZE(insns); - - ret = bpf_prog_load(BPF_PROG_TYPE_TRACEPOINT, NULL, "GPL", insns, insn_cnt, &opts); - return probe_fd(ret); -} - -static int probe_kern_btf_enum64(int token_fd) -{ - static const char strs[] = "\0enum64"; - __u32 types[] = { - BTF_TYPE_ENC(1, BTF_INFO_ENC(BTF_KIND_ENUM64, 0, 0), 8), - }; - - return probe_fd(libbpf__load_raw_btf((char *)types, sizeof(types), - strs, sizeof(strs), token_fd)); -} - -typedef int (*feature_probe_fn)(int /* token_fd */); - -static struct kern_feature_cache feature_cache; - -static struct kern_feature_desc { - const char *desc; - feature_probe_fn probe; -} feature_probes[__FEAT_CNT] = { - [FEAT_PROG_NAME] = { - "BPF program name", probe_kern_prog_name, - }, - [FEAT_GLOBAL_DATA] = { - "global variables", probe_kern_global_data, - }, - [FEAT_BTF] = { - "minimal BTF", probe_kern_btf, - }, - [FEAT_BTF_FUNC] = { - "BTF functions", probe_kern_btf_func, - }, - [FEAT_BTF_GLOBAL_FUNC] = { - "BTF global function", probe_kern_btf_func_global, - }, - [FEAT_BTF_DATASEC] = { - "BTF data section and variable", probe_kern_btf_datasec, - }, - [FEAT_ARRAY_MMAP] = { - "ARRAY map mmap()", probe_kern_array_mmap, - }, - [FEAT_EXP_ATTACH_TYPE] = { - "BPF_PROG_LOAD expected_attach_type attribute", - probe_kern_exp_attach_type, - }, - [FEAT_PROBE_READ_KERN] = { - "bpf_probe_read_kernel() helper", probe_kern_probe_read_kernel, - }, - [FEAT_PROG_BIND_MAP] = { - "BPF_PROG_BIND_MAP support", probe_prog_bind_map, - }, - [FEAT_MODULE_BTF] = { - "module BTF support", probe_module_btf, - }, - [FEAT_BTF_FLOAT] = { - "BTF_KIND_FLOAT support", probe_kern_btf_float, - }, - [FEAT_PERF_LINK] = { - "BPF perf link support", probe_perf_link, - }, - [FEAT_BTF_DECL_TAG] = { - "BTF_KIND_DECL_TAG support", probe_kern_btf_decl_tag, - }, - [FEAT_BTF_TYPE_TAG] = { - "BTF_KIND_TYPE_TAG support", probe_kern_btf_type_tag, - }, - [FEAT_MEMCG_ACCOUNT] = { - "memcg-based memory accounting", probe_memcg_account, - }, - [FEAT_BPF_COOKIE] = { - "BPF cookie support", probe_kern_bpf_cookie, - }, - [FEAT_BTF_ENUM64] = { - "BTF_KIND_ENUM64 support", probe_kern_btf_enum64, - }, - [FEAT_SYSCALL_WRAPPER] = { - "Kernel using syscall wrapper", probe_kern_syscall_wrapper, - }, - [FEAT_UPROBE_MULTI_LINK] = { - "BPF multi-uprobe link support", probe_uprobe_multi_link, - }, -}; - -bool feat_supported(struct kern_feature_cache *cache, enum kern_feature_id feat_id) -{ - struct kern_feature_desc *feat = &feature_probes[feat_id]; - int ret; - - /* assume global feature cache, unless custom one is provided */ - if (!cache) - cache = &feature_cache; - - if (READ_ONCE(cache->res[feat_id]) == FEAT_UNKNOWN) { - ret = feat->probe(cache->token_fd); - if (ret > 0) { - WRITE_ONCE(cache->res[feat_id], FEAT_SUPPORTED); - } else if (ret == 0) { - WRITE_ONCE(cache->res[feat_id], FEAT_MISSING); - } else { - pr_warn("Detection of kernel %s support failed: %d\n", feat->desc, ret); - WRITE_ONCE(cache->res[feat_id], FEAT_MISSING); - } - } - - return READ_ONCE(cache->res[feat_id]) == FEAT_SUPPORTED; -} diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 4b5ff9508e18..ac54ebc0629f 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -59,8 +59,6 @@ #define BPF_FS_MAGIC 0xcafe4a11 #endif -#define BPF_FS_DEFAULT_PATH "/sys/fs/bpf" - #define BPF_INSN_SZ (sizeof(struct bpf_insn)) /* vsprintf() in __base_pr() uses nonliteral format string. It may break @@ -695,10 +693,6 @@ struct bpf_object { struct usdt_manager *usdt_man; - struct kern_feature_cache *feat_cache; - char *token_path; - int token_fd; - char path[]; }; @@ -2198,7 +2192,7 @@ static int build_map_pin_path(struct bpf_map *map, const char *path) int err; if (!path) - path = BPF_FS_DEFAULT_PATH; + path = "/sys/fs/bpf"; err = pathname_concat(buf, sizeof(buf), path, bpf_map__name(map)); if (err) @@ -3285,7 +3279,7 @@ skip_exception_cb: } else { /* currently BPF_BTF_LOAD only supports log_level 1 */ err = btf_load_into_kernel(kern_btf, obj->log_buf, obj->log_size, - obj->log_level ? 1 : 0, obj->token_fd); + obj->log_level ? 1 : 0); } if (sanitize) { if (!err) { @@ -4608,63 +4602,6 @@ int bpf_map__set_max_entries(struct bpf_map *map, __u32 max_entries) return 0; } -static int bpf_object_prepare_token(struct bpf_object *obj) -{ - const char *bpffs_path; - int bpffs_fd = -1, token_fd, err; - bool mandatory; - enum libbpf_print_level level; - - /* token is already set up */ - if (obj->token_fd > 0) - return 0; - /* token is explicitly prevented */ - if (obj->token_fd < 0) { - pr_debug("object '%s': token is prevented, skipping...\n", obj->name); - /* reset to zero to avoid extra checks during map_create and prog_load steps */ - obj->token_fd = 0; - return 0; - } - - mandatory = obj->token_path != NULL; - level = mandatory ? LIBBPF_WARN : LIBBPF_DEBUG; - - bpffs_path = obj->token_path ?: BPF_FS_DEFAULT_PATH; - bpffs_fd = open(bpffs_path, O_DIRECTORY, O_RDWR); - if (bpffs_fd < 0) { - err = -errno; - __pr(level, "object '%s': failed (%d) to open BPF FS mount at '%s'%s\n", - obj->name, err, bpffs_path, - mandatory ? "" : ", skipping optional step..."); - return mandatory ? err : 0; - } - - token_fd = bpf_token_create(bpffs_fd, 0); - close(bpffs_fd); - if (token_fd < 0) { - if (!mandatory && token_fd == -ENOENT) { - pr_debug("object '%s': BPF FS at '%s' doesn't have BPF token delegation set up, skipping...\n", - obj->name, bpffs_path); - return 0; - } - __pr(level, "object '%s': failed (%d) to create BPF token from '%s'%s\n", - obj->name, token_fd, bpffs_path, - mandatory ? "" : ", skipping optional step..."); - return mandatory ? token_fd : 0; - } - - obj->feat_cache = calloc(1, sizeof(*obj->feat_cache)); - if (!obj->feat_cache) { - close(token_fd); - return -ENOMEM; - } - - obj->token_fd = token_fd; - obj->feat_cache->token_fd = token_fd; - - return 0; -} - static int bpf_object__probe_loading(struct bpf_object *obj) { @@ -4674,7 +4611,6 @@ bpf_object__probe_loading(struct bpf_object *obj) BPF_EXIT_INSN(), }; int ret, insn_cnt = ARRAY_SIZE(insns); - LIBBPF_OPTS(bpf_prog_load_opts, opts, .token_fd = obj->token_fd); if (obj->gen_loader) return 0; @@ -4684,9 +4620,9 @@ bpf_object__probe_loading(struct bpf_object *obj) pr_warn("Failed to bump RLIMIT_MEMLOCK (err = %d), you might need to do it explicitly!\n", ret); /* make sure basic loading works */ - ret = bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER, NULL, "GPL", insns, insn_cnt, &opts); + ret = bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER, NULL, "GPL", insns, insn_cnt, NULL); if (ret < 0) - ret = bpf_prog_load(BPF_PROG_TYPE_TRACEPOINT, NULL, "GPL", insns, insn_cnt, &opts); + ret = bpf_prog_load(BPF_PROG_TYPE_TRACEPOINT, NULL, "GPL", insns, insn_cnt, NULL); if (ret < 0) { ret = errno; cp = libbpf_strerror_r(ret, errmsg, sizeof(errmsg)); @@ -4701,18 +4637,462 @@ bpf_object__probe_loading(struct bpf_object *obj) return 0; } +static int probe_fd(int fd) +{ + if (fd >= 0) + close(fd); + return fd >= 0; +} + +static int probe_kern_prog_name(void) +{ + const size_t attr_sz = offsetofend(union bpf_attr, prog_name); + struct bpf_insn insns[] = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }; + union bpf_attr attr; + int ret; + + memset(&attr, 0, attr_sz); + attr.prog_type = BPF_PROG_TYPE_SOCKET_FILTER; + attr.license = ptr_to_u64("GPL"); + attr.insns = ptr_to_u64(insns); + attr.insn_cnt = (__u32)ARRAY_SIZE(insns); + libbpf_strlcpy(attr.prog_name, "libbpf_nametest", sizeof(attr.prog_name)); + + /* make sure loading with name works */ + ret = sys_bpf_prog_load(&attr, attr_sz, PROG_LOAD_ATTEMPTS); + return probe_fd(ret); +} + +static int probe_kern_global_data(void) +{ + char *cp, errmsg[STRERR_BUFSIZE]; + struct bpf_insn insns[] = { + BPF_LD_MAP_VALUE(BPF_REG_1, 0, 16), + BPF_ST_MEM(BPF_DW, BPF_REG_1, 0, 42), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }; + int ret, map, insn_cnt = ARRAY_SIZE(insns); + + map = bpf_map_create(BPF_MAP_TYPE_ARRAY, "libbpf_global", sizeof(int), 32, 1, NULL); + if (map < 0) { + ret = -errno; + cp = libbpf_strerror_r(ret, errmsg, sizeof(errmsg)); + pr_warn("Error in %s():%s(%d). Couldn't create simple array map.\n", + __func__, cp, -ret); + return ret; + } + + insns[0].imm = map; + + ret = bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER, NULL, "GPL", insns, insn_cnt, NULL); + close(map); + return probe_fd(ret); +} + +static int probe_kern_btf(void) +{ + static const char strs[] = "\0int"; + __u32 types[] = { + /* int */ + BTF_TYPE_INT_ENC(1, BTF_INT_SIGNED, 0, 32, 4), + }; + + return probe_fd(libbpf__load_raw_btf((char *)types, sizeof(types), + strs, sizeof(strs))); +} + +static int probe_kern_btf_func(void) +{ + static const char strs[] = "\0int\0x\0a"; + /* void x(int a) {} */ + __u32 types[] = { + /* int */ + BTF_TYPE_INT_ENC(1, BTF_INT_SIGNED, 0, 32, 4), /* [1] */ + /* FUNC_PROTO */ /* [2] */ + BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_FUNC_PROTO, 0, 1), 0), + BTF_PARAM_ENC(7, 1), + /* FUNC x */ /* [3] */ + BTF_TYPE_ENC(5, BTF_INFO_ENC(BTF_KIND_FUNC, 0, 0), 2), + }; + + return probe_fd(libbpf__load_raw_btf((char *)types, sizeof(types), + strs, sizeof(strs))); +} + +static int probe_kern_btf_func_global(void) +{ + static const char strs[] = "\0int\0x\0a"; + /* static void x(int a) {} */ + __u32 types[] = { + /* int */ + BTF_TYPE_INT_ENC(1, BTF_INT_SIGNED, 0, 32, 4), /* [1] */ + /* FUNC_PROTO */ /* [2] */ + BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_FUNC_PROTO, 0, 1), 0), + BTF_PARAM_ENC(7, 1), + /* FUNC x BTF_FUNC_GLOBAL */ /* [3] */ + BTF_TYPE_ENC(5, BTF_INFO_ENC(BTF_KIND_FUNC, 0, BTF_FUNC_GLOBAL), 2), + }; + + return probe_fd(libbpf__load_raw_btf((char *)types, sizeof(types), + strs, sizeof(strs))); +} + +static int probe_kern_btf_datasec(void) +{ + static const char strs[] = "\0x\0.data"; + /* static int a; */ + __u32 types[] = { + /* int */ + BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */ + /* VAR x */ /* [2] */ + BTF_TYPE_ENC(1, BTF_INFO_ENC(BTF_KIND_VAR, 0, 0), 1), + BTF_VAR_STATIC, + /* DATASEC val */ /* [3] */ + BTF_TYPE_ENC(3, BTF_INFO_ENC(BTF_KIND_DATASEC, 0, 1), 4), + BTF_VAR_SECINFO_ENC(2, 0, 4), + }; + + return probe_fd(libbpf__load_raw_btf((char *)types, sizeof(types), + strs, sizeof(strs))); +} + +static int probe_kern_btf_float(void) +{ + static const char strs[] = "\0float"; + __u32 types[] = { + /* float */ + BTF_TYPE_FLOAT_ENC(1, 4), + }; + + return probe_fd(libbpf__load_raw_btf((char *)types, sizeof(types), + strs, sizeof(strs))); +} + +static int probe_kern_btf_decl_tag(void) +{ + static const char strs[] = "\0tag"; + __u32 types[] = { + /* int */ + BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */ + /* VAR x */ /* [2] */ + BTF_TYPE_ENC(1, BTF_INFO_ENC(BTF_KIND_VAR, 0, 0), 1), + BTF_VAR_STATIC, + /* attr */ + BTF_TYPE_DECL_TAG_ENC(1, 2, -1), + }; + + return probe_fd(libbpf__load_raw_btf((char *)types, sizeof(types), + strs, sizeof(strs))); +} + +static int probe_kern_btf_type_tag(void) +{ + static const char strs[] = "\0tag"; + __u32 types[] = { + /* int */ + BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */ + /* attr */ + BTF_TYPE_TYPE_TAG_ENC(1, 1), /* [2] */ + /* ptr */ + BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_PTR, 0, 0), 2), /* [3] */ + }; + + return probe_fd(libbpf__load_raw_btf((char *)types, sizeof(types), + strs, sizeof(strs))); +} + +static int probe_kern_array_mmap(void) +{ + LIBBPF_OPTS(bpf_map_create_opts, opts, .map_flags = BPF_F_MMAPABLE); + int fd; + + fd = bpf_map_create(BPF_MAP_TYPE_ARRAY, "libbpf_mmap", sizeof(int), sizeof(int), 1, &opts); + return probe_fd(fd); +} + +static int probe_kern_exp_attach_type(void) +{ + LIBBPF_OPTS(bpf_prog_load_opts, opts, .expected_attach_type = BPF_CGROUP_INET_SOCK_CREATE); + struct bpf_insn insns[] = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }; + int fd, insn_cnt = ARRAY_SIZE(insns); + + /* use any valid combination of program type and (optional) + * non-zero expected attach type (i.e., not a BPF_CGROUP_INET_INGRESS) + * to see if kernel supports expected_attach_type field for + * BPF_PROG_LOAD command + */ + fd = bpf_prog_load(BPF_PROG_TYPE_CGROUP_SOCK, NULL, "GPL", insns, insn_cnt, &opts); + return probe_fd(fd); +} + +static int probe_kern_probe_read_kernel(void) +{ + struct bpf_insn insns[] = { + BPF_MOV64_REG(BPF_REG_1, BPF_REG_10), /* r1 = r10 (fp) */ + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -8), /* r1 += -8 */ + BPF_MOV64_IMM(BPF_REG_2, 8), /* r2 = 8 */ + BPF_MOV64_IMM(BPF_REG_3, 0), /* r3 = 0 */ + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_probe_read_kernel), + BPF_EXIT_INSN(), + }; + int fd, insn_cnt = ARRAY_SIZE(insns); + + fd = bpf_prog_load(BPF_PROG_TYPE_TRACEPOINT, NULL, "GPL", insns, insn_cnt, NULL); + return probe_fd(fd); +} + +static int probe_prog_bind_map(void) +{ + char *cp, errmsg[STRERR_BUFSIZE]; + struct bpf_insn insns[] = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }; + int ret, map, prog, insn_cnt = ARRAY_SIZE(insns); + + map = bpf_map_create(BPF_MAP_TYPE_ARRAY, "libbpf_det_bind", sizeof(int), 32, 1, NULL); + if (map < 0) { + ret = -errno; + cp = libbpf_strerror_r(ret, errmsg, sizeof(errmsg)); + pr_warn("Error in %s():%s(%d). Couldn't create simple array map.\n", + __func__, cp, -ret); + return ret; + } + + prog = bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER, NULL, "GPL", insns, insn_cnt, NULL); + if (prog < 0) { + close(map); + return 0; + } + + ret = bpf_prog_bind_map(prog, map, NULL); + + close(map); + close(prog); + + return ret >= 0; +} + +static int probe_module_btf(void) +{ + static const char strs[] = "\0int"; + __u32 types[] = { + /* int */ + BTF_TYPE_INT_ENC(1, BTF_INT_SIGNED, 0, 32, 4), + }; + struct bpf_btf_info info; + __u32 len = sizeof(info); + char name[16]; + int fd, err; + + fd = libbpf__load_raw_btf((char *)types, sizeof(types), strs, sizeof(strs)); + if (fd < 0) + return 0; /* BTF not supported at all */ + + memset(&info, 0, sizeof(info)); + info.name = ptr_to_u64(name); + info.name_len = sizeof(name); + + /* check that BPF_OBJ_GET_INFO_BY_FD supports specifying name pointer; + * kernel's module BTF support coincides with support for + * name/name_len fields in struct bpf_btf_info. + */ + err = bpf_btf_get_info_by_fd(fd, &info, &len); + close(fd); + return !err; +} + +static int probe_perf_link(void) +{ + struct bpf_insn insns[] = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }; + int prog_fd, link_fd, err; + + prog_fd = bpf_prog_load(BPF_PROG_TYPE_TRACEPOINT, NULL, "GPL", + insns, ARRAY_SIZE(insns), NULL); + if (prog_fd < 0) + return -errno; + + /* use invalid perf_event FD to get EBADF, if link is supported; + * otherwise EINVAL should be returned + */ + link_fd = bpf_link_create(prog_fd, -1, BPF_PERF_EVENT, NULL); + err = -errno; /* close() can clobber errno */ + + if (link_fd >= 0) + close(link_fd); + close(prog_fd); + + return link_fd < 0 && err == -EBADF; +} + +static int probe_uprobe_multi_link(void) +{ + LIBBPF_OPTS(bpf_prog_load_opts, load_opts, + .expected_attach_type = BPF_TRACE_UPROBE_MULTI, + ); + LIBBPF_OPTS(bpf_link_create_opts, link_opts); + struct bpf_insn insns[] = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }; + int prog_fd, link_fd, err; + unsigned long offset = 0; + + prog_fd = bpf_prog_load(BPF_PROG_TYPE_KPROBE, NULL, "GPL", + insns, ARRAY_SIZE(insns), &load_opts); + if (prog_fd < 0) + return -errno; + + /* Creating uprobe in '/' binary should fail with -EBADF. */ + link_opts.uprobe_multi.path = "/"; + link_opts.uprobe_multi.offsets = &offset; + link_opts.uprobe_multi.cnt = 1; + + link_fd = bpf_link_create(prog_fd, -1, BPF_TRACE_UPROBE_MULTI, &link_opts); + err = -errno; /* close() can clobber errno */ + + if (link_fd >= 0) + close(link_fd); + close(prog_fd); + + return link_fd < 0 && err == -EBADF; +} + +static int probe_kern_bpf_cookie(void) +{ + struct bpf_insn insns[] = { + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_attach_cookie), + BPF_EXIT_INSN(), + }; + int ret, insn_cnt = ARRAY_SIZE(insns); + + ret = bpf_prog_load(BPF_PROG_TYPE_KPROBE, NULL, "GPL", insns, insn_cnt, NULL); + return probe_fd(ret); +} + +static int probe_kern_btf_enum64(void) +{ + static const char strs[] = "\0enum64"; + __u32 types[] = { + BTF_TYPE_ENC(1, BTF_INFO_ENC(BTF_KIND_ENUM64, 0, 0), 8), + }; + + return probe_fd(libbpf__load_raw_btf((char *)types, sizeof(types), + strs, sizeof(strs))); +} + +static int probe_kern_syscall_wrapper(void); + +enum kern_feature_result { + FEAT_UNKNOWN = 0, + FEAT_SUPPORTED = 1, + FEAT_MISSING = 2, +}; + +typedef int (*feature_probe_fn)(void); + +static struct kern_feature_desc { + const char *desc; + feature_probe_fn probe; + enum kern_feature_result res; +} feature_probes[__FEAT_CNT] = { + [FEAT_PROG_NAME] = { + "BPF program name", probe_kern_prog_name, + }, + [FEAT_GLOBAL_DATA] = { + "global variables", probe_kern_global_data, + }, + [FEAT_BTF] = { + "minimal BTF", probe_kern_btf, + }, + [FEAT_BTF_FUNC] = { + "BTF functions", probe_kern_btf_func, + }, + [FEAT_BTF_GLOBAL_FUNC] = { + "BTF global function", probe_kern_btf_func_global, + }, + [FEAT_BTF_DATASEC] = { + "BTF data section and variable", probe_kern_btf_datasec, + }, + [FEAT_ARRAY_MMAP] = { + "ARRAY map mmap()", probe_kern_array_mmap, + }, + [FEAT_EXP_ATTACH_TYPE] = { + "BPF_PROG_LOAD expected_attach_type attribute", + probe_kern_exp_attach_type, + }, + [FEAT_PROBE_READ_KERN] = { + "bpf_probe_read_kernel() helper", probe_kern_probe_read_kernel, + }, + [FEAT_PROG_BIND_MAP] = { + "BPF_PROG_BIND_MAP support", probe_prog_bind_map, + }, + [FEAT_MODULE_BTF] = { + "module BTF support", probe_module_btf, + }, + [FEAT_BTF_FLOAT] = { + "BTF_KIND_FLOAT support", probe_kern_btf_float, + }, + [FEAT_PERF_LINK] = { + "BPF perf link support", probe_perf_link, + }, + [FEAT_BTF_DECL_TAG] = { + "BTF_KIND_DECL_TAG support", probe_kern_btf_decl_tag, + }, + [FEAT_BTF_TYPE_TAG] = { + "BTF_KIND_TYPE_TAG support", probe_kern_btf_type_tag, + }, + [FEAT_MEMCG_ACCOUNT] = { + "memcg-based memory accounting", probe_memcg_account, + }, + [FEAT_BPF_COOKIE] = { + "BPF cookie support", probe_kern_bpf_cookie, + }, + [FEAT_BTF_ENUM64] = { + "BTF_KIND_ENUM64 support", probe_kern_btf_enum64, + }, + [FEAT_SYSCALL_WRAPPER] = { + "Kernel using syscall wrapper", probe_kern_syscall_wrapper, + }, + [FEAT_UPROBE_MULTI_LINK] = { + "BPF multi-uprobe link support", probe_uprobe_multi_link, + }, +}; + bool kernel_supports(const struct bpf_object *obj, enum kern_feature_id feat_id) { + struct kern_feature_desc *feat = &feature_probes[feat_id]; + int ret; + if (obj && obj->gen_loader) /* To generate loader program assume the latest kernel * to avoid doing extra prog_load, map_create syscalls. */ return true; - if (obj->token_fd) - return feat_supported(obj->feat_cache, feat_id); + if (READ_ONCE(feat->res) == FEAT_UNKNOWN) { + ret = feat->probe(); + if (ret > 0) { + WRITE_ONCE(feat->res, FEAT_SUPPORTED); + } else if (ret == 0) { + WRITE_ONCE(feat->res, FEAT_MISSING); + } else { + pr_warn("Detection of kernel %s support failed: %d\n", feat->desc, ret); + WRITE_ONCE(feat->res, FEAT_MISSING); + } + } - return feat_supported(NULL, feat_id); + return READ_ONCE(feat->res) == FEAT_SUPPORTED; } static bool map_is_reuse_compat(const struct bpf_map *map, int map_fd) @@ -4831,7 +5211,6 @@ static int bpf_object__create_map(struct bpf_object *obj, struct bpf_map *map, b create_attr.map_flags = def->map_flags; create_attr.numa_node = map->numa_node; create_attr.map_extra = map->map_extra; - create_attr.token_fd = obj->token_fd; if (bpf_map__is_struct_ops(map)) create_attr.btf_vmlinux_value_type_id = map->btf_vmlinux_value_type_id; @@ -6667,7 +7046,6 @@ static int bpf_object_load_prog(struct bpf_object *obj, struct bpf_program *prog load_attr.attach_btf_id = prog->attach_btf_id; load_attr.kern_version = kern_version; load_attr.prog_ifindex = prog->prog_ifindex; - load_attr.token_fd = obj->token_fd; /* specify func_info/line_info only if kernel supports them */ btf_fd = bpf_object__btf_fd(obj); @@ -7129,10 +7507,10 @@ static int bpf_object_init_progs(struct bpf_object *obj, const struct bpf_object static struct bpf_object *bpf_object_open(const char *path, const void *obj_buf, size_t obj_buf_sz, const struct bpf_object_open_opts *opts) { - const char *obj_name, *kconfig, *btf_tmp_path, *token_path; + const char *obj_name, *kconfig, *btf_tmp_path; struct bpf_object *obj; char tmp_name[64]; - int err, token_fd; + int err; char *log_buf; size_t log_size; __u32 log_level; @@ -7166,28 +7544,6 @@ static struct bpf_object *bpf_object_open(const char *path, const void *obj_buf, if (log_size && !log_buf) return ERR_PTR(-EINVAL); - token_path = OPTS_GET(opts, bpf_token_path, NULL); - token_fd = OPTS_GET(opts, bpf_token_fd, -1); - /* non-empty token path can't be combined with invalid token FD */ - if (token_path && token_path[0] != '\0' && token_fd < 0) - return ERR_PTR(-EINVAL); - /* empty token path can't be combined with valid token FD */ - if (token_path && token_path[0] == '\0' && token_fd > 0) - return ERR_PTR(-EINVAL); - /* if user didn't specify bpf_token_path/bpf_token_fd explicitly, - * check if LIBBPF_BPF_TOKEN_PATH envvar was set and treat it as - * bpf_token_path option - */ - if (token_fd == 0 && !token_path) - token_path = getenv("LIBBPF_BPF_TOKEN_PATH"); - /* empty token_path is equivalent to invalid token_fd */ - if (token_path && token_path[0] == '\0') { - token_path = NULL; - token_fd = -1; - } - if (token_path && strlen(token_path) >= PATH_MAX) - return ERR_PTR(-ENAMETOOLONG); - obj = bpf_object__new(path, obj_buf, obj_buf_sz, obj_name); if (IS_ERR(obj)) return obj; @@ -7196,19 +7552,6 @@ static struct bpf_object *bpf_object_open(const char *path, const void *obj_buf, obj->log_size = log_size; obj->log_level = log_level; - obj->token_fd = token_fd <= 0 ? token_fd : dup_good_fd(token_fd); - if (token_fd > 0 && obj->token_fd < 0) { - err = -errno; - goto out; - } - if (token_path) { - obj->token_path = strdup(token_path); - if (!obj->token_path) { - err = -ENOMEM; - goto out; - } - } - btf_tmp_path = OPTS_GET(opts, btf_custom_path, NULL); if (btf_tmp_path) { if (strlen(btf_tmp_path) >= PATH_MAX) { @@ -7719,8 +8062,7 @@ static int bpf_object_load(struct bpf_object *obj, int extra_log_level, const ch if (obj->gen_loader) bpf_gen__init(obj->gen_loader, extra_log_level, obj->nr_programs, obj->nr_maps); - err = bpf_object_prepare_token(obj); - err = err ? : bpf_object__probe_loading(obj); + err = bpf_object__probe_loading(obj); err = err ? : bpf_object__load_vmlinux_btf(obj, false); err = err ? : bpf_object__resolve_externs(obj, obj->kconfig); err = err ? : bpf_object__sanitize_and_load_btf(obj); @@ -8257,11 +8599,6 @@ void bpf_object__close(struct bpf_object *obj) } zfree(&obj->programs); - zfree(&obj->feat_cache); - zfree(&obj->token_path); - if (obj->token_fd > 0) - close(obj->token_fd); - free(obj); } @@ -10275,7 +10612,7 @@ static const char *arch_specific_syscall_pfx(void) #endif } -int probe_kern_syscall_wrapper(int token_fd) +static int probe_kern_syscall_wrapper(void) { char syscall_name[64]; const char *ksys_pfx; diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index 916904bd2a7a..6cd9c501624f 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -177,45 +177,10 @@ struct bpf_object_open_opts { * logs through its print callback. */ __u32 kernel_log_level; - /* FD of a BPF token instantiated by user through bpf_token_create() - * API. BPF object will keep dup()'ed FD internally, so passed token - * FD can be closed after BPF object/skeleton open step. - * - * Setting bpf_token_fd to negative value disables libbpf's automatic - * attempt to create BPF token from default BPF FS mount point - * (/sys/fs/bpf), in case this default behavior is undesirable. - * - * If bpf_token_path and bpf_token_fd are not specified, libbpf will - * consult LIBBPF_BPF_TOKEN_PATH environment variable. If set, it will - * be taken as a value of bpf_token_path option and will force libbpf - * to either create BPF token from provided custom BPF FS path, or - * will disable implicit BPF token creation, if envvar value is an - * empty string. - * - * bpf_token_path and bpf_token_fd are mutually exclusive and only one - * of those options should be set. Either of them overrides - * LIBBPF_BPF_TOKEN_PATH envvar. - */ - int bpf_token_fd; - /* Path to BPF FS mount point to derive BPF token from. - * - * Created BPF token will be used for all bpf() syscall operations - * that accept BPF token (e.g., map creation, BTF and program loads, - * etc) automatically within instantiated BPF object. - * - * Setting bpf_token_path option to empty string disables libbpf's - * automatic attempt to create BPF token from default BPF FS mount - * point (/sys/fs/bpf), in case this default behavior is undesirable. - * - * bpf_token_path and bpf_token_fd are mutually exclusive and only one - * of those options should be set. Either of them overrides - * LIBBPF_BPF_TOKEN_PATH envvar. - */ - const char *bpf_token_path; size_t :0; }; -#define bpf_object_open_opts__last_field bpf_token_path +#define bpf_object_open_opts__last_field kernel_log_level /** * @brief **bpf_object__open()** creates a bpf_object by opening diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index df7657b65c47..91c5aef7dae7 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -401,7 +401,6 @@ LIBBPF_1.3.0 { bpf_program__attach_netkit; bpf_program__attach_tcx; bpf_program__attach_uprobe_multi; - bpf_token_create; ring__avail_data_size; ring__consume; ring__consumer_pos; diff --git a/tools/lib/bpf/libbpf_internal.h b/tools/lib/bpf/libbpf_internal.h index 4cda32298c49..b5d334754e5d 100644 --- a/tools/lib/bpf/libbpf_internal.h +++ b/tools/lib/bpf/libbpf_internal.h @@ -360,32 +360,15 @@ enum kern_feature_id { __FEAT_CNT, }; -enum kern_feature_result { - FEAT_UNKNOWN = 0, - FEAT_SUPPORTED = 1, - FEAT_MISSING = 2, -}; - -struct kern_feature_cache { - enum kern_feature_result res[__FEAT_CNT]; - int token_fd; -}; - -bool feat_supported(struct kern_feature_cache *cache, enum kern_feature_id feat_id); +int probe_memcg_account(void); bool kernel_supports(const struct bpf_object *obj, enum kern_feature_id feat_id); - -int probe_kern_syscall_wrapper(int token_fd); -int probe_memcg_account(int token_fd); int bump_rlimit_memlock(void); int parse_cpu_mask_str(const char *s, bool **mask, int *mask_sz); int parse_cpu_mask_file(const char *fcpu, bool **mask, int *mask_sz); int libbpf__load_raw_btf(const char *raw_types, size_t types_len, - const char *str_sec, size_t str_len, - int token_fd); -int btf_load_into_kernel(struct btf *btf, - char *log_buf, size_t log_sz, __u32 log_level, - int token_fd); + const char *str_sec, size_t str_len); +int btf_load_into_kernel(struct btf *btf, char *log_buf, size_t log_sz, __u32 log_level); struct btf *btf_get_from_fd(int btf_fd, struct btf *base_btf); void btf_get_kernel_prefix_kind(enum bpf_attach_type attach_type, @@ -549,17 +532,6 @@ static inline bool is_ldimm64_insn(struct bpf_insn *insn) return insn->code == (BPF_LD | BPF_IMM | BPF_DW); } -/* Unconditionally dup FD, ensuring it doesn't use [0, 2] range. - * Original FD is not closed or altered in any other way. - * Preserves original FD value, if it's invalid (negative). - */ -static inline int dup_good_fd(int fd) -{ - if (fd < 0) - return fd; - return fcntl(fd, F_DUPFD_CLOEXEC, 3); -} - /* if fd is stdin, stdout, or stderr, dup to a fd greater than 2 * Takes ownership of the fd passed in, and closes it if calling * fcntl(fd, F_DUPFD_CLOEXEC, 3). @@ -571,7 +543,7 @@ static inline int ensure_good_fd(int fd) if (fd < 0) return fd; if (fd < 3) { - fd = dup_good_fd(fd); + fd = fcntl(fd, F_DUPFD_CLOEXEC, 3); saved_errno = errno; close(old_fd); errno = saved_errno; diff --git a/tools/lib/bpf/libbpf_probes.c b/tools/lib/bpf/libbpf_probes.c index 8e7437006639..9c4db90b92b6 100644 --- a/tools/lib/bpf/libbpf_probes.c +++ b/tools/lib/bpf/libbpf_probes.c @@ -219,8 +219,7 @@ int libbpf_probe_bpf_prog_type(enum bpf_prog_type prog_type, const void *opts) } int libbpf__load_raw_btf(const char *raw_types, size_t types_len, - const char *str_sec, size_t str_len, - int token_fd) + const char *str_sec, size_t str_len) { struct btf_header hdr = { .magic = BTF_MAGIC, @@ -230,7 +229,6 @@ int libbpf__load_raw_btf(const char *raw_types, size_t types_len, .str_off = types_len, .str_len = str_len, }; - LIBBPF_OPTS(bpf_btf_load_opts, opts, .token_fd = token_fd); int btf_fd, btf_len; __u8 *raw_btf; @@ -243,7 +241,7 @@ int libbpf__load_raw_btf(const char *raw_types, size_t types_len, memcpy(raw_btf + hdr.hdr_len, raw_types, hdr.type_len); memcpy(raw_btf + hdr.hdr_len + hdr.type_len, str_sec, hdr.str_len); - btf_fd = bpf_btf_load(raw_btf, btf_len, &opts); + btf_fd = bpf_btf_load(raw_btf, btf_len, NULL); free(raw_btf); return btf_fd; @@ -273,7 +271,7 @@ static int load_local_storage_btf(void) }; return libbpf__load_raw_btf((char *)types, sizeof(types), - strs, sizeof(strs), 0); + strs, sizeof(strs)); } static int probe_map_create(enum bpf_map_type map_type) diff --git a/tools/lib/bpf/str_error.h b/tools/lib/bpf/str_error.h index 626d7ffb03d6..a139334d57b6 100644 --- a/tools/lib/bpf/str_error.h +++ b/tools/lib/bpf/str_error.h @@ -2,8 +2,5 @@ #ifndef __LIBBPF_STR_ERROR_H #define __LIBBPF_STR_ERROR_H -#define STRERR_BUFSIZE 128 - char *libbpf_strerror_r(int err, char *dst, int len); - #endif /* __LIBBPF_STR_ERROR_H */ diff --git a/tools/testing/selftests/bpf/prog_tests/libbpf_probes.c b/tools/testing/selftests/bpf/prog_tests/libbpf_probes.c index 4ed46ed58a7b..9f766ddd946a 100644 --- a/tools/testing/selftests/bpf/prog_tests/libbpf_probes.c +++ b/tools/testing/selftests/bpf/prog_tests/libbpf_probes.c @@ -30,8 +30,6 @@ void test_libbpf_probe_prog_types(void) if (prog_type == BPF_PROG_TYPE_UNSPEC) continue; - if (strcmp(prog_type_name, "__MAX_BPF_PROG_TYPE") == 0) - continue; if (!test__start_subtest(prog_type_name)) continue; @@ -70,8 +68,6 @@ void test_libbpf_probe_map_types(void) if (map_type == BPF_MAP_TYPE_UNSPEC) continue; - if (strcmp(map_type_name, "__MAX_BPF_MAP_TYPE") == 0) - continue; if (!test__start_subtest(map_type_name)) continue; diff --git a/tools/testing/selftests/bpf/prog_tests/libbpf_str.c b/tools/testing/selftests/bpf/prog_tests/libbpf_str.c index 62ea855ec4d0..eb34d612d6f8 100644 --- a/tools/testing/selftests/bpf/prog_tests/libbpf_str.c +++ b/tools/testing/selftests/bpf/prog_tests/libbpf_str.c @@ -132,9 +132,6 @@ static void test_libbpf_bpf_map_type_str(void) const char *map_type_str; char buf[256]; - if (map_type == __MAX_BPF_MAP_TYPE) - continue; - map_type_name = btf__str_by_offset(btf, e->name_off); map_type_str = libbpf_bpf_map_type_str(map_type); ASSERT_OK_PTR(map_type_str, map_type_name); @@ -189,9 +186,6 @@ static void test_libbpf_bpf_prog_type_str(void) const char *prog_type_str; char buf[256]; - if (prog_type == __MAX_BPF_PROG_TYPE) - continue; - prog_type_name = btf__str_by_offset(btf, e->name_off); prog_type_str = libbpf_bpf_prog_type_str(prog_type); ASSERT_OK_PTR(prog_type_str, prog_type_name); diff --git a/tools/testing/selftests/bpf/prog_tests/token.c b/tools/testing/selftests/bpf/prog_tests/token.c deleted file mode 100644 index b5dce630e0e1..000000000000 --- a/tools/testing/selftests/bpf/prog_tests/token.c +++ /dev/null @@ -1,1031 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* Copyright (c) 2023 Meta Platforms, Inc. and affiliates. */ -#define _GNU_SOURCE -#include -#include -#include "cap_helpers.h" -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "priv_map.skel.h" -#include "priv_prog.skel.h" -#include "dummy_st_ops_success.skel.h" - -static inline int sys_mount(const char *dev_name, const char *dir_name, - const char *type, unsigned long flags, - const void *data) -{ - return syscall(__NR_mount, dev_name, dir_name, type, flags, data); -} - -static inline int sys_fsopen(const char *fsname, unsigned flags) -{ - return syscall(__NR_fsopen, fsname, flags); -} - -static inline int sys_fspick(int dfd, const char *path, unsigned flags) -{ - return syscall(__NR_fspick, dfd, path, flags); -} - -static inline int sys_fsconfig(int fs_fd, unsigned cmd, const char *key, const void *val, int aux) -{ - return syscall(__NR_fsconfig, fs_fd, cmd, key, val, aux); -} - -static inline int sys_fsmount(int fs_fd, unsigned flags, unsigned ms_flags) -{ - return syscall(__NR_fsmount, fs_fd, flags, ms_flags); -} - -static inline int sys_move_mount(int from_dfd, const char *from_path, - int to_dfd, const char *to_path, - unsigned flags) -{ - return syscall(__NR_move_mount, from_dfd, from_path, to_dfd, to_path, flags); -} - -static int drop_priv_caps(__u64 *old_caps) -{ - return cap_disable_effective((1ULL << CAP_BPF) | - (1ULL << CAP_PERFMON) | - (1ULL << CAP_NET_ADMIN) | - (1ULL << CAP_SYS_ADMIN), old_caps); -} - -static int restore_priv_caps(__u64 old_caps) -{ - return cap_enable_effective(old_caps, NULL); -} - -static int set_delegate_mask(int fs_fd, const char *key, __u64 mask, const char *mask_str) -{ - char buf[32]; - int err; - - if (!mask_str) { - if (mask == ~0ULL) { - mask_str = "any"; - } else { - snprintf(buf, sizeof(buf), "0x%llx", (unsigned long long)mask); - mask_str = buf; - } - } - - err = sys_fsconfig(fs_fd, FSCONFIG_SET_STRING, key, - mask_str, 0); - if (err < 0) - err = -errno; - return err; -} - -#define zclose(fd) do { if (fd >= 0) close(fd); fd = -1; } while (0) - -struct bpffs_opts { - __u64 cmds; - __u64 maps; - __u64 progs; - __u64 attachs; - const char *cmds_str; - const char *maps_str; - const char *progs_str; - const char *attachs_str; -}; - -static int create_bpffs_fd(void) -{ - int fs_fd; - - /* create VFS context */ - fs_fd = sys_fsopen("bpf", 0); - ASSERT_GE(fs_fd, 0, "fs_fd"); - - return fs_fd; -} - -static int materialize_bpffs_fd(int fs_fd, struct bpffs_opts *opts) -{ - int mnt_fd, err; - - /* set up token delegation mount options */ - err = set_delegate_mask(fs_fd, "delegate_cmds", opts->cmds, opts->cmds_str); - if (!ASSERT_OK(err, "fs_cfg_cmds")) - return err; - err = set_delegate_mask(fs_fd, "delegate_maps", opts->maps, opts->maps_str); - if (!ASSERT_OK(err, "fs_cfg_maps")) - return err; - err = set_delegate_mask(fs_fd, "delegate_progs", opts->progs, opts->progs_str); - if (!ASSERT_OK(err, "fs_cfg_progs")) - return err; - err = set_delegate_mask(fs_fd, "delegate_attachs", opts->attachs, opts->attachs_str); - if (!ASSERT_OK(err, "fs_cfg_attachs")) - return err; - - /* instantiate FS object */ - err = sys_fsconfig(fs_fd, FSCONFIG_CMD_CREATE, NULL, NULL, 0); - if (err < 0) - return -errno; - - /* create O_PATH fd for detached mount */ - mnt_fd = sys_fsmount(fs_fd, 0, 0); - if (err < 0) - return -errno; - - return mnt_fd; -} - -/* send FD over Unix domain (AF_UNIX) socket */ -static int sendfd(int sockfd, int fd) -{ - struct msghdr msg = {}; - struct cmsghdr *cmsg; - int fds[1] = { fd }, err; - char iobuf[1]; - struct iovec io = { - .iov_base = iobuf, - .iov_len = sizeof(iobuf), - }; - union { - char buf[CMSG_SPACE(sizeof(fds))]; - struct cmsghdr align; - } u; - - msg.msg_iov = &io; - msg.msg_iovlen = 1; - msg.msg_control = u.buf; - msg.msg_controllen = sizeof(u.buf); - cmsg = CMSG_FIRSTHDR(&msg); - cmsg->cmsg_level = SOL_SOCKET; - cmsg->cmsg_type = SCM_RIGHTS; - cmsg->cmsg_len = CMSG_LEN(sizeof(fds)); - memcpy(CMSG_DATA(cmsg), fds, sizeof(fds)); - - err = sendmsg(sockfd, &msg, 0); - if (err < 0) - err = -errno; - if (!ASSERT_EQ(err, 1, "sendmsg")) - return -EINVAL; - - return 0; -} - -/* receive FD over Unix domain (AF_UNIX) socket */ -static int recvfd(int sockfd, int *fd) -{ - struct msghdr msg = {}; - struct cmsghdr *cmsg; - int fds[1], err; - char iobuf[1]; - struct iovec io = { - .iov_base = iobuf, - .iov_len = sizeof(iobuf), - }; - union { - char buf[CMSG_SPACE(sizeof(fds))]; - struct cmsghdr align; - } u; - - msg.msg_iov = &io; - msg.msg_iovlen = 1; - msg.msg_control = u.buf; - msg.msg_controllen = sizeof(u.buf); - - err = recvmsg(sockfd, &msg, 0); - if (err < 0) - err = -errno; - if (!ASSERT_EQ(err, 1, "recvmsg")) - return -EINVAL; - - cmsg = CMSG_FIRSTHDR(&msg); - if (!ASSERT_OK_PTR(cmsg, "cmsg_null") || - !ASSERT_EQ(cmsg->cmsg_len, CMSG_LEN(sizeof(fds)), "cmsg_len") || - !ASSERT_EQ(cmsg->cmsg_level, SOL_SOCKET, "cmsg_level") || - !ASSERT_EQ(cmsg->cmsg_type, SCM_RIGHTS, "cmsg_type")) - return -EINVAL; - - memcpy(fds, CMSG_DATA(cmsg), sizeof(fds)); - *fd = fds[0]; - - return 0; -} - -static ssize_t write_nointr(int fd, const void *buf, size_t count) -{ - ssize_t ret; - - do { - ret = write(fd, buf, count); - } while (ret < 0 && errno == EINTR); - - return ret; -} - -static int write_file(const char *path, const void *buf, size_t count) -{ - int fd; - ssize_t ret; - - fd = open(path, O_WRONLY | O_CLOEXEC | O_NOCTTY | O_NOFOLLOW); - if (fd < 0) - return -1; - - ret = write_nointr(fd, buf, count); - close(fd); - if (ret < 0 || (size_t)ret != count) - return -1; - - return 0; -} - -static int create_and_enter_userns(void) -{ - uid_t uid; - gid_t gid; - char map[100]; - - uid = getuid(); - gid = getgid(); - - if (unshare(CLONE_NEWUSER)) - return -1; - - if (write_file("/proc/self/setgroups", "deny", sizeof("deny") - 1) && - errno != ENOENT) - return -1; - - snprintf(map, sizeof(map), "0 %d 1", uid); - if (write_file("/proc/self/uid_map", map, strlen(map))) - return -1; - - - snprintf(map, sizeof(map), "0 %d 1", gid); - if (write_file("/proc/self/gid_map", map, strlen(map))) - return -1; - - if (setgid(0)) - return -1; - - if (setuid(0)) - return -1; - - return 0; -} - -typedef int (*child_callback_fn)(int); - -static void child(int sock_fd, struct bpffs_opts *opts, child_callback_fn callback) -{ - LIBBPF_OPTS(bpf_map_create_opts, map_opts); - int mnt_fd = -1, fs_fd = -1, err = 0, bpffs_fd = -1; - - /* setup userns with root mappings */ - err = create_and_enter_userns(); - if (!ASSERT_OK(err, "create_and_enter_userns")) - goto cleanup; - - /* setup mountns to allow creating BPF FS (fsopen("bpf")) from unpriv process */ - err = unshare(CLONE_NEWNS); - if (!ASSERT_OK(err, "create_mountns")) - goto cleanup; - - err = sys_mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0); - if (!ASSERT_OK(err, "remount_root")) - goto cleanup; - - fs_fd = create_bpffs_fd(); - if (!ASSERT_GE(fs_fd, 0, "create_bpffs_fd")) { - err = -EINVAL; - goto cleanup; - } - - /* ensure unprivileged child cannot set delegation options */ - err = set_delegate_mask(fs_fd, "delegate_cmds", 0x1, NULL); - ASSERT_EQ(err, -EPERM, "delegate_cmd_eperm"); - err = set_delegate_mask(fs_fd, "delegate_maps", 0x1, NULL); - ASSERT_EQ(err, -EPERM, "delegate_maps_eperm"); - err = set_delegate_mask(fs_fd, "delegate_progs", 0x1, NULL); - ASSERT_EQ(err, -EPERM, "delegate_progs_eperm"); - err = set_delegate_mask(fs_fd, "delegate_attachs", 0x1, NULL); - ASSERT_EQ(err, -EPERM, "delegate_attachs_eperm"); - - /* pass BPF FS context object to parent */ - err = sendfd(sock_fd, fs_fd); - if (!ASSERT_OK(err, "send_fs_fd")) - goto cleanup; - zclose(fs_fd); - - /* avoid mucking around with mount namespaces and mounting at - * well-known path, just get detach-mounted BPF FS fd back from parent - */ - err = recvfd(sock_fd, &mnt_fd); - if (!ASSERT_OK(err, "recv_mnt_fd")) - goto cleanup; - - /* try to fspick() BPF FS and try to add some delegation options */ - fs_fd = sys_fspick(mnt_fd, "", FSPICK_EMPTY_PATH); - if (!ASSERT_GE(fs_fd, 0, "bpffs_fspick")) { - err = -EINVAL; - goto cleanup; - } - - /* ensure unprivileged child cannot reconfigure to set delegation options */ - err = set_delegate_mask(fs_fd, "delegate_cmds", 0, "any"); - if (!ASSERT_EQ(err, -EPERM, "delegate_cmd_eperm_reconfig")) { - err = -EINVAL; - goto cleanup; - } - err = set_delegate_mask(fs_fd, "delegate_maps", 0, "any"); - if (!ASSERT_EQ(err, -EPERM, "delegate_maps_eperm_reconfig")) { - err = -EINVAL; - goto cleanup; - } - err = set_delegate_mask(fs_fd, "delegate_progs", 0, "any"); - if (!ASSERT_EQ(err, -EPERM, "delegate_progs_eperm_reconfig")) { - err = -EINVAL; - goto cleanup; - } - err = set_delegate_mask(fs_fd, "delegate_attachs", 0, "any"); - if (!ASSERT_EQ(err, -EPERM, "delegate_attachs_eperm_reconfig")) { - err = -EINVAL; - goto cleanup; - } - zclose(fs_fd); - - bpffs_fd = openat(mnt_fd, ".", 0, O_RDWR); - if (!ASSERT_GE(bpffs_fd, 0, "bpffs_open")) { - err = -EINVAL; - goto cleanup; - } - - /* do custom test logic with customly set up BPF FS instance */ - err = callback(bpffs_fd); - if (!ASSERT_OK(err, "test_callback")) - goto cleanup; - - err = 0; -cleanup: - zclose(sock_fd); - zclose(mnt_fd); - zclose(fs_fd); - zclose(bpffs_fd); - - exit(-err); -} - -static int wait_for_pid(pid_t pid) -{ - int status, ret; - -again: - ret = waitpid(pid, &status, 0); - if (ret == -1) { - if (errno == EINTR) - goto again; - - return -1; - } - - if (!WIFEXITED(status)) - return -1; - - return WEXITSTATUS(status); -} - -static void parent(int child_pid, struct bpffs_opts *bpffs_opts, int sock_fd) -{ - int fs_fd = -1, mnt_fd = -1, err; - - err = recvfd(sock_fd, &fs_fd); - if (!ASSERT_OK(err, "recv_bpffs_fd")) - goto cleanup; - - mnt_fd = materialize_bpffs_fd(fs_fd, bpffs_opts); - if (!ASSERT_GE(mnt_fd, 0, "materialize_bpffs_fd")) { - err = -EINVAL; - goto cleanup; - } - zclose(fs_fd); - - /* pass BPF FS context object to parent */ - err = sendfd(sock_fd, mnt_fd); - if (!ASSERT_OK(err, "send_mnt_fd")) - goto cleanup; - zclose(mnt_fd); - - err = wait_for_pid(child_pid); - ASSERT_OK(err, "waitpid_child"); - -cleanup: - zclose(sock_fd); - zclose(fs_fd); - zclose(mnt_fd); - - if (child_pid > 0) - (void)kill(child_pid, SIGKILL); -} - -static void subtest_userns(struct bpffs_opts *bpffs_opts, child_callback_fn cb) -{ - int sock_fds[2] = { -1, -1 }; - int child_pid = 0, err; - - err = socketpair(AF_UNIX, SOCK_STREAM, 0, sock_fds); - if (!ASSERT_OK(err, "socketpair")) - goto cleanup; - - child_pid = fork(); - if (!ASSERT_GE(child_pid, 0, "fork")) - goto cleanup; - - if (child_pid == 0) { - zclose(sock_fds[0]); - return child(sock_fds[1], bpffs_opts, cb); - - } else { - zclose(sock_fds[1]); - return parent(child_pid, bpffs_opts, sock_fds[0]); - } - -cleanup: - zclose(sock_fds[0]); - zclose(sock_fds[1]); - if (child_pid > 0) - (void)kill(child_pid, SIGKILL); -} - -static int userns_map_create(int mnt_fd) -{ - LIBBPF_OPTS(bpf_map_create_opts, map_opts); - int err, token_fd = -1, map_fd = -1; - __u64 old_caps = 0; - - /* create BPF token from BPF FS mount */ - token_fd = bpf_token_create(mnt_fd, NULL); - if (!ASSERT_GT(token_fd, 0, "token_create")) { - err = -EINVAL; - goto cleanup; - } - - /* while inside non-init userns, we need both a BPF token *and* - * CAP_BPF inside current userns to create privileged map; let's test - * that neither BPF token alone nor namespaced CAP_BPF is sufficient - */ - err = drop_priv_caps(&old_caps); - if (!ASSERT_OK(err, "drop_caps")) - goto cleanup; - - /* no token, no CAP_BPF -> fail */ - map_opts.token_fd = 0; - map_fd = bpf_map_create(BPF_MAP_TYPE_STACK, "wo_token_wo_bpf", 0, 8, 1, &map_opts); - if (!ASSERT_LT(map_fd, 0, "stack_map_wo_token_wo_cap_bpf_should_fail")) { - err = -EINVAL; - goto cleanup; - } - - /* token without CAP_BPF -> fail */ - map_opts.token_fd = token_fd; - map_fd = bpf_map_create(BPF_MAP_TYPE_STACK, "w_token_wo_bpf", 0, 8, 1, &map_opts); - if (!ASSERT_LT(map_fd, 0, "stack_map_w_token_wo_cap_bpf_should_fail")) { - err = -EINVAL; - goto cleanup; - } - - /* get back effective local CAP_BPF (and CAP_SYS_ADMIN) */ - err = restore_priv_caps(old_caps); - if (!ASSERT_OK(err, "restore_caps")) - goto cleanup; - - /* CAP_BPF without token -> fail */ - map_opts.token_fd = 0; - map_fd = bpf_map_create(BPF_MAP_TYPE_STACK, "wo_token_w_bpf", 0, 8, 1, &map_opts); - if (!ASSERT_LT(map_fd, 0, "stack_map_wo_token_w_cap_bpf_should_fail")) { - err = -EINVAL; - goto cleanup; - } - - /* finally, namespaced CAP_BPF + token -> success */ - map_opts.token_fd = token_fd; - map_fd = bpf_map_create(BPF_MAP_TYPE_STACK, "w_token_w_bpf", 0, 8, 1, &map_opts); - if (!ASSERT_GT(map_fd, 0, "stack_map_w_token_w_cap_bpf")) { - err = -EINVAL; - goto cleanup; - } - -cleanup: - zclose(token_fd); - zclose(map_fd); - return err; -} - -static int userns_btf_load(int mnt_fd) -{ - LIBBPF_OPTS(bpf_btf_load_opts, btf_opts); - int err, token_fd = -1, btf_fd = -1; - const void *raw_btf_data; - struct btf *btf = NULL; - __u32 raw_btf_size; - __u64 old_caps = 0; - - /* create BPF token from BPF FS mount */ - token_fd = bpf_token_create(mnt_fd, NULL); - if (!ASSERT_GT(token_fd, 0, "token_create")) { - err = -EINVAL; - goto cleanup; - } - - /* while inside non-init userns, we need both a BPF token *and* - * CAP_BPF inside current userns to create privileged map; let's test - * that neither BPF token alone nor namespaced CAP_BPF is sufficient - */ - err = drop_priv_caps(&old_caps); - if (!ASSERT_OK(err, "drop_caps")) - goto cleanup; - - /* setup a trivial BTF data to load to the kernel */ - btf = btf__new_empty(); - if (!ASSERT_OK_PTR(btf, "empty_btf")) - goto cleanup; - - ASSERT_GT(btf__add_int(btf, "int", 4, 0), 0, "int_type"); - - raw_btf_data = btf__raw_data(btf, &raw_btf_size); - if (!ASSERT_OK_PTR(raw_btf_data, "raw_btf_data")) - goto cleanup; - - /* no token + no CAP_BPF -> failure */ - btf_opts.token_fd = 0; - btf_fd = bpf_btf_load(raw_btf_data, raw_btf_size, &btf_opts); - if (!ASSERT_LT(btf_fd, 0, "no_token_no_cap_should_fail")) - goto cleanup; - - /* token + no CAP_BPF -> failure */ - btf_opts.token_fd = token_fd; - btf_fd = bpf_btf_load(raw_btf_data, raw_btf_size, &btf_opts); - if (!ASSERT_LT(btf_fd, 0, "token_no_cap_should_fail")) - goto cleanup; - - /* get back effective local CAP_BPF (and CAP_SYS_ADMIN) */ - err = restore_priv_caps(old_caps); - if (!ASSERT_OK(err, "restore_caps")) - goto cleanup; - - /* token + CAP_BPF -> success */ - btf_opts.token_fd = token_fd; - btf_fd = bpf_btf_load(raw_btf_data, raw_btf_size, &btf_opts); - if (!ASSERT_GT(btf_fd, 0, "token_and_cap_success")) - goto cleanup; - - err = 0; -cleanup: - btf__free(btf); - zclose(btf_fd); - zclose(token_fd); - return err; -} - -static int userns_prog_load(int mnt_fd) -{ - LIBBPF_OPTS(bpf_prog_load_opts, prog_opts); - int err, token_fd = -1, prog_fd = -1; - struct bpf_insn insns[] = { - /* bpf_jiffies64() requires CAP_BPF */ - BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_jiffies64), - /* bpf_get_current_task() requires CAP_PERFMON */ - BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_current_task), - /* r0 = 0; exit; */ - BPF_MOV64_IMM(BPF_REG_0, 0), - BPF_EXIT_INSN(), - }; - size_t insn_cnt = ARRAY_SIZE(insns); - __u64 old_caps = 0; - - /* create BPF token from BPF FS mount */ - token_fd = bpf_token_create(mnt_fd, NULL); - if (!ASSERT_GT(token_fd, 0, "token_create")) { - err = -EINVAL; - goto cleanup; - } - - /* validate we can successfully load BPF program with token; this - * being XDP program (CAP_NET_ADMIN) using bpf_jiffies64() (CAP_BPF) - * and bpf_get_current_task() (CAP_PERFMON) helpers validates we have - * BPF token wired properly in a bunch of places in the kernel - */ - prog_opts.token_fd = token_fd; - prog_opts.expected_attach_type = BPF_XDP; - prog_fd = bpf_prog_load(BPF_PROG_TYPE_XDP, "token_prog", "GPL", - insns, insn_cnt, &prog_opts); - if (!ASSERT_GT(prog_fd, 0, "prog_fd")) { - err = -EPERM; - goto cleanup; - } - - /* no token + caps -> failure */ - prog_opts.token_fd = 0; - prog_fd = bpf_prog_load(BPF_PROG_TYPE_XDP, "token_prog", "GPL", - insns, insn_cnt, &prog_opts); - if (!ASSERT_EQ(prog_fd, -EPERM, "prog_fd_eperm")) { - err = -EPERM; - goto cleanup; - } - - err = drop_priv_caps(&old_caps); - if (!ASSERT_OK(err, "drop_caps")) - goto cleanup; - - /* no caps + token -> failure */ - prog_opts.token_fd = token_fd; - prog_fd = bpf_prog_load(BPF_PROG_TYPE_XDP, "token_prog", "GPL", - insns, insn_cnt, &prog_opts); - if (!ASSERT_EQ(prog_fd, -EPERM, "prog_fd_eperm")) { - err = -EPERM; - goto cleanup; - } - - /* no caps + no token -> definitely a failure */ - prog_opts.token_fd = 0; - prog_fd = bpf_prog_load(BPF_PROG_TYPE_XDP, "token_prog", "GPL", - insns, insn_cnt, &prog_opts); - if (!ASSERT_EQ(prog_fd, -EPERM, "prog_fd_eperm")) { - err = -EPERM; - goto cleanup; - } - - err = 0; -cleanup: - zclose(prog_fd); - zclose(token_fd); - return err; -} - -static int userns_obj_priv_map(int mnt_fd) -{ - LIBBPF_OPTS(bpf_object_open_opts, opts); - char buf[256]; - struct priv_map *skel; - int err, token_fd; - - skel = priv_map__open_and_load(); - if (!ASSERT_ERR_PTR(skel, "obj_tokenless_load")) { - priv_map__destroy(skel); - return -EINVAL; - } - - /* use bpf_token_path to provide BPF FS path */ - snprintf(buf, sizeof(buf), "/proc/self/fd/%d", mnt_fd); - opts.bpf_token_path = buf; - skel = priv_map__open_opts(&opts); - if (!ASSERT_OK_PTR(skel, "obj_token_path_open")) - return -EINVAL; - - err = priv_map__load(skel); - priv_map__destroy(skel); - if (!ASSERT_OK(err, "obj_token_path_load")) - return -EINVAL; - - /* create token and pass it through bpf_token_fd */ - token_fd = bpf_token_create(mnt_fd, NULL); - if (!ASSERT_GT(token_fd, 0, "create_token")) - return -EINVAL; - - opts.bpf_token_path = NULL; - opts.bpf_token_fd = token_fd; - skel = priv_map__open_opts(&opts); - if (!ASSERT_OK_PTR(skel, "obj_token_fd_open")) - return -EINVAL; - - /* we can close our token FD, bpf_object owns dup()'ed FD now */ - close(token_fd); - - err = priv_map__load(skel); - priv_map__destroy(skel); - if (!ASSERT_OK(err, "obj_token_fd_load")) - return -EINVAL; - - return 0; -} - -static int userns_obj_priv_prog(int mnt_fd) -{ - LIBBPF_OPTS(bpf_object_open_opts, opts); - char buf[256]; - struct priv_prog *skel; - int err; - - skel = priv_prog__open_and_load(); - if (!ASSERT_ERR_PTR(skel, "obj_tokenless_load")) { - priv_prog__destroy(skel); - return -EINVAL; - } - - /* use bpf_token_path to provide BPF FS path */ - snprintf(buf, sizeof(buf), "/proc/self/fd/%d", mnt_fd); - opts.bpf_token_path = buf; - skel = priv_prog__open_opts(&opts); - if (!ASSERT_OK_PTR(skel, "obj_token_path_open")) - return -EINVAL; - - err = priv_prog__load(skel); - priv_prog__destroy(skel); - if (!ASSERT_OK(err, "obj_token_path_load")) - return -EINVAL; - - return 0; -} - -/* this test is called with BPF FS that doesn't delegate BPF_BTF_LOAD command, - * which should cause struct_ops application to fail, as BTF won't be uploaded - * into the kernel, even if STRUCT_OPS programs themselves are allowed - */ -static int validate_struct_ops_load(int mnt_fd, bool expect_success) -{ - LIBBPF_OPTS(bpf_object_open_opts, opts); - char buf[256]; - struct dummy_st_ops_success *skel; - int err; - - snprintf(buf, sizeof(buf), "/proc/self/fd/%d", mnt_fd); - opts.bpf_token_path = buf; - skel = dummy_st_ops_success__open_opts(&opts); - if (!ASSERT_OK_PTR(skel, "obj_token_path_open")) - return -EINVAL; - - err = dummy_st_ops_success__load(skel); - dummy_st_ops_success__destroy(skel); - if (expect_success) { - if (!ASSERT_OK(err, "obj_token_path_load")) - return -EINVAL; - } else /* expect failure */ { - if (!ASSERT_ERR(err, "obj_token_path_load")) - return -EINVAL; - } - - return 0; -} - -static int userns_obj_priv_btf_fail(int mnt_fd) -{ - return validate_struct_ops_load(mnt_fd, false /* should fail */); -} - -static int userns_obj_priv_btf_success(int mnt_fd) -{ - return validate_struct_ops_load(mnt_fd, true /* should succeed */); -} - -#define TOKEN_ENVVAR "LIBBPF_BPF_TOKEN_PATH" -#define TOKEN_BPFFS_CUSTOM "/bpf-token-fs" - -static int userns_obj_priv_implicit_token(int mnt_fd) -{ - LIBBPF_OPTS(bpf_object_open_opts, opts); - struct dummy_st_ops_success *skel; - int err; - - /* before we mount BPF FS with token delegation, struct_ops skeleton - * should fail to load - */ - skel = dummy_st_ops_success__open_and_load(); - if (!ASSERT_ERR_PTR(skel, "obj_tokenless_load")) { - dummy_st_ops_success__destroy(skel); - return -EINVAL; - } - - /* mount custom BPF FS over /sys/fs/bpf so that libbpf can create BPF - * token automatically and implicitly - */ - err = sys_move_mount(mnt_fd, "", AT_FDCWD, "/sys/fs/bpf", MOVE_MOUNT_F_EMPTY_PATH); - if (!ASSERT_OK(err, "move_mount_bpffs")) - return -EINVAL; - - /* disable implicit BPF token creation by setting - * LIBBPF_BPF_TOKEN_PATH envvar to empty value, load should fail - */ - err = setenv(TOKEN_ENVVAR, "", 1 /*overwrite*/); - if (!ASSERT_OK(err, "setenv_token_path")) - return -EINVAL; - skel = dummy_st_ops_success__open_and_load(); - if (!ASSERT_ERR_PTR(skel, "obj_token_envvar_disabled_load")) { - unsetenv(TOKEN_ENVVAR); - dummy_st_ops_success__destroy(skel); - return -EINVAL; - } - unsetenv(TOKEN_ENVVAR); - - /* now the same struct_ops skeleton should succeed thanks to libppf - * creating BPF token from /sys/fs/bpf mount point - */ - skel = dummy_st_ops_success__open_and_load(); - if (!ASSERT_OK_PTR(skel, "obj_implicit_token_load")) - return -EINVAL; - - dummy_st_ops_success__destroy(skel); - - /* now disable implicit token through empty bpf_token_path, should fail */ - opts.bpf_token_path = ""; - skel = dummy_st_ops_success__open_opts(&opts); - if (!ASSERT_OK_PTR(skel, "obj_empty_token_path_open")) - return -EINVAL; - - err = dummy_st_ops_success__load(skel); - dummy_st_ops_success__destroy(skel); - if (!ASSERT_ERR(err, "obj_empty_token_path_load")) - return -EINVAL; - - /* now disable implicit token through negative bpf_token_fd, should fail */ - opts.bpf_token_path = NULL; - opts.bpf_token_fd = -1; - skel = dummy_st_ops_success__open_opts(&opts); - if (!ASSERT_OK_PTR(skel, "obj_neg_token_fd_open")) - return -EINVAL; - - err = dummy_st_ops_success__load(skel); - dummy_st_ops_success__destroy(skel); - if (!ASSERT_ERR(err, "obj_neg_token_fd_load")) - return -EINVAL; - - return 0; -} - -static int userns_obj_priv_implicit_token_envvar(int mnt_fd) -{ - LIBBPF_OPTS(bpf_object_open_opts, opts); - struct dummy_st_ops_success *skel; - int err; - - /* before we mount BPF FS with token delegation, struct_ops skeleton - * should fail to load - */ - skel = dummy_st_ops_success__open_and_load(); - if (!ASSERT_ERR_PTR(skel, "obj_tokenless_load")) { - dummy_st_ops_success__destroy(skel); - return -EINVAL; - } - - /* mount custom BPF FS over custom location, so libbpf can't create - * BPF token implicitly, unless pointed to it through - * LIBBPF_BPF_TOKEN_PATH envvar - */ - rmdir(TOKEN_BPFFS_CUSTOM); - if (!ASSERT_OK(mkdir(TOKEN_BPFFS_CUSTOM, 0777), "mkdir_bpffs_custom")) - goto err_out; - err = sys_move_mount(mnt_fd, "", AT_FDCWD, TOKEN_BPFFS_CUSTOM, MOVE_MOUNT_F_EMPTY_PATH); - if (!ASSERT_OK(err, "move_mount_bpffs")) - goto err_out; - - /* even though we have BPF FS with delegation, it's not at default - * /sys/fs/bpf location, so we still fail to load until envvar is set up - */ - skel = dummy_st_ops_success__open_and_load(); - if (!ASSERT_ERR_PTR(skel, "obj_tokenless_load2")) { - dummy_st_ops_success__destroy(skel); - goto err_out; - } - - err = setenv(TOKEN_ENVVAR, TOKEN_BPFFS_CUSTOM, 1 /*overwrite*/); - if (!ASSERT_OK(err, "setenv_token_path")) - goto err_out; - - /* now the same struct_ops skeleton should succeed thanks to libppf - * creating BPF token from custom mount point - */ - skel = dummy_st_ops_success__open_and_load(); - if (!ASSERT_OK_PTR(skel, "obj_implicit_token_load")) - goto err_out; - - dummy_st_ops_success__destroy(skel); - - /* now disable implicit token through empty bpf_token_path, envvar - * will be ignored, should fail - */ - opts.bpf_token_path = ""; - skel = dummy_st_ops_success__open_opts(&opts); - if (!ASSERT_OK_PTR(skel, "obj_empty_token_path_open")) - goto err_out; - - err = dummy_st_ops_success__load(skel); - dummy_st_ops_success__destroy(skel); - if (!ASSERT_ERR(err, "obj_empty_token_path_load")) - goto err_out; - - /* now disable implicit token through negative bpf_token_fd, envvar - * will be ignored, should fail - */ - opts.bpf_token_path = NULL; - opts.bpf_token_fd = -1; - skel = dummy_st_ops_success__open_opts(&opts); - if (!ASSERT_OK_PTR(skel, "obj_neg_token_fd_open")) - goto err_out; - - err = dummy_st_ops_success__load(skel); - dummy_st_ops_success__destroy(skel); - if (!ASSERT_ERR(err, "obj_neg_token_fd_load")) - goto err_out; - - rmdir(TOKEN_BPFFS_CUSTOM); - unsetenv(TOKEN_ENVVAR); - return 0; -err_out: - rmdir(TOKEN_BPFFS_CUSTOM); - unsetenv(TOKEN_ENVVAR); - return -EINVAL; -} - -#define bit(n) (1ULL << (n)) - -void test_token(void) -{ - if (test__start_subtest("map_token")) { - struct bpffs_opts opts = { - .cmds_str = "map_create", - .maps_str = "stack", - }; - - subtest_userns(&opts, userns_map_create); - } - if (test__start_subtest("btf_token")) { - struct bpffs_opts opts = { - .cmds = 1ULL << BPF_BTF_LOAD, - }; - - subtest_userns(&opts, userns_btf_load); - } - if (test__start_subtest("prog_token")) { - struct bpffs_opts opts = { - .cmds_str = "PROG_LOAD", - .progs_str = "XDP", - .attachs_str = "xdp", - }; - - subtest_userns(&opts, userns_prog_load); - } - if (test__start_subtest("obj_priv_map")) { - struct bpffs_opts opts = { - .cmds = bit(BPF_MAP_CREATE), - .maps = bit(BPF_MAP_TYPE_QUEUE), - }; - - subtest_userns(&opts, userns_obj_priv_map); - } - if (test__start_subtest("obj_priv_prog")) { - struct bpffs_opts opts = { - .cmds = bit(BPF_PROG_LOAD), - .progs = bit(BPF_PROG_TYPE_KPROBE), - .attachs = ~0ULL, - }; - - subtest_userns(&opts, userns_obj_priv_prog); - } - if (test__start_subtest("obj_priv_btf_fail")) { - struct bpffs_opts opts = { - /* disallow BTF loading */ - .cmds = bit(BPF_MAP_CREATE) | bit(BPF_PROG_LOAD), - .maps = bit(BPF_MAP_TYPE_STRUCT_OPS), - .progs = bit(BPF_PROG_TYPE_STRUCT_OPS), - .attachs = ~0ULL, - }; - - subtest_userns(&opts, userns_obj_priv_btf_fail); - } - if (test__start_subtest("obj_priv_btf_success")) { - struct bpffs_opts opts = { - /* allow BTF loading */ - .cmds = bit(BPF_BTF_LOAD) | bit(BPF_MAP_CREATE) | bit(BPF_PROG_LOAD), - .maps = bit(BPF_MAP_TYPE_STRUCT_OPS), - .progs = bit(BPF_PROG_TYPE_STRUCT_OPS), - .attachs = ~0ULL, - }; - - subtest_userns(&opts, userns_obj_priv_btf_success); - } - if (test__start_subtest("obj_priv_implicit_token")) { - struct bpffs_opts opts = { - /* allow BTF loading */ - .cmds = bit(BPF_BTF_LOAD) | bit(BPF_MAP_CREATE) | bit(BPF_PROG_LOAD), - .maps = bit(BPF_MAP_TYPE_STRUCT_OPS), - .progs = bit(BPF_PROG_TYPE_STRUCT_OPS), - .attachs = ~0ULL, - }; - - subtest_userns(&opts, userns_obj_priv_implicit_token); - } - if (test__start_subtest("obj_priv_implicit_token_envvar")) { - struct bpffs_opts opts = { - /* allow BTF loading */ - .cmds = bit(BPF_BTF_LOAD) | bit(BPF_MAP_CREATE) | bit(BPF_PROG_LOAD), - .maps = bit(BPF_MAP_TYPE_STRUCT_OPS), - .progs = bit(BPF_PROG_TYPE_STRUCT_OPS), - .attachs = ~0ULL, - }; - - subtest_userns(&opts, userns_obj_priv_implicit_token_envvar); - } -} diff --git a/tools/testing/selftests/bpf/progs/priv_map.c b/tools/testing/selftests/bpf/progs/priv_map.c deleted file mode 100644 index 9085be50f03b..000000000000 --- a/tools/testing/selftests/bpf/progs/priv_map.c +++ /dev/null @@ -1,13 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* Copyright (c) 2023 Meta Platforms, Inc. and affiliates. */ - -#include "vmlinux.h" -#include - -char _license[] SEC("license") = "GPL"; - -struct { - __uint(type, BPF_MAP_TYPE_QUEUE); - __uint(max_entries, 1); - __type(value, __u32); -} priv_map SEC(".maps"); diff --git a/tools/testing/selftests/bpf/progs/priv_prog.c b/tools/testing/selftests/bpf/progs/priv_prog.c deleted file mode 100644 index 3c7b2b618c8a..000000000000 --- a/tools/testing/selftests/bpf/progs/priv_prog.c +++ /dev/null @@ -1,13 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* Copyright (c) 2023 Meta Platforms, Inc. and affiliates. */ - -#include "vmlinux.h" -#include - -char _license[] SEC("license") = "GPL"; - -SEC("kprobe") -int kprobe_prog(void *ctx) -{ - return 1; -} -- cgit From 489c693bd04a2308865dc50f37bd0b5f6ad52deb Mon Sep 17 00:00:00 2001 From: Chen Haonan Date: Tue, 19 Dec 2023 21:06:25 +0800 Subject: PM: hibernate: Use kmap_local_page() in copy_data_page() kmap_atomic() has been deprecated in favor of kmap_local_page(). kmap_atomic() disables page-faults and preemption (the latter only for !PREEMPT_RT kernels).The code between the mapping and un-mapping in this patch does not depend on the above-mentioned side effects.So simply replaced kmap_atomic() with kmap_local_page(). Signed-off-by: Chen Haonan [ rjw: Subject edits ] Signed-off-by: Rafael J. Wysocki --- kernel/power/snapshot.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index e3e8f1c6e75f..5c96ff067c64 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1487,11 +1487,11 @@ static bool copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) s_page = pfn_to_page(src_pfn); d_page = pfn_to_page(dst_pfn); if (PageHighMem(s_page)) { - src = kmap_atomic(s_page); - dst = kmap_atomic(d_page); + src = kmap_local_page(s_page); + dst = kmap_local_page(d_page); zeros_only = do_copy_page(dst, src); - kunmap_atomic(dst); - kunmap_atomic(src); + kunmap_local(dst); + kunmap_local(src); } else { if (PageHighMem(d_page)) { /* @@ -1499,9 +1499,9 @@ static bool copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) * data modified by kmap_atomic() */ zeros_only = safe_copy_page(buffer, s_page); - dst = kmap_atomic(d_page); + dst = kmap_local_page(d_page); copy_page(dst, buffer); - kunmap_atomic(dst); + kunmap_local(dst); } else { zeros_only = safe_copy_page(page_address(d_page), s_page); } -- cgit From f17f2c13d613cbeef529b03ca17ae2581b2e6cb8 Mon Sep 17 00:00:00 2001 From: Kevin Hao Date: Fri, 8 Dec 2023 16:29:34 +0800 Subject: module: Remove redundant TASK_UNINTERRUPTIBLE TASK_KILLABLE already includes TASK_UNINTERRUPTIBLE, so there is no need to add a separate TASK_UNINTERRUPTIBLE. Signed-off-by: Kevin Hao Signed-off-by: Luis Chamberlain --- kernel/module/dups.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/module/dups.c b/kernel/module/dups.c index f3d7ea1e96d8..9a92f2f8c9d3 100644 --- a/kernel/module/dups.c +++ b/kernel/module/dups.c @@ -207,7 +207,7 @@ bool kmod_dup_request_exists_wait(char *module_name, bool wait, int *dup_ret) * optimization enabled ... */ ret = wait_for_completion_state(&kmod_req->first_req_done, - TASK_UNINTERRUPTIBLE | TASK_KILLABLE); + TASK_KILLABLE); if (ret) { *dup_ret = ret; return true; -- cgit From d028f87517d6775dccff4ddbca2740826f9e53f1 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Tue, 19 Dec 2023 21:47:57 +0800 Subject: bpf: make the verifier tracks the "not equal" for regs We can derive some new information for BPF_JNE in regs_refine_cond_op(). Take following code for example: /* The type of "a" is u32 */ if (a > 0 && a < 100) { /* the range of the register for a is [0, 99], not [1, 99], * and will cause the following error: * * invalid zero-sized read * * as a can be 0. */ bpf_skb_store_bytes(skb, xx, xx, a, 0); } In the code above, "a > 0" will be compiled to "jmp xxx if a == 0". In the TRUE branch, the dst_reg will be marked as known to 0. However, in the fallthrough(FALSE) branch, the dst_reg will not be handled, which makes the [min, max] for a is [0, 99], not [1, 99]. For BPF_JNE, we can reduce the range of the dst reg if the src reg is a const and is exactly the edge of the dst reg. Signed-off-by: Menglong Dong Acked-by: Andrii Nakryiko Acked-by: Shung-Hsi Yu Link: https://lore.kernel.org/r/20231219134800.1550388-2-menglong8.dong@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 38 +++++++++++++++++++++++++++++++++++++- 1 file changed, 37 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 4ceec8c2a484..df1cae459c77 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -14336,7 +14336,43 @@ again: } break; case BPF_JNE: - /* we don't derive any new information for inequality yet */ + if (!is_reg_const(reg2, is_jmp32)) + swap(reg1, reg2); + if (!is_reg_const(reg2, is_jmp32)) + break; + + /* try to recompute the bound of reg1 if reg2 is a const and + * is exactly the edge of reg1. + */ + val = reg_const_value(reg2, is_jmp32); + if (is_jmp32) { + /* u32_min_value is not equal to 0xffffffff at this point, + * because otherwise u32_max_value is 0xffffffff as well, + * in such a case both reg1 and reg2 would be constants, + * jump would be predicted and reg_set_min_max() won't + * be called. + * + * Same reasoning works for all {u,s}{min,max}{32,64} cases + * below. + */ + if (reg1->u32_min_value == (u32)val) + reg1->u32_min_value++; + if (reg1->u32_max_value == (u32)val) + reg1->u32_max_value--; + if (reg1->s32_min_value == (s32)val) + reg1->s32_min_value++; + if (reg1->s32_max_value == (s32)val) + reg1->s32_max_value--; + } else { + if (reg1->umin_value == (u64)val) + reg1->umin_value++; + if (reg1->umax_value == (u64)val) + reg1->umax_value--; + if (reg1->smin_value == (s64)val) + reg1->smin_value++; + if (reg1->smax_value == (s64)val) + reg1->smax_value--; + } break; case BPF_JSET: if (!is_reg_const(reg2, is_jmp32)) -- cgit From 4ba1d0f23414135e4f426dae4cb5cdc2ce246f89 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 14 Dec 2023 17:13:25 -0800 Subject: bpf: abstract away global subprog arg preparation logic from reg state setup btf_prepare_func_args() is used to understand expectations and restrictions on global subprog arguments. But current implementation is hard to extend, as it intermixes BTF-based func prototype parsing and interpretation logic with setting up register state at subprog entry. Worse still, those registers are not completely set up inside btf_prepare_func_args(), requiring some more logic later in do_check_common(). Like calling mark_reg_unknown() and similar initialization operations. This intermixing of BTF interpretation and register state setup is problematic. First, it causes duplication of BTF parsing logic for global subprog verification (to set up initial state of global subprog) and global subprog call sites analysis (when we need to check that whatever is being passed into global subprog matches expectations), performed in btf_check_subprog_call(). Given we want to extend global func argument with tags later, this duplication is problematic. So refactor btf_prepare_func_args() to do only BTF-based func proto and args parsing, returning high-level argument "expectations" only, with no regard to specifics of register state. I.e., if it's a context argument, instead of setting register state to PTR_TO_CTX, we return ARG_PTR_TO_CTX enum for that argument as "an argument specification" for further processing inside do_check_common(). Similarly for SCALAR arguments, PTR_TO_MEM, etc. This allows to reuse btf_prepare_func_args() in following patches at global subprog call site analysis time. It also keeps register setup code consistently in one place, do_check_common(). Besides all this, we cache this argument specs information inside env->subprog_info, eliminating the need to redo these potentially expensive BTF traversals, especially if BPF program's BTF is big and/or there are lots of global subprog calls. Acked-by: Eduard Zingerman Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231215011334.2307144-2-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 3 +-- include/linux/bpf_verifier.h | 16 ++++++++++++++++ kernel/bpf/btf.c | 38 ++++++++++++++++++++------------------ kernel/bpf/verifier.c | 43 +++++++++++++++++++++++++++---------------- 4 files changed, 64 insertions(+), 36 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 7a8d4c81a39a..c050c82cc9a5 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2470,8 +2470,7 @@ int btf_check_subprog_arg_match(struct bpf_verifier_env *env, int subprog, struct bpf_reg_state *regs); int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog, struct bpf_reg_state *regs); -int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog, - struct bpf_reg_state *reg, u32 *nargs); +int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog); int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *prog, struct btf *btf, const struct btf_type *t); const char *btf_find_decl_tag_value(const struct btf *btf, const struct btf_type *pt, diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index c2819a6579a5..5742e9c0a7b8 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -606,6 +606,13 @@ static inline bool bpf_verifier_log_needed(const struct bpf_verifier_log *log) #define BPF_MAX_SUBPROGS 256 +struct bpf_subprog_arg_info { + enum bpf_arg_type arg_type; + union { + u32 mem_size; + }; +}; + struct bpf_subprog_info { /* 'start' has to be the first field otherwise find_subprog() won't work */ u32 start; /* insn idx of function entry point */ @@ -617,6 +624,10 @@ struct bpf_subprog_info { bool is_cb: 1; bool is_async_cb: 1; bool is_exception_cb: 1; + bool args_cached: 1; + + u8 arg_cnt; + struct bpf_subprog_arg_info args[MAX_BPF_FUNC_REG_ARGS]; }; struct bpf_verifier_env; @@ -727,6 +738,11 @@ struct bpf_verifier_env { char tmp_str_buf[TMP_STR_BUF_LEN]; }; +static inline struct bpf_subprog_info *subprog_info(struct bpf_verifier_env *env, int subprog) +{ + return &env->subprog_info[subprog]; +} + __printf(2, 0) void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt, va_list args); __printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env, diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index d56433bf8aba..be2104e5f2f5 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -6948,16 +6948,17 @@ int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog, return err; } -/* Convert BTF of a function into bpf_reg_state if possible +/* Process BTF of a function to produce high-level expectation of function + * arguments (like ARG_PTR_TO_CTX, or ARG_PTR_TO_MEM, etc). This information + * is cached in subprog info for reuse. * Returns: * EFAULT - there is a verifier bug. Abort verification. * EINVAL - cannot convert BTF. - * 0 - Successfully converted BTF into bpf_reg_state - * (either PTR_TO_CTX or SCALAR_VALUE). + * 0 - Successfully processed BTF and constructed argument expectations. */ -int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog, - struct bpf_reg_state *regs, u32 *arg_cnt) +int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog) { + struct bpf_subprog_info *sub = subprog_info(env, subprog); struct bpf_verifier_log *log = &env->log; struct bpf_prog *prog = env->prog; enum bpf_prog_type prog_type = prog->type; @@ -6967,6 +6968,9 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog, u32 i, nargs, btf_id; const char *tname; + if (sub->args_cached) + return 0; + if (!prog->aux->func_info || prog->aux->func_info_aux[subprog].linkage != BTF_FUNC_GLOBAL) { bpf_log(log, "Verifier bug\n"); @@ -6990,10 +6994,6 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog, } tname = btf_name_by_offset(btf, t->name_off); - if (log->level & BPF_LOG_LEVEL) - bpf_log(log, "Validating %s() func#%d...\n", - tname, subprog); - if (prog->aux->func_info_aux[subprog].unreliable) { bpf_log(log, "Verifier bug in function %s()\n", tname); return -EFAULT; @@ -7013,7 +7013,6 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog, tname, nargs, MAX_BPF_FUNC_REG_ARGS); return -EINVAL; } - *arg_cnt = nargs; /* check that function returns int, exception cb also requires this */ t = btf_type_by_id(btf, t->type); while (btf_type_is_modifier(t)) @@ -7028,24 +7027,24 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog, * Only PTR_TO_CTX and SCALAR are supported atm. */ for (i = 0; i < nargs; i++) { - struct bpf_reg_state *reg = ®s[i + 1]; - t = btf_type_by_id(btf, args[i].type); while (btf_type_is_modifier(t)) t = btf_type_by_id(btf, t->type); if (btf_type_is_int(t) || btf_is_any_enum(t)) { - reg->type = SCALAR_VALUE; + sub->args[i].arg_type = ARG_ANYTHING; continue; } if (btf_type_is_ptr(t)) { + u32 mem_size; + if (btf_get_prog_ctx_type(log, btf, t, prog_type, i)) { - reg->type = PTR_TO_CTX; + sub->args[i].arg_type = ARG_PTR_TO_CTX; continue; } t = btf_type_skip_modifiers(btf, t->type, NULL); - ref_t = btf_resolve_size(btf, t, ®->mem_size); + ref_t = btf_resolve_size(btf, t, &mem_size); if (IS_ERR(ref_t)) { bpf_log(log, "arg#%d reference type('%s %s') size cannot be determined: %ld\n", @@ -7054,15 +7053,18 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog, return -EINVAL; } - reg->type = PTR_TO_MEM | PTR_MAYBE_NULL; - reg->id = ++env->id_gen; - + sub->args[i].arg_type = ARG_PTR_TO_MEM_OR_NULL; + sub->args[i].mem_size = mem_size; continue; } bpf_log(log, "Arg#%d type %s in %s() is not supported yet.\n", i, btf_type_str(t), tname); return -EINVAL; } + + sub->arg_cnt = nargs; + sub->args_cached = true; + return 0; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index df1cae459c77..6555785b9f63 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -442,11 +442,6 @@ static struct bpf_func_info_aux *subprog_aux(const struct bpf_verifier_env *env, return &env->prog->aux->func_info_aux[subprog]; } -static struct bpf_subprog_info *subprog_info(struct bpf_verifier_env *env, int subprog) -{ - return &env->subprog_info[subprog]; -} - static void mark_subprog_exc_cb(struct bpf_verifier_env *env, int subprog) { struct bpf_subprog_info *info = subprog_info(env, subprog); @@ -19937,34 +19932,50 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog) regs = state->frame[state->curframe]->regs; if (subprog || env->prog->type == BPF_PROG_TYPE_EXT) { - u32 nargs; + struct bpf_subprog_info *sub = subprog_info(env, subprog); + const char *sub_name = subprog_name(env, subprog); + struct bpf_subprog_arg_info *arg; + struct bpf_reg_state *reg; - ret = btf_prepare_func_args(env, subprog, regs, &nargs); + verbose(env, "Validating %s() func#%d...\n", sub_name, subprog); + ret = btf_prepare_func_args(env, subprog); if (ret) goto out; + if (subprog_is_exc_cb(env, subprog)) { state->frame[0]->in_exception_callback_fn = true; /* We have already ensured that the callback returns an integer, just * like all global subprogs. We need to determine it only has a single * scalar argument. */ - if (nargs != 1 || regs[BPF_REG_1].type != SCALAR_VALUE) { + if (sub->arg_cnt != 1 || sub->args[0].arg_type != ARG_ANYTHING) { verbose(env, "exception cb only supports single integer argument\n"); ret = -EINVAL; goto out; } } - for (i = BPF_REG_1; i <= BPF_REG_5; i++) { - if (regs[i].type == PTR_TO_CTX) + for (i = BPF_REG_1; i <= sub->arg_cnt; i++) { + arg = &sub->args[i - BPF_REG_1]; + reg = ®s[i]; + + if (arg->arg_type == ARG_PTR_TO_CTX) { + reg->type = PTR_TO_CTX; mark_reg_known_zero(env, regs, i); - else if (regs[i].type == SCALAR_VALUE) + } else if (arg->arg_type == ARG_ANYTHING) { + reg->type = SCALAR_VALUE; mark_reg_unknown(env, regs, i); - else if (base_type(regs[i].type) == PTR_TO_MEM) { - const u32 mem_size = regs[i].mem_size; - + } else if (base_type(arg->arg_type) == ARG_PTR_TO_MEM) { + reg->type = PTR_TO_MEM; + if (arg->arg_type & PTR_MAYBE_NULL) + reg->type |= PTR_MAYBE_NULL; mark_reg_known_zero(env, regs, i); - regs[i].mem_size = mem_size; - regs[i].id = ++env->id_gen; + reg->mem_size = arg->mem_size; + reg->id = ++env->id_gen; + } else { + WARN_ONCE(1, "BUG: unhandled arg#%d type %d\n", + i - BPF_REG_1, arg->arg_type); + ret = -EFAULT; + goto out; } } } else { -- cgit From 5eccd2db42d77e3570619c32d39e39bf486607cf Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 14 Dec 2023 17:13:26 -0800 Subject: bpf: reuse btf_prepare_func_args() check for main program BTF validation Instead of btf_check_subprog_arg_match(), use btf_prepare_func_args() logic to validate "trustworthiness" of main BPF program's BTF information, if it is present. We ignored results of original BTF check anyway, often times producing confusing and ominously-sounding "reg type unsupported for arg#0 function" message, which has no apparent effect on program correctness and verification process. All the -EFAULT returning sanity checks are already performed in check_btf_info_early(), so there is zero reason to have this duplication of logic between btf_check_subprog_call() and btf_check_subprog_arg_match(). Dropping btf_check_subprog_arg_match() simplifies btf_check_func_arg_match() further removing `bool processing_call` flag. One subtle bit that was done by btf_check_subprog_arg_match() was potentially marking main program's BTF as unreliable. We do this explicitly now with a dedicated simple check, preserving the original behavior, but now based on well factored btf_prepare_func_args() logic. Acked-by: Eduard Zingerman Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231215011334.2307144-3-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 - kernel/bpf/btf.c | 50 ++-------------------- kernel/bpf/verifier.c | 25 ++++++----- tools/testing/selftests/bpf/prog_tests/log_fixup.c | 4 +- .../selftests/bpf/progs/cgrp_kfunc_failure.c | 2 +- .../selftests/bpf/progs/task_kfunc_failure.c | 2 +- 6 files changed, 19 insertions(+), 66 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index c050c82cc9a5..d0d7eff22b8a 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2466,8 +2466,6 @@ int btf_distill_func_proto(struct bpf_verifier_log *log, struct btf_func_model *m); struct bpf_reg_state; -int btf_check_subprog_arg_match(struct bpf_verifier_env *env, int subprog, - struct bpf_reg_state *regs); int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog, struct bpf_reg_state *regs); int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog); diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index be2104e5f2f5..33d9a1c73f6e 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -6768,8 +6768,7 @@ int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *pr static int btf_check_func_arg_match(struct bpf_verifier_env *env, const struct btf *btf, u32 func_id, struct bpf_reg_state *regs, - bool ptr_to_mem_ok, - bool processing_call) + bool ptr_to_mem_ok) { enum bpf_prog_type prog_type = resolve_prog_type(env->prog); struct bpf_verifier_log *log = &env->log; @@ -6842,7 +6841,7 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, i, btf_type_str(t)); return -EINVAL; } - } else if (ptr_to_mem_ok && processing_call) { + } else if (ptr_to_mem_ok) { const struct btf_type *resolve_ret; u32 type_size; @@ -6867,55 +6866,12 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, return 0; } -/* Compare BTF of a function declaration with given bpf_reg_state. - * Returns: - * EFAULT - there is a verifier bug. Abort verification. - * EINVAL - there is a type mismatch or BTF is not available. - * 0 - BTF matches with what bpf_reg_state expects. - * Only PTR_TO_CTX and SCALAR_VALUE states are recognized. - */ -int btf_check_subprog_arg_match(struct bpf_verifier_env *env, int subprog, - struct bpf_reg_state *regs) -{ - struct bpf_prog *prog = env->prog; - struct btf *btf = prog->aux->btf; - bool is_global; - u32 btf_id; - int err; - - if (!prog->aux->func_info) - return -EINVAL; - - btf_id = prog->aux->func_info[subprog].type_id; - if (!btf_id) - return -EFAULT; - - if (prog->aux->func_info_aux[subprog].unreliable) - return -EINVAL; - - is_global = prog->aux->func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL; - err = btf_check_func_arg_match(env, btf, btf_id, regs, is_global, false); - - /* Compiler optimizations can remove arguments from static functions - * or mismatched type can be passed into a global function. - * In such cases mark the function as unreliable from BTF point of view. - */ - if (err) - prog->aux->func_info_aux[subprog].unreliable = true; - return err; -} - /* Compare BTF of a function call with given bpf_reg_state. * Returns: * EFAULT - there is a verifier bug. Abort verification. * EINVAL - there is a type mismatch or BTF is not available. * 0 - BTF matches with what bpf_reg_state expects. * Only PTR_TO_CTX and SCALAR_VALUE states are recognized. - * - * NOTE: the code is duplicated from btf_check_subprog_arg_match() - * because btf_check_func_arg_match() is still doing both. Once that - * function is split in 2, we can call from here btf_check_subprog_arg_match() - * first, and then treat the calling part in a new code path. */ int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog, struct bpf_reg_state *regs) @@ -6937,7 +6893,7 @@ int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog, return -EINVAL; is_global = prog->aux->func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL; - err = btf_check_func_arg_match(env, btf, btf_id, regs, is_global, true); + err = btf_check_func_arg_match(env, btf, btf_id, regs, is_global); /* Compiler optimizations can remove arguments from static functions * or mismatched type can be passed into a global function. diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6555785b9f63..c26e9ab5226c 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -19904,6 +19904,7 @@ static void free_states(struct bpf_verifier_env *env) static int do_check_common(struct bpf_verifier_env *env, int subprog) { bool pop_log = !(env->log.level & BPF_LOG_LEVEL2); + struct bpf_subprog_info *sub = subprog_info(env, subprog); struct bpf_verifier_state *state; struct bpf_reg_state *regs; int ret, i; @@ -19930,9 +19931,9 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog) state->first_insn_idx = env->subprog_info[subprog].start; state->last_insn_idx = -1; + regs = state->frame[state->curframe]->regs; if (subprog || env->prog->type == BPF_PROG_TYPE_EXT) { - struct bpf_subprog_info *sub = subprog_info(env, subprog); const char *sub_name = subprog_name(env, subprog); struct bpf_subprog_arg_info *arg; struct bpf_reg_state *reg; @@ -19979,21 +19980,19 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog) } } } else { + /* if main BPF program has associated BTF info, validate that + * it's matching expected signature, and otherwise mark BTF + * info for main program as unreliable + */ + if (env->prog->aux->func_info_aux) { + ret = btf_prepare_func_args(env, 0); + if (ret || sub->arg_cnt != 1 || sub->args[0].arg_type != ARG_PTR_TO_CTX) + env->prog->aux->func_info_aux[0].unreliable = true; + } + /* 1st arg to a function */ regs[BPF_REG_1].type = PTR_TO_CTX; mark_reg_known_zero(env, regs, BPF_REG_1); - ret = btf_check_subprog_arg_match(env, subprog, regs); - if (ret == -EFAULT) - /* unlikely verifier bug. abort. - * ret == 0 and ret < 0 are sadly acceptable for - * main() function due to backward compatibility. - * Like socket filter program may be written as: - * int bpf_prog(struct pt_regs *ctx) - * and never dereference that ctx in the program. - * 'struct pt_regs' is a type mismatch for socket - * filter that should be using 'struct __sk_buff'. - */ - goto out; } ret = do_check(env); diff --git a/tools/testing/selftests/bpf/prog_tests/log_fixup.c b/tools/testing/selftests/bpf/prog_tests/log_fixup.c index effd78b2a657..7a3fa2ff567b 100644 --- a/tools/testing/selftests/bpf/prog_tests/log_fixup.c +++ b/tools/testing/selftests/bpf/prog_tests/log_fixup.c @@ -169,9 +169,9 @@ void test_log_fixup(void) if (test__start_subtest("bad_core_relo_trunc_none")) bad_core_relo(0, TRUNC_NONE /* full buf */); if (test__start_subtest("bad_core_relo_trunc_partial")) - bad_core_relo(300, TRUNC_PARTIAL /* truncate original log a bit */); + bad_core_relo(280, TRUNC_PARTIAL /* truncate original log a bit */); if (test__start_subtest("bad_core_relo_trunc_full")) - bad_core_relo(210, TRUNC_FULL /* truncate also libbpf's message patch */); + bad_core_relo(220, TRUNC_FULL /* truncate also libbpf's message patch */); if (test__start_subtest("bad_core_relo_subprog")) bad_core_relo_subprog(); if (test__start_subtest("missing_map")) diff --git a/tools/testing/selftests/bpf/progs/cgrp_kfunc_failure.c b/tools/testing/selftests/bpf/progs/cgrp_kfunc_failure.c index 0fa564a5cc5b..9fe9c4a4e8f6 100644 --- a/tools/testing/selftests/bpf/progs/cgrp_kfunc_failure.c +++ b/tools/testing/selftests/bpf/progs/cgrp_kfunc_failure.c @@ -78,7 +78,7 @@ int BPF_PROG(cgrp_kfunc_acquire_fp, struct cgroup *cgrp, const char *path) } SEC("kretprobe/cgroup_destroy_locked") -__failure __msg("reg type unsupported for arg#0 function") +__failure __msg("calling kernel function bpf_cgroup_acquire is not allowed") int BPF_PROG(cgrp_kfunc_acquire_unsafe_kretprobe, struct cgroup *cgrp) { struct cgroup *acquired; diff --git a/tools/testing/selftests/bpf/progs/task_kfunc_failure.c b/tools/testing/selftests/bpf/progs/task_kfunc_failure.c index dcdea3127086..ad88a3796ddf 100644 --- a/tools/testing/selftests/bpf/progs/task_kfunc_failure.c +++ b/tools/testing/selftests/bpf/progs/task_kfunc_failure.c @@ -248,7 +248,7 @@ int BPF_PROG(task_kfunc_from_pid_no_null_check, struct task_struct *task, u64 cl } SEC("lsm/task_free") -__failure __msg("reg type unsupported for arg#0 function") +__failure __msg("R1 must be a rcu pointer") int BPF_PROG(task_kfunc_from_lsm_task_free, struct task_struct *task) { struct task_struct *acquired; -- cgit From e26080d0da87f20222ca6712b65f95a856fadee0 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 14 Dec 2023 17:13:27 -0800 Subject: bpf: prepare btf_prepare_func_args() for handling static subprogs Generalize btf_prepare_func_args() to support both global and static subprogs. We are going to utilize this property in the next patch, reusing btf_prepare_func_args() for subprog call logic instead of reparsing BTF information in a completely separate implementation. btf_prepare_func_args() now detects whether subprog is global or static makes slight logic adjustments for static func cases, like not failing fatally (-EFAULT) for conditions that are allowable for static subprogs. Somewhat subtle (but major!) difference is the handling of pointer arguments. Both global and static functions need to handle special context arguments (which are pointers to predefined type names), but static subprogs give up on any other pointers, falling back to marking subprog as "unreliable", disabling the use of BTF type information altogether. For global functions, though, we are assuming that such pointers to unrecognized types are just pointers to fixed-sized memory region (or error out if size cannot be established, like for `void *` pointers). This patch accommodates these small differences and sets up a stage for refactoring in the next patch, eliminating a separate BTF-based parsing logic in btf_check_func_arg_match(). Acked-by: Eduard Zingerman Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231215011334.2307144-4-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 5 +++++ kernel/bpf/btf.c | 18 +++++++++--------- kernel/bpf/verifier.c | 5 ----- 3 files changed, 14 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 5742e9c0a7b8..d3ea9ef04767 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -738,6 +738,11 @@ struct bpf_verifier_env { char tmp_str_buf[TMP_STR_BUF_LEN]; }; +static inline struct bpf_func_info_aux *subprog_aux(struct bpf_verifier_env *env, int subprog) +{ + return &env->prog->aux->func_info_aux[subprog]; +} + static inline struct bpf_subprog_info *subprog_info(struct bpf_verifier_env *env, int subprog) { return &env->subprog_info[subprog]; diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 33d9a1c73f6e..d321340e16f1 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -6914,6 +6914,7 @@ int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog, */ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog) { + bool is_global = subprog_aux(env, subprog)->linkage == BTF_FUNC_GLOBAL; struct bpf_subprog_info *sub = subprog_info(env, subprog); struct bpf_verifier_log *log = &env->log; struct bpf_prog *prog = env->prog; @@ -6927,14 +6928,15 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog) if (sub->args_cached) return 0; - if (!prog->aux->func_info || - prog->aux->func_info_aux[subprog].linkage != BTF_FUNC_GLOBAL) { + if (!prog->aux->func_info) { bpf_log(log, "Verifier bug\n"); return -EFAULT; } btf_id = prog->aux->func_info[subprog].type_id; if (!btf_id) { + if (!is_global) /* not fatal for static funcs */ + return -EINVAL; bpf_log(log, "Global functions need valid BTF\n"); return -EFAULT; } @@ -6990,16 +6992,14 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog) sub->args[i].arg_type = ARG_ANYTHING; continue; } - if (btf_type_is_ptr(t)) { + if (btf_type_is_ptr(t) && btf_get_prog_ctx_type(log, btf, t, prog_type, i)) { + sub->args[i].arg_type = ARG_PTR_TO_CTX; + continue; + } + if (is_global && btf_type_is_ptr(t)) { u32 mem_size; - if (btf_get_prog_ctx_type(log, btf, t, prog_type, i)) { - sub->args[i].arg_type = ARG_PTR_TO_CTX; - continue; - } - t = btf_type_skip_modifiers(btf, t->type, NULL); - ref_t = btf_resolve_size(btf, t, &mem_size); if (IS_ERR(ref_t)) { bpf_log(log, diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c26e9ab5226c..6c9ecb86a8d9 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -437,11 +437,6 @@ static const char *subprog_name(const struct bpf_verifier_env *env, int subprog) return btf_type_name(env->prog->aux->btf, info->type_id); } -static struct bpf_func_info_aux *subprog_aux(const struct bpf_verifier_env *env, int subprog) -{ - return &env->prog->aux->func_info_aux[subprog]; -} - static void mark_subprog_exc_cb(struct bpf_verifier_env *env, int subprog) { struct bpf_subprog_info *info = subprog_info(env, subprog); -- cgit From c5a7244759b1eeacc59d0426fb73859afa942d0d Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 14 Dec 2023 17:13:28 -0800 Subject: bpf: move subprog call logic back to verifier.c Subprog call logic in btf_check_subprog_call() currently has both a lot of BTF parsing logic (which is, presumably, what justified putting it into btf.c), but also a bunch of register state checks, some of each utilize deep verifier logic helpers, necessarily exported from verifier.c: check_ptr_off_reg(), check_func_arg_reg_off(), and check_mem_reg(). Going forward, btf_check_subprog_call() will have a minimum of BTF-related logic, but will get more internal verifier logic related to register state manipulation. So move it into verifier.c to minimize amount of verifier-specific logic exposed to btf.c. We do this move before refactoring btf_check_func_arg_match() to preserve as much history post-refactoring as possible. No functional changes. Acked-by: Eduard Zingerman Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231215011334.2307144-5-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 - include/linux/bpf_verifier.h | 8 --- kernel/bpf/btf.c | 139 --------------------------------------- kernel/bpf/verifier.c | 153 +++++++++++++++++++++++++++++++++++++++++-- 4 files changed, 146 insertions(+), 156 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index d0d7eff22b8a..7671530d6e4e 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2466,8 +2466,6 @@ int btf_distill_func_proto(struct bpf_verifier_log *log, struct btf_func_model *m); struct bpf_reg_state; -int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog, - struct bpf_reg_state *regs); int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog); int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *prog, struct btf *btf, const struct btf_type *t); diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index d3ea9ef04767..d07d857ca67f 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -785,14 +785,6 @@ bpf_prog_offload_replace_insn(struct bpf_verifier_env *env, u32 off, void bpf_prog_offload_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt); -int check_ptr_off_reg(struct bpf_verifier_env *env, - const struct bpf_reg_state *reg, int regno); -int check_func_arg_reg_off(struct bpf_verifier_env *env, - const struct bpf_reg_state *reg, int regno, - enum bpf_arg_type arg_type); -int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, - u32 regno, u32 mem_size); - /* this lives here instead of in bpf.h because it needs to dereference tgt_prog */ static inline u64 bpf_trampoline_compute_key(const struct bpf_prog *tgt_prog, struct btf *btf, u32 btf_id) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index d321340e16f1..341811bcca53 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -6765,145 +6765,6 @@ int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *pr return btf_check_func_type_match(log, btf1, t1, btf2, t2); } -static int btf_check_func_arg_match(struct bpf_verifier_env *env, - const struct btf *btf, u32 func_id, - struct bpf_reg_state *regs, - bool ptr_to_mem_ok) -{ - enum bpf_prog_type prog_type = resolve_prog_type(env->prog); - struct bpf_verifier_log *log = &env->log; - const char *func_name, *ref_tname; - const struct btf_type *t, *ref_t; - const struct btf_param *args; - u32 i, nargs, ref_id; - int ret; - - t = btf_type_by_id(btf, func_id); - if (!t || !btf_type_is_func(t)) { - /* These checks were already done by the verifier while loading - * struct bpf_func_info or in add_kfunc_call(). - */ - bpf_log(log, "BTF of func_id %u doesn't point to KIND_FUNC\n", - func_id); - return -EFAULT; - } - func_name = btf_name_by_offset(btf, t->name_off); - - t = btf_type_by_id(btf, t->type); - if (!t || !btf_type_is_func_proto(t)) { - bpf_log(log, "Invalid BTF of func %s\n", func_name); - return -EFAULT; - } - args = (const struct btf_param *)(t + 1); - nargs = btf_type_vlen(t); - if (nargs > MAX_BPF_FUNC_REG_ARGS) { - bpf_log(log, "Function %s has %d > %d args\n", func_name, nargs, - MAX_BPF_FUNC_REG_ARGS); - return -EINVAL; - } - - /* check that BTF function arguments match actual types that the - * verifier sees. - */ - for (i = 0; i < nargs; i++) { - enum bpf_arg_type arg_type = ARG_DONTCARE; - u32 regno = i + 1; - struct bpf_reg_state *reg = ®s[regno]; - - t = btf_type_skip_modifiers(btf, args[i].type, NULL); - if (btf_type_is_scalar(t)) { - if (reg->type == SCALAR_VALUE) - continue; - bpf_log(log, "R%d is not a scalar\n", regno); - return -EINVAL; - } - - if (!btf_type_is_ptr(t)) { - bpf_log(log, "Unrecognized arg#%d type %s\n", - i, btf_type_str(t)); - return -EINVAL; - } - - ref_t = btf_type_skip_modifiers(btf, t->type, &ref_id); - ref_tname = btf_name_by_offset(btf, ref_t->name_off); - - ret = check_func_arg_reg_off(env, reg, regno, arg_type); - if (ret < 0) - return ret; - - if (btf_get_prog_ctx_type(log, btf, t, prog_type, i)) { - /* If function expects ctx type in BTF check that caller - * is passing PTR_TO_CTX. - */ - if (reg->type != PTR_TO_CTX) { - bpf_log(log, - "arg#%d expected pointer to ctx, but got %s\n", - i, btf_type_str(t)); - return -EINVAL; - } - } else if (ptr_to_mem_ok) { - const struct btf_type *resolve_ret; - u32 type_size; - - resolve_ret = btf_resolve_size(btf, ref_t, &type_size); - if (IS_ERR(resolve_ret)) { - bpf_log(log, - "arg#%d reference type('%s %s') size cannot be determined: %ld\n", - i, btf_type_str(ref_t), ref_tname, - PTR_ERR(resolve_ret)); - return -EINVAL; - } - - if (check_mem_reg(env, reg, regno, type_size)) - return -EINVAL; - } else { - bpf_log(log, "reg type unsupported for arg#%d function %s#%d\n", i, - func_name, func_id); - return -EINVAL; - } - } - - return 0; -} - -/* Compare BTF of a function call with given bpf_reg_state. - * Returns: - * EFAULT - there is a verifier bug. Abort verification. - * EINVAL - there is a type mismatch or BTF is not available. - * 0 - BTF matches with what bpf_reg_state expects. - * Only PTR_TO_CTX and SCALAR_VALUE states are recognized. - */ -int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog, - struct bpf_reg_state *regs) -{ - struct bpf_prog *prog = env->prog; - struct btf *btf = prog->aux->btf; - bool is_global; - u32 btf_id; - int err; - - if (!prog->aux->func_info) - return -EINVAL; - - btf_id = prog->aux->func_info[subprog].type_id; - if (!btf_id) - return -EFAULT; - - if (prog->aux->func_info_aux[subprog].unreliable) - return -EINVAL; - - is_global = prog->aux->func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL; - err = btf_check_func_arg_match(env, btf, btf_id, regs, is_global); - - /* Compiler optimizations can remove arguments from static functions - * or mismatched type can be passed into a global function. - * In such cases mark the function as unreliable from BTF point of view. - */ - if (err) - prog->aux->func_info_aux[subprog].unreliable = true; - return err; -} - /* Process BTF of a function to produce high-level expectation of function * arguments (like ARG_PTR_TO_CTX, or ARG_PTR_TO_MEM, etc). This information * is cached in subprog info for reuse. diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6c9ecb86a8d9..f48e49f2d482 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5127,8 +5127,8 @@ static int __check_ptr_off_reg(struct bpf_verifier_env *env, return 0; } -int check_ptr_off_reg(struct bpf_verifier_env *env, - const struct bpf_reg_state *reg, int regno) +static int check_ptr_off_reg(struct bpf_verifier_env *env, + const struct bpf_reg_state *reg, int regno) { return __check_ptr_off_reg(env, reg, regno, false); } @@ -7300,8 +7300,8 @@ static int check_mem_size_reg(struct bpf_verifier_env *env, return err; } -int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, - u32 regno, u32 mem_size) +static int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, + u32 regno, u32 mem_size) { bool may_be_null = type_may_be_null(reg->type); struct bpf_reg_state saved_reg; @@ -8286,9 +8286,9 @@ reg_find_field_offset(const struct bpf_reg_state *reg, s32 off, u32 fields) return field; } -int check_func_arg_reg_off(struct bpf_verifier_env *env, - const struct bpf_reg_state *reg, int regno, - enum bpf_arg_type arg_type) +static int check_func_arg_reg_off(struct bpf_verifier_env *env, + const struct bpf_reg_state *reg, int regno, + enum bpf_arg_type arg_type) { u32 type = reg->type; @@ -9249,6 +9249,145 @@ err_out: return err; } +static int btf_check_func_arg_match(struct bpf_verifier_env *env, + const struct btf *btf, u32 func_id, + struct bpf_reg_state *regs, + bool ptr_to_mem_ok) +{ + enum bpf_prog_type prog_type = resolve_prog_type(env->prog); + struct bpf_verifier_log *log = &env->log; + const char *func_name, *ref_tname; + const struct btf_type *t, *ref_t; + const struct btf_param *args; + u32 i, nargs, ref_id; + int ret; + + t = btf_type_by_id(btf, func_id); + if (!t || !btf_type_is_func(t)) { + /* These checks were already done by the verifier while loading + * struct bpf_func_info or in add_kfunc_call(). + */ + bpf_log(log, "BTF of func_id %u doesn't point to KIND_FUNC\n", + func_id); + return -EFAULT; + } + func_name = btf_name_by_offset(btf, t->name_off); + + t = btf_type_by_id(btf, t->type); + if (!t || !btf_type_is_func_proto(t)) { + bpf_log(log, "Invalid BTF of func %s\n", func_name); + return -EFAULT; + } + args = (const struct btf_param *)(t + 1); + nargs = btf_type_vlen(t); + if (nargs > MAX_BPF_FUNC_REG_ARGS) { + bpf_log(log, "Function %s has %d > %d args\n", func_name, nargs, + MAX_BPF_FUNC_REG_ARGS); + return -EINVAL; + } + + /* check that BTF function arguments match actual types that the + * verifier sees. + */ + for (i = 0; i < nargs; i++) { + enum bpf_arg_type arg_type = ARG_DONTCARE; + u32 regno = i + 1; + struct bpf_reg_state *reg = ®s[regno]; + + t = btf_type_skip_modifiers(btf, args[i].type, NULL); + if (btf_type_is_scalar(t)) { + if (reg->type == SCALAR_VALUE) + continue; + bpf_log(log, "R%d is not a scalar\n", regno); + return -EINVAL; + } + + if (!btf_type_is_ptr(t)) { + bpf_log(log, "Unrecognized arg#%d type %s\n", + i, btf_type_str(t)); + return -EINVAL; + } + + ref_t = btf_type_skip_modifiers(btf, t->type, &ref_id); + ref_tname = btf_name_by_offset(btf, ref_t->name_off); + + ret = check_func_arg_reg_off(env, reg, regno, arg_type); + if (ret < 0) + return ret; + + if (btf_get_prog_ctx_type(log, btf, t, prog_type, i)) { + /* If function expects ctx type in BTF check that caller + * is passing PTR_TO_CTX. + */ + if (reg->type != PTR_TO_CTX) { + bpf_log(log, + "arg#%d expected pointer to ctx, but got %s\n", + i, btf_type_str(t)); + return -EINVAL; + } + } else if (ptr_to_mem_ok) { + const struct btf_type *resolve_ret; + u32 type_size; + + resolve_ret = btf_resolve_size(btf, ref_t, &type_size); + if (IS_ERR(resolve_ret)) { + bpf_log(log, + "arg#%d reference type('%s %s') size cannot be determined: %ld\n", + i, btf_type_str(ref_t), ref_tname, + PTR_ERR(resolve_ret)); + return -EINVAL; + } + + if (check_mem_reg(env, reg, regno, type_size)) + return -EINVAL; + } else { + bpf_log(log, "reg type unsupported for arg#%d function %s#%d\n", i, + func_name, func_id); + return -EINVAL; + } + } + + return 0; +} + +/* Compare BTF of a function call with given bpf_reg_state. + * Returns: + * EFAULT - there is a verifier bug. Abort verification. + * EINVAL - there is a type mismatch or BTF is not available. + * 0 - BTF matches with what bpf_reg_state expects. + * Only PTR_TO_CTX and SCALAR_VALUE states are recognized. + */ +static int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog, + struct bpf_reg_state *regs) +{ + struct bpf_prog *prog = env->prog; + struct btf *btf = prog->aux->btf; + bool is_global; + u32 btf_id; + int err; + + if (!prog->aux->func_info) + return -EINVAL; + + btf_id = prog->aux->func_info[subprog].type_id; + if (!btf_id) + return -EFAULT; + + if (prog->aux->func_info_aux[subprog].unreliable) + return -EINVAL; + + is_global = prog->aux->func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL; + err = btf_check_func_arg_match(env, btf, btf_id, regs, is_global); + + /* Compiler optimizations can remove arguments from static functions + * or mismatched type can be passed into a global function. + * In such cases mark the function as unreliable from BTF point of view. + */ + if (err) + prog->aux->func_info_aux[subprog].unreliable = true; + return err; +} + static int push_callback_call(struct bpf_verifier_env *env, struct bpf_insn *insn, int insn_idx, int subprog, set_callee_state_fn set_callee_state_cb) -- cgit From f18c3d88deedf0defc3e4800341cc7bcaaabcdf9 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 14 Dec 2023 17:13:29 -0800 Subject: bpf: reuse subprog argument parsing logic for subprog call checks Remove duplicated BTF parsing logic when it comes to subprog call check. Instead, use (potentially cached) results of btf_prepare_func_args() to abstract away expectations of each subprog argument in generic terms (e.g., "this is pointer to context", or "this is a pointer to memory of size X"), and then use those simple high-level argument type expectations to validate actual register states to check if they match expectations. Acked-by: Eduard Zingerman Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231215011334.2307144-6-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 110 ++++++--------------- .../selftests/bpf/progs/test_global_func5.c | 2 +- 2 files changed, 31 insertions(+), 81 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f48e49f2d482..3c9e6d0d77d6 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -9249,101 +9249,54 @@ err_out: return err; } -static int btf_check_func_arg_match(struct bpf_verifier_env *env, - const struct btf *btf, u32 func_id, - struct bpf_reg_state *regs, - bool ptr_to_mem_ok) +static int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog, + const struct btf *btf, + struct bpf_reg_state *regs) { - enum bpf_prog_type prog_type = resolve_prog_type(env->prog); + struct bpf_subprog_info *sub = subprog_info(env, subprog); struct bpf_verifier_log *log = &env->log; - const char *func_name, *ref_tname; - const struct btf_type *t, *ref_t; - const struct btf_param *args; - u32 i, nargs, ref_id; + u32 i; int ret; - t = btf_type_by_id(btf, func_id); - if (!t || !btf_type_is_func(t)) { - /* These checks were already done by the verifier while loading - * struct bpf_func_info or in add_kfunc_call(). - */ - bpf_log(log, "BTF of func_id %u doesn't point to KIND_FUNC\n", - func_id); - return -EFAULT; - } - func_name = btf_name_by_offset(btf, t->name_off); - - t = btf_type_by_id(btf, t->type); - if (!t || !btf_type_is_func_proto(t)) { - bpf_log(log, "Invalid BTF of func %s\n", func_name); - return -EFAULT; - } - args = (const struct btf_param *)(t + 1); - nargs = btf_type_vlen(t); - if (nargs > MAX_BPF_FUNC_REG_ARGS) { - bpf_log(log, "Function %s has %d > %d args\n", func_name, nargs, - MAX_BPF_FUNC_REG_ARGS); - return -EINVAL; - } + ret = btf_prepare_func_args(env, subprog); + if (ret) + return ret; /* check that BTF function arguments match actual types that the * verifier sees. */ - for (i = 0; i < nargs; i++) { - enum bpf_arg_type arg_type = ARG_DONTCARE; + for (i = 0; i < sub->arg_cnt; i++) { u32 regno = i + 1; struct bpf_reg_state *reg = ®s[regno]; + struct bpf_subprog_arg_info *arg = &sub->args[i]; - t = btf_type_skip_modifiers(btf, args[i].type, NULL); - if (btf_type_is_scalar(t)) { - if (reg->type == SCALAR_VALUE) - continue; - bpf_log(log, "R%d is not a scalar\n", regno); - return -EINVAL; - } - - if (!btf_type_is_ptr(t)) { - bpf_log(log, "Unrecognized arg#%d type %s\n", - i, btf_type_str(t)); - return -EINVAL; - } - - ref_t = btf_type_skip_modifiers(btf, t->type, &ref_id); - ref_tname = btf_name_by_offset(btf, ref_t->name_off); - - ret = check_func_arg_reg_off(env, reg, regno, arg_type); - if (ret < 0) - return ret; - - if (btf_get_prog_ctx_type(log, btf, t, prog_type, i)) { + if (arg->arg_type == ARG_ANYTHING) { + if (reg->type != SCALAR_VALUE) { + bpf_log(log, "R%d is not a scalar\n", regno); + return -EINVAL; + } + } else if (arg->arg_type == ARG_PTR_TO_CTX) { + ret = check_func_arg_reg_off(env, reg, regno, ARG_DONTCARE); + if (ret < 0) + return ret; /* If function expects ctx type in BTF check that caller * is passing PTR_TO_CTX. */ if (reg->type != PTR_TO_CTX) { - bpf_log(log, - "arg#%d expected pointer to ctx, but got %s\n", - i, btf_type_str(t)); - return -EINVAL; - } - } else if (ptr_to_mem_ok) { - const struct btf_type *resolve_ret; - u32 type_size; - - resolve_ret = btf_resolve_size(btf, ref_t, &type_size); - if (IS_ERR(resolve_ret)) { - bpf_log(log, - "arg#%d reference type('%s %s') size cannot be determined: %ld\n", - i, btf_type_str(ref_t), ref_tname, - PTR_ERR(resolve_ret)); + bpf_log(log, "arg#%d expects pointer to ctx\n", i); return -EINVAL; } + } else if (base_type(arg->arg_type) == ARG_PTR_TO_MEM) { + ret = check_func_arg_reg_off(env, reg, regno, ARG_DONTCARE); + if (ret < 0) + return ret; - if (check_mem_reg(env, reg, regno, type_size)) + if (check_mem_reg(env, reg, regno, arg->mem_size)) return -EINVAL; } else { - bpf_log(log, "reg type unsupported for arg#%d function %s#%d\n", i, - func_name, func_id); - return -EINVAL; + bpf_log(log, "verifier bug: unrecognized arg#%d type %d\n", + i, arg->arg_type); + return -EFAULT; } } @@ -9358,11 +9311,10 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, * Only PTR_TO_CTX and SCALAR_VALUE states are recognized. */ static int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog, - struct bpf_reg_state *regs) + struct bpf_reg_state *regs) { struct bpf_prog *prog = env->prog; struct btf *btf = prog->aux->btf; - bool is_global; u32 btf_id; int err; @@ -9376,9 +9328,7 @@ static int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog, if (prog->aux->func_info_aux[subprog].unreliable) return -EINVAL; - is_global = prog->aux->func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL; - err = btf_check_func_arg_match(env, btf, btf_id, regs, is_global); - + err = btf_check_func_arg_match(env, subprog, btf, regs); /* Compiler optimizations can remove arguments from static functions * or mismatched type can be passed into a global function. * In such cases mark the function as unreliable from BTF point of view. diff --git a/tools/testing/selftests/bpf/progs/test_global_func5.c b/tools/testing/selftests/bpf/progs/test_global_func5.c index cc55aedaf82d..257c0569ff98 100644 --- a/tools/testing/selftests/bpf/progs/test_global_func5.c +++ b/tools/testing/selftests/bpf/progs/test_global_func5.c @@ -26,7 +26,7 @@ int f3(int val, struct __sk_buff *skb) } SEC("tc") -__failure __msg("expected pointer to ctx, but got PTR") +__failure __msg("expects pointer to ctx") int global_func5(struct __sk_buff *skb) { return f1(skb) + f2(2, skb) + f3(3, skb); -- cgit From 94e1c70a34523b5e1529e4ec508316acc6a26a2b Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 14 Dec 2023 17:13:30 -0800 Subject: bpf: support 'arg:xxx' btf_decl_tag-based hints for global subprog args Add support for annotating global BPF subprog arguments to provide more information about expected semantics of the argument. Currently, verifier relies purely on argument's BTF type information, and supports three general use cases: scalar, pointer-to-context, and pointer-to-fixed-size-memory. Scalar and pointer-to-fixed-mem work well in practice and are quite natural to use. But pointer-to-context is a bit problematic, as typical BPF users don't realize that they need to use a special type name to signal to verifier that argument is not just some pointer, but actually a PTR_TO_CTX. Further, even if users do know which type to use, it is limiting in situations where the same BPF program logic is used across few different program types. Common case is kprobes, tracepoints, and perf_event programs having a helper to send some data over BPF perf buffer. bpf_perf_event_output() requires `ctx` argument, and so it's quite cumbersome to share such global subprog across few BPF programs of different types, necessitating extra static subprog that is context type-agnostic. Long story short, there is a need to go beyond types and allow users to add hints to global subprog arguments to define expectations. This patch adds such support for two initial special tags: - pointer to context; - non-null qualifier for generic pointer arguments. All of the above came up in practice already and seem generally useful additions. Non-null qualifier is an often requested feature, which currently has to be worked around by having unnecessary NULL checks inside subprogs even if we know that arguments are never NULL. Pointer to context was discussed earlier. As for implementation, we utilize btf_decl_tag attribute and set up an "arg:xxx" convention to specify argument hint. As such: - btf_decl_tag("arg:ctx") is a PTR_TO_CTX hint; - btf_decl_tag("arg:nonnull") marks pointer argument as not allowed to be NULL, making NULL check inside global subprog unnecessary. Acked-by: Eduard Zingerman Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231215011334.2307144-7-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/btf.c | 44 ++++++++++++++++++++++++++++++++++++++------ kernel/bpf/verifier.c | 5 ++++- 2 files changed, 42 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 341811bcca53..c0fc8977be97 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -6782,7 +6782,7 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog) enum bpf_prog_type prog_type = prog->type; struct btf *btf = prog->aux->btf; const struct btf_param *args; - const struct btf_type *t, *ref_t; + const struct btf_type *t, *ref_t, *fn_t; u32 i, nargs, btf_id; const char *tname; @@ -6802,8 +6802,8 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog) return -EFAULT; } - t = btf_type_by_id(btf, btf_id); - if (!t || !btf_type_is_func(t)) { + fn_t = btf_type_by_id(btf, btf_id); + if (!fn_t || !btf_type_is_func(fn_t)) { /* These checks were already done by the verifier while loading * struct bpf_func_info */ @@ -6811,7 +6811,7 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog) subprog); return -EFAULT; } - tname = btf_name_by_offset(btf, t->name_off); + tname = btf_name_by_offset(btf, fn_t->name_off); if (prog->aux->func_info_aux[subprog].unreliable) { bpf_log(log, "Verifier bug in function %s()\n", tname); @@ -6820,7 +6820,7 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog) if (prog_type == BPF_PROG_TYPE_EXT) prog_type = prog->aux->dst_prog->type; - t = btf_type_by_id(btf, t->type); + t = btf_type_by_id(btf, fn_t->type); if (!t || !btf_type_is_func_proto(t)) { bpf_log(log, "Invalid type of function %s()\n", tname); return -EFAULT; @@ -6846,7 +6846,35 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog) * Only PTR_TO_CTX and SCALAR are supported atm. */ for (i = 0; i < nargs; i++) { + bool is_nonnull = false; + const char *tag; + t = btf_type_by_id(btf, args[i].type); + + tag = btf_find_decl_tag_value(btf, fn_t, i, "arg:"); + if (IS_ERR(tag) && PTR_ERR(tag) == -ENOENT) { + tag = NULL; + } else if (IS_ERR(tag)) { + bpf_log(log, "arg#%d type's tag fetching failure: %ld\n", i, PTR_ERR(tag)); + return PTR_ERR(tag); + } + /* 'arg:' decl_tag takes precedence over derivation of + * register type from BTF type itself + */ + if (tag) { + /* disallow arg tags in static subprogs */ + if (!is_global) { + bpf_log(log, "arg#%d type tag is not supported in static functions\n", i); + return -EOPNOTSUPP; + } + if (strcmp(tag, "ctx") == 0) { + sub->args[i].arg_type = ARG_PTR_TO_CTX; + continue; + } + if (strcmp(tag, "nonnull") == 0) + is_nonnull = true; + } + while (btf_type_is_modifier(t)) t = btf_type_by_id(btf, t->type); if (btf_type_is_int(t) || btf_is_any_enum(t)) { @@ -6870,10 +6898,14 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog) return -EINVAL; } - sub->args[i].arg_type = ARG_PTR_TO_MEM_OR_NULL; + sub->args[i].arg_type = is_nonnull ? ARG_PTR_TO_MEM : ARG_PTR_TO_MEM_OR_NULL; sub->args[i].mem_size = mem_size; continue; } + if (is_nonnull) { + bpf_log(log, "arg#%d marked as non-null, but is not a pointer type\n", i); + return -EINVAL; + } bpf_log(log, "Arg#%d type %s in %s() is not supported yet.\n", i, btf_type_str(t), tname); return -EINVAL; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 3c9e6d0d77d6..4caabfdc15aa 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -9290,9 +9290,12 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog, ret = check_func_arg_reg_off(env, reg, regno, ARG_DONTCARE); if (ret < 0) return ret; - if (check_mem_reg(env, reg, regno, arg->mem_size)) return -EINVAL; + if (!(arg->arg_type & PTR_MAYBE_NULL) && (reg->type & PTR_MAYBE_NULL)) { + bpf_log(log, "arg#%d is expected to be non-NULL\n", i); + return -EINVAL; + } } else { bpf_log(log, "verifier bug: unrecognized arg#%d type %d\n", i, arg->arg_type); -- cgit From a64bfe618665ea9c722f922cba8c6e3234eac5ac Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 14 Dec 2023 17:13:31 -0800 Subject: bpf: add support for passing dynptr pointer to global subprog Add ability to pass a pointer to dynptr into global functions. This allows to have global subprogs that accept and work with generic dynptrs that are created by caller. Dynptr argument is detected based on the name of a struct type, if it's "bpf_dynptr", it's assumed to be a proper dynptr pointer. Both actual struct and forward struct declaration types are supported. This is conceptually exactly the same semantics as bpf_user_ringbuf_drain()'s use of dynptr to pass a variable-sized pointer to ringbuf record. So we heavily rely on CONST_PTR_TO_DYNPTR bits of already existing logic in the verifier. During global subprog validation, we mark such CONST_PTR_TO_DYNPTR as having LOCAL type, as that's the most unassuming type of dynptr and it doesn't have any special helpers that can try to free or acquire extra references (unlike skb, xdp, or ringbuf dynptr). So that seems like a safe "choice" to make from correctness standpoint. It's still possible to pass any type of dynptr to such subprog, though, because generic dynptr helpers, like getting data/slice pointers, read/write memory copying routines, dynptr adjustment and getter routines all work correctly with any type of dynptr. Acked-by: Eduard Zingerman Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231215011334.2307144-8-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/btf.c | 23 +++++++++++++++++++++++ kernel/bpf/verifier.c | 7 +++++++ 2 files changed, 30 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index c0fc8977be97..51e8b4bee0c8 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -6765,6 +6765,25 @@ int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *pr return btf_check_func_type_match(log, btf1, t1, btf2, t2); } +static bool btf_is_dynptr_ptr(const struct btf *btf, const struct btf_type *t) +{ + const char *name; + + t = btf_type_by_id(btf, t->type); /* skip PTR */ + + while (btf_type_is_modifier(t)) + t = btf_type_by_id(btf, t->type); + + /* allow either struct or struct forward declaration */ + if (btf_type_is_struct(t) || + (btf_type_is_fwd(t) && btf_type_kflag(t) == 0)) { + name = btf_str_by_offset(btf, t->name_off); + return name && strcmp(name, "bpf_dynptr") == 0; + } + + return false; +} + /* Process BTF of a function to produce high-level expectation of function * arguments (like ARG_PTR_TO_CTX, or ARG_PTR_TO_MEM, etc). This information * is cached in subprog info for reuse. @@ -6885,6 +6904,10 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog) sub->args[i].arg_type = ARG_PTR_TO_CTX; continue; } + if (btf_type_is_ptr(t) && btf_is_dynptr_ptr(btf, t)) { + sub->args[i].arg_type = ARG_PTR_TO_DYNPTR | MEM_RDONLY; + continue; + } if (is_global && btf_type_is_ptr(t)) { u32 mem_size; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 4caabfdc15aa..f13008d27f35 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -9296,6 +9296,10 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog, bpf_log(log, "arg#%d is expected to be non-NULL\n", i); return -EINVAL; } + } else if (arg->arg_type == (ARG_PTR_TO_DYNPTR | MEM_RDONLY)) { + ret = process_dynptr_func(env, regno, -1, arg->arg_type, 0); + if (ret) + return ret; } else { bpf_log(log, "verifier bug: unrecognized arg#%d type %d\n", i, arg->arg_type); @@ -20052,6 +20056,9 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog) } else if (arg->arg_type == ARG_ANYTHING) { reg->type = SCALAR_VALUE; mark_reg_unknown(env, regs, i); + } else if (arg->arg_type == (ARG_PTR_TO_DYNPTR | MEM_RDONLY)) { + /* assume unspecial LOCAL dynptr type */ + __mark_dynptr_reg(reg, BPF_DYNPTR_TYPE_LOCAL, true, ++env->id_gen); } else if (base_type(arg->arg_type) == ARG_PTR_TO_MEM) { reg->type = PTR_TO_MEM; if (arg->arg_type & PTR_MAYBE_NULL) -- cgit From d5cfbdfc96aadc96a2bf6176269b994e2ce31717 Mon Sep 17 00:00:00 2001 From: "Tzvetomir Stoyanov (VMware)" Date: Tue, 19 Dec 2023 13:54:15 -0500 Subject: ring-buffer: Have ring_buffer_print_page_header() be able to access ring_buffer_iter In order to introduce sub-buffer size per ring buffer, some internal refactoring is needed. As ring_buffer_print_page_header() will depend on the trace_buffer structure, it is moved after the structure definition. Link: https://lore.kernel.org/linux-trace-devel/20211213094825.61876-2-tz.stoyanov@gmail.com Link: https://lore.kernel.org/linux-trace-kernel/20231219185627.723857541@goodmis.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Vincent Donnefort Cc: Kent Overstreet Signed-off-by: Tzvetomir Stoyanov (VMware) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 60 +++++++++++++++++++++++----------------------- 1 file changed, 30 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index f7dc74e45ebf..2400c8e68fd3 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -379,36 +379,6 @@ static inline bool test_time_stamp(u64 delta) /* Max payload is BUF_PAGE_SIZE - header (8bytes) */ #define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2)) -int ring_buffer_print_page_header(struct trace_seq *s) -{ - struct buffer_data_page field; - - trace_seq_printf(s, "\tfield: u64 timestamp;\t" - "offset:0;\tsize:%u;\tsigned:%u;\n", - (unsigned int)sizeof(field.time_stamp), - (unsigned int)is_signed_type(u64)); - - trace_seq_printf(s, "\tfield: local_t commit;\t" - "offset:%u;\tsize:%u;\tsigned:%u;\n", - (unsigned int)offsetof(typeof(field), commit), - (unsigned int)sizeof(field.commit), - (unsigned int)is_signed_type(long)); - - trace_seq_printf(s, "\tfield: int overwrite;\t" - "offset:%u;\tsize:%u;\tsigned:%u;\n", - (unsigned int)offsetof(typeof(field), commit), - 1, - (unsigned int)is_signed_type(long)); - - trace_seq_printf(s, "\tfield: char data;\t" - "offset:%u;\tsize:%u;\tsigned:%u;\n", - (unsigned int)offsetof(typeof(field), data), - (unsigned int)BUF_PAGE_SIZE, - (unsigned int)is_signed_type(char)); - - return !trace_seq_has_overflowed(s); -} - struct rb_irq_work { struct irq_work work; wait_queue_head_t waiters; @@ -556,6 +526,36 @@ struct ring_buffer_iter { int missed_events; }; +int ring_buffer_print_page_header(struct trace_seq *s) +{ + struct buffer_data_page field; + + trace_seq_printf(s, "\tfield: u64 timestamp;\t" + "offset:0;\tsize:%u;\tsigned:%u;\n", + (unsigned int)sizeof(field.time_stamp), + (unsigned int)is_signed_type(u64)); + + trace_seq_printf(s, "\tfield: local_t commit;\t" + "offset:%u;\tsize:%u;\tsigned:%u;\n", + (unsigned int)offsetof(typeof(field), commit), + (unsigned int)sizeof(field.commit), + (unsigned int)is_signed_type(long)); + + trace_seq_printf(s, "\tfield: int overwrite;\t" + "offset:%u;\tsize:%u;\tsigned:%u;\n", + (unsigned int)offsetof(typeof(field), commit), + 1, + (unsigned int)is_signed_type(long)); + + trace_seq_printf(s, "\tfield: char data;\t" + "offset:%u;\tsize:%u;\tsigned:%u;\n", + (unsigned int)offsetof(typeof(field), data), + (unsigned int)BUF_PAGE_SIZE, + (unsigned int)is_signed_type(char)); + + return !trace_seq_has_overflowed(s); +} + static inline void rb_time_read(rb_time_t *t, u64 *ret) { *ret = local64_read(&t->time); -- cgit From 139f84002145d8624f0195fb090b3a7670744a13 Mon Sep 17 00:00:00 2001 From: "Tzvetomir Stoyanov (VMware)" Date: Tue, 19 Dec 2023 13:54:16 -0500 Subject: ring-buffer: Page size per ring buffer Currently the size of one sub buffer page is global for all buffers and it is hard coded to one system page. In order to introduce configurable ring buffer sub page size, the internal logic should be refactored to work with sub page size per ring buffer. Link: https://lore.kernel.org/linux-trace-devel/20211213094825.61876-3-tz.stoyanov@gmail.com Link: https://lore.kernel.org/linux-trace-kernel/20231219185628.009147038@goodmis.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Vincent Donnefort Cc: Kent Overstreet Signed-off-by: Tzvetomir Stoyanov (VMware) Signed-off-by: Steven Rostedt (Google) --- include/linux/ring_buffer.h | 2 +- kernel/trace/ring_buffer.c | 68 +++++++++++++++++++++++++-------------------- kernel/trace/trace.c | 2 +- kernel/trace/trace.h | 1 + kernel/trace/trace_events.c | 59 +++++++++++++++++++++++++++++---------- 5 files changed, 86 insertions(+), 46 deletions(-) (limited to 'kernel') diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index b1b03b2c0f08..ce46218ce46d 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -200,7 +200,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer, void **data_page, struct trace_seq; int ring_buffer_print_entry_header(struct trace_seq *s); -int ring_buffer_print_page_header(struct trace_seq *s); +int ring_buffer_print_page_header(struct trace_buffer *buffer, struct trace_seq *s); enum ring_buffer_flags { RB_FL_OVERWRITE = 1 << 0, diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 2400c8e68fd3..d9f656502400 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -374,11 +374,6 @@ static inline bool test_time_stamp(u64 delta) return !!(delta & TS_DELTA_TEST); } -#define BUF_PAGE_SIZE (PAGE_SIZE - BUF_PAGE_HDR_SIZE) - -/* Max payload is BUF_PAGE_SIZE - header (8bytes) */ -#define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2)) - struct rb_irq_work { struct irq_work work; wait_queue_head_t waiters; @@ -510,6 +505,9 @@ struct trace_buffer { struct rb_irq_work irq_work; bool time_stamp_abs; + + unsigned int subbuf_size; + unsigned int max_data_size; }; struct ring_buffer_iter { @@ -523,10 +521,11 @@ struct ring_buffer_iter { u64 read_stamp; u64 page_stamp; struct ring_buffer_event *event; + size_t event_size; int missed_events; }; -int ring_buffer_print_page_header(struct trace_seq *s) +int ring_buffer_print_page_header(struct trace_buffer *buffer, struct trace_seq *s) { struct buffer_data_page field; @@ -550,7 +549,7 @@ int ring_buffer_print_page_header(struct trace_seq *s) trace_seq_printf(s, "\tfield: char data;\t" "offset:%u;\tsize:%u;\tsigned:%u;\n", (unsigned int)offsetof(typeof(field), data), - (unsigned int)BUF_PAGE_SIZE, + (unsigned int)buffer->subbuf_size, (unsigned int)is_signed_type(char)); return !trace_seq_has_overflowed(s); @@ -1625,7 +1624,13 @@ struct trace_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, if (!zalloc_cpumask_var(&buffer->cpumask, GFP_KERNEL)) goto fail_free_buffer; - nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); + /* Default buffer page size - one system page */ + buffer->subbuf_size = PAGE_SIZE - BUF_PAGE_HDR_SIZE; + + /* Max payload is buffer page size - header (8bytes) */ + buffer->max_data_size = buffer->subbuf_size - (sizeof(u32) * 2); + + nr_pages = DIV_ROUND_UP(size, buffer->subbuf_size); buffer->flags = flags; buffer->clock = trace_clock_local; buffer->reader_lock_key = key; @@ -1944,7 +1949,7 @@ static void update_pages_handler(struct work_struct *work) * @size: the new size. * @cpu_id: the cpu buffer to resize * - * Minimum size is 2 * BUF_PAGE_SIZE. + * Minimum size is 2 * buffer->subbuf_size. * * Returns 0 on success and < 0 on failure. */ @@ -1966,7 +1971,7 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, !cpumask_test_cpu(cpu_id, buffer->cpumask)) return 0; - nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); + nr_pages = DIV_ROUND_UP(size, buffer->subbuf_size); /* we need a minimum of two pages */ if (nr_pages < 2) @@ -2213,7 +2218,7 @@ rb_iter_head_event(struct ring_buffer_iter *iter) */ barrier(); - if ((iter->head + length) > commit || length > BUF_PAGE_SIZE) + if ((iter->head + length) > commit || length > iter->event_size) /* Writer corrupted the read? */ goto reset; @@ -2446,6 +2451,7 @@ static inline void rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, unsigned long tail, struct rb_event_info *info) { + unsigned long bsize = READ_ONCE(cpu_buffer->buffer->subbuf_size); struct buffer_page *tail_page = info->tail_page; struct ring_buffer_event *event; unsigned long length = info->length; @@ -2454,13 +2460,13 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, * Only the event that crossed the page boundary * must fill the old tail_page with padding. */ - if (tail >= BUF_PAGE_SIZE) { + if (tail >= bsize) { /* * If the page was filled, then we still need * to update the real_end. Reset it to zero * and the reader will ignore it. */ - if (tail == BUF_PAGE_SIZE) + if (tail == bsize) tail_page->real_end = 0; local_sub(length, &tail_page->write); @@ -2488,7 +2494,7 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, * If we are less than the minimum size, we don't need to * worry about it. */ - if (tail > (BUF_PAGE_SIZE - RB_EVNT_MIN_SIZE)) { + if (tail > (bsize - RB_EVNT_MIN_SIZE)) { /* No room for any events */ /* Mark the rest of the page with padding */ @@ -2503,19 +2509,19 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, } /* Put in a discarded event */ - event->array[0] = (BUF_PAGE_SIZE - tail) - RB_EVNT_HDR_SIZE; + event->array[0] = (bsize - tail) - RB_EVNT_HDR_SIZE; event->type_len = RINGBUF_TYPE_PADDING; /* time delta must be non zero */ event->time_delta = 1; /* account for padding bytes */ - local_add(BUF_PAGE_SIZE - tail, &cpu_buffer->entries_bytes); + local_add(bsize - tail, &cpu_buffer->entries_bytes); /* Make sure the padding is visible before the tail_page->write update */ smp_wmb(); /* Set write to end of buffer */ - length = (tail + length) - BUF_PAGE_SIZE; + length = (tail + length) - bsize; local_sub(length, &tail_page->write); } @@ -3469,7 +3475,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, tail = write - info->length; /* See if we shot pass the end of this buffer page */ - if (unlikely(write > BUF_PAGE_SIZE)) { + if (unlikely(write > cpu_buffer->buffer->subbuf_size)) { check_buffer(cpu_buffer, info, CHECK_FULL_PAGE); return rb_move_tail(cpu_buffer, tail, info); } @@ -3600,7 +3606,7 @@ rb_reserve_next_event(struct trace_buffer *buffer, if (ring_buffer_time_stamp_abs(cpu_buffer->buffer)) { add_ts_default = RB_ADD_STAMP_ABSOLUTE; info.length += RB_LEN_TIME_EXTEND; - if (info.length > BUF_MAX_DATA_SIZE) + if (info.length > cpu_buffer->buffer->max_data_size) goto out_fail; } else { add_ts_default = RB_ADD_STAMP_NONE; @@ -3675,7 +3681,7 @@ ring_buffer_lock_reserve(struct trace_buffer *buffer, unsigned long length) if (unlikely(atomic_read(&cpu_buffer->record_disabled))) goto out; - if (unlikely(length > BUF_MAX_DATA_SIZE)) + if (unlikely(length > buffer->max_data_size)) goto out; if (unlikely(trace_recursive_lock(cpu_buffer))) @@ -3825,7 +3831,7 @@ int ring_buffer_write(struct trace_buffer *buffer, if (atomic_read(&cpu_buffer->record_disabled)) goto out; - if (length > BUF_MAX_DATA_SIZE) + if (length > buffer->max_data_size) goto out; if (unlikely(trace_recursive_lock(cpu_buffer))) @@ -4405,6 +4411,7 @@ static struct buffer_page * rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) { struct buffer_page *reader = NULL; + unsigned long bsize = READ_ONCE(cpu_buffer->buffer->subbuf_size); unsigned long overwrite; unsigned long flags; int nr_loops = 0; @@ -4540,7 +4547,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) #define USECS_WAIT 1000000 for (nr_loops = 0; nr_loops < USECS_WAIT; nr_loops++) { /* If the write is past the end of page, a writer is still updating it */ - if (likely(!reader || rb_page_write(reader) <= BUF_PAGE_SIZE)) + if (likely(!reader || rb_page_write(reader) <= bsize)) break; udelay(1); @@ -4984,7 +4991,8 @@ ring_buffer_read_prepare(struct trace_buffer *buffer, int cpu, gfp_t flags) return NULL; /* Holds the entire event: data and meta data */ - iter->event = kmalloc(BUF_PAGE_SIZE, flags); + iter->event_size = buffer->subbuf_size; + iter->event = kmalloc(iter->event_size, flags); if (!iter->event) { kfree(iter); return NULL; @@ -5102,14 +5110,14 @@ unsigned long ring_buffer_size(struct trace_buffer *buffer, int cpu) { /* * Earlier, this method returned - * BUF_PAGE_SIZE * buffer->nr_pages + * buffer->subbuf_size * buffer->nr_pages * Since the nr_pages field is now removed, we have converted this to * return the per cpu buffer value. */ if (!cpumask_test_cpu(cpu, buffer->cpumask)) return 0; - return BUF_PAGE_SIZE * buffer->buffers[cpu]->nr_pages; + return buffer->subbuf_size * buffer->buffers[cpu]->nr_pages; } EXPORT_SYMBOL_GPL(ring_buffer_size); @@ -5123,8 +5131,8 @@ unsigned long ring_buffer_max_event_size(struct trace_buffer *buffer) { /* If abs timestamp is requested, events have a timestamp too */ if (ring_buffer_time_stamp_abs(buffer)) - return BUF_MAX_DATA_SIZE - RB_LEN_TIME_EXTEND; - return BUF_MAX_DATA_SIZE; + return buffer->max_data_size - RB_LEN_TIME_EXTEND; + return buffer->max_data_size; } EXPORT_SYMBOL_GPL(ring_buffer_max_event_size); @@ -5730,7 +5738,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer, /* If there is room at the end of the page to save the * missed events, then record it there. */ - if (BUF_PAGE_SIZE - commit >= sizeof(missed_events)) { + if (buffer->subbuf_size - commit >= sizeof(missed_events)) { memcpy(&bpage->data[commit], &missed_events, sizeof(missed_events)); local_add(RB_MISSED_STORED, &bpage->commit); @@ -5742,8 +5750,8 @@ int ring_buffer_read_page(struct trace_buffer *buffer, /* * This page may be off to user land. Zero it out here. */ - if (commit < BUF_PAGE_SIZE) - memset(&bpage->data[commit], 0, BUF_PAGE_SIZE - commit); + if (commit < buffer->subbuf_size) + memset(&bpage->data[commit], 0, buffer->subbuf_size - commit); out_unlock: raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 55dabee4c78b..76dd0a4c8cb5 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5018,7 +5018,7 @@ static int tracing_release(struct inode *inode, struct file *file) return 0; } -static int tracing_release_generic_tr(struct inode *inode, struct file *file) +int tracing_release_generic_tr(struct inode *inode, struct file *file) { struct trace_array *tr = inode->i_private; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 79180aed13ee..00f873910c5d 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -616,6 +616,7 @@ void tracing_reset_all_online_cpus(void); void tracing_reset_all_online_cpus_unlocked(void); int tracing_open_generic(struct inode *inode, struct file *filp); int tracing_open_generic_tr(struct inode *inode, struct file *filp); +int tracing_release_generic_tr(struct inode *inode, struct file *file); int tracing_open_file_tr(struct inode *inode, struct file *filp); int tracing_release_file_tr(struct inode *inode, struct file *filp); int tracing_single_release_file_tr(struct inode *inode, struct file *filp); diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index b70d03818038..7c364b87352e 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1893,9 +1893,9 @@ subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, } static ssize_t -show_header(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) +show_header_page_file(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { - int (*func)(struct trace_seq *s) = filp->private_data; + struct trace_array *tr = filp->private_data; struct trace_seq *s; int r; @@ -1908,7 +1908,31 @@ show_header(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) trace_seq_init(s); - func(s); + ring_buffer_print_page_header(tr->array_buffer.buffer, s); + r = simple_read_from_buffer(ubuf, cnt, ppos, + s->buffer, trace_seq_used(s)); + + kfree(s); + + return r; +} + +static ssize_t +show_header_event_file(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) +{ + struct trace_seq *s; + int r; + + if (*ppos) + return 0; + + s = kmalloc(sizeof(*s), GFP_KERNEL); + if (!s) + return -ENOMEM; + + trace_seq_init(s); + + ring_buffer_print_entry_header(s); r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, trace_seq_used(s)); @@ -2165,10 +2189,18 @@ static const struct file_operations ftrace_tr_enable_fops = { .release = subsystem_release, }; -static const struct file_operations ftrace_show_header_fops = { - .open = tracing_open_generic, - .read = show_header, +static const struct file_operations ftrace_show_header_page_fops = { + .open = tracing_open_generic_tr, + .read = show_header_page_file, .llseek = default_llseek, + .release = tracing_release_generic_tr, +}; + +static const struct file_operations ftrace_show_header_event_fops = { + .open = tracing_open_generic_tr, + .read = show_header_event_file, + .llseek = default_llseek, + .release = tracing_release_generic_tr, }; static int @@ -3794,17 +3826,16 @@ static int events_callback(const char *name, umode_t *mode, void **data, return 1; } - if (strcmp(name, "header_page") == 0) - *data = ring_buffer_print_page_header; - - else if (strcmp(name, "header_event") == 0) - *data = ring_buffer_print_entry_header; + if (strcmp(name, "header_page") == 0) { + *mode = TRACE_MODE_READ; + *fops = &ftrace_show_header_page_fops; - else + } else if (strcmp(name, "header_event") == 0) { + *mode = TRACE_MODE_READ; + *fops = &ftrace_show_header_event_fops; + } else return 0; - *mode = TRACE_MODE_READ; - *fops = &ftrace_show_header_fops; return 1; } -- cgit From 2808e31ec12e5fbe2ae25acc027fcdc67b1fb7f0 Mon Sep 17 00:00:00 2001 From: "Tzvetomir Stoyanov (VMware)" Date: Tue, 19 Dec 2023 13:54:17 -0500 Subject: ring-buffer: Add interface for configuring trace sub buffer size The trace ring buffer sub page size can be configured, per trace instance. A new ftrace file "buffer_subbuf_order" is added to get and set the size of the ring buffer sub page for current trace instance. The size must be an order of system page size, that's why the new interface works with system page order, instead of absolute page size: 0 means the ring buffer sub page is equal to 1 system page and so forth: 0 - 1 system page 1 - 2 system pages 2 - 4 system pages ... The ring buffer sub page size is limited between 1 and 128 system pages. The default value is 1 system page. New ring buffer APIs are introduced: ring_buffer_subbuf_order_set() ring_buffer_subbuf_order_get() ring_buffer_subbuf_size_get() Link: https://lore.kernel.org/linux-trace-devel/20211213094825.61876-4-tz.stoyanov@gmail.com Link: https://lore.kernel.org/linux-trace-kernel/20231219185628.298324722@goodmis.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Vincent Donnefort Cc: Kent Overstreet Signed-off-by: Tzvetomir Stoyanov (VMware) Signed-off-by: Steven Rostedt (Google) --- include/linux/ring_buffer.h | 4 +++ kernel/trace/ring_buffer.c | 73 +++++++++++++++++++++++++++++++++++++++++++++ kernel/trace/trace.c | 48 +++++++++++++++++++++++++++++ 3 files changed, 125 insertions(+) (limited to 'kernel') diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index ce46218ce46d..12573306b889 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -202,6 +202,10 @@ struct trace_seq; int ring_buffer_print_entry_header(struct trace_seq *s); int ring_buffer_print_page_header(struct trace_buffer *buffer, struct trace_seq *s); +int ring_buffer_subbuf_order_get(struct trace_buffer *buffer); +int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order); +int ring_buffer_subbuf_size_get(struct trace_buffer *buffer); + enum ring_buffer_flags { RB_FL_OVERWRITE = 1 << 0, }; diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index d9f656502400..20fc0121735d 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -507,6 +507,7 @@ struct trace_buffer { bool time_stamp_abs; unsigned int subbuf_size; + unsigned int subbuf_order; unsigned int max_data_size; }; @@ -5761,6 +5762,78 @@ int ring_buffer_read_page(struct trace_buffer *buffer, } EXPORT_SYMBOL_GPL(ring_buffer_read_page); +/** + * ring_buffer_subbuf_size_get - get size of the sub buffer. + * @buffer: the buffer to get the sub buffer size from + * + * Returns size of the sub buffer, in bytes. + */ +int ring_buffer_subbuf_size_get(struct trace_buffer *buffer) +{ + return buffer->subbuf_size + BUF_PAGE_HDR_SIZE; +} +EXPORT_SYMBOL_GPL(ring_buffer_subbuf_size_get); + +/** + * ring_buffer_subbuf_order_get - get order of system sub pages in one buffer page. + * @buffer: The ring_buffer to get the system sub page order from + * + * By default, one ring buffer sub page equals to one system page. This parameter + * is configurable, per ring buffer. The size of the ring buffer sub page can be + * extended, but must be an order of system page size. + * + * Returns the order of buffer sub page size, in system pages: + * 0 means the sub buffer size is 1 system page and so forth. + * In case of an error < 0 is returned. + */ +int ring_buffer_subbuf_order_get(struct trace_buffer *buffer) +{ + if (!buffer) + return -EINVAL; + + return buffer->subbuf_order; +} +EXPORT_SYMBOL_GPL(ring_buffer_subbuf_order_get); + +/** + * ring_buffer_subbuf_order_set - set the size of ring buffer sub page. + * @buffer: The ring_buffer to set the new page size. + * @order: Order of the system pages in one sub buffer page + * + * By default, one ring buffer pages equals to one system page. This API can be + * used to set new size of the ring buffer page. The size must be order of + * system page size, that's why the input parameter @order is the order of + * system pages that are allocated for one ring buffer page: + * 0 - 1 system page + * 1 - 2 system pages + * 3 - 4 system pages + * ... + * + * Returns 0 on success or < 0 in case of an error. + */ +int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order) +{ + int psize; + + if (!buffer || order < 0) + return -EINVAL; + + if (buffer->subbuf_order == order) + return 0; + + psize = (1 << order) * PAGE_SIZE; + if (psize <= BUF_PAGE_HDR_SIZE) + return -EINVAL; + + buffer->subbuf_order = order; + buffer->subbuf_size = psize - BUF_PAGE_HDR_SIZE; + + /* Todo: reset the buffer with the new page size */ + + return 0; +} +EXPORT_SYMBOL_GPL(ring_buffer_subbuf_order_set); + /* * We only allocate new buffers, never free them if the CPU goes down. * If we were to free the buffer, then the user would lose any trace that was in diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 76dd0a4c8cb5..a010aba4c4a4 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -9358,6 +9358,51 @@ static const struct file_operations buffer_percent_fops = { .llseek = default_llseek, }; +static ssize_t +buffer_order_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) +{ + struct trace_array *tr = filp->private_data; + char buf[64]; + int r; + + r = sprintf(buf, "%d\n", ring_buffer_subbuf_order_get(tr->array_buffer.buffer)); + + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); +} + +static ssize_t +buffer_order_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct trace_array *tr = filp->private_data; + unsigned long val; + int ret; + + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); + if (ret) + return ret; + + /* limit between 1 and 128 system pages */ + if (val < 0 || val > 7) + return -EINVAL; + + ret = ring_buffer_subbuf_order_set(tr->array_buffer.buffer, val); + if (ret) + return ret; + + (*ppos)++; + + return cnt; +} + +static const struct file_operations buffer_order_fops = { + .open = tracing_open_generic_tr, + .read = buffer_order_read, + .write = buffer_order_write, + .release = tracing_release_generic_tr, + .llseek = default_llseek, +}; + static struct dentry *trace_instance_dir; static void @@ -9824,6 +9869,9 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) trace_create_file("buffer_percent", TRACE_MODE_WRITE, d_tracer, tr, &buffer_percent_fops); + trace_create_file("buffer_subbuf_order", TRACE_MODE_WRITE, d_tracer, + tr, &buffer_order_fops); + create_trace_options_dir(tr); #ifdef CONFIG_TRACER_MAX_TRACE -- cgit From f9b94daa542a8d2532f0930f01cd9aec2d19621b Mon Sep 17 00:00:00 2001 From: "Tzvetomir Stoyanov (VMware)" Date: Tue, 19 Dec 2023 13:54:18 -0500 Subject: ring-buffer: Set new size of the ring buffer sub page There are two approaches when changing the size of the ring buffer sub page: 1. Destroying all pages and allocating new pages with the new size. 2. Allocating new pages, copying the content of the old pages before destroying them. The first approach is easier, it is selected in the proposed implementation. Changing the ring buffer sub page size is supposed to not happen frequently. Usually, that size should be set only once, when the buffer is not in use yet and is supposed to be empty. Link: https://lore.kernel.org/linux-trace-devel/20211213094825.61876-5-tz.stoyanov@gmail.com Link: https://lore.kernel.org/linux-trace-kernel/20231219185628.588995543@goodmis.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Vincent Donnefort Cc: Kent Overstreet Signed-off-by: Tzvetomir Stoyanov (VMware) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 80 ++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 73 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 20fc0121735d..b928bd03dd0e 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -332,6 +332,7 @@ struct buffer_page { unsigned read; /* index for next read */ local_t entries; /* entries on this page */ unsigned long real_end; /* real end of data */ + unsigned order; /* order of the page */ struct buffer_data_page *page; /* Actual data page */ }; @@ -362,7 +363,7 @@ static __always_inline unsigned int rb_page_commit(struct buffer_page *bpage) static void free_buffer_page(struct buffer_page *bpage) { - free_page((unsigned long)bpage->page); + free_pages((unsigned long)bpage->page, bpage->order); kfree(bpage); } @@ -1460,10 +1461,12 @@ static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, list_add(&bpage->list, pages); - page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu), mflags, 0); + page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu), mflags, + cpu_buffer->buffer->subbuf_order); if (!page) goto free_pages; bpage->page = page_address(page); + bpage->order = cpu_buffer->buffer->subbuf_order; rb_init_page(bpage->page); if (user_thread && fatal_signal_pending(current)) @@ -1542,7 +1545,8 @@ rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu) rb_check_bpage(cpu_buffer, bpage); cpu_buffer->reader_page = bpage; - page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL, 0); + + page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL, cpu_buffer->buffer->subbuf_order); if (!page) goto fail_free_reader; bpage->page = page_address(page); @@ -1626,6 +1630,7 @@ struct trace_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, goto fail_free_buffer; /* Default buffer page size - one system page */ + buffer->subbuf_order = 0; buffer->subbuf_size = PAGE_SIZE - BUF_PAGE_HDR_SIZE; /* Max payload is buffer page size - header (8bytes) */ @@ -5503,8 +5508,8 @@ void *ring_buffer_alloc_read_page(struct trace_buffer *buffer, int cpu) if (bpage) goto out; - page = alloc_pages_node(cpu_to_node(cpu), - GFP_KERNEL | __GFP_NORETRY, 0); + page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL | __GFP_NORETRY, + cpu_buffer->buffer->subbuf_order); if (!page) return ERR_PTR(-ENOMEM); @@ -5553,7 +5558,7 @@ void ring_buffer_free_read_page(struct trace_buffer *buffer, int cpu, void *data local_irq_restore(flags); out: - free_page((unsigned long)bpage); + free_pages((unsigned long)bpage, buffer->subbuf_order); } EXPORT_SYMBOL_GPL(ring_buffer_free_read_page); @@ -5813,7 +5818,13 @@ EXPORT_SYMBOL_GPL(ring_buffer_subbuf_order_get); */ int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order) { + struct ring_buffer_per_cpu **cpu_buffers; + int old_order, old_size; + int nr_pages; int psize; + int bsize; + int err; + int cpu; if (!buffer || order < 0) return -EINVAL; @@ -5825,12 +5836,67 @@ int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order) if (psize <= BUF_PAGE_HDR_SIZE) return -EINVAL; + bsize = sizeof(void *) * buffer->cpus; + cpu_buffers = kzalloc(bsize, GFP_KERNEL); + if (!cpu_buffers) + return -ENOMEM; + + old_order = buffer->subbuf_order; + old_size = buffer->subbuf_size; + + /* prevent another thread from changing buffer sizes */ + mutex_lock(&buffer->mutex); + atomic_inc(&buffer->record_disabled); + + /* Make sure all commits have finished */ + synchronize_rcu(); + buffer->subbuf_order = order; buffer->subbuf_size = psize - BUF_PAGE_HDR_SIZE; - /* Todo: reset the buffer with the new page size */ + /* Make sure all new buffers are allocated, before deleting the old ones */ + for_each_buffer_cpu(buffer, cpu) { + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + continue; + + nr_pages = buffer->buffers[cpu]->nr_pages; + cpu_buffers[cpu] = rb_allocate_cpu_buffer(buffer, nr_pages, cpu); + if (!cpu_buffers[cpu]) { + err = -ENOMEM; + goto error; + } + } + + for_each_buffer_cpu(buffer, cpu) { + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + continue; + + rb_free_cpu_buffer(buffer->buffers[cpu]); + buffer->buffers[cpu] = cpu_buffers[cpu]; + } + + atomic_dec(&buffer->record_disabled); + mutex_unlock(&buffer->mutex); + + kfree(cpu_buffers); return 0; + +error: + buffer->subbuf_order = old_order; + buffer->subbuf_size = old_size; + + atomic_dec(&buffer->record_disabled); + mutex_unlock(&buffer->mutex); + + for_each_buffer_cpu(buffer, cpu) { + if (!cpu_buffers[cpu]) + continue; + kfree(cpu_buffers[cpu]); + } + kfree(cpu_buffers); + + return err; } EXPORT_SYMBOL_GPL(ring_buffer_subbuf_order_set); -- cgit From bce761d757452ba5eb77e11fecc37a04b67494e7 Mon Sep 17 00:00:00 2001 From: "Tzvetomir Stoyanov (VMware)" Date: Tue, 19 Dec 2023 13:54:19 -0500 Subject: ring-buffer: Read and write to ring buffers with custom sub buffer size As the size of the ring sub buffer page can be changed dynamically, the logic that reads and writes to the buffer should be fixed to take that into account. Some internal ring buffer APIs are changed: ring_buffer_alloc_read_page() ring_buffer_free_read_page() ring_buffer_read_page() A new API is introduced: ring_buffer_read_page_data() Link: https://lore.kernel.org/linux-trace-devel/20211213094825.61876-6-tz.stoyanov@gmail.com Link: https://lore.kernel.org/linux-trace-kernel/20231219185628.875145995@goodmis.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Vincent Donnefort Cc: Kent Overstreet Signed-off-by: Tzvetomir Stoyanov (VMware) [ Fixed kerneldoc on data_page parameter in ring_buffer_free_read_page() ] Signed-off-by: Steven Rostedt (Google) --- include/linux/ring_buffer.h | 11 ++++-- kernel/trace/ring_buffer.c | 75 ++++++++++++++++++++++++++---------- kernel/trace/ring_buffer_benchmark.c | 10 +++-- kernel/trace/trace.c | 34 +++++++++------- 4 files changed, 89 insertions(+), 41 deletions(-) (limited to 'kernel') diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index 12573306b889..fa802db216f9 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -192,10 +192,15 @@ bool ring_buffer_time_stamp_abs(struct trace_buffer *buffer); size_t ring_buffer_nr_pages(struct trace_buffer *buffer, int cpu); size_t ring_buffer_nr_dirty_pages(struct trace_buffer *buffer, int cpu); -void *ring_buffer_alloc_read_page(struct trace_buffer *buffer, int cpu); -void ring_buffer_free_read_page(struct trace_buffer *buffer, int cpu, void *data); -int ring_buffer_read_page(struct trace_buffer *buffer, void **data_page, +struct buffer_data_read_page; +struct buffer_data_read_page * +ring_buffer_alloc_read_page(struct trace_buffer *buffer, int cpu); +void ring_buffer_free_read_page(struct trace_buffer *buffer, int cpu, + struct buffer_data_read_page *page); +int ring_buffer_read_page(struct trace_buffer *buffer, + struct buffer_data_read_page *data_page, size_t len, int cpu, int full); +void *ring_buffer_read_page_data(struct buffer_data_read_page *page); struct trace_seq; diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index b928bd03dd0e..c2afcf98ea91 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -318,6 +318,11 @@ struct buffer_data_page { unsigned char data[] RB_ALIGN_DATA; /* data of buffer page */ }; +struct buffer_data_read_page { + unsigned order; /* order of the page */ + struct buffer_data_page *data; /* actual data, stored in this page */ +}; + /* * Note, the buffer_page list must be first. The buffer pages * are allocated in cache lines, which means that each buffer @@ -5483,40 +5488,48 @@ EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu); * Returns: * The page allocated, or ERR_PTR */ -void *ring_buffer_alloc_read_page(struct trace_buffer *buffer, int cpu) +struct buffer_data_read_page * +ring_buffer_alloc_read_page(struct trace_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; - struct buffer_data_page *bpage = NULL; + struct buffer_data_read_page *bpage = NULL; unsigned long flags; struct page *page; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return ERR_PTR(-ENODEV); + bpage = kzalloc(sizeof(*bpage), GFP_KERNEL); + if (!bpage) + return ERR_PTR(-ENOMEM); + + bpage->order = buffer->subbuf_order; cpu_buffer = buffer->buffers[cpu]; local_irq_save(flags); arch_spin_lock(&cpu_buffer->lock); if (cpu_buffer->free_page) { - bpage = cpu_buffer->free_page; + bpage->data = cpu_buffer->free_page; cpu_buffer->free_page = NULL; } arch_spin_unlock(&cpu_buffer->lock); local_irq_restore(flags); - if (bpage) + if (bpage->data) goto out; page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL | __GFP_NORETRY, cpu_buffer->buffer->subbuf_order); - if (!page) + if (!page) { + kfree(bpage); return ERR_PTR(-ENOMEM); + } - bpage = page_address(page); + bpage->data = page_address(page); out: - rb_init_page(bpage); + rb_init_page(bpage->data); return bpage; } @@ -5526,14 +5539,15 @@ EXPORT_SYMBOL_GPL(ring_buffer_alloc_read_page); * ring_buffer_free_read_page - free an allocated read page * @buffer: the buffer the page was allocate for * @cpu: the cpu buffer the page came from - * @data: the page to free + * @data_page: the page to free * * Free a page allocated from ring_buffer_alloc_read_page. */ -void ring_buffer_free_read_page(struct trace_buffer *buffer, int cpu, void *data) +void ring_buffer_free_read_page(struct trace_buffer *buffer, int cpu, + struct buffer_data_read_page *data_page) { struct ring_buffer_per_cpu *cpu_buffer; - struct buffer_data_page *bpage = data; + struct buffer_data_page *bpage = data_page->data; struct page *page = virt_to_page(bpage); unsigned long flags; @@ -5542,8 +5556,12 @@ void ring_buffer_free_read_page(struct trace_buffer *buffer, int cpu, void *data cpu_buffer = buffer->buffers[cpu]; - /* If the page is still in use someplace else, we can't reuse it */ - if (page_ref_count(page) > 1) + /* + * If the page is still in use someplace else, or order of the page + * is different from the subbuffer order of the buffer - + * we can't reuse it + */ + if (page_ref_count(page) > 1 || data_page->order != buffer->subbuf_order) goto out; local_irq_save(flags); @@ -5558,7 +5576,8 @@ void ring_buffer_free_read_page(struct trace_buffer *buffer, int cpu, void *data local_irq_restore(flags); out: - free_pages((unsigned long)bpage, buffer->subbuf_order); + free_pages((unsigned long)bpage, data_page->order); + kfree(data_page); } EXPORT_SYMBOL_GPL(ring_buffer_free_read_page); @@ -5579,9 +5598,10 @@ EXPORT_SYMBOL_GPL(ring_buffer_free_read_page); * rpage = ring_buffer_alloc_read_page(buffer, cpu); * if (IS_ERR(rpage)) * return PTR_ERR(rpage); - * ret = ring_buffer_read_page(buffer, &rpage, len, cpu, 0); + * ret = ring_buffer_read_page(buffer, rpage, len, cpu, 0); * if (ret >= 0) - * process_page(rpage, ret); + * process_page(ring_buffer_read_page_data(rpage), ret); + * ring_buffer_free_read_page(buffer, cpu, rpage); * * When @full is set, the function will not return true unless * the writer is off the reader page. @@ -5596,7 +5616,8 @@ EXPORT_SYMBOL_GPL(ring_buffer_free_read_page); * <0 if no data has been transferred. */ int ring_buffer_read_page(struct trace_buffer *buffer, - void **data_page, size_t len, int cpu, int full) + struct buffer_data_read_page *data_page, + size_t len, int cpu, int full) { struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; struct ring_buffer_event *event; @@ -5621,10 +5642,12 @@ int ring_buffer_read_page(struct trace_buffer *buffer, len -= BUF_PAGE_HDR_SIZE; - if (!data_page) + if (!data_page || !data_page->data) + goto out; + if (data_page->order != buffer->subbuf_order) goto out; - bpage = *data_page; + bpage = data_page->data; if (!bpage) goto out; @@ -5718,11 +5741,11 @@ int ring_buffer_read_page(struct trace_buffer *buffer, /* swap the pages */ rb_init_page(bpage); bpage = reader->page; - reader->page = *data_page; + reader->page = data_page->data; local_set(&reader->write, 0); local_set(&reader->entries, 0); reader->read = 0; - *data_page = bpage; + data_page->data = bpage; /* * Use the real_end for the data size, @@ -5767,6 +5790,18 @@ int ring_buffer_read_page(struct trace_buffer *buffer, } EXPORT_SYMBOL_GPL(ring_buffer_read_page); +/** + * ring_buffer_read_page_data - get pointer to the data in the page. + * @page: the page to get the data from + * + * Returns pointer to the actual data in this page. + */ +void *ring_buffer_read_page_data(struct buffer_data_read_page *page) +{ + return page->data; +} +EXPORT_SYMBOL_GPL(ring_buffer_read_page_data); + /** * ring_buffer_subbuf_size_get - get size of the sub buffer. * @buffer: the buffer to get the sub buffer size from diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index aef34673d79d..008187ebd7fe 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c @@ -104,10 +104,11 @@ static enum event_status read_event(int cpu) static enum event_status read_page(int cpu) { + struct buffer_data_read_page *bpage; struct ring_buffer_event *event; struct rb_page *rpage; unsigned long commit; - void *bpage; + int page_size; int *entry; int ret; int inc; @@ -117,14 +118,15 @@ static enum event_status read_page(int cpu) if (IS_ERR(bpage)) return EVENT_DROPPED; - ret = ring_buffer_read_page(buffer, &bpage, PAGE_SIZE, cpu, 1); + page_size = ring_buffer_subbuf_size_get(buffer); + ret = ring_buffer_read_page(buffer, bpage, page_size, cpu, 1); if (ret >= 0) { - rpage = bpage; + rpage = ring_buffer_read_page_data(bpage); /* The commit may have missed event flags set, clear them */ commit = local_read(&rpage->commit) & 0xfffff; for (i = 0; i < commit && !test_error ; i += inc) { - if (i >= (PAGE_SIZE - offsetof(struct rb_page, data))) { + if (i >= (page_size - offsetof(struct rb_page, data))) { TEST_ERROR(); break; } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index a010aba4c4a4..711095aa731d 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -8286,6 +8286,8 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, { struct ftrace_buffer_info *info = filp->private_data; struct trace_iterator *iter = &info->iter; + void *trace_data; + int page_size; ssize_t ret = 0; ssize_t size; @@ -8297,6 +8299,8 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, return -EBUSY; #endif + page_size = ring_buffer_subbuf_size_get(iter->array_buffer->buffer); + if (!info->spare) { info->spare = ring_buffer_alloc_read_page(iter->array_buffer->buffer, iter->cpu_file); @@ -8311,13 +8315,13 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, return ret; /* Do we have previous read data to read? */ - if (info->read < PAGE_SIZE) + if (info->read < page_size) goto read; again: trace_access_lock(iter->cpu_file); ret = ring_buffer_read_page(iter->array_buffer->buffer, - &info->spare, + info->spare, count, iter->cpu_file, 0); trace_access_unlock(iter->cpu_file); @@ -8338,11 +8342,11 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, info->read = 0; read: - size = PAGE_SIZE - info->read; + size = page_size - info->read; if (size > count) size = count; - - ret = copy_to_user(ubuf, info->spare + info->read, size); + trace_data = ring_buffer_read_page_data(info->spare); + ret = copy_to_user(ubuf, trace_data + info->read, size); if (ret == size) return -EFAULT; @@ -8453,6 +8457,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, .spd_release = buffer_spd_release, }; struct buffer_ref *ref; + int page_size; int entries, i; ssize_t ret = 0; @@ -8461,13 +8466,14 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, return -EBUSY; #endif - if (*ppos & (PAGE_SIZE - 1)) + page_size = ring_buffer_subbuf_size_get(iter->array_buffer->buffer); + if (*ppos & (page_size - 1)) return -EINVAL; - if (len & (PAGE_SIZE - 1)) { - if (len < PAGE_SIZE) + if (len & (page_size - 1)) { + if (len < page_size) return -EINVAL; - len &= PAGE_MASK; + len &= (~(page_size - 1)); } if (splice_grow_spd(pipe, &spd)) @@ -8477,7 +8483,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, trace_access_lock(iter->cpu_file); entries = ring_buffer_entries_cpu(iter->array_buffer->buffer, iter->cpu_file); - for (i = 0; i < spd.nr_pages_max && len && entries; i++, len -= PAGE_SIZE) { + for (i = 0; i < spd.nr_pages_max && len && entries; i++, len -= page_size) { struct page *page; int r; @@ -8498,7 +8504,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, } ref->cpu = iter->cpu_file; - r = ring_buffer_read_page(ref->buffer, &ref->page, + r = ring_buffer_read_page(ref->buffer, ref->page, len, iter->cpu_file, 1); if (r < 0) { ring_buffer_free_read_page(ref->buffer, ref->cpu, @@ -8507,14 +8513,14 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, break; } - page = virt_to_page(ref->page); + page = virt_to_page(ring_buffer_read_page_data(ref->page)); spd.pages[i] = page; - spd.partial[i].len = PAGE_SIZE; + spd.partial[i].len = page_size; spd.partial[i].offset = 0; spd.partial[i].private = (unsigned long)ref; spd.nr_pages++; - *ppos += PAGE_SIZE; + *ppos += page_size; entries = ring_buffer_entries_cpu(iter->array_buffer->buffer, iter->cpu_file); } -- cgit From cb665db94fc61512c9c94ed1d42af67e7bf6ce01 Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Fri, 1 Dec 2023 10:26:23 +0100 Subject: tick-sched: Fix function names in comments When referencing functions in comments, it might be helpful to use full function names (including the prefix) to be able to find it when grepping. Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lore.kernel.org/r/20231201092654.34614-2-anna-maria@linutronix.de --- kernel/time/tick-sched.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index be77b021e5d6..ff25fdff6b7c 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -920,11 +920,11 @@ static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu) } /* - * nohz_stop_sched_tick() can be called several times before - * nohz_restart_sched_tick() is called. This happens when - * interrupts arrive which do not cause a reschedule. In the - * first call we save the current tick time, so we can restart - * the scheduler tick in nohz_restart_sched_tick(). + * tick_nohz_stop_tick() can be called several times before + * tick_nohz_restart_sched_tick() is called. This happens when + * interrupts arrive which do not cause a reschedule. In the first + * call we save the current tick time, so we can restart the + * scheduler tick in tick_nohz_restart_sched_tick(). */ if (!ts->tick_stopped) { calc_load_nohz_start(); -- cgit From 318050671affa92fd166d988d08d4041c7b113c4 Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Fri, 1 Dec 2023 10:26:24 +0100 Subject: tick/sched: Cleanup confusing variables tick_nohz_stop_tick() contains the expires (u64 variable) and tick (ktime_t) variable. In the beginning the value of expires is written to tick. Afterwards none of the variables is changed. They are only used for checks. Drop the not required variable tick and use always expires instead. Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lore.kernel.org/r/20231201092654.34614-3-anna-maria@linutronix.de --- kernel/time/tick-sched.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index ff25fdff6b7c..fce3c6f0e4a6 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -887,7 +887,6 @@ static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu) struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); u64 basemono = ts->timer_expires_base; u64 expires = ts->timer_expires; - ktime_t tick = expires; /* Make sure we won't be trying to stop it twice in a row. */ ts->timer_expires_base = 0; @@ -910,7 +909,7 @@ static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu) /* Skip reprogram of event if it's not changed */ if (ts->tick_stopped && (expires == ts->next_tick)) { /* Sanity check: make sure clockevent is actually programmed */ - if (tick == KTIME_MAX || ts->next_tick == hrtimer_get_expires(&ts->sched_timer)) + if (expires == KTIME_MAX || ts->next_tick == hrtimer_get_expires(&ts->sched_timer)) return; WARN_ON_ONCE(1); @@ -935,7 +934,7 @@ static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu) trace_tick_stop(1, TICK_DEP_MASK_NONE); } - ts->next_tick = tick; + ts->next_tick = expires; /* * If the expiration time == KTIME_MAX, then we simply stop @@ -950,11 +949,11 @@ static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu) } if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { - hrtimer_start(&ts->sched_timer, tick, + hrtimer_start(&ts->sched_timer, expires, HRTIMER_MODE_ABS_PINNED_HARD); } else { - hrtimer_set_expires(&ts->sched_timer, tick); - tick_program_event(tick, 1); + hrtimer_set_expires(&ts->sched_timer, expires); + tick_program_event(expires, 1); } } -- cgit From cbf04a22026100dceeceec67fcbf1973383eb32f Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Fri, 1 Dec 2023 10:26:25 +0100 Subject: tick-sched: Warn when next tick seems to be in the past When the next tick is in the past, the delta between basemono and the next tick gets negativ. But the next tick should never be in the past. The negative effect of a wrong next tick might be a stop of the tick and timers might expire late. To prevent expensive debugging when changing underlying code, add a WARN_ON_ONCE into this code path. To prevent complete misbehaviour, also reset next_tick to basemono in this case. Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lore.kernel.org/r/20231201092654.34614-4-anna-maria@linutronix.de --- kernel/time/tick-sched.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index fce3c6f0e4a6..a17d26002831 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -839,6 +839,10 @@ static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu) ts->next_timer = next_tick; } + /* Make sure next_tick is never before basemono! */ + if (WARN_ON_ONCE(basemono > next_tick)) + next_tick = basemono; + /* * If the tick is due in the next period, keep it ticking or * force prod the timer. -- cgit From dbcdcb62b59db2cf6a24113873b90da15c6f0b19 Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Fri, 1 Dec 2023 10:26:26 +0100 Subject: tracing/timers: Enhance timer_start tracepoint For starting a timer, the timer is enqueued into a bucket of the timer wheel. The bucket expiry is the defacto expiry of the timer but it is not equal the timer expiry because of increasing granularity when bucket is in a higher level of the wheel. To be able to figure out in a trace whether a timer expired in time or not, the bucket expiry time is required as well. Add bucket expiry time to the timer_start tracepoint and thereby simplify the arguments. Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lore.kernel.org/r/20231201092654.34614-5-anna-maria@linutronix.de --- include/trace/events/timer.h | 20 ++++++++++---------- kernel/time/timer.c | 2 +- 2 files changed, 11 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/include/trace/events/timer.h b/include/trace/events/timer.h index b4bc2828fa09..99ada928d445 100644 --- a/include/trace/events/timer.h +++ b/include/trace/events/timer.h @@ -46,22 +46,21 @@ DEFINE_EVENT(timer_class, timer_init, /** * timer_start - called when the timer is started - * @timer: pointer to struct timer_list - * @expires: the timers expiry time - * @flags: the timers flags + * @timer: pointer to struct timer_list + * @bucket_expiry: the bucket expiry time */ TRACE_EVENT(timer_start, TP_PROTO(struct timer_list *timer, - unsigned long expires, - unsigned int flags), + unsigned long bucket_expiry), - TP_ARGS(timer, expires, flags), + TP_ARGS(timer, bucket_expiry), TP_STRUCT__entry( __field( void *, timer ) __field( void *, function ) __field( unsigned long, expires ) + __field( unsigned long, bucket_expiry ) __field( unsigned long, now ) __field( unsigned int, flags ) ), @@ -69,15 +68,16 @@ TRACE_EVENT(timer_start, TP_fast_assign( __entry->timer = timer; __entry->function = timer->function; - __entry->expires = expires; + __entry->expires = timer->expires; + __entry->bucket_expiry = bucket_expiry; __entry->now = jiffies; - __entry->flags = flags; + __entry->flags = timer->flags; ), - TP_printk("timer=%p function=%ps expires=%lu [timeout=%ld] cpu=%u idx=%u flags=%s", + TP_printk("timer=%p function=%ps expires=%lu [timeout=%ld] bucket_expiry=%lu cpu=%u idx=%u flags=%s", __entry->timer, __entry->function, __entry->expires, (long)__entry->expires - __entry->now, - __entry->flags & TIMER_CPUMASK, + __entry->bucket_expiry, __entry->flags & TIMER_CPUMASK, __entry->flags >> TIMER_ARRAYSHIFT, decode_timer_flags(__entry->flags & TIMER_TRACE_FLAGMASK)) ); diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 63a8ce7177dd..a81d793a43d0 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -606,7 +606,7 @@ static void enqueue_timer(struct timer_base *base, struct timer_list *timer, __set_bit(idx, base->pending_map); timer_set_idx(timer, idx); - trace_timer_start(timer, timer->expires, timer->flags); + trace_timer_start(timer, bucket_expiry); /* * Check whether this is the new first expiring timer. The -- cgit From b573c73101d8786446535b2ab28cbc8907bda9a9 Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Fri, 1 Dec 2023 10:26:27 +0100 Subject: tracing/timers: Add tracepoint for tracking timer base is_idle flag When debugging timer code the timer tracepoints are very important. There is no tracepoint when the is_idle flag of the timer base changes. Instead of always adding manually trace_printk(), add tracepoints which can be easily enabled whenever required. Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lore.kernel.org/r/20231201092654.34614-6-anna-maria@linutronix.de --- include/trace/events/timer.h | 20 ++++++++++++++++++++ kernel/time/timer.c | 14 +++++++++++--- 2 files changed, 31 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/trace/events/timer.h b/include/trace/events/timer.h index 99ada928d445..1ef58a04fc57 100644 --- a/include/trace/events/timer.h +++ b/include/trace/events/timer.h @@ -142,6 +142,26 @@ DEFINE_EVENT(timer_class, timer_cancel, TP_ARGS(timer) ); +TRACE_EVENT(timer_base_idle, + + TP_PROTO(bool is_idle, unsigned int cpu), + + TP_ARGS(is_idle, cpu), + + TP_STRUCT__entry( + __field( bool, is_idle ) + __field( unsigned int, cpu ) + ), + + TP_fast_assign( + __entry->is_idle = is_idle; + __entry->cpu = cpu; + ), + + TP_printk("is_idle=%d cpu=%d", + __entry->is_idle, __entry->cpu) +); + #define decode_clockid(type) \ __print_symbolic(type, \ { CLOCK_REALTIME, "CLOCK_REALTIME" }, \ diff --git a/kernel/time/timer.c b/kernel/time/timer.c index a81d793a43d0..ed8d6063d9ef 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1950,7 +1950,10 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem) if (time_before_eq(nextevt, basej)) { expires = basem; - base->is_idle = false; + if (base->is_idle) { + base->is_idle = false; + trace_timer_base_idle(false, base->cpu); + } } else { if (base->timers_pending) expires = basem + (u64)(nextevt - basej) * TICK_NSEC; @@ -1961,8 +1964,10 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem) * logic is only maintained for the BASE_STD base, deferrable * timers may still see large granularity skew (by design). */ - if ((expires - basem) > TICK_NSEC) + if ((expires - basem) > TICK_NSEC && !base->is_idle) { base->is_idle = true; + trace_timer_base_idle(true, base->cpu); + } } raw_spin_unlock(&base->lock); @@ -1984,7 +1989,10 @@ void timer_clear_idle(void) * sending the IPI a few instructions smaller for the cost of taking * the lock in the exit from idle path. */ - base->is_idle = false; + if (base->is_idle) { + base->is_idle = false; + trace_timer_base_idle(false, smp_processor_id()); + } } #endif -- cgit From d124c3393e798b1fb142ee728d5c8976d11e722d Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Fri, 1 Dec 2023 10:26:28 +0100 Subject: timers: Do not IPI for deferrable timers Deferrable timers do not prevent CPU from going idle and are not taken into account on idle path. Sending an IPI to a remote CPU when a new first deferrable timer was enqueued will wake up the remote CPU but nothing will be done regarding the deferrable timers. Drop IPI completely when a new first deferrable timer was enqueued. Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lore.kernel.org/r/20231201092654.34614-7-anna-maria@linutronix.de --- kernel/time/timer.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index ed8d6063d9ef..91882059bf3d 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -571,18 +571,15 @@ static int calc_wheel_index(unsigned long expires, unsigned long clk, static void trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer) { - if (!is_timers_nohz_active()) - return; - /* - * TODO: This wants some optimizing similar to the code below, but we - * will do that when we switch from push to pull for deferrable timers. + * Deferrable timers do not prevent the CPU from entering dynticks and + * are not taken into account on the idle/nohz_full path. An IPI when a + * new deferrable timer is enqueued will wake up the remote CPU but + * nothing will be done with the deferrable timer base. Therefore skip + * the remote IPI for deferrable timers completely. */ - if (timer->flags & TIMER_DEFERRABLE) { - if (tick_nohz_full_cpu(base->cpu)) - wake_up_nohz_cpu(base->cpu); + if (!is_timers_nohz_active() || timer->flags & TIMER_DEFERRABLE) return; - } /* * We might have to IPI the remote CPU if the base is idle and the -- cgit From b5e6f59888c7bde3c05f61b3ce06b78a86713fc0 Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Fri, 1 Dec 2023 10:26:29 +0100 Subject: timers: Move store of next event into __next_timer_interrupt() Both call sites of __next_timer_interrupt() store the return value directly in base->next_expiry. Move the store into __next_timer_interrupt() and to make its purpose more clear, rename the function to next_expiry_recalc(). No functional change. Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Reviewed-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lore.kernel.org/r/20231201092654.34614-8-anna-maria@linutronix.de --- kernel/time/timer.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 91882059bf3d..490ff8e66fc2 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1800,8 +1800,10 @@ static int next_pending_bucket(struct timer_base *base, unsigned offset, /* * Search the first expiring timer in the various clock levels. Caller must * hold base->lock. + * + * Store next expiry time in base->next_expiry. */ -static unsigned long __next_timer_interrupt(struct timer_base *base) +static void next_expiry_recalc(struct timer_base *base) { unsigned long clk, next, adj; unsigned lvl, offset = 0; @@ -1867,10 +1869,9 @@ static unsigned long __next_timer_interrupt(struct timer_base *base) clk += adj; } + base->next_expiry = next; base->next_expiry_recalc = false; base->timers_pending = !(next == base->clk + NEXT_TIMER_MAX_DELTA); - - return next; } #ifdef CONFIG_NO_HZ_COMMON @@ -1930,7 +1931,7 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem) raw_spin_lock(&base->lock); if (base->next_expiry_recalc) - base->next_expiry = __next_timer_interrupt(base); + next_expiry_recalc(base); nextevt = base->next_expiry; /* @@ -2021,7 +2022,7 @@ static inline void __run_timers(struct timer_base *base) WARN_ON_ONCE(!levels && !base->next_expiry_recalc && base->timers_pending); base->clk++; - base->next_expiry = __next_timer_interrupt(base); + next_expiry_recalc(base); while (levels--) expire_timers(base, heads + levels); -- cgit From 8a2c9c7e7848d7f63d38b698209148b5bb4ba7f3 Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Fri, 1 Dec 2023 10:26:30 +0100 Subject: timers: Clarify check in forward_timer_base() The current check whether a forward of the timer base is required can be simplified by using an already existing comparison function which is easier to read. The related comment is outdated and was not updated when the check changed in commit 36cd28a4cdd0 ("timers: Lower base clock forwarding threshold"). Use time_before_eq() for the check and replace the comment by copying the comment from the same check inside get_next_timer_interrupt(). Move the precious information of the outdated comment to the proper place in __run_timers(). No functional change. Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lore.kernel.org/r/20231201092654.34614-9-anna-maria@linutronix.de --- kernel/time/timer.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 490ff8e66fc2..f75f932b128e 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -944,11 +944,10 @@ static inline void forward_timer_base(struct timer_base *base) unsigned long jnow = READ_ONCE(jiffies); /* - * No need to forward if we are close enough below jiffies. - * Also while executing timers, base->clk is 1 offset ahead - * of jiffies to avoid endless requeuing to current jiffies. + * Check whether we can forward the base. We can only do that when + * @basej is past base->clk otherwise we might rewind base->clk. */ - if ((long)(jnow - base->clk) < 1) + if (time_before_eq(jnow, base->clk)) return; /* @@ -2021,6 +2020,10 @@ static inline void __run_timers(struct timer_base *base) */ WARN_ON_ONCE(!levels && !base->next_expiry_recalc && base->timers_pending); + /* + * While executing timers, base->clk is set 1 offset ahead of + * jiffies to avoid endless requeuing to current jiffies. + */ base->clk++; next_expiry_recalc(base); -- cgit From 1e490484aa3af42d4eeffabf96d6a02be69d586b Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Fri, 1 Dec 2023 10:26:31 +0100 Subject: timers: Split out forward timer base functionality Forwarding timer base is done when the next expiry value is calculated and when a new timer is enqueued. When the next expiry value is calculated the jiffies value is already available and does not need to be reread a second time. Splitting out the forward timer base functionality to make it executable via both contextes - those where jiffies are already known and those, where jiffies need to be read. No functional change. Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lore.kernel.org/r/20231201092654.34614-10-anna-maria@linutronix.de --- kernel/time/timer.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index f75f932b128e..5b02e169ab23 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -939,30 +939,34 @@ get_target_base(struct timer_base *base, unsigned tflags) return get_timer_this_cpu_base(tflags); } -static inline void forward_timer_base(struct timer_base *base) +static inline void __forward_timer_base(struct timer_base *base, + unsigned long basej) { - unsigned long jnow = READ_ONCE(jiffies); - /* * Check whether we can forward the base. We can only do that when * @basej is past base->clk otherwise we might rewind base->clk. */ - if (time_before_eq(jnow, base->clk)) + if (time_before_eq(basej, base->clk)) return; /* * If the next expiry value is > jiffies, then we fast forward to * jiffies otherwise we forward to the next expiry value. */ - if (time_after(base->next_expiry, jnow)) { - base->clk = jnow; + if (time_after(base->next_expiry, basej)) { + base->clk = basej; } else { if (WARN_ON_ONCE(time_before(base->next_expiry, base->clk))) return; base->clk = base->next_expiry; } + } +static inline void forward_timer_base(struct timer_base *base) +{ + __forward_timer_base(base, READ_ONCE(jiffies)); +} /* * We are using hashed locking: Holding per_cpu(timer_bases[x]).lock means -- cgit From 7a39a5080ef0e3cf233d92165f6a778f08a08244 Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Fri, 1 Dec 2023 10:26:32 +0100 Subject: timers: Use already existing function for forwarding timer base There is an already existing function for forwarding the timer base. Forwarding the timer base is implemented directly in get_next_timer_interrupt() as well. Remove the code duplication and invoke __forward_timer_base() instead. Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lore.kernel.org/r/20231201092654.34614-11-anna-maria@linutronix.de --- kernel/time/timer.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 5b02e169ab23..1a73d396101b 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1939,15 +1939,9 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem) /* * We have a fresh next event. Check whether we can forward the - * base. We can only do that when @basej is past base->clk - * otherwise we might rewind base->clk. + * base. */ - if (time_after(basej, base->clk)) { - if (time_after(nextevt, basej)) - base->clk = basej; - else if (time_after(nextevt, base->clk)) - base->clk = nextevt; - } + __forward_timer_base(base, basej); if (time_before_eq(nextevt, basej)) { expires = basem; -- cgit From bb8caad5083f8fbba70faf41f1d3bab7cf09da6d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 1 Dec 2023 10:26:33 +0100 Subject: timers: Rework idle logic To improve readability of the code, split base->idle calculation and expires calculation into separate parts. While at it, update the comment about timer base idle marking. Thereby the following subtle change happens if the next event is just one jiffy ahead and the tick was already stopped: Originally base->is_idle remains true in this situation. Now base->is_idle turns to false. This may spare an IPI if a timer is enqueued remotely to an idle CPU that is going to tick on the next jiffy. Signed-off-by: Thomas Gleixner Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lore.kernel.org/r/20231201092654.34614-12-anna-maria@linutronix.de --- kernel/time/timer.c | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 1a73d396101b..cf51655add64 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1924,6 +1924,7 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem) struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); u64 expires = KTIME_MAX; unsigned long nextevt; + bool was_idle; /* * Pretend that there is no timer pending if the cpu is offline. @@ -1943,27 +1944,26 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem) */ __forward_timer_base(base, basej); - if (time_before_eq(nextevt, basej)) { - expires = basem; - if (base->is_idle) { - base->is_idle = false; - trace_timer_base_idle(false, base->cpu); - } - } else { - if (base->timers_pending) - expires = basem + (u64)(nextevt - basej) * TICK_NSEC; - /* - * If we expect to sleep more than a tick, mark the base idle. - * Also the tick is stopped so any added timer must forward - * the base clk itself to keep granularity small. This idle - * logic is only maintained for the BASE_STD base, deferrable - * timers may still see large granularity skew (by design). - */ - if ((expires - basem) > TICK_NSEC && !base->is_idle) { - base->is_idle = true; - trace_timer_base_idle(true, base->cpu); - } + if (base->timers_pending) { + /* If we missed a tick already, force 0 delta */ + if (time_before(nextevt, basej)) + nextevt = basej; + expires = basem + (u64)(nextevt - basej) * TICK_NSEC; } + + /* + * Base is idle if the next event is more than a tick away. + * + * If the base is marked idle then any timer add operation must forward + * the base clk itself to keep granularity small. This idle logic is + * only maintained for the BASE_STD base, deferrable timers may still + * see large granularity skew (by design). + */ + was_idle = base->is_idle; + base->is_idle = time_after(nextevt, basej + 1); + if (was_idle != base->is_idle) + trace_timer_base_idle(base->is_idle, base->cpu); + raw_spin_unlock(&base->lock); return cmp_next_hrtimer_event(basem, expires); -- cgit From da65f29dada7f7cbbf0d6375b88a0316f5f7d6f5 Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Fri, 1 Dec 2023 10:26:34 +0100 Subject: timers: Fix nextevt calculation when no timers are pending When no timer is queued into an empty timer base, the next_expiry will not be updated. It was originally calculated as base->clk + NEXT_TIMER_MAX_DELTA When the timer base stays empty long enough (> NEXT_TIMER_MAX_DELTA), the next_expiry value of the empty base suggests that there is a timer pending soon. This might be more a kind of a theoretical problem, but the fix doesn't hurt. Use only base->next_expiry value as nextevt when timers are pending. Otherwise nextevt will be jiffies + NEXT_TIMER_MAX_DELTA. As all information is in place, update base->next_expiry value of the empty timer base as well. Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lore.kernel.org/r/20231201092654.34614-13-anna-maria@linutronix.de --- kernel/time/timer.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index cf51655add64..352b161113cd 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1922,8 +1922,8 @@ static u64 cmp_next_hrtimer_event(u64 basem, u64 expires) u64 get_next_timer_interrupt(unsigned long basej, u64 basem) { struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); + unsigned long nextevt = basej + NEXT_TIMER_MAX_DELTA; u64 expires = KTIME_MAX; - unsigned long nextevt; bool was_idle; /* @@ -1936,7 +1936,6 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem) raw_spin_lock(&base->lock); if (base->next_expiry_recalc) next_expiry_recalc(base); - nextevt = base->next_expiry; /* * We have a fresh next event. Check whether we can forward the @@ -1945,10 +1944,20 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem) __forward_timer_base(base, basej); if (base->timers_pending) { + nextevt = base->next_expiry; + /* If we missed a tick already, force 0 delta */ if (time_before(nextevt, basej)) nextevt = basej; expires = basem + (u64)(nextevt - basej) * TICK_NSEC; + } else { + /* + * Move next_expiry for the empty base into the future to + * prevent a unnecessary raise of the timer softirq when the + * next_expiry value will be reached even if there is no timer + * pending. + */ + base->next_expiry = nextevt; } /* -- cgit From 7beb82b7d590e51f698b1cf590b0e2785db1c498 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 19 Dec 2023 22:12:26 -0800 Subject: tracing/synthetic: fix kernel-doc warnings scripts/kernel-doc warns about using @args: for variadic arguments to functions. Documentation/doc-guide/kernel-doc.rst says that this should be written as @...: instead, so update the source code to match that, preventing the warnings. trace_events_synth.c:1165: warning: Excess function parameter 'args' description in '__synth_event_gen_cmd_start' trace_events_synth.c:1714: warning: Excess function parameter 'args' description in 'synth_event_trace' Link: https://lore.kernel.org/linux-trace-kernel/20231220061226.30962-1-rdunlap@infradead.org Cc: Mathieu Desnoyers Fixes: 35ca5207c2d11 ("tracing: Add synthetic event command generation functions") Fixes: 8dcc53ad956d2 ("tracing: Add synth_event_trace() and related functions") Acked-by: Masami Hiramatsu (Google) Signed-off-by: Randy Dunlap Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_synth.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c index 846e02c0fb59..e7af286af4f1 100644 --- a/kernel/trace/trace_events_synth.c +++ b/kernel/trace/trace_events_synth.c @@ -1137,7 +1137,7 @@ EXPORT_SYMBOL_GPL(synth_event_add_fields); * @cmd: A pointer to the dynevent_cmd struct representing the new event * @name: The name of the synthetic event * @mod: The module creating the event, NULL if not created from a module - * @args: Variable number of arg (pairs), one pair for each field + * @...: Variable number of arg (pairs), one pair for each field * * NOTE: Users normally won't want to call this function directly, but * rather use the synth_event_gen_cmd_start() wrapper, which @@ -1695,7 +1695,7 @@ __synth_event_trace_end(struct synth_event_trace_state *trace_state) * synth_event_trace - Trace a synthetic event * @file: The trace_event_file representing the synthetic event * @n_vals: The number of values in vals - * @args: Variable number of args containing the event values + * @...: Variable number of args containing the event values * * Trace a synthetic event using the values passed in the variable * argument list. -- cgit From e0f4bd26e29bf6162cdc9dc6fb7522bde7b74d07 Mon Sep 17 00:00:00 2001 From: Kevin Hao Date: Wed, 20 Dec 2023 08:35:35 +0800 Subject: PM: sleep: Remove obsolete comment from unlock_system_sleep() With the freezer changes introduced by commit f5d39b020809 ("freezer,sched: Rewrite core freezer logic"), the comment in unlock_system_sleep() has become obsolete, there is no need to retain it. Signed-off-by: Kevin Hao Signed-off-by: Rafael J. Wysocki --- kernel/power/main.c | 16 ---------------- 1 file changed, 16 deletions(-) (limited to 'kernel') diff --git a/kernel/power/main.c b/kernel/power/main.c index f6425ae3e8b0..b1ae9b677d03 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -60,22 +60,6 @@ EXPORT_SYMBOL_GPL(lock_system_sleep); void unlock_system_sleep(unsigned int flags) { - /* - * Don't use freezer_count() because we don't want the call to - * try_to_freeze() here. - * - * Reason: - * Fundamentally, we just don't need it, because freezing condition - * doesn't come into effect until we release the - * system_transition_mutex lock, since the freezer always works with - * system_transition_mutex held. - * - * More importantly, in the case of hibernation, - * unlock_system_sleep() gets called in snapshot_read() and - * snapshot_write() when the freezing condition is still in effect. - * Which means, if we use try_to_freeze() here, it would make them - * enter the refrigerator, thus causing hibernation to lockup. - */ if (!(flags & PF_NOFREEZE)) current->flags &= ~PF_NOFREEZE; mutex_unlock(&system_transition_mutex); -- cgit From dadce3fbaf10250b35d540caff475ff93b259de0 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 19 Dec 2023 22:02:46 -0800 Subject: PM: hibernate: Repair excess function parameter description warning Function swsusp_close() does not have any parameters, so remove the description of parameter @exclusive to prevent this warning. swap.c:1573: warning: Excess function parameter 'exclusive' description in 'swsusp_close' Signed-off-by: Randy Dunlap [ rjw: Subject edits ] Signed-off-by: Rafael J. Wysocki --- kernel/power/swap.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 975e7195573b..6053ddddaf65 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -1566,7 +1566,6 @@ put: /** * swsusp_close - close resume device. - * @exclusive: Close the resume device which is exclusively opened. */ void swsusp_close(void) -- cgit From 7ac5c53e00735d183a0f5e2cfce5eeb6c16319f2 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Sat, 16 Dec 2023 21:10:51 +0800 Subject: bpf: Use c->unit_size to select target cache during free MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit At present, bpf memory allocator uses check_obj_size() to ensure that ksize() of allocated pointer is equal with the unit_size of used bpf_mem_cache. Its purpose is to prevent bpf_mem_free() from selecting a bpf_mem_cache which has different unit_size compared with the bpf_mem_cache used for allocation. But as reported by lkp, the return value of ksize() or kmalloc_size_roundup() may change due to slab merge and it will lead to the warning report in check_obj_size(). The reported warning happened as follows: (1) in bpf_mem_cache_adjust_size(), kmalloc_size_roundup(96) returns the object_size of kmalloc-96 instead of kmalloc-cg-96. The object_size of kmalloc-96 is 96, so size_index for 96 is not adjusted accordingly. (2) the object_size of kmalloc-cg-96 is adjust from 96 to 128 due to slab merge in __kmem_cache_alias(). For SLAB, SLAB_HWCACHE_ALIGN is enabled by default for kmalloc slab, so align is 64 and size is 128 for kmalloc-cg-96. SLUB has a similar merge logic, but its object_size will not be changed, because its align is 8 under x86-64. (3) when unit_alloc() does kmalloc_node(96, __GFP_ACCOUNT, node), ksize() returns 128 instead of 96 for the returned pointer. (4) the warning in check_obj_size() is triggered. Considering the slab merge can happen in anytime (e.g, a slab created in a new module), the following case is also possible: during the initialization of bpf_global_ma, there is no slab merge and ksize() for a 96-bytes object returns 96. But after that a new slab created by a kernel module is merged to kmalloc-cg-96 and the object_size of kmalloc-cg-96 is adjust from 96 to 128 (which is possible for x86-64 + CONFIG_SLAB, because its alignment requirement is 64 for 96-bytes slab). So soon or later, when bpf_global_ma frees a 96-byte-sized pointer which is allocated from bpf_mem_cache with unit_size=96, bpf_mem_free() will free the pointer through a bpf_mem_cache in which unit_size is 128, because the return value of ksize() changes. The warning for the mismatch will be triggered again. A feasible fix is introducing similar APIs compared with ksize() and kmalloc_size_roundup() to return the actually-allocated size instead of size which may change due to slab merge, but it will introduce unnecessary dependency on the implementation details of mm subsystem. As for now the pointer of bpf_mem_cache is saved in the 8-bytes area (or 4-bytes under 32-bit host) above the returned pointer, using unit_size in the saved bpf_mem_cache to select the target cache instead of inferring the size from the pointer itself. Beside no extra dependency on mm subsystem, the performance for bpf_mem_free_rcu() is also improved as shown below. Before applying the patch, the performances of bpf_mem_alloc() and bpf_mem_free_rcu() on 8-CPUs VM with one producer are as follows: kmalloc : alloc 11.69 ± 0.28M/s free 29.58 ± 0.93M/s percpu : alloc 14.11 ± 0.52M/s free 14.29 ± 0.99M/s After apply the patch, the performance for bpf_mem_free_rcu() increases 9% and 146% for kmalloc memory and per-cpu memory respectively: kmalloc: alloc 11.01 ± 0.03M/s free 32.42 ± 0.48M/s percpu: alloc 12.84 ± 0.12M/s free 35.24 ± 0.23M/s After the fixes, there is no need to adjust size_index to fix the mismatch between allocation and free, so remove it as well. Also return NULL instead of ZERO_SIZE_PTR for zero-sized alloc in bpf_mem_alloc(), because there is no bpf_mem_cache pointer saved above ZERO_SIZE_PTR. Fixes: 9077fc228f09 ("bpf: Use kmalloc_size_roundup() to adjust size_index") Reported-by: kernel test robot Closes: https://lore.kernel.org/bpf/202310302113.9f8fe705-oliver.sang@intel.com Signed-off-by: Hou Tao Link: https://lore.kernel.org/r/20231216131052.27621-2-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/memalloc.c | 105 ++++++-------------------------------------------- 1 file changed, 11 insertions(+), 94 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c index 6a51cfe4c2d6..aa0fbf000a12 100644 --- a/kernel/bpf/memalloc.c +++ b/kernel/bpf/memalloc.c @@ -490,27 +490,6 @@ static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu) alloc_bulk(c, c->unit_size <= 256 ? 4 : 1, cpu_to_node(cpu), false); } -static int check_obj_size(struct bpf_mem_cache *c, unsigned int idx) -{ - struct llist_node *first; - unsigned int obj_size; - - first = c->free_llist.first; - if (!first) - return 0; - - if (c->percpu_size) - obj_size = pcpu_alloc_size(((void **)first)[1]); - else - obj_size = ksize(first); - if (obj_size != c->unit_size) { - WARN_ONCE(1, "bpf_mem_cache[%u]: percpu %d, unexpected object size %u, expect %u\n", - idx, c->percpu_size, obj_size, c->unit_size); - return -EINVAL; - } - return 0; -} - /* When size != 0 bpf_mem_cache for each cpu. * This is typical bpf hash map use case when all elements have equal size. * @@ -521,10 +500,10 @@ static int check_obj_size(struct bpf_mem_cache *c, unsigned int idx) int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu) { static u16 sizes[NUM_CACHES] = {96, 192, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096}; - int cpu, i, err, unit_size, percpu_size = 0; struct bpf_mem_caches *cc, __percpu *pcc; struct bpf_mem_cache *c, __percpu *pc; struct obj_cgroup *objcg = NULL; + int cpu, i, unit_size, percpu_size = 0; /* room for llist_node and per-cpu pointer */ if (percpu) @@ -560,7 +539,6 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu) pcc = __alloc_percpu_gfp(sizeof(*cc), 8, GFP_KERNEL); if (!pcc) return -ENOMEM; - err = 0; #ifdef CONFIG_MEMCG_KMEM objcg = get_obj_cgroup_from_current(); #endif @@ -574,28 +552,12 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu) c->tgt = c; init_refill_work(c); - /* Another bpf_mem_cache will be used when allocating - * c->unit_size in bpf_mem_alloc(), so doesn't prefill - * for the bpf_mem_cache because these free objects will - * never be used. - */ - if (i != bpf_mem_cache_idx(c->unit_size)) - continue; prefill_mem_cache(c, cpu); - err = check_obj_size(c, i); - if (err) - goto out; } } -out: ma->caches = pcc; - /* refill_work is either zeroed or initialized, so it is safe to - * call irq_work_sync(). - */ - if (err) - bpf_mem_alloc_destroy(ma); - return err; + return 0; } static void drain_mem_cache(struct bpf_mem_cache *c) @@ -869,7 +831,7 @@ void notrace *bpf_mem_alloc(struct bpf_mem_alloc *ma, size_t size) void *ret; if (!size) - return ZERO_SIZE_PTR; + return NULL; idx = bpf_mem_cache_idx(size + LLIST_NODE_SZ); if (idx < 0) @@ -879,26 +841,17 @@ void notrace *bpf_mem_alloc(struct bpf_mem_alloc *ma, size_t size) return !ret ? NULL : ret + LLIST_NODE_SZ; } -static notrace int bpf_mem_free_idx(void *ptr, bool percpu) -{ - size_t size; - - if (percpu) - size = pcpu_alloc_size(*((void **)ptr)); - else - size = ksize(ptr - LLIST_NODE_SZ); - return bpf_mem_cache_idx(size); -} - void notrace bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr) { + struct bpf_mem_cache *c; int idx; if (!ptr) return; - idx = bpf_mem_free_idx(ptr, ma->percpu); - if (idx < 0) + c = *(void **)(ptr - LLIST_NODE_SZ); + idx = bpf_mem_cache_idx(c->unit_size); + if (WARN_ON_ONCE(idx < 0)) return; unit_free(this_cpu_ptr(ma->caches)->cache + idx, ptr); @@ -906,13 +859,15 @@ void notrace bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr) void notrace bpf_mem_free_rcu(struct bpf_mem_alloc *ma, void *ptr) { + struct bpf_mem_cache *c; int idx; if (!ptr) return; - idx = bpf_mem_free_idx(ptr, ma->percpu); - if (idx < 0) + c = *(void **)(ptr - LLIST_NODE_SZ); + idx = bpf_mem_cache_idx(c->unit_size); + if (WARN_ON_ONCE(idx < 0)) return; unit_free_rcu(this_cpu_ptr(ma->caches)->cache + idx, ptr); @@ -986,41 +941,3 @@ void notrace *bpf_mem_cache_alloc_flags(struct bpf_mem_alloc *ma, gfp_t flags) return !ret ? NULL : ret + LLIST_NODE_SZ; } - -/* The alignment of dynamic per-cpu area is 8, so c->unit_size and the - * actual size of dynamic per-cpu area will always be matched and there is - * no need to adjust size_index for per-cpu allocation. However for the - * simplicity of the implementation, use an unified size_index for both - * kmalloc and per-cpu allocation. - */ -static __init int bpf_mem_cache_adjust_size(void) -{ - unsigned int size; - - /* Adjusting the indexes in size_index() according to the object_size - * of underlying slab cache, so bpf_mem_alloc() will select a - * bpf_mem_cache with unit_size equal to the object_size of - * the underlying slab cache. - * - * The maximal value of KMALLOC_MIN_SIZE and __kmalloc_minalign() is - * 256-bytes, so only do adjustment for [8-bytes, 192-bytes]. - */ - for (size = 192; size >= 8; size -= 8) { - unsigned int kmalloc_size, index; - - kmalloc_size = kmalloc_size_roundup(size); - if (kmalloc_size == size) - continue; - - if (kmalloc_size <= 192) - index = size_index[(kmalloc_size - 1) / 8]; - else - index = fls(kmalloc_size - 1) - 1; - /* Only overwrite if necessary */ - if (size_index[(size - 1) / 8] != index) - size_index[(size - 1) / 8] = index; - } - - return 0; -} -subsys_initcall(bpf_mem_cache_adjust_size); -- cgit From c1ad12ee0efc07244be37f69311e6f7c4ac98e62 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 23 Oct 2023 13:01:54 +0200 Subject: kexec: fix KEXEC_FILE dependencies The cleanup for the CONFIG_KEXEC Kconfig logic accidentally changed the 'depends on CRYPTO=y' dependency to a plain 'depends on CRYPTO', which causes a link failure when all the crypto support is in a loadable module and kexec_file support is built-in: x86_64-linux-ld: vmlinux.o: in function `__x64_sys_kexec_file_load': (.text+0x32e30a): undefined reference to `crypto_alloc_shash' x86_64-linux-ld: (.text+0x32e58e): undefined reference to `crypto_shash_update' x86_64-linux-ld: (.text+0x32e6ee): undefined reference to `crypto_shash_final' Both s390 and x86 have this problem, while ppc64 and riscv have the correct dependency already. On riscv, the dependency is only used for the purgatory, not for the kexec_file code itself, which may be a bit surprising as it means that with CONFIG_CRYPTO=m, it is possible to enable KEXEC_FILE but then the purgatory code is silently left out. Move this into the common Kconfig.kexec file in a way that is correct everywhere, using the dependency on CRYPTO_SHA256=y only when the purgatory code is available. This requires reversing the dependency between ARCH_SUPPORTS_KEXEC_PURGATORY and KEXEC_FILE, but the effect remains the same, other than making riscv behave like the other ones. On s390, there is an additional dependency on CRYPTO_SHA256_S390, which should technically not be required but gives better performance. Remove this dependency here, noting that it was not present in the initial Kconfig code but was brought in without an explanation in commit 71406883fd357 ("s390/kexec_file: Add kexec_file_load system call"). [arnd@arndb.de: fix riscv build] Link: https://lkml.kernel.org/r/67ddd260-d424-4229-a815-e3fcfb864a77@app.fastmail.com Link: https://lkml.kernel.org/r/20231023110308.1202042-1-arnd@kernel.org Fixes: 6af5138083005 ("x86/kexec: refactor for kernel/Kconfig.kexec") Signed-off-by: Arnd Bergmann Reviewed-by: Eric DeVolder Tested-by: Eric DeVolder Cc: Albert Ou Cc: Alexander Gordeev Cc: Ard Biesheuvel Cc: Borislav Petkov Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Conor Dooley Cc: Dave Hansen Cc: David S. Miller Cc: Heiko Carstens Cc: Herbert Xu Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Peter Zijlstra Cc: Sven Schnelle Cc: Thomas Gleixner Cc: Vasily Gorbik Signed-off-by: Andrew Morton --- arch/powerpc/Kconfig | 4 ++-- arch/riscv/Kconfig | 4 +--- arch/s390/Kconfig | 4 ++-- arch/x86/Kconfig | 4 ++-- kernel/Kconfig.kexec | 1 + 5 files changed, 8 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 6f105ee4f3cf..1f11a62809f2 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -608,10 +608,10 @@ config ARCH_SUPPORTS_KEXEC def_bool PPC_BOOK3S || PPC_E500 || (44x && !SMP) config ARCH_SUPPORTS_KEXEC_FILE - def_bool PPC64 && CRYPTO=y && CRYPTO_SHA256=y + def_bool PPC64 config ARCH_SUPPORTS_KEXEC_PURGATORY - def_bool KEXEC_FILE + def_bool y config ARCH_SELECTS_KEXEC_FILE def_bool y diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 24c1799e2ec4..cd4c9a204d08 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -702,9 +702,7 @@ config ARCH_SELECTS_KEXEC_FILE select KEXEC_ELF config ARCH_SUPPORTS_KEXEC_PURGATORY - def_bool KEXEC_FILE - depends on CRYPTO=y - depends on CRYPTO_SHA256=y + def_bool ARCH_SUPPORTS_KEXEC_FILE config ARCH_SUPPORTS_CRASH_DUMP def_bool y diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 3bec98d20283..d5d8f99d1f25 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -254,13 +254,13 @@ config ARCH_SUPPORTS_KEXEC def_bool y config ARCH_SUPPORTS_KEXEC_FILE - def_bool CRYPTO && CRYPTO_SHA256 && CRYPTO_SHA256_S390 + def_bool y config ARCH_SUPPORTS_KEXEC_SIG def_bool MODULE_SIG_FORMAT config ARCH_SUPPORTS_KEXEC_PURGATORY - def_bool KEXEC_FILE + def_bool y config ARCH_SUPPORTS_CRASH_DUMP def_bool y diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 3762f41bb092..1566748f16c4 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2072,7 +2072,7 @@ config ARCH_SUPPORTS_KEXEC def_bool y config ARCH_SUPPORTS_KEXEC_FILE - def_bool X86_64 && CRYPTO && CRYPTO_SHA256 + def_bool X86_64 config ARCH_SELECTS_KEXEC_FILE def_bool y @@ -2080,7 +2080,7 @@ config ARCH_SELECTS_KEXEC_FILE select HAVE_IMA_KEXEC if IMA config ARCH_SUPPORTS_KEXEC_PURGATORY - def_bool KEXEC_FILE + def_bool y config ARCH_SUPPORTS_KEXEC_SIG def_bool y diff --git a/kernel/Kconfig.kexec b/kernel/Kconfig.kexec index 2fd510256604..92120e396008 100644 --- a/kernel/Kconfig.kexec +++ b/kernel/Kconfig.kexec @@ -36,6 +36,7 @@ config KEXEC config KEXEC_FILE bool "Enable kexec file based system call" depends on ARCH_SUPPORTS_KEXEC_FILE + depends on CRYPTO_SHA256=y || !ARCH_SUPPORTS_KEXEC_PURGATORY select KEXEC_CORE help This is new version of kexec system call. This system call is -- cgit From e63bde3d9417f8318d6dd0d0fafa35ebf307aabd Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 23 Oct 2023 13:01:55 +0200 Subject: kexec: select CRYPTO from KEXEC_FILE instead of depending on it All other users of crypto code use 'select' instead of 'depends on', so do the same thing with KEXEC_FILE for consistency. In practice this makes very little difference as kernels with kexec support are very likely to also include some other feature that already selects both crypto and crypto_sha256, but being consistent here helps for usability as well as to avoid potential circular dependencies. This reverts the dependency back to what it was originally before commit 74ca317c26a3f ("kexec: create a new config option CONFIG_KEXEC_FILE for new syscall"), which changed changed it with the comment "This should be safer as "select" is not recursive", but that appears to have been done in error, as "select" is indeed recursive, and there are no other dependencies that prevent CRYPTO_SHA256 from being selected here. Link: https://lkml.kernel.org/r/20231023110308.1202042-2-arnd@kernel.org Fixes: 74ca317c26a3f ("kexec: create a new config option CONFIG_KEXEC_FILE for new syscall") Signed-off-by: Arnd Bergmann Reviewed-by: Eric DeVolder Tested-by: Eric DeVolder Acked-by: Baoquan He Cc: Herbert Xu Cc: "David S. Miller" Cc: Albert Ou Cc: Alexander Gordeev Cc: Ard Biesheuvel Cc: Borislav Petkov Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Conor Dooley Cc: Dave Hansen Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Peter Zijlstra Cc: Sven Schnelle Cc: Thomas Gleixner Cc: Vasily Gorbik Signed-off-by: Andrew Morton --- kernel/Kconfig.kexec | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/Kconfig.kexec b/kernel/Kconfig.kexec index 92120e396008..946dffa048b7 100644 --- a/kernel/Kconfig.kexec +++ b/kernel/Kconfig.kexec @@ -36,7 +36,8 @@ config KEXEC config KEXEC_FILE bool "Enable kexec file based system call" depends on ARCH_SUPPORTS_KEXEC_FILE - depends on CRYPTO_SHA256=y || !ARCH_SUPPORTS_KEXEC_PURGATORY + select CRYPTO + select CRYPTO_SHA256 select KEXEC_CORE help This is new version of kexec system call. This system call is -- cgit From cbc2fe9d9cb226347365753f50d81bc48cc3c52e Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Wed, 13 Dec 2023 13:57:41 +0800 Subject: kexec_file: add kexec_file flag to control debug printing Patch series "kexec_file: print out debugging message if required", v4. Currently, specifying '-d' on kexec command will print a lot of debugging informationabout kexec/kdump loading with kexec_load interface. However, kexec_file_load prints nothing even though '-d' is specified. It's very inconvenient to debug or analyze the kexec/kdump loading when something wrong happened with kexec/kdump itself or develper want to check the kexec/kdump loading. In this patchset, a kexec_file flag is KEXEC_FILE_DEBUG added and checked in code. If it's passed in, debugging message of kexec_file code will be printed out and can be seen from console and dmesg. Otherwise, the debugging message is printed like beofre when pr_debug() is taken. Note: **** ===== 1) The code in kexec-tools utility also need be changed to support passing KEXEC_FILE_DEBUG to kernel when 'kexec -s -d' is specified. The patch link is here: ========= [PATCH] kexec_file: add kexec_file flag to support debug printing http://lists.infradead.org/pipermail/kexec/2023-November/028505.html 2) s390 also has kexec_file code, while I am not sure what debugging information is necessary. So leave it to s390 developer. Test: **** ==== Testing was done in v1 on x86_64 and arm64. For v4, tested on x86_64 again. And on x86_64, the printed messages look like below: -------------------------------------------------------------- kexec measurement buffer for the loaded kernel at 0x207fffe000. Loaded purgatory at 0x207fff9000 Loaded boot_param, command line and misc at 0x207fff3000 bufsz=0x1180 memsz=0x1180 Loaded 64bit kernel at 0x207c000000 bufsz=0xc88200 memsz=0x3c4a000 Loaded initrd at 0x2079e79000 bufsz=0x2186280 memsz=0x2186280 Final command line is: root=/dev/mapper/fedora_intel--knightslanding--lb--02-root ro rd.lvm.lv=fedora_intel-knightslanding-lb-02/root console=ttyS0,115200N81 crashkernel=256M E820 memmap: 0000000000000000-000000000009a3ff (1) 000000000009a400-000000000009ffff (2) 00000000000e0000-00000000000fffff (2) 0000000000100000-000000006ff83fff (1) 000000006ff84000-000000007ac50fff (2) ...... 000000207fff6150-000000207fff615f (128) 000000207fff6160-000000207fff714f (1) 000000207fff7150-000000207fff715f (128) 000000207fff7160-000000207fff814f (1) 000000207fff8150-000000207fff815f (128) 000000207fff8160-000000207fffffff (1) nr_segments = 5 segment[0]: buf=0x000000004e5ece74 bufsz=0x211 mem=0x207fffe000 memsz=0x1000 segment[1]: buf=0x000000009e871498 bufsz=0x4000 mem=0x207fff9000 memsz=0x5000 segment[2]: buf=0x00000000d879f1fe bufsz=0x1180 mem=0x207fff3000 memsz=0x2000 segment[3]: buf=0x000000001101cd86 bufsz=0xc88200 mem=0x207c000000 memsz=0x3c4a000 segment[4]: buf=0x00000000c6e38ac7 bufsz=0x2186280 mem=0x2079e79000 memsz=0x2187000 kexec_file_load: type:0, start:0x207fff91a0 head:0x109e004002 flags:0x8 --------------------------------------------------------------------------- This patch (of 7): When specifying 'kexec -c -d', kexec_load interface will print loading information, e.g the regions where kernel/initrd/purgatory/cmdline are put, the memmap passed to 2nd kernel taken as system RAM ranges, and printing all contents of struct kexec_segment, etc. These are very helpful for analyzing or positioning what's happening when kexec/kdump itself failed. The debugging printing for kexec_load interface is made in user space utility kexec-tools. Whereas, with kexec_file_load interface, 'kexec -s -d' print nothing. Because kexec_file code is mostly implemented in kernel space, and the debugging printing functionality is missed. It's not convenient when debugging kexec/kdump loading and jumping with kexec_file_load interface. Now add KEXEC_FILE_DEBUG to kexec_file flag to control the debugging message printing. And add global variable kexec_file_dbg_print and macro kexec_dprintk() to facilitate the printing. This is a preparation, later kexec_dprintk() will be used to replace the existing pr_debug(). Once 'kexec -s -d' is specified, it will print out kexec/kdump loading information. If '-d' is not specified, it regresses to pr_debug(). Link: https://lkml.kernel.org/r/20231213055747.61826-1-bhe@redhat.com Link: https://lkml.kernel.org/r/20231213055747.61826-2-bhe@redhat.com Signed-off-by: Baoquan He Cc: Conor Dooley Cc: Joe Perches Cc: Nathan Chancellor Signed-off-by: Andrew Morton --- include/linux/kexec.h | 9 ++++++++- include/uapi/linux/kexec.h | 1 + kernel/kexec_core.c | 2 ++ kernel/kexec_file.c | 3 +++ 4 files changed, 14 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/kexec.h b/include/linux/kexec.h index 8227455192b7..400cb6c02176 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -403,7 +403,7 @@ bool kexec_load_permitted(int kexec_image_type); /* List of defined/legal kexec file flags */ #define KEXEC_FILE_FLAGS (KEXEC_FILE_UNLOAD | KEXEC_FILE_ON_CRASH | \ - KEXEC_FILE_NO_INITRAMFS) + KEXEC_FILE_NO_INITRAMFS | KEXEC_FILE_DEBUG) /* flag to track if kexec reboot is in progress */ extern bool kexec_in_progress; @@ -500,6 +500,13 @@ static inline int crash_hotplug_memory_support(void) { return 0; } static inline unsigned int crash_get_elfcorehdr_size(void) { return 0; } #endif +extern bool kexec_file_dbg_print; + +#define kexec_dprintk(fmt, ...) \ + printk("%s" fmt, \ + kexec_file_dbg_print ? KERN_INFO : KERN_DEBUG, \ + ##__VA_ARGS__) + #else /* !CONFIG_KEXEC_CORE */ struct pt_regs; struct task_struct; diff --git a/include/uapi/linux/kexec.h b/include/uapi/linux/kexec.h index 01766dd839b0..c17bb096ea68 100644 --- a/include/uapi/linux/kexec.h +++ b/include/uapi/linux/kexec.h @@ -25,6 +25,7 @@ #define KEXEC_FILE_UNLOAD 0x00000001 #define KEXEC_FILE_ON_CRASH 0x00000002 #define KEXEC_FILE_NO_INITRAMFS 0x00000004 +#define KEXEC_FILE_DEBUG 0x00000008 /* These values match the ELF architecture values. * Unless there is a good reason that should continue to be the case. diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index bc4c096ab1f3..64072acef2b6 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -52,6 +52,8 @@ atomic_t __kexec_lock = ATOMIC_INIT(0); /* Flag to indicate we are going to kexec a new kernel */ bool kexec_in_progress = false; +bool kexec_file_dbg_print; + int kexec_should_crash(struct task_struct *p) { /* diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index ba3ef30921b8..3ee204474de6 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -123,6 +123,8 @@ void kimage_file_post_load_cleanup(struct kimage *image) */ kfree(image->image_loader_data); image->image_loader_data = NULL; + + kexec_file_dbg_print = false; } #ifdef CONFIG_KEXEC_SIG @@ -278,6 +280,7 @@ kimage_file_alloc_init(struct kimage **rimage, int kernel_fd, if (!image) return -ENOMEM; + kexec_file_dbg_print = !!(flags & KEXEC_FILE_DEBUG); image->file_mode = 1; if (kexec_on_panic) { -- cgit From a85ee18c7900f001f42082d2fabce4eaf57e655f Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Wed, 13 Dec 2023 13:57:42 +0800 Subject: kexec_file: print out debugging message if required Then when specifying '-d' for kexec_file_load interface, loaded locations of kernel/initrd/cmdline etc can be printed out to help debug. Here replace pr_debug() with the newly added kexec_dprintk() in kexec_file loading related codes. And also print out type/start/head of kimage and flags to help debug. Link: https://lkml.kernel.org/r/20231213055747.61826-3-bhe@redhat.com Signed-off-by: Baoquan He Cc: Conor Dooley Cc: Joe Perches Cc: Nathan Chancellor Signed-off-by: Andrew Morton --- kernel/crash_core.c | 8 +++++--- kernel/kexec_file.c | 11 ++++++++--- security/integrity/ima/ima_kexec.c | 4 ++-- 3 files changed, 15 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/crash_core.c b/kernel/crash_core.c index d4313b53837e..c97e825a0fd9 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -551,9 +551,11 @@ int crash_prepare_elf64_headers(struct crash_mem *mem, int need_kernel_map, phdr->p_filesz = phdr->p_memsz = mend - mstart + 1; phdr->p_align = 0; ehdr->e_phnum++; - pr_debug("Crash PT_LOAD ELF header. phdr=%p vaddr=0x%llx, paddr=0x%llx, sz=0x%llx e_phnum=%d p_offset=0x%llx\n", - phdr, phdr->p_vaddr, phdr->p_paddr, phdr->p_filesz, - ehdr->e_phnum, phdr->p_offset); +#ifdef CONFIG_KEXEC_FILE + kexec_dprintk("Crash PT_LOAD ELF header. phdr=%p vaddr=0x%llx, paddr=0x%llx, sz=0x%llx e_phnum=%d p_offset=0x%llx\n", + phdr, phdr->p_vaddr, phdr->p_paddr, phdr->p_filesz, + ehdr->e_phnum, phdr->p_offset); +#endif phdr++; } diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 3ee204474de6..aca5f3668f4c 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -204,6 +204,8 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd, if (ret < 0) return ret; image->kernel_buf_len = ret; + kexec_dprintk("kernel: %p kernel_size: %#lx\n", + image->kernel_buf, image->kernel_buf_len); /* Call arch image probe handlers */ ret = arch_kexec_kernel_image_probe(image, image->kernel_buf, @@ -387,13 +389,14 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd, if (ret) goto out; + kexec_dprintk("nr_segments = %lu\n", image->nr_segments); for (i = 0; i < image->nr_segments; i++) { struct kexec_segment *ksegment; ksegment = &image->segment[i]; - pr_debug("Loading segment %d: buf=0x%p bufsz=0x%zx mem=0x%lx memsz=0x%zx\n", - i, ksegment->buf, ksegment->bufsz, ksegment->mem, - ksegment->memsz); + kexec_dprintk("segment[%d]: buf=0x%p bufsz=0x%zx mem=0x%lx memsz=0x%zx\n", + i, ksegment->buf, ksegment->bufsz, ksegment->mem, + ksegment->memsz); ret = kimage_load_segment(image, &image->segment[i]); if (ret) @@ -406,6 +409,8 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd, if (ret) goto out; + kexec_dprintk("kexec_file_load: type:%u, start:0x%lx head:0x%lx flags:0x%lx\n", + image->type, image->start, image->head, flags); /* * Free up any temporary buffers allocated which are not needed * after image has been loaded diff --git a/security/integrity/ima/ima_kexec.c b/security/integrity/ima/ima_kexec.c index ad133fe120db..dadc1d138118 100644 --- a/security/integrity/ima/ima_kexec.c +++ b/security/integrity/ima/ima_kexec.c @@ -129,8 +129,8 @@ void ima_add_kexec_buffer(struct kimage *image) image->ima_buffer_size = kexec_segment_size; image->ima_buffer = kexec_buffer; - pr_debug("kexec measurement buffer for the loaded kernel at 0x%lx.\n", - kbuf.mem); + kexec_dprintk("kexec measurement buffer for the loaded kernel at 0x%lx.\n", + kbuf.mem); } #endif /* IMA_KEXEC */ -- cgit From a903904c5fa06e8d8472509741f79e8eb25ff864 Mon Sep 17 00:00:00 2001 From: Kevin Hao Date: Fri, 8 Dec 2023 16:41:15 +0800 Subject: fork: remove redundant TASK_UNINTERRUPTIBLE TASK_KILLABLE already includes TASK_UNINTERRUPTIBLE, so there is no need to add a separate TASK_UNINTERRUPTIBLE. Link: https://lkml.kernel.org/r/20231208084115.1973285-1-haokexin@gmail.com Signed-off-by: Kevin Hao Cc: Peter Zijlstra Signed-off-by: Andrew Morton --- kernel/fork.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index ce8a4b8c04e2..d71c8ade8f9c 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1562,7 +1562,7 @@ static void complete_vfork_done(struct task_struct *tsk) static int wait_for_vfork_done(struct task_struct *child, struct completion *vfork) { - unsigned int state = TASK_UNINTERRUPTIBLE|TASK_KILLABLE|TASK_FREEZABLE; + unsigned int state = TASK_KILLABLE|TASK_FREEZABLE; int killed; cgroup_enter_frozen(); -- cgit From db6b6fb70193f0defe4d5785e940156c06e9abbe Mon Sep 17 00:00:00 2001 From: Yuntao Wang Date: Tue, 12 Dec 2023 22:27:06 +0800 Subject: kexec: use ALIGN macro instead of open-coding it Use ALIGN macro instead of open-coding it to improve code readability. Link: https://lkml.kernel.org/r/20231212142706.25149-1-ytcoode@gmail.com Signed-off-by: Yuntao Wang Acked-by: Baoquan He Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton --- kernel/kexec_core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 64072acef2b6..6e0f022987ff 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -432,7 +432,7 @@ static struct page *kimage_alloc_crash_control_pages(struct kimage *image, pages = NULL; size = (1 << order) << PAGE_SHIFT; - hole_start = (image->control_page + (size - 1)) & ~(size - 1); + hole_start = ALIGN(image->control_page, size); hole_end = hole_start + size - 1; while (hole_end <= crashk_res.end) { unsigned long i; @@ -449,7 +449,7 @@ static struct page *kimage_alloc_crash_control_pages(struct kimage *image, mend = mstart + image->segment[i].memsz - 1; if ((hole_end >= mstart) && (hole_start <= mend)) { /* Advance the hole to the end of the segment */ - hole_start = (mend + (size - 1)) & ~(size - 1); + hole_start = ALIGN(mend, size); hole_end = hole_start + size - 1; break; } -- cgit From 4459cd2e167e7208e57d517d16282408d9035dad Mon Sep 17 00:00:00 2001 From: Wang Jinchao Date: Fri, 15 Dec 2023 16:54:51 +0800 Subject: crash_core: remove duplicated including of kexec.h Remove second include of linux/kexec.h Link: https://lkml.kernel.org/r/202312151654+0800-wangjinchao@xfusion.com Signed-off-by: Wang Jinchao Acked-by: Baoquan He Cc: Dave Young Cc: Vivek Goyal Signed-off-by: Andrew Morton --- kernel/crash_core.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/crash_core.c b/kernel/crash_core.c index c97e825a0fd9..6f074e112c1e 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -13,7 +13,6 @@ #include #include #include -#include #include #include -- cgit From a2bef835d39c5c9d611e9420db4221b0eeec9944 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 15 Dec 2023 20:49:12 -0500 Subject: kernel/fork.c: add missing include Signed-off-by: Kent Overstreet --- kernel/fork.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 10917c3e1f03..319e61297bfb 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -53,6 +53,7 @@ #include #include #include +#include #include #include #include -- cgit From d7a73e3f089204aee3393687e23fd45a22657b08 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 11 Dec 2023 13:27:00 -0500 Subject: kernel/numa.c: Move logging out of numa.h Moving these stub functions to a .c file means we can kill a sched.h dependency on printk.h. Signed-off-by: Kent Overstreet --- include/linux/numa.h | 19 ++++++------------- kernel/Makefile | 1 + kernel/numa.c | 26 ++++++++++++++++++++++++++ 3 files changed, 33 insertions(+), 13 deletions(-) create mode 100644 kernel/numa.c (limited to 'kernel') diff --git a/include/linux/numa.h b/include/linux/numa.h index a904861de800..915033a75731 100644 --- a/include/linux/numa.h +++ b/include/linux/numa.h @@ -1,6 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_NUMA_H #define _LINUX_NUMA_H +#include #include #ifdef CONFIG_NODES_SHIFT @@ -22,34 +23,26 @@ #endif #ifdef CONFIG_NUMA -#include #include /* Generic implementation available */ int numa_nearest_node(int node, unsigned int state); #ifndef memory_add_physaddr_to_nid -static inline int memory_add_physaddr_to_nid(u64 start) -{ - pr_info_once("Unknown online node for memory at 0x%llx, assuming node 0\n", - start); - return 0; -} +int memory_add_physaddr_to_nid(u64 start); #endif + #ifndef phys_to_target_node -static inline int phys_to_target_node(u64 start) -{ - pr_info_once("Unknown target node for memory at 0x%llx, assuming node 0\n", - start); - return 0; -} +int phys_to_target_node(u64 start); #endif + #ifndef numa_fill_memblks static inline int __init numa_fill_memblks(u64 start, u64 end) { return NUMA_NO_MEMBLK; } #endif + #else /* !CONFIG_NUMA */ static inline int numa_nearest_node(int node, unsigned int state) { diff --git a/kernel/Makefile b/kernel/Makefile index 3947122d618b..ce105a5558fc 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -114,6 +114,7 @@ obj-$(CONFIG_SHADOW_CALL_STACK) += scs.o obj-$(CONFIG_HAVE_STATIC_CALL) += static_call.o obj-$(CONFIG_HAVE_STATIC_CALL_INLINE) += static_call_inline.o obj-$(CONFIG_CFI_CLANG) += cfi.o +obj-$(CONFIG_NUMA) += numa.o obj-$(CONFIG_PERF_EVENTS) += events/ diff --git a/kernel/numa.c b/kernel/numa.c new file mode 100644 index 000000000000..67ca6b8585c0 --- /dev/null +++ b/kernel/numa.c @@ -0,0 +1,26 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include +#include + +/* Stub functions: */ + +#ifndef memory_add_physaddr_to_nid +int memory_add_physaddr_to_nid(u64 start) +{ + pr_info_once("Unknown online node for memory at 0x%llx, assuming node 0\n", + start); + return 0; +} +EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); +#endif + +#ifndef phys_to_target_node +int phys_to_target_node(u64 start) +{ + pr_info_once("Unknown target node for memory at 0x%llx, assuming node 0\n", + start); + return 0; +} +EXPORT_SYMBOL_GPL(phys_to_target_node); +#endif -- cgit From f551103cb964e9e6f5c03b3b8723424723731e76 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 15 Dec 2023 17:49:24 -0500 Subject: sched.h: move pid helpers to pid.h This is needed for killing the sched.h dependency on rcupdate.h, and pid.h is a better place for this code anyways. Signed-off-by: Kent Overstreet --- arch/x86/um/sysrq_64.c | 1 + drivers/gpu/drm/lima/lima_ctx.c | 1 + drivers/irqchip/irq-gic-v4.c | 1 + include/linux/pid.h | 125 ++++++++++++++++++++++++++++++++++++++++ include/linux/sched.h | 122 --------------------------------------- include/linux/sched/signal.h | 1 + ipc/util.h | 1 + kernel/async.c | 5 +- kernel/locking/spinlock_debug.c | 1 + 9 files changed, 134 insertions(+), 124 deletions(-) (limited to 'kernel') diff --git a/arch/x86/um/sysrq_64.c b/arch/x86/um/sysrq_64.c index ef1eb4f4f612..0bf6de40abff 100644 --- a/arch/x86/um/sysrq_64.c +++ b/arch/x86/um/sysrq_64.c @@ -6,6 +6,7 @@ #include #include +#include #include #include #include diff --git a/drivers/gpu/drm/lima/lima_ctx.c b/drivers/gpu/drm/lima/lima_ctx.c index 891d5cd5019a..8389f2d7d021 100644 --- a/drivers/gpu/drm/lima/lima_ctx.c +++ b/drivers/gpu/drm/lima/lima_ctx.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 OR MIT /* Copyright 2018-2019 Qiang Yu */ +#include #include #include "lima_device.h" diff --git a/drivers/irqchip/irq-gic-v4.c b/drivers/irqchip/irq-gic-v4.c index 94d56a03b175..ca32ac19d284 100644 --- a/drivers/irqchip/irq-gic-v4.c +++ b/drivers/irqchip/irq-gic-v4.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include diff --git a/include/linux/pid.h b/include/linux/pid.h index f254c3a45b9b..395cacce1179 100644 --- a/include/linux/pid.h +++ b/include/linux/pid.h @@ -4,7 +4,9 @@ #include #include +#include #include +#include #include /* @@ -204,4 +206,127 @@ pid_t pid_vnr(struct pid *pid); } \ task = tg___; \ } while_each_pid_task(pid, type, task) + +static inline struct pid *task_pid(struct task_struct *task) +{ + return task->thread_pid; +} + +/* + * the helpers to get the task's different pids as they are seen + * from various namespaces + * + * task_xid_nr() : global id, i.e. the id seen from the init namespace; + * task_xid_vnr() : virtual id, i.e. the id seen from the pid namespace of + * current. + * task_xid_nr_ns() : id seen from the ns specified; + * + * see also pid_nr() etc in include/linux/pid.h + */ +pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, struct pid_namespace *ns); + +static inline pid_t task_pid_nr(struct task_struct *tsk) +{ + return tsk->pid; +} + +static inline pid_t task_pid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) +{ + return __task_pid_nr_ns(tsk, PIDTYPE_PID, ns); +} + +static inline pid_t task_pid_vnr(struct task_struct *tsk) +{ + return __task_pid_nr_ns(tsk, PIDTYPE_PID, NULL); +} + + +static inline pid_t task_tgid_nr(struct task_struct *tsk) +{ + return tsk->tgid; +} + +/** + * pid_alive - check that a task structure is not stale + * @p: Task structure to be checked. + * + * Test if a process is not yet dead (at most zombie state) + * If pid_alive fails, then pointers within the task structure + * can be stale and must not be dereferenced. + * + * Return: 1 if the process is alive. 0 otherwise. + */ +static inline int pid_alive(const struct task_struct *p) +{ + return p->thread_pid != NULL; +} + +static inline pid_t task_pgrp_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) +{ + return __task_pid_nr_ns(tsk, PIDTYPE_PGID, ns); +} + +static inline pid_t task_pgrp_vnr(struct task_struct *tsk) +{ + return __task_pid_nr_ns(tsk, PIDTYPE_PGID, NULL); +} + + +static inline pid_t task_session_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) +{ + return __task_pid_nr_ns(tsk, PIDTYPE_SID, ns); +} + +static inline pid_t task_session_vnr(struct task_struct *tsk) +{ + return __task_pid_nr_ns(tsk, PIDTYPE_SID, NULL); +} + +static inline pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) +{ + return __task_pid_nr_ns(tsk, PIDTYPE_TGID, ns); +} + +static inline pid_t task_tgid_vnr(struct task_struct *tsk) +{ + return __task_pid_nr_ns(tsk, PIDTYPE_TGID, NULL); +} + +static inline pid_t task_ppid_nr_ns(const struct task_struct *tsk, struct pid_namespace *ns) +{ + pid_t pid = 0; + + rcu_read_lock(); + if (pid_alive(tsk)) + pid = task_tgid_nr_ns(rcu_dereference(tsk->real_parent), ns); + rcu_read_unlock(); + + return pid; +} + +static inline pid_t task_ppid_nr(const struct task_struct *tsk) +{ + return task_ppid_nr_ns(tsk, &init_pid_ns); +} + +/* Obsolete, do not use: */ +static inline pid_t task_pgrp_nr(struct task_struct *tsk) +{ + return task_pgrp_nr_ns(tsk, &init_pid_ns); +} + +/** + * is_global_init - check if a task structure is init. Since init + * is free to have sub-threads we need to check tgid. + * @tsk: Task structure to be checked. + * + * Check if a task structure is the first user space task the kernel created. + * + * Return: 1 if the task structure is init. 0 otherwise. + */ +static inline int is_global_init(struct task_struct *tsk) +{ + return task_tgid_nr(tsk) == 1; +} + #endif /* _LINUX_PID_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 37cc9d257073..9e2708c2cfa6 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1561,114 +1561,6 @@ struct task_struct { */ }; -static inline struct pid *task_pid(struct task_struct *task) -{ - return task->thread_pid; -} - -/* - * the helpers to get the task's different pids as they are seen - * from various namespaces - * - * task_xid_nr() : global id, i.e. the id seen from the init namespace; - * task_xid_vnr() : virtual id, i.e. the id seen from the pid namespace of - * current. - * task_xid_nr_ns() : id seen from the ns specified; - * - * see also pid_nr() etc in include/linux/pid.h - */ -pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, struct pid_namespace *ns); - -static inline pid_t task_pid_nr(struct task_struct *tsk) -{ - return tsk->pid; -} - -static inline pid_t task_pid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) -{ - return __task_pid_nr_ns(tsk, PIDTYPE_PID, ns); -} - -static inline pid_t task_pid_vnr(struct task_struct *tsk) -{ - return __task_pid_nr_ns(tsk, PIDTYPE_PID, NULL); -} - - -static inline pid_t task_tgid_nr(struct task_struct *tsk) -{ - return tsk->tgid; -} - -/** - * pid_alive - check that a task structure is not stale - * @p: Task structure to be checked. - * - * Test if a process is not yet dead (at most zombie state) - * If pid_alive fails, then pointers within the task structure - * can be stale and must not be dereferenced. - * - * Return: 1 if the process is alive. 0 otherwise. - */ -static inline int pid_alive(const struct task_struct *p) -{ - return p->thread_pid != NULL; -} - -static inline pid_t task_pgrp_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) -{ - return __task_pid_nr_ns(tsk, PIDTYPE_PGID, ns); -} - -static inline pid_t task_pgrp_vnr(struct task_struct *tsk) -{ - return __task_pid_nr_ns(tsk, PIDTYPE_PGID, NULL); -} - - -static inline pid_t task_session_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) -{ - return __task_pid_nr_ns(tsk, PIDTYPE_SID, ns); -} - -static inline pid_t task_session_vnr(struct task_struct *tsk) -{ - return __task_pid_nr_ns(tsk, PIDTYPE_SID, NULL); -} - -static inline pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) -{ - return __task_pid_nr_ns(tsk, PIDTYPE_TGID, ns); -} - -static inline pid_t task_tgid_vnr(struct task_struct *tsk) -{ - return __task_pid_nr_ns(tsk, PIDTYPE_TGID, NULL); -} - -static inline pid_t task_ppid_nr_ns(const struct task_struct *tsk, struct pid_namespace *ns) -{ - pid_t pid = 0; - - rcu_read_lock(); - if (pid_alive(tsk)) - pid = task_tgid_nr_ns(rcu_dereference(tsk->real_parent), ns); - rcu_read_unlock(); - - return pid; -} - -static inline pid_t task_ppid_nr(const struct task_struct *tsk) -{ - return task_ppid_nr_ns(tsk, &init_pid_ns); -} - -/* Obsolete, do not use: */ -static inline pid_t task_pgrp_nr(struct task_struct *tsk) -{ - return task_pgrp_nr_ns(tsk, &init_pid_ns); -} - #define TASK_REPORT_IDLE (TASK_REPORT + 1) #define TASK_REPORT_MAX (TASK_REPORT_IDLE << 1) @@ -1712,20 +1604,6 @@ static inline char task_state_to_char(struct task_struct *tsk) return task_index_to_char(task_state_index(tsk)); } -/** - * is_global_init - check if a task structure is init. Since init - * is free to have sub-threads we need to check tgid. - * @tsk: Task structure to be checked. - * - * Check if a task structure is the first user space task the kernel created. - * - * Return: 1 if the task structure is init. 0 otherwise. - */ -static inline int is_global_init(struct task_struct *tsk) -{ - return task_tgid_nr(tsk) == 1; -} - extern struct pid *cad_pid; /* diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index 3499c1a8b929..b847d8fa75a9 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include diff --git a/ipc/util.h b/ipc/util.h index 67bdd2aa2c28..a55d6cebe6d3 100644 --- a/ipc/util.h +++ b/ipc/util.h @@ -14,6 +14,7 @@ #include #include #include +#include /* * The IPC ID contains 2 separate numbers - index and sequence number. diff --git a/kernel/async.c b/kernel/async.c index b2c4ba5686ee..79f6a3034b1f 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -46,11 +46,12 @@ asynchronous and synchronous parts of the kernel. #include #include -#include #include -#include +#include +#include #include #include +#include #include #include "workqueue_internal.h" diff --git a/kernel/locking/spinlock_debug.c b/kernel/locking/spinlock_debug.c index 14235671a1a7..87b03d2e41db 100644 --- a/kernel/locking/spinlock_debug.c +++ b/kernel/locking/spinlock_debug.c @@ -12,6 +12,7 @@ #include #include #include +#include void __raw_spin_lock_init(raw_spinlock_t *lock, const char *name, struct lock_class_key *key, short inner) -- cgit From 8b7787a543cde905e53eaf29172c9472fe8a6a75 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 11 Dec 2023 13:12:49 -0500 Subject: plist: Split out plist_types.h Trimming down sched.h dependencies: we don't want to include more than the base types. Signed-off-by: Kent Overstreet --- include/linux/plist.h | 12 +----------- include/linux/plist_types.h | 17 +++++++++++++++++ include/linux/sched.h | 2 +- init/init_task.c | 1 + kernel/futex/core.c | 1 + kernel/futex/requeue.c | 1 + kernel/futex/waitwake.c | 1 + mm/swapfile.c | 1 + 8 files changed, 24 insertions(+), 12 deletions(-) create mode 100644 include/linux/plist_types.h (limited to 'kernel') diff --git a/include/linux/plist.h b/include/linux/plist.h index 0f352c1d3c80..8c1c8adf7fe9 100644 --- a/include/linux/plist.h +++ b/include/linux/plist.h @@ -75,20 +75,10 @@ #include #include -#include +#include #include -struct plist_head { - struct list_head node_list; -}; - -struct plist_node { - int prio; - struct list_head prio_list; - struct list_head node_list; -}; - /** * PLIST_HEAD_INIT - static struct plist_head initializer * @head: struct plist_head variable name diff --git a/include/linux/plist_types.h b/include/linux/plist_types.h new file mode 100644 index 000000000000..c37e784330af --- /dev/null +++ b/include/linux/plist_types.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef _LINUX_PLIST_TYPES_H +#define _LINUX_PLIST_TYPES_H + +#include + +struct plist_head { + struct list_head node_list; +}; + +struct plist_node { + int prio; + struct list_head prio_list; + struct list_head node_list; +}; + +#endif /* _LINUX_PLIST_TYPES_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 9e2708c2cfa6..8c230f24688b 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -16,7 +16,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/init/init_task.c b/init/init_task.c index 5727d42149c3..56220898a256 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -12,6 +12,7 @@ #include #include #include +#include #include diff --git a/kernel/futex/core.c b/kernel/futex/core.c index dad981a865b8..e0e853412c15 100644 --- a/kernel/futex/core.c +++ b/kernel/futex/core.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include #include diff --git a/kernel/futex/requeue.c b/kernel/futex/requeue.c index eb21f065816b..b47bb764b352 100644 --- a/kernel/futex/requeue.c +++ b/kernel/futex/requeue.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-or-later +#include #include #include "futex.h" diff --git a/kernel/futex/waitwake.c b/kernel/futex/waitwake.c index 61b112897a84..3a10375d9521 100644 --- a/kernel/futex/waitwake.c +++ b/kernel/futex/waitwake.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-or-later +#include #include #include #include diff --git a/mm/swapfile.c b/mm/swapfile.c index 4bc70f459164..25019af07181 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -42,6 +42,7 @@ #include #include #include +#include #include #include -- cgit From 6dfeff09d5ad331905c7066207053d286d58ac83 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 11 Dec 2023 18:14:41 +0000 Subject: wait: Remove uapi header file from main header file There's really no overlap between uapi/linux/wait.h and linux/wait.h. There are two files which rely on the uapi file being implcitly included, so explicitly include it there and remove it from the main header file. Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Kent Overstreet Reviewed-by: Christian Brauner --- include/linux/wait.h | 1 - kernel/exit.c | 4 +++- kernel/pid_namespace.c | 1 + 3 files changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/wait.h b/include/linux/wait.h index 3473b663176f..8aa3372f21a0 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -9,7 +9,6 @@ #include #include -#include typedef struct wait_queue_entry wait_queue_entry_t; diff --git a/kernel/exit.c b/kernel/exit.c index ee9f43bed49a..2ef33047371b 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -69,8 +69,10 @@ #include #include #include - #include + +#include + #include #include diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 3028b2218aa4..7ade20e95232 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -23,6 +23,7 @@ #include #include #include +#include #include "pid_sysctl.h" static DEFINE_MUTEX(pid_caches_mutex); -- cgit From a4aebe936554dac6a91e5d091179c934f8325708 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 19 Dec 2023 15:26:59 -0800 Subject: posix-timers: Get rid of [COMPAT_]SYS_NI() uses Only the posix timer system calls use this (when the posix timer support is disabled, which does not actually happen in any normal case), because they had debug code to print out a warning about missing system calls. Get rid of that special case, and just use the standard COND_SYSCALL interface that creates weak system call stubs that return -ENOSYS for when the system call does not exist. This fixes a kCFI issue with the SYS_NI() hackery: CFI failure at int80_emulation+0x67/0xb0 (target: sys_ni_posix_timers+0x0/0x70; expected type: 0xb02b34d9) WARNING: CPU: 0 PID: 48 at int80_emulation+0x67/0xb0 Reported-by: kernel test robot Reviewed-by: Sami Tolvanen Tested-by: Sami Tolvanen Cc: Thomas Gleixner Cc: Dave Hansen Cc: Borislav Petkov Signed-off-by: Linus Torvalds --- arch/arm64/include/asm/syscall_wrapper.h | 4 --- arch/riscv/include/asm/syscall_wrapper.h | 5 ---- arch/s390/include/asm/syscall_wrapper.h | 13 +-------- arch/x86/include/asm/syscall_wrapper.h | 34 +++--------------------- kernel/sys_ni.c | 14 ++++++++++ kernel/time/posix-stubs.c | 45 -------------------------------- 6 files changed, 19 insertions(+), 96 deletions(-) (limited to 'kernel') diff --git a/arch/arm64/include/asm/syscall_wrapper.h b/arch/arm64/include/asm/syscall_wrapper.h index d977713ec0ba..abb57bc54305 100644 --- a/arch/arm64/include/asm/syscall_wrapper.h +++ b/arch/arm64/include/asm/syscall_wrapper.h @@ -44,9 +44,6 @@ return sys_ni_syscall(); \ } -#define COMPAT_SYS_NI(name) \ - SYSCALL_ALIAS(__arm64_compat_sys_##name, sys_ni_posix_timers); - #endif /* CONFIG_COMPAT */ #define __SYSCALL_DEFINEx(x, name, ...) \ @@ -81,6 +78,5 @@ } asmlinkage long __arm64_sys_ni_syscall(const struct pt_regs *__unused); -#define SYS_NI(name) SYSCALL_ALIAS(__arm64_sys_##name, sys_ni_posix_timers); #endif /* __ASM_SYSCALL_WRAPPER_H */ diff --git a/arch/riscv/include/asm/syscall_wrapper.h b/arch/riscv/include/asm/syscall_wrapper.h index 1d7942c8a6cb..eeec04b7dae6 100644 --- a/arch/riscv/include/asm/syscall_wrapper.h +++ b/arch/riscv/include/asm/syscall_wrapper.h @@ -46,9 +46,6 @@ asmlinkage long __riscv_sys_ni_syscall(const struct pt_regs *); return sys_ni_syscall(); \ } -#define COMPAT_SYS_NI(name) \ - SYSCALL_ALIAS(__riscv_compat_sys_##name, sys_ni_posix_timers); - #endif /* CONFIG_COMPAT */ #define __SYSCALL_DEFINEx(x, name, ...) \ @@ -82,6 +79,4 @@ asmlinkage long __riscv_sys_ni_syscall(const struct pt_regs *); return sys_ni_syscall(); \ } -#define SYS_NI(name) SYSCALL_ALIAS(__riscv_sys_##name, sys_ni_posix_timers); - #endif /* __ASM_SYSCALL_WRAPPER_H */ diff --git a/arch/s390/include/asm/syscall_wrapper.h b/arch/s390/include/asm/syscall_wrapper.h index 9286430fe729..35c1d1b860d8 100644 --- a/arch/s390/include/asm/syscall_wrapper.h +++ b/arch/s390/include/asm/syscall_wrapper.h @@ -63,10 +63,6 @@ cond_syscall(__s390x_sys_##name); \ cond_syscall(__s390_sys_##name) -#define SYS_NI(name) \ - SYSCALL_ALIAS(__s390x_sys_##name, sys_ni_posix_timers); \ - SYSCALL_ALIAS(__s390_sys_##name, sys_ni_posix_timers) - #define COMPAT_SYSCALL_DEFINEx(x, name, ...) \ long __s390_compat_sys##name(struct pt_regs *regs); \ ALLOW_ERROR_INJECTION(__s390_compat_sys##name, ERRNO); \ @@ -85,15 +81,11 @@ /* * As some compat syscalls may not be implemented, we need to expand - * COND_SYSCALL_COMPAT in kernel/sys_ni.c and COMPAT_SYS_NI in - * kernel/time/posix-stubs.c to cover this case as well. + * COND_SYSCALL_COMPAT in kernel/sys_ni.c to cover this case as well. */ #define COND_SYSCALL_COMPAT(name) \ cond_syscall(__s390_compat_sys_##name) -#define COMPAT_SYS_NI(name) \ - SYSCALL_ALIAS(__s390_compat_sys_##name, sys_ni_posix_timers) - #define __S390_SYS_STUBx(x, name, ...) \ long __s390_sys##name(struct pt_regs *regs); \ ALLOW_ERROR_INJECTION(__s390_sys##name, ERRNO); \ @@ -124,9 +116,6 @@ #define COND_SYSCALL(name) \ cond_syscall(__s390x_sys_##name) -#define SYS_NI(name) \ - SYSCALL_ALIAS(__s390x_sys_##name, sys_ni_posix_timers) - #define __S390_SYS_STUBx(x, fullname, name, ...) #endif /* CONFIG_COMPAT */ diff --git a/arch/x86/include/asm/syscall_wrapper.h b/arch/x86/include/asm/syscall_wrapper.h index fd2669b1cb2d..21f9407be5d3 100644 --- a/arch/x86/include/asm/syscall_wrapper.h +++ b/arch/x86/include/asm/syscall_wrapper.h @@ -86,9 +86,6 @@ extern long __ia32_sys_ni_syscall(const struct pt_regs *regs); return sys_ni_syscall(); \ } -#define __SYS_NI(abi, name) \ - SYSCALL_ALIAS(__##abi##_##name, sys_ni_posix_timers); - #ifdef CONFIG_X86_64 #define __X64_SYS_STUB0(name) \ __SYS_STUB0(x64, sys_##name) @@ -100,13 +97,10 @@ extern long __ia32_sys_ni_syscall(const struct pt_regs *regs); #define __X64_COND_SYSCALL(name) \ __COND_SYSCALL(x64, sys_##name) -#define __X64_SYS_NI(name) \ - __SYS_NI(x64, sys_##name) #else /* CONFIG_X86_64 */ #define __X64_SYS_STUB0(name) #define __X64_SYS_STUBx(x, name, ...) #define __X64_COND_SYSCALL(name) -#define __X64_SYS_NI(name) #endif /* CONFIG_X86_64 */ #if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) @@ -120,13 +114,10 @@ extern long __ia32_sys_ni_syscall(const struct pt_regs *regs); #define __IA32_COND_SYSCALL(name) \ __COND_SYSCALL(ia32, sys_##name) -#define __IA32_SYS_NI(name) \ - __SYS_NI(ia32, sys_##name) #else /* CONFIG_X86_32 || CONFIG_IA32_EMULATION */ #define __IA32_SYS_STUB0(name) #define __IA32_SYS_STUBx(x, name, ...) #define __IA32_COND_SYSCALL(name) -#define __IA32_SYS_NI(name) #endif /* CONFIG_X86_32 || CONFIG_IA32_EMULATION */ #ifdef CONFIG_IA32_EMULATION @@ -135,8 +126,7 @@ extern long __ia32_sys_ni_syscall(const struct pt_regs *regs); * additional wrappers (aptly named __ia32_sys_xyzzy) which decode the * ia32 regs in the proper order for shared or "common" syscalls. As some * syscalls may not be implemented, we need to expand COND_SYSCALL in - * kernel/sys_ni.c and SYS_NI in kernel/time/posix-stubs.c to cover this - * case as well. + * kernel/sys_ni.c to cover this case as well. */ #define __IA32_COMPAT_SYS_STUB0(name) \ __SYS_STUB0(ia32, compat_sys_##name) @@ -148,14 +138,10 @@ extern long __ia32_sys_ni_syscall(const struct pt_regs *regs); #define __IA32_COMPAT_COND_SYSCALL(name) \ __COND_SYSCALL(ia32, compat_sys_##name) -#define __IA32_COMPAT_SYS_NI(name) \ - __SYS_NI(ia32, compat_sys_##name) - #else /* CONFIG_IA32_EMULATION */ #define __IA32_COMPAT_SYS_STUB0(name) #define __IA32_COMPAT_SYS_STUBx(x, name, ...) #define __IA32_COMPAT_COND_SYSCALL(name) -#define __IA32_COMPAT_SYS_NI(name) #endif /* CONFIG_IA32_EMULATION */ @@ -175,13 +161,10 @@ extern long __ia32_sys_ni_syscall(const struct pt_regs *regs); #define __X32_COMPAT_COND_SYSCALL(name) \ __COND_SYSCALL(x64, compat_sys_##name) -#define __X32_COMPAT_SYS_NI(name) \ - __SYS_NI(x64, compat_sys_##name) #else /* CONFIG_X86_X32_ABI */ #define __X32_COMPAT_SYS_STUB0(name) #define __X32_COMPAT_SYS_STUBx(x, name, ...) #define __X32_COMPAT_COND_SYSCALL(name) -#define __X32_COMPAT_SYS_NI(name) #endif /* CONFIG_X86_X32_ABI */ @@ -212,17 +195,12 @@ extern long __ia32_sys_ni_syscall(const struct pt_regs *regs); /* * As some compat syscalls may not be implemented, we need to expand - * COND_SYSCALL_COMPAT in kernel/sys_ni.c and COMPAT_SYS_NI in - * kernel/time/posix-stubs.c to cover this case as well. + * COND_SYSCALL_COMPAT in kernel/sys_ni.c to cover this case as well. */ #define COND_SYSCALL_COMPAT(name) \ __IA32_COMPAT_COND_SYSCALL(name) \ __X32_COMPAT_COND_SYSCALL(name) -#define COMPAT_SYS_NI(name) \ - __IA32_COMPAT_SYS_NI(name) \ - __X32_COMPAT_SYS_NI(name) - #endif /* CONFIG_COMPAT */ #define __SYSCALL_DEFINEx(x, name, ...) \ @@ -243,8 +221,8 @@ extern long __ia32_sys_ni_syscall(const struct pt_regs *regs); * As the generic SYSCALL_DEFINE0() macro does not decode any parameters for * obvious reasons, and passing struct pt_regs *regs to it in %rdi does not * hurt, we only need to re-define it here to keep the naming congruent to - * SYSCALL_DEFINEx() -- which is essential for the COND_SYSCALL() and SYS_NI() - * macros to work correctly. + * SYSCALL_DEFINEx() -- which is essential for the COND_SYSCALL() macro + * to work correctly. */ #define SYSCALL_DEFINE0(sname) \ SYSCALL_METADATA(_##sname, 0); \ @@ -257,10 +235,6 @@ extern long __ia32_sys_ni_syscall(const struct pt_regs *regs); __X64_COND_SYSCALL(name) \ __IA32_COND_SYSCALL(name) -#define SYS_NI(name) \ - __X64_SYS_NI(name) \ - __IA32_SYS_NI(name) - /* * For VSYSCALLS, we need to declare these three syscalls with the new diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index e1a6e3c675c0..9a846439b36a 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -201,6 +201,20 @@ COND_SYSCALL(recvmmsg_time32); COND_SYSCALL_COMPAT(recvmmsg_time32); COND_SYSCALL_COMPAT(recvmmsg_time64); +/* Posix timer syscalls may be configured out */ +COND_SYSCALL(timer_create); +COND_SYSCALL(timer_gettime); +COND_SYSCALL(timer_getoverrun); +COND_SYSCALL(timer_settime); +COND_SYSCALL(timer_delete); +COND_SYSCALL(clock_adjtime); +COND_SYSCALL(getitimer); +COND_SYSCALL(setitimer); +COND_SYSCALL(alarm); +COND_SYSCALL_COMPAT(timer_create); +COND_SYSCALL_COMPAT(getitimer); +COND_SYSCALL_COMPAT(setitimer); + /* * Architecture specific syscalls: see further below */ diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c index 828aeecbd1e8..9b6fcb8d85e7 100644 --- a/kernel/time/posix-stubs.c +++ b/kernel/time/posix-stubs.c @@ -17,40 +17,6 @@ #include #include -#ifdef CONFIG_ARCH_HAS_SYSCALL_WRAPPER -/* Architectures may override SYS_NI and COMPAT_SYS_NI */ -#include -#endif - -asmlinkage long sys_ni_posix_timers(void) -{ - pr_err_once("process %d (%s) attempted a POSIX timer syscall " - "while CONFIG_POSIX_TIMERS is not set\n", - current->pid, current->comm); - return -ENOSYS; -} - -#ifndef SYS_NI -#define SYS_NI(name) SYSCALL_ALIAS(sys_##name, sys_ni_posix_timers) -#endif - -#ifndef COMPAT_SYS_NI -#define COMPAT_SYS_NI(name) SYSCALL_ALIAS(compat_sys_##name, sys_ni_posix_timers) -#endif - -SYS_NI(timer_create); -SYS_NI(timer_gettime); -SYS_NI(timer_getoverrun); -SYS_NI(timer_settime); -SYS_NI(timer_delete); -SYS_NI(clock_adjtime); -SYS_NI(getitimer); -SYS_NI(setitimer); -SYS_NI(clock_adjtime32); -#ifdef __ARCH_WANT_SYS_ALARM -SYS_NI(alarm); -#endif - /* * We preserve minimal support for CLOCK_REALTIME and CLOCK_MONOTONIC * as it is easy to remain compatible with little code. CLOCK_BOOTTIME @@ -158,18 +124,7 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, which_clock); } -#ifdef CONFIG_COMPAT -COMPAT_SYS_NI(timer_create); -#endif - -#if defined(CONFIG_COMPAT) || defined(CONFIG_ALPHA) -COMPAT_SYS_NI(getitimer); -COMPAT_SYS_NI(setitimer); -#endif - #ifdef CONFIG_COMPAT_32BIT_TIME -SYS_NI(timer_settime32); -SYS_NI(timer_gettime32); SYSCALL_DEFINE2(clock_settime32, const clockid_t, which_clock, struct old_timespec32 __user *, tp) -- cgit From 1bfc466b13cf6652ba227c282c27a30ffede69a5 Mon Sep 17 00:00:00 2001 From: Dmitry Antipov Date: Thu, 21 Dec 2023 12:01:21 +0300 Subject: watch_queue: fix kcalloc() arguments order When compiling with gcc version 14.0.0 20231220 (experimental) and W=1, I've noticed the following warning: kernel/watch_queue.c: In function 'watch_queue_set_size': kernel/watch_queue.c:273:32: warning: 'kcalloc' sizes specified with 'sizeof' in the earlier argument and not in the later argument [-Wcalloc-transposed-args] 273 | pages = kcalloc(sizeof(struct page *), nr_pages, GFP_KERNEL); | ^~~~~~ Since 'n' and 'size' arguments of 'kcalloc()' are multiplied to calculate the final size, their actual order doesn't affect the result and so this is not a bug. But it's still worth to fix it. Signed-off-by: Dmitry Antipov Link: https://lore.kernel.org/r/20231221090139.12579-1-dmantipov@yandex.ru Signed-off-by: Christian Brauner --- kernel/watch_queue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c index 778b4056700f..03b90d7d2175 100644 --- a/kernel/watch_queue.c +++ b/kernel/watch_queue.c @@ -270,7 +270,7 @@ long watch_queue_set_size(struct pipe_inode_info *pipe, unsigned int nr_notes) goto error; ret = -ENOMEM; - pages = kcalloc(sizeof(struct page *), nr_pages, GFP_KERNEL); + pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL); if (!pages) goto error; -- cgit From b08c8fc0411dce0fc44b78ce4d67f1b67c35c196 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 20 Dec 2023 14:38:05 +0100 Subject: bpf: Re-support uid and gid when mounting bpffs For a clean, conflict-free revert of the token-related patches in commit d17aff807f84 ("Revert BPF token-related functionality"), the bpf fs commit 750e785796bb ("bpf: Support uid and gid when mounting bpffs") was undone temporarily as well. This patch manually re-adds the functionality from the original one back in 750e785796bb, no other functional changes intended. Testing: # mount -t bpf -o uid=65534,gid=65534 bpffs ./foo # ls -la . | grep foo drwxrwxrwt 2 nobody nogroup 0 Dec 20 13:16 foo # mount -t bpf bpffs on /root/foo type bpf (rw,relatime,uid=65534,gid=65534) Also, passing invalid arguments for uid/gid are properly rejected as expected. Fixes: d17aff807f84 ("Revert BPF token-related functionality") Signed-off-by: Daniel Borkmann Reviewed-by: Christian Brauner Cc: Jie Jiang Cc: Andrii Nakryiko Cc: linux-fsdevel@vger.kernel.org Link: https://lore.kernel.org/bpf/20231220133805.20953-1-daniel@iogearbox.net --- kernel/bpf/inode.c | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 51 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 1aafb2ff2e95..41e0a55c35f5 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -599,8 +599,15 @@ EXPORT_SYMBOL(bpf_prog_get_type_path); */ static int bpf_show_options(struct seq_file *m, struct dentry *root) { - umode_t mode = d_inode(root)->i_mode & S_IALLUGO & ~S_ISVTX; - + struct inode *inode = d_inode(root); + umode_t mode = inode->i_mode & S_IALLUGO & ~S_ISVTX; + + if (!uid_eq(inode->i_uid, GLOBAL_ROOT_UID)) + seq_printf(m, ",uid=%u", + from_kuid_munged(&init_user_ns, inode->i_uid)); + if (!gid_eq(inode->i_gid, GLOBAL_ROOT_GID)) + seq_printf(m, ",gid=%u", + from_kgid_munged(&init_user_ns, inode->i_gid)); if (mode != S_IRWXUGO) seq_printf(m, ",mode=%o", mode); return 0; @@ -625,15 +632,21 @@ static const struct super_operations bpf_super_ops = { }; enum { + OPT_UID, + OPT_GID, OPT_MODE, }; static const struct fs_parameter_spec bpf_fs_parameters[] = { + fsparam_u32 ("uid", OPT_UID), + fsparam_u32 ("gid", OPT_GID), fsparam_u32oct ("mode", OPT_MODE), {} }; struct bpf_mount_opts { + kuid_t uid; + kgid_t gid; umode_t mode; }; @@ -641,6 +654,8 @@ static int bpf_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct bpf_mount_opts *opts = fc->fs_private; struct fs_parse_result result; + kuid_t uid; + kgid_t gid; int opt; opt = fs_parse(fc, bpf_fs_parameters, param, &result); @@ -662,12 +677,42 @@ static int bpf_parse_param(struct fs_context *fc, struct fs_parameter *param) } switch (opt) { + case OPT_UID: + uid = make_kuid(current_user_ns(), result.uint_32); + if (!uid_valid(uid)) + goto bad_value; + + /* + * The requested uid must be representable in the + * filesystem's idmapping. + */ + if (!kuid_has_mapping(fc->user_ns, uid)) + goto bad_value; + + opts->uid = uid; + break; + case OPT_GID: + gid = make_kgid(current_user_ns(), result.uint_32); + if (!gid_valid(gid)) + goto bad_value; + + /* + * The requested gid must be representable in the + * filesystem's idmapping. + */ + if (!kgid_has_mapping(fc->user_ns, gid)) + goto bad_value; + + opts->gid = gid; + break; case OPT_MODE: opts->mode = result.uint_32 & S_IALLUGO; break; } return 0; +bad_value: + return invalfc(fc, "Bad value for '%s'", param->key); } struct bpf_preload_ops *bpf_preload_ops; @@ -750,6 +795,8 @@ static int bpf_fill_super(struct super_block *sb, struct fs_context *fc) sb->s_op = &bpf_super_ops; inode = sb->s_root->d_inode; + inode->i_uid = opts->uid; + inode->i_gid = opts->gid; inode->i_op = &bpf_dir_iops; inode->i_mode &= ~S_IALLUGO; populate_bpffs(sb->s_root); @@ -785,6 +832,8 @@ static int bpf_init_fs_context(struct fs_context *fc) return -ENOMEM; opts->mode = S_IRWXUGO; + opts->uid = current_fsuid(); + opts->gid = current_fsgid(); fc->fs_private = opts; fc->ops = &bpf_context_ops; -- cgit From 88b30c7f5d27e1594d70dc2bd7199b18f2b57fa9 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Wed, 20 Dec 2023 11:15:25 -0500 Subject: tracing / synthetic: Disable events after testing in synth_event_gen_test_init() The synth_event_gen_test module can be built in, if someone wants to run the tests at boot up and not have to load them. The synth_event_gen_test_init() function creates and enables the synthetic events and runs its tests. The synth_event_gen_test_exit() disables the events it created and destroys the events. If the module is builtin, the events are never disabled. The issue is, the events should be disable after the tests are run. This could be an issue if the rest of the boot up tests are enabled, as they expect the events to be in a known state before testing. That known state happens to be disabled. When CONFIG_SYNTH_EVENT_GEN_TEST=y and CONFIG_EVENT_TRACE_STARTUP_TEST=y a warning will trigger: Running tests on trace events: Testing event create_synth_test: Enabled event during self test! ------------[ cut here ]------------ WARNING: CPU: 2 PID: 1 at kernel/trace/trace_events.c:4150 event_trace_self_tests+0x1c2/0x480 Modules linked in: CPU: 2 PID: 1 Comm: swapper/0 Not tainted 6.7.0-rc2-test-00031-gb803d7c664d5-dirty #276 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.2-debian-1.16.2-1 04/01/2014 RIP: 0010:event_trace_self_tests+0x1c2/0x480 Code: bb e8 a2 ab 5d fc 48 8d 7b 48 e8 f9 3d 99 fc 48 8b 73 48 40 f6 c6 01 0f 84 d6 fe ff ff 48 c7 c7 20 b6 ad bb e8 7f ab 5d fc 90 <0f> 0b 90 48 89 df e8 d3 3d 99 fc 48 8b 1b 4c 39 f3 0f 85 2c ff ff RSP: 0000:ffffc9000001fdc0 EFLAGS: 00010246 RAX: 0000000000000029 RBX: ffff88810399ca80 RCX: 0000000000000000 RDX: 0000000000000000 RSI: ffffffffb9f19478 RDI: ffff88823c734e64 RBP: ffff88810399f300 R08: 0000000000000000 R09: fffffbfff79eb32a R10: ffffffffbcf59957 R11: 0000000000000001 R12: ffff888104068090 R13: ffffffffbc89f0a0 R14: ffffffffbc8a0f08 R15: 0000000000000078 FS: 0000000000000000(0000) GS:ffff88823c700000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 00000001f6282001 CR4: 0000000000170ef0 Call Trace: ? __warn+0xa5/0x200 ? event_trace_self_tests+0x1c2/0x480 ? report_bug+0x1f6/0x220 ? handle_bug+0x6f/0x90 ? exc_invalid_op+0x17/0x50 ? asm_exc_invalid_op+0x1a/0x20 ? tracer_preempt_on+0x78/0x1c0 ? event_trace_self_tests+0x1c2/0x480 ? __pfx_event_trace_self_tests_init+0x10/0x10 event_trace_self_tests_init+0x27/0xe0 do_one_initcall+0xd6/0x3c0 ? __pfx_do_one_initcall+0x10/0x10 ? kasan_set_track+0x25/0x30 ? rcu_is_watching+0x38/0x60 kernel_init_freeable+0x324/0x450 ? __pfx_kernel_init+0x10/0x10 kernel_init+0x1f/0x1e0 ? _raw_spin_unlock_irq+0x33/0x50 ret_from_fork+0x34/0x60 ? __pfx_kernel_init+0x10/0x10 ret_from_fork_asm+0x1b/0x30 This is because the synth_event_gen_test_init() left the synthetic events that it created enabled. By having it disable them after testing, the other selftests will run fine. Link: https://lore.kernel.org/linux-trace-kernel/20231220111525.2f0f49b0@gandalf.local.home Cc: stable@vger.kernel.org Cc: Mathieu Desnoyers Cc: Tom Zanussi Fixes: 9fe41efaca084 ("tracing: Add synth event generation test module") Acked-by: Masami Hiramatsu (Google) Reported-by: Alexander Graf Tested-by: Alexander Graf Signed-off-by: Steven Rostedt (Google) --- kernel/trace/synth_event_gen_test.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/synth_event_gen_test.c b/kernel/trace/synth_event_gen_test.c index 8dfe85499d4a..354c2117be43 100644 --- a/kernel/trace/synth_event_gen_test.c +++ b/kernel/trace/synth_event_gen_test.c @@ -477,6 +477,17 @@ static int __init synth_event_gen_test_init(void) ret = test_trace_synth_event(); WARN_ON(ret); + + /* Disable when done */ + trace_array_set_clr_event(gen_synth_test->tr, + "synthetic", + "gen_synth_test", false); + trace_array_set_clr_event(empty_synth_test->tr, + "synthetic", + "empty_synth_test", false); + trace_array_set_clr_event(create_synth_test->tr, + "synthetic", + "create_synth_test", false); out: return ret; } -- cgit From 22887dfba0633c09444562722f0cf68f61ec9a50 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 19 Dec 2023 13:54:20 -0500 Subject: ring-buffer: Clear pages on error in ring_buffer_subbuf_order_set() failure On failure to allocate ring buffer pages, the pointer to the CPU buffer pages is freed, but the pages that were allocated previously were not. Make sure they are freed too. Link: https://lore.kernel.org/linux-trace-kernel/20231219185629.179352802@goodmis.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Tzvetomir Stoyanov Cc: Vincent Donnefort Cc: Kent Overstreet Fixes: f9b94daa542a ("tracing: Set new size of the ring buffer sub page") Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index c2afcf98ea91..3c11e8e811ed 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -5927,6 +5927,7 @@ error: for_each_buffer_cpu(buffer, cpu) { if (!cpu_buffers[cpu]) continue; + rb_free_cpu_buffer(cpu_buffers[cpu]); kfree(cpu_buffers[cpu]); } kfree(cpu_buffers); -- cgit From b81e03a24966dca0b119eff0549a4e44befff419 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 19 Dec 2023 13:54:21 -0500 Subject: ring-buffer: Do no swap cpu buffers if order is different As all the subbuffer order (subbuffer sizes) must be the same throughout the ring buffer, check the order of the buffers that are doing a CPU buffer swap in ring_buffer_swap_cpu() to make sure they are the same. If the are not the same, then fail to do the swap, otherwise the ring buffer will think the CPU buffer has a specific subbuffer size when it does not. Link: https://lore.kernel.org/linux-trace-kernel/20231219185629.467894710@goodmis.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Tzvetomir Stoyanov Cc: Vincent Donnefort Cc: Kent Overstreet Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 3c11e8e811ed..fdcd171b09b5 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -5417,6 +5417,9 @@ int ring_buffer_swap_cpu(struct trace_buffer *buffer_a, if (cpu_buffer_a->nr_pages != cpu_buffer_b->nr_pages) goto out; + if (buffer_a->subbuf_order != buffer_b->subbuf_order) + goto out; + ret = -EAGAIN; if (atomic_read(&buffer_a->record_disabled)) -- cgit From 4e958db34fd5324421ef9562c31c15e2d152dc63 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 19 Dec 2023 13:54:22 -0500 Subject: ring-buffer: Make sure the spare sub buffer used for reads has same size Now that the ring buffer specifies the size of its sub buffers, they all need to be the same size. When doing a read, a swap is done with a spare page. Make sure they are the same size before doing the swap, otherwise the read will fail. Link: https://lore.kernel.org/linux-trace-kernel/20231219185629.763664788@goodmis.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Tzvetomir Stoyanov Cc: Vincent Donnefort Cc: Kent Overstreet Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 711095aa731d..4dcdc30aa110 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -7582,6 +7582,7 @@ struct ftrace_buffer_info { struct trace_iterator iter; void *spare; unsigned int spare_cpu; + unsigned int spare_size; unsigned int read; }; @@ -8301,6 +8302,15 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, page_size = ring_buffer_subbuf_size_get(iter->array_buffer->buffer); + /* Make sure the spare matches the current sub buffer size */ + if (info->spare) { + if (page_size != info->spare_size) { + ring_buffer_free_read_page(iter->array_buffer->buffer, + info->spare_cpu, info->spare); + info->spare = NULL; + } + } + if (!info->spare) { info->spare = ring_buffer_alloc_read_page(iter->array_buffer->buffer, iter->cpu_file); @@ -8309,6 +8319,7 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, info->spare = NULL; } else { info->spare_cpu = iter->cpu_file; + info->spare_size = page_size; } } if (!info->spare) -- cgit From aa067682adf19ca78f81cd57539282dbfd7cce21 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 19 Dec 2023 13:54:23 -0500 Subject: tracing: Update snapshot order along with main buffer order When updating the order of the sub buffers for the main buffer, make sure that if the snapshot buffer exists, that it gets its order updated as well. Link: https://lore.kernel.org/linux-trace-kernel/20231219185630.054668186@goodmis.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Tzvetomir Stoyanov Cc: Vincent Donnefort Cc: Kent Overstreet Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 45 +++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 43 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 4dcdc30aa110..2439e00aa4ce 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1263,10 +1263,17 @@ static void set_buffer_entries(struct array_buffer *buf, unsigned long val); int tracing_alloc_snapshot_instance(struct trace_array *tr) { + int order; int ret; if (!tr->allocated_snapshot) { + /* Make the snapshot buffer have the same order as main buffer */ + order = ring_buffer_subbuf_order_get(tr->array_buffer.buffer); + ret = ring_buffer_subbuf_order_set(tr->max_buffer.buffer, order); + if (ret < 0) + return ret; + /* allocate spare buffer */ ret = resize_buffer_duplicate_size(&tr->max_buffer, &tr->array_buffer, RING_BUFFER_ALL_CPUS); @@ -1286,6 +1293,7 @@ static void free_snapshot(struct trace_array *tr) * The max_tr ring buffer has some state (e.g. ring->clock) and * we want preserve it. */ + ring_buffer_subbuf_order_set(tr->max_buffer.buffer, 0); ring_buffer_resize(tr->max_buffer.buffer, 1, RING_BUFFER_ALL_CPUS); set_buffer_entries(&tr->max_buffer, 1); tracing_reset_online_cpus(&tr->max_buffer); @@ -9393,6 +9401,7 @@ buffer_order_write(struct file *filp, const char __user *ubuf, { struct trace_array *tr = filp->private_data; unsigned long val; + int old_order; int ret; ret = kstrtoul_from_user(ubuf, cnt, 10, &val); @@ -9403,12 +9412,44 @@ buffer_order_write(struct file *filp, const char __user *ubuf, if (val < 0 || val > 7) return -EINVAL; + old_order = ring_buffer_subbuf_order_get(tr->array_buffer.buffer); + if (old_order == val) + return 0; + ret = ring_buffer_subbuf_order_set(tr->array_buffer.buffer, val); if (ret) - return ret; + return 0; - (*ppos)++; +#ifdef CONFIG_TRACER_MAX_TRACE + + if (!tr->allocated_snapshot) + goto out_max; + ret = ring_buffer_subbuf_order_set(tr->max_buffer.buffer, val); + if (ret) { + /* Put back the old order */ + cnt = ring_buffer_subbuf_order_set(tr->array_buffer.buffer, old_order); + if (WARN_ON_ONCE(cnt)) { + /* + * AARGH! We are left with different orders! + * The max buffer is our "snapshot" buffer. + * When a tracer needs a snapshot (one of the + * latency tracers), it swaps the max buffer + * with the saved snap shot. We succeeded to + * update the order of the main buffer, but failed to + * update the order of the max buffer. But when we tried + * to reset the main buffer to the original size, we + * failed there too. This is very unlikely to + * happen, but if it does, warn and kill all + * tracing. + */ + tracing_disabled = 1; + } + return ret; + } + out_max: +#endif + (*ppos)++; return cnt; } -- cgit From fa4b54af5ba17a59ce082e9d710eb638bdce7637 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 19 Dec 2023 13:54:24 -0500 Subject: tracing: Stop the tracing while changing the ring buffer subbuf size Because the main buffer and the snapshot buffer need to be the same for some tracers, otherwise it will fail and disable all tracing, the tracers need to be stopped while updating the sub buffer sizes so that the tracers see the main and snapshot buffers with the same sub buffer size. Link: https://lore.kernel.org/linux-trace-kernel/20231219185630.353222794@goodmis.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Tzvetomir Stoyanov Cc: Vincent Donnefort Cc: Kent Overstreet Fixes: 2808e31ec12e ("ring-buffer: Add interface for configuring trace sub buffer size") Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 2439e00aa4ce..82303bd2bba1 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -9412,13 +9412,16 @@ buffer_order_write(struct file *filp, const char __user *ubuf, if (val < 0 || val > 7) return -EINVAL; + /* Do not allow tracing while changing the order of the ring buffer */ + tracing_stop_tr(tr); + old_order = ring_buffer_subbuf_order_get(tr->array_buffer.buffer); if (old_order == val) - return 0; + goto out; ret = ring_buffer_subbuf_order_set(tr->array_buffer.buffer, val); if (ret) - return 0; + goto out; #ifdef CONFIG_TRACER_MAX_TRACE @@ -9445,11 +9448,15 @@ buffer_order_write(struct file *filp, const char __user *ubuf, */ tracing_disabled = 1; } - return ret; + goto out; } out_max: #endif (*ppos)++; + out: + if (ret) + cnt = ret; + tracing_start_tr(tr); return cnt; } -- cgit From 353cc219372935805218e05a7754a059ab104641 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 19 Dec 2023 13:54:25 -0500 Subject: ring-buffer: Keep the same size when updating the order The function ring_buffer_subbuf_order_set() just updated the sub-buffers to the new size, but this also changes the size of the buffer in doing so. As the size is determined by nr_pages * subbuf_size. If the subbuf_size is increased without decreasing the nr_pages, this causes the total size of the buffer to increase. This broke the latency tracers as the snapshot needs to be the same size as the main buffer. The size of the snapshot buffer is only expanded when needed, and because the order is still the same, the size becomes out of sync with the main buffer, as the main buffer increased in size without the tracing system knowing. Calculate the nr_pages to allocate with the new subbuf_size to be buffer_size / new_subbuf_size. Link: https://lore.kernel.org/linux-trace-kernel/20231219185630.649397785@goodmis.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Tzvetomir Stoyanov Cc: Vincent Donnefort Cc: Kent Overstreet Fixes: f9b94daa542a ("ring-buffer: Set new size of the ring buffer sub page") Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index fdcd171b09b5..23ead7602da0 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -5897,7 +5897,10 @@ int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order) if (!cpumask_test_cpu(cpu, buffer->cpumask)) continue; - nr_pages = buffer->buffers[cpu]->nr_pages; + /* Update the number of pages to match the new size */ + nr_pages = old_size * buffer->buffers[cpu]->nr_pages; + nr_pages = DIV_ROUND_UP(nr_pages, buffer->subbuf_size); + cpu_buffers[cpu] = rb_allocate_cpu_buffer(buffer, nr_pages, cpu); if (!cpu_buffers[cpu]) { err = -ENOMEM; -- cgit From 8e7b58c27b3c567316a51079b375b846f9223bba Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 19 Dec 2023 13:54:26 -0500 Subject: ring-buffer: Just update the subbuffers when changing their allocation order The ring_buffer_subbuf_order_set() was creating ring_buffer_per_cpu cpu_buffers with the new subbuffers with the updated order, and if they all successfully were created, then they the ring_buffer's per_cpu buffers would be freed and replaced by them. The problem is that the freed per_cpu buffers contains state that would be lost. Running the following commands: 1. # echo 3 > /sys/kernel/tracing/buffer_subbuf_order 2. # echo 0 > /sys/kernel/tracing/tracing_cpumask 3. # echo 1 > /sys/kernel/tracing/snapshot 4. # echo ff > /sys/kernel/tracing/tracing_cpumask 5. # echo test > /sys/kernel/tracing/trace_marker Would result in: -bash: echo: write error: Bad file descriptor That's because the state of the per_cpu buffers of the snapshot buffer is lost when the order is changed (the order of a freed snapshot buffer goes to 0 to save memory, and when the snapshot buffer is allocated again, it goes back to what the main buffer is). In operation 2, the snapshot buffers were set to "disable" (as all the ring buffers CPUs were disabled). In operation 3, the snapshot is allocated and a call to ring_buffer_subbuf_order_set() replaced the per_cpu buffers losing the "record_disable" count. When it was enabled again, the atomic_dec(&cpu_buffer->record_disable) was decrementing a zero, setting it to -1. Writing 1 into the snapshot would swap the snapshot buffer with the main buffer, so now the main buffer is "disabled", and nothing can write to the ring buffer anymore. Instead of creating new per_cpu buffers and losing the state of the old buffers, basically do what the resize does and just allocate new subbuf pages into the new_pages link list of the per_cpu buffer and if they all succeed, then replace the old sub buffers with the new ones. This keeps the per_cpu buffer descriptor in tact and by doing so, keeps its state. Link: https://lore.kernel.org/linux-trace-kernel/20231219185630.944104939@goodmis.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Tzvetomir Stoyanov Cc: Vincent Donnefort Cc: Kent Overstreet Fixes: f9b94daa542a ("ring-buffer: Set new size of the ring buffer sub page") Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 88 +++++++++++++++++++++++++++++++++++++--------- 1 file changed, 71 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 23ead7602da0..7ee6779bf292 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -5856,11 +5856,11 @@ EXPORT_SYMBOL_GPL(ring_buffer_subbuf_order_get); */ int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order) { - struct ring_buffer_per_cpu **cpu_buffers; + struct ring_buffer_per_cpu *cpu_buffer; + struct buffer_page *bpage, *tmp; int old_order, old_size; int nr_pages; int psize; - int bsize; int err; int cpu; @@ -5874,11 +5874,6 @@ int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order) if (psize <= BUF_PAGE_HDR_SIZE) return -EINVAL; - bsize = sizeof(void *) * buffer->cpus; - cpu_buffers = kzalloc(bsize, GFP_KERNEL); - if (!cpu_buffers) - return -ENOMEM; - old_order = buffer->subbuf_order; old_size = buffer->subbuf_size; @@ -5894,33 +5889,88 @@ int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order) /* Make sure all new buffers are allocated, before deleting the old ones */ for_each_buffer_cpu(buffer, cpu) { + if (!cpumask_test_cpu(cpu, buffer->cpumask)) continue; + cpu_buffer = buffer->buffers[cpu]; + /* Update the number of pages to match the new size */ nr_pages = old_size * buffer->buffers[cpu]->nr_pages; nr_pages = DIV_ROUND_UP(nr_pages, buffer->subbuf_size); - cpu_buffers[cpu] = rb_allocate_cpu_buffer(buffer, nr_pages, cpu); - if (!cpu_buffers[cpu]) { + /* we need a minimum of two pages */ + if (nr_pages < 2) + nr_pages = 2; + + cpu_buffer->nr_pages_to_update = nr_pages; + + /* Include the reader page */ + nr_pages++; + + /* Allocate the new size buffer */ + INIT_LIST_HEAD(&cpu_buffer->new_pages); + if (__rb_allocate_pages(cpu_buffer, nr_pages, + &cpu_buffer->new_pages)) { + /* not enough memory for new pages */ err = -ENOMEM; goto error; } } for_each_buffer_cpu(buffer, cpu) { + if (!cpumask_test_cpu(cpu, buffer->cpumask)) continue; - rb_free_cpu_buffer(buffer->buffers[cpu]); - buffer->buffers[cpu] = cpu_buffers[cpu]; + cpu_buffer = buffer->buffers[cpu]; + + /* Clear the head bit to make the link list normal to read */ + rb_head_page_deactivate(cpu_buffer); + + /* Now walk the list and free all the old sub buffers */ + list_for_each_entry_safe(bpage, tmp, cpu_buffer->pages, list) { + list_del_init(&bpage->list); + free_buffer_page(bpage); + } + /* The above loop stopped an the last page needing to be freed */ + bpage = list_entry(cpu_buffer->pages, struct buffer_page, list); + free_buffer_page(bpage); + + /* Free the current reader page */ + free_buffer_page(cpu_buffer->reader_page); + + /* One page was allocated for the reader page */ + cpu_buffer->reader_page = list_entry(cpu_buffer->new_pages.next, + struct buffer_page, list); + list_del_init(&cpu_buffer->reader_page->list); + + /* The cpu_buffer pages are a link list with no head */ + cpu_buffer->pages = cpu_buffer->new_pages.next; + cpu_buffer->new_pages.next->prev = cpu_buffer->new_pages.prev; + cpu_buffer->new_pages.prev->next = cpu_buffer->new_pages.next; + + /* Clear the new_pages list */ + INIT_LIST_HEAD(&cpu_buffer->new_pages); + + cpu_buffer->head_page + = list_entry(cpu_buffer->pages, struct buffer_page, list); + cpu_buffer->tail_page = cpu_buffer->commit_page = cpu_buffer->head_page; + + cpu_buffer->nr_pages = cpu_buffer->nr_pages_to_update; + cpu_buffer->nr_pages_to_update = 0; + + free_pages((unsigned long)cpu_buffer->free_page, old_order); + cpu_buffer->free_page = NULL; + + rb_head_page_activate(cpu_buffer); + + rb_check_pages(cpu_buffer); } atomic_dec(&buffer->record_disabled); mutex_unlock(&buffer->mutex); - kfree(cpu_buffers); - return 0; error: @@ -5931,12 +5981,16 @@ error: mutex_unlock(&buffer->mutex); for_each_buffer_cpu(buffer, cpu) { - if (!cpu_buffers[cpu]) + cpu_buffer = buffer->buffers[cpu]; + + if (!cpu_buffer->nr_pages_to_update) continue; - rb_free_cpu_buffer(cpu_buffers[cpu]); - kfree(cpu_buffers[cpu]); + + list_for_each_entry_safe(bpage, tmp, &cpu_buffer->new_pages, list) { + list_del_init(&bpage->list); + free_buffer_page(bpage); + } } - kfree(cpu_buffers); return err; } -- cgit From 2f84b39f48476615186af5b3220ad5a2c756679b Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 19 Dec 2023 13:54:29 -0500 Subject: tracing: Update subbuffer with kilobytes not page order Using page order for deciding what the size of the ring buffer sub buffers are is exposing a bit too much of the implementation. Although the sub buffers are only allocated in orders of pages, allow the user to specify the minimum size of each sub-buffer via kilobytes like they can with the buffer size itself. If the user specifies 3 via: echo 3 > buffer_subbuf_size_kb Then the sub-buffer size will round up to 4kb (on a 4kb page size system). If they specify: echo 6 > buffer_subbuf_size_kb The sub-buffer size will become 8kb. and so on. Link: https://lore.kernel.org/linux-trace-kernel/20231219185631.809766769@goodmis.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Tzvetomir Stoyanov Cc: Vincent Donnefort Cc: Kent Overstreet Signed-off-by: Steven Rostedt (Google) --- Documentation/trace/ftrace.rst | 46 +++++------ kernel/trace/trace.c | 38 ++++++--- .../ftrace/test.d/00basic/ringbuffer_order.tc | 95 ---------------------- .../test.d/00basic/ringbuffer_subbuf_size.tc | 95 ++++++++++++++++++++++ 4 files changed, 140 insertions(+), 134 deletions(-) delete mode 100644 tools/testing/selftests/ftrace/test.d/00basic/ringbuffer_order.tc create mode 100644 tools/testing/selftests/ftrace/test.d/00basic/ringbuffer_subbuf_size.tc (limited to 'kernel') diff --git a/Documentation/trace/ftrace.rst b/Documentation/trace/ftrace.rst index 231d26ceedb0..933e7efb9f1b 100644 --- a/Documentation/trace/ftrace.rst +++ b/Documentation/trace/ftrace.rst @@ -203,32 +203,26 @@ of ftrace. Here is a list of some of the key files: This displays the total combined size of all the trace buffers. - buffer_subbuf_order: - - This sets or displays the sub buffer page size order. The ring buffer - is broken up into several same size "sub buffers". An event can not be - bigger than the size of the sub buffer. Normally, the sub buffer is - the size of the architecture's page (4K on x86). The sub buffer also - contains meta data at the start which also limits the size of an event. - That means when the sub buffer is a page size, no event can be larger - than the page size minus the sub buffer meta data. - - The buffer_subbuf_order allows the user to change the size of the sub - buffer. As the sub buffer is a set of pages by the power of 2, thus - the sub buffer total size is defined by the order: - - order size - ---- ---- - 0 PAGE_SIZE - 1 PAGE_SIZE * 2 - 2 PAGE_SIZE * 4 - 3 PAGE_SIZE * 8 - - Changing the order will change the sub buffer size allowing for events - to be larger than the page size. - - Note: When changing the order, tracing is stopped and any data in the - ring buffer and the snapshot buffer will be discarded. + buffer_subbuf_size_kb: + + This sets or displays the sub buffer size. The ring buffer is broken up + into several same size "sub buffers". An event can not be bigger than + the size of the sub buffer. Normally, the sub buffer is the size of the + architecture's page (4K on x86). The sub buffer also contains meta data + at the start which also limits the size of an event. That means when + the sub buffer is a page size, no event can be larger than the page + size minus the sub buffer meta data. + + Note, the buffer_subbuf_size_kb is a way for the user to specify the + minimum size of the subbuffer. The kernel may make it bigger due to the + implementation details, or simply fail the operation if the kernel can + not handle the request. + + Changing the sub buffer size allows for events to be larger than the + page size. + + Note: When changing the sub-buffer size, tracing is stopped and any + data in the ring buffer and the snapshot buffer will be discarded. free_buffer: diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 82303bd2bba1..46dbe22121e9 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -9384,42 +9384,54 @@ static const struct file_operations buffer_percent_fops = { }; static ssize_t -buffer_order_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) +buffer_subbuf_size_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_array *tr = filp->private_data; + size_t size; char buf[64]; + int order; int r; - r = sprintf(buf, "%d\n", ring_buffer_subbuf_order_get(tr->array_buffer.buffer)); + order = ring_buffer_subbuf_order_get(tr->array_buffer.buffer); + size = (PAGE_SIZE << order) / 1024; + + r = sprintf(buf, "%zd\n", size); return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); } static ssize_t -buffer_order_write(struct file *filp, const char __user *ubuf, - size_t cnt, loff_t *ppos) +buffer_subbuf_size_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) { struct trace_array *tr = filp->private_data; unsigned long val; int old_order; + int order; + int pages; int ret; ret = kstrtoul_from_user(ubuf, cnt, 10, &val); if (ret) return ret; + val *= 1024; /* value passed in is in KB */ + + pages = DIV_ROUND_UP(val, PAGE_SIZE); + order = fls(pages - 1); + /* limit between 1 and 128 system pages */ - if (val < 0 || val > 7) + if (order < 0 || order > 7) return -EINVAL; /* Do not allow tracing while changing the order of the ring buffer */ tracing_stop_tr(tr); old_order = ring_buffer_subbuf_order_get(tr->array_buffer.buffer); - if (old_order == val) + if (old_order == order) goto out; - ret = ring_buffer_subbuf_order_set(tr->array_buffer.buffer, val); + ret = ring_buffer_subbuf_order_set(tr->array_buffer.buffer, order); if (ret) goto out; @@ -9428,7 +9440,7 @@ buffer_order_write(struct file *filp, const char __user *ubuf, if (!tr->allocated_snapshot) goto out_max; - ret = ring_buffer_subbuf_order_set(tr->max_buffer.buffer, val); + ret = ring_buffer_subbuf_order_set(tr->max_buffer.buffer, order); if (ret) { /* Put back the old order */ cnt = ring_buffer_subbuf_order_set(tr->array_buffer.buffer, old_order); @@ -9460,10 +9472,10 @@ buffer_order_write(struct file *filp, const char __user *ubuf, return cnt; } -static const struct file_operations buffer_order_fops = { +static const struct file_operations buffer_subbuf_size_fops = { .open = tracing_open_generic_tr, - .read = buffer_order_read, - .write = buffer_order_write, + .read = buffer_subbuf_size_read, + .write = buffer_subbuf_size_write, .release = tracing_release_generic_tr, .llseek = default_llseek, }; @@ -9934,8 +9946,8 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) trace_create_file("buffer_percent", TRACE_MODE_WRITE, d_tracer, tr, &buffer_percent_fops); - trace_create_file("buffer_subbuf_order", TRACE_MODE_WRITE, d_tracer, - tr, &buffer_order_fops); + trace_create_file("buffer_subbuf_size_kb", TRACE_MODE_WRITE, d_tracer, + tr, &buffer_subbuf_size_fops); create_trace_options_dir(tr); diff --git a/tools/testing/selftests/ftrace/test.d/00basic/ringbuffer_order.tc b/tools/testing/selftests/ftrace/test.d/00basic/ringbuffer_order.tc deleted file mode 100644 index ecbcc810e6c1..000000000000 --- a/tools/testing/selftests/ftrace/test.d/00basic/ringbuffer_order.tc +++ /dev/null @@ -1,95 +0,0 @@ -#!/bin/sh -# SPDX-License-Identifier: GPL-2.0 -# description: Change the ringbuffer sub-buffer order -# requires: buffer_subbuf_order -# flags: instance - -get_buffer_data_size() { - sed -ne 's/^.*data.*size:\([0-9][0-9]*\).*/\1/p' events/header_page -} - -get_buffer_data_offset() { - sed -ne 's/^.*data.*offset:\([0-9][0-9]*\).*/\1/p' events/header_page -} - -get_event_header_size() { - type_len=`sed -ne 's/^.*type_len.*:[^0-9]*\([0-9][0-9]*\).*/\1/p' events/header_event` - time_len=`sed -ne 's/^.*time_delta.*:[^0-9]*\([0-9][0-9]*\).*/\1/p' events/header_event` - array_len=`sed -ne 's/^.*array.*:[^0-9]*\([0-9][0-9]*\).*/\1/p' events/header_event` - total_bits=$((type_len+time_len+array_len)) - total_bits=$((total_bits+7)) - echo $((total_bits/8)) -} - -get_print_event_buf_offset() { - sed -ne 's/^.*buf.*offset:\([0-9][0-9]*\).*/\1/p' events/ftrace/print/format -} - -event_header_size=`get_event_header_size` -print_header_size=`get_print_event_buf_offset` - -data_offset=`get_buffer_data_offset` - -marker_meta=$((event_header_size+print_header_size)) - -make_str() { - cnt=$1 - printf -- 'X%.0s' $(seq $cnt) -} - -write_buffer() { - size=$1 - - str=`make_str $size` - - # clear the buffer - echo > trace - - # write the string into the marker - echo $str > trace_marker - - echo $str -} - -test_buffer() { - orde=$1 - page_size=$((4096< buffer_subbuf_order - test_buffer $a -done - -echo $ORIG > buffer_subbuf_order - diff --git a/tools/testing/selftests/ftrace/test.d/00basic/ringbuffer_subbuf_size.tc b/tools/testing/selftests/ftrace/test.d/00basic/ringbuffer_subbuf_size.tc new file mode 100644 index 000000000000..d44d09a33a74 --- /dev/null +++ b/tools/testing/selftests/ftrace/test.d/00basic/ringbuffer_subbuf_size.tc @@ -0,0 +1,95 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 +# description: Change the ringbuffer sub-buffer size +# requires: buffer_subbuf_size_kb +# flags: instance + +get_buffer_data_size() { + sed -ne 's/^.*data.*size:\([0-9][0-9]*\).*/\1/p' events/header_page +} + +get_buffer_data_offset() { + sed -ne 's/^.*data.*offset:\([0-9][0-9]*\).*/\1/p' events/header_page +} + +get_event_header_size() { + type_len=`sed -ne 's/^.*type_len.*:[^0-9]*\([0-9][0-9]*\).*/\1/p' events/header_event` + time_len=`sed -ne 's/^.*time_delta.*:[^0-9]*\([0-9][0-9]*\).*/\1/p' events/header_event` + array_len=`sed -ne 's/^.*array.*:[^0-9]*\([0-9][0-9]*\).*/\1/p' events/header_event` + total_bits=$((type_len+time_len+array_len)) + total_bits=$((total_bits+7)) + echo $((total_bits/8)) +} + +get_print_event_buf_offset() { + sed -ne 's/^.*buf.*offset:\([0-9][0-9]*\).*/\1/p' events/ftrace/print/format +} + +event_header_size=`get_event_header_size` +print_header_size=`get_print_event_buf_offset` + +data_offset=`get_buffer_data_offset` + +marker_meta=$((event_header_size+print_header_size)) + +make_str() { + cnt=$1 + printf -- 'X%.0s' $(seq $cnt) +} + +write_buffer() { + size=$1 + + str=`make_str $size` + + # clear the buffer + echo > trace + + # write the string into the marker + echo $str > trace_marker + + echo $str +} + +test_buffer() { + size_kb=$1 + page_size=$((size_kb*1024)) + + size=`get_buffer_data_size` + + # the size must be greater than or equal to page_size - data_offset + page_size=$((page_size-data_offset)) + if [ $size -lt $page_size ]; then + exit fail + fi + + # Now add a little more the meta data overhead will overflow + + str=`write_buffer $size` + + # Make sure the line was broken + new_str=`awk ' /tracing_mark_write:/ { sub(/^.*tracing_mark_write: /,"");printf "%s", $0; exit}' trace` + + if [ "$new_str" = "$str" ]; then + exit fail; + fi + + # Make sure the entire line can be found + new_str=`awk ' /tracing_mark_write:/ { sub(/^.*tracing_mark_write: /,"");printf "%s", $0; }' trace` + + if [ "$new_str" != "$str" ]; then + exit fail; + fi +} + +ORIG=`cat buffer_subbuf_size_kb` + +# Could test bigger sizes than 32K, but then creating the string +# to write into the ring buffer takes too long +for a in 4 8 16 32 ; do + echo $a > buffer_subbuf_size_kb + test_buffer $a +done + +echo $ORIG > buffer_subbuf_size_kb + -- cgit From 3cb3091138ca0921c4569bcf7ffa062519639b6a Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 19 Dec 2023 17:38:00 -0500 Subject: ring-buffer: Use subbuf_order for buffer page masking The comparisons to PAGE_SIZE were all converted to use the buffer->subbuf_order, but the use of PAGE_MASK was missed. Convert all the PAGE_MASK usages over to: (PAGE_SIZE << cpu_buffer->buffer->subbuf_order) - 1 Link: https://lore.kernel.org/linux-trace-kernel/20231219173800.66eefb7a@gandalf.local.home Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Tzvetomir Stoyanov Cc: Vincent Donnefort Cc: Kent Overstreet Fixes: 139f84002145 ("ring-buffer: Page size per ring buffer") Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 7ee6779bf292..173d2595ce2d 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2269,11 +2269,13 @@ rb_commit_index(struct ring_buffer_per_cpu *cpu_buffer) } static __always_inline unsigned -rb_event_index(struct ring_buffer_event *event) +rb_event_index(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer_event *event) { unsigned long addr = (unsigned long)event; - return (addr & ~PAGE_MASK) - BUF_PAGE_HDR_SIZE; + addr &= (PAGE_SIZE << cpu_buffer->buffer->subbuf_order) - 1; + + return addr - BUF_PAGE_HDR_SIZE; } static void rb_inc_iter(struct ring_buffer_iter *iter) @@ -2646,7 +2648,8 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, /* Slow path */ static struct ring_buffer_event * -rb_add_time_stamp(struct ring_buffer_event *event, u64 delta, bool abs) +rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_event *event, u64 delta, bool abs) { if (abs) event->type_len = RINGBUF_TYPE_TIME_STAMP; @@ -2654,7 +2657,7 @@ rb_add_time_stamp(struct ring_buffer_event *event, u64 delta, bool abs) event->type_len = RINGBUF_TYPE_TIME_EXTEND; /* Not the first event on the page, or not delta? */ - if (abs || rb_event_index(event)) { + if (abs || rb_event_index(cpu_buffer, event)) { event->time_delta = delta & TS_MASK; event->array[0] = delta >> TS_SHIFT; } else { @@ -2728,7 +2731,7 @@ static void rb_add_timestamp(struct ring_buffer_per_cpu *cpu_buffer, if (!abs) info->delta = 0; } - *event = rb_add_time_stamp(*event, info->delta, abs); + *event = rb_add_time_stamp(cpu_buffer, *event, info->delta, abs); *length -= RB_LEN_TIME_EXTEND; *delta = 0; } @@ -2812,10 +2815,10 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, struct buffer_page *bpage; unsigned long addr; - new_index = rb_event_index(event); + new_index = rb_event_index(cpu_buffer, event); old_index = new_index + rb_event_ts_length(event); addr = (unsigned long)event; - addr &= PAGE_MASK; + addr &= ~((PAGE_SIZE << cpu_buffer->buffer->subbuf_order) - 1); bpage = READ_ONCE(cpu_buffer->tail_page); @@ -3726,7 +3729,7 @@ rb_decrement_entry(struct ring_buffer_per_cpu *cpu_buffer, struct buffer_page *bpage = cpu_buffer->commit_page; struct buffer_page *start; - addr &= PAGE_MASK; + addr &= ~((PAGE_SIZE << cpu_buffer->buffer->subbuf_order) - 1); /* Do the likely case first */ if (likely(bpage->page == (void *)addr)) { -- cgit From 5abde62465222edd3080b70099bd809f166d5d7d Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Thu, 21 Dec 2023 18:03:52 +0100 Subject: bpf: Avoid unnecessary use of comma operator in verifier Although it does not seem to have any untoward side-effects, the use of ';' to separate to assignments seems more appropriate than ','. Flagged by clang-17 -Wcomma No functional change intended. Compile tested only. Signed-off-by: Simon Horman Signed-off-by: Daniel Borkmann Reviewed-by: Dave Marchevsky Link: https://lore.kernel.org/bpf/20231221-bpf-verifier-comma-v1-1-cde2530912e9@kernel.org --- kernel/bpf/verifier.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f13008d27f35..a376eb609c41 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -9616,7 +9616,7 @@ static int set_find_vma_callback_state(struct bpf_verifier_env *env, callee->regs[BPF_REG_2].type = PTR_TO_BTF_ID; __mark_reg_known_zero(&callee->regs[BPF_REG_2]); callee->regs[BPF_REG_2].btf = btf_vmlinux; - callee->regs[BPF_REG_2].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_VMA], + callee->regs[BPF_REG_2].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_VMA]; /* pointer to stack or null */ callee->regs[BPF_REG_3] = caller->regs[BPF_REG_4]; -- cgit From d68019471995ba47e56a9da355df13a1cdb5bf7e Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Mon, 18 Dec 2023 08:45:18 +0100 Subject: entry: Move exit to usermode functions to header file To allow inlining, move exit_to_user_mode() to entry-common.h. Signed-off-by: Sven Schnelle Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20231218074520.1998026-2-svens@linux.ibm.com --- include/linux/entry-common.h | 53 +++++++++++++++++++++++++++++++++++++++++++- kernel/entry/common.c | 52 ++++++++----------------------------------- 2 files changed, 61 insertions(+), 44 deletions(-) (limited to 'kernel') diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h index d95ab85f96ba..6a6e98f3805f 100644 --- a/include/linux/entry-common.h +++ b/include/linux/entry-common.h @@ -7,6 +7,10 @@ #include #include #include +#include +#include +#include +#include #include @@ -258,6 +262,43 @@ static __always_inline void arch_exit_to_user_mode(void) { } */ void arch_do_signal_or_restart(struct pt_regs *regs); +/** + * exit_to_user_mode_loop - do any pending work before leaving to user space + */ +unsigned long exit_to_user_mode_loop(struct pt_regs *regs, + unsigned long ti_work); + +/** + * exit_to_user_mode_prepare - call exit_to_user_mode_loop() if required + * @regs: Pointer to pt_regs on entry stack + * + * 1) check that interrupts are disabled + * 2) call tick_nohz_user_enter_prepare() + * 3) call exit_to_user_mode_loop() if any flags from + * EXIT_TO_USER_MODE_WORK are set + * 4) check that interrupts are still disabled + */ +static __always_inline void exit_to_user_mode_prepare(struct pt_regs *regs) +{ + unsigned long ti_work; + + lockdep_assert_irqs_disabled(); + + /* Flush pending rcuog wakeup before the last need_resched() check */ + tick_nohz_user_enter_prepare(); + + ti_work = read_thread_flags(); + if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK)) + ti_work = exit_to_user_mode_loop(regs, ti_work); + + arch_exit_to_user_mode_prepare(regs, ti_work); + + /* Ensure that kernel state is sane for a return to userspace */ + kmap_assert_nomap(); + lockdep_assert_irqs_disabled(); + lockdep_sys_exit(); +} + /** * exit_to_user_mode - Fixup state when exiting to user mode * @@ -276,7 +317,17 @@ void arch_do_signal_or_restart(struct pt_regs *regs); * non-instrumentable. * The caller has to invoke syscall_exit_to_user_mode_work() before this. */ -void exit_to_user_mode(void); +static __always_inline void exit_to_user_mode(void) +{ + instrumentation_begin(); + trace_hardirqs_on_prepare(); + lockdep_hardirqs_on_prepare(); + instrumentation_end(); + + user_enter_irqoff(); + arch_exit_to_user_mode(); + lockdep_hardirqs_on(CALLER_ADDR0); +} /** * syscall_exit_to_user_mode_work - Handle work before returning to user mode diff --git a/kernel/entry/common.c b/kernel/entry/common.c index d7ee4bc3f2ba..7f8f8c16140a 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -123,29 +123,16 @@ noinstr void syscall_enter_from_user_mode_prepare(struct pt_regs *regs) instrumentation_end(); } -/* See comment for exit_to_user_mode() in entry-common.h */ -static __always_inline void __exit_to_user_mode(void) -{ - instrumentation_begin(); - trace_hardirqs_on_prepare(); - lockdep_hardirqs_on_prepare(); - instrumentation_end(); - - user_enter_irqoff(); - arch_exit_to_user_mode(); - lockdep_hardirqs_on(CALLER_ADDR0); -} - -void noinstr exit_to_user_mode(void) -{ - __exit_to_user_mode(); -} - /* Workaround to allow gradual conversion of architecture code */ void __weak arch_do_signal_or_restart(struct pt_regs *regs) { } -static unsigned long exit_to_user_mode_loop(struct pt_regs *regs, - unsigned long ti_work) +/** + * exit_to_user_mode_loop - do any pending work before leaving to user space + * @regs: Pointer to pt_regs on entry stack + * @ti_work: TIF work flags as read by the caller + */ +__always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs, + unsigned long ti_work) { /* * Before returning to user space ensure that all pending work @@ -190,27 +177,6 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs, return ti_work; } -static void exit_to_user_mode_prepare(struct pt_regs *regs) -{ - unsigned long ti_work; - - lockdep_assert_irqs_disabled(); - - /* Flush pending rcuog wakeup before the last need_resched() check */ - tick_nohz_user_enter_prepare(); - - ti_work = read_thread_flags(); - if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK)) - ti_work = exit_to_user_mode_loop(regs, ti_work); - - arch_exit_to_user_mode_prepare(regs, ti_work); - - /* Ensure that kernel state is sane for a return to userspace */ - kmap_assert_nomap(); - lockdep_assert_irqs_disabled(); - lockdep_sys_exit(); -} - /* * If SYSCALL_EMU is set, then the only reason to report is when * SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP). This syscall @@ -295,7 +261,7 @@ __visible noinstr void syscall_exit_to_user_mode(struct pt_regs *regs) instrumentation_begin(); __syscall_exit_to_user_mode_work(regs); instrumentation_end(); - __exit_to_user_mode(); + exit_to_user_mode(); } noinstr void irqentry_enter_from_user_mode(struct pt_regs *regs) @@ -308,7 +274,7 @@ noinstr void irqentry_exit_to_user_mode(struct pt_regs *regs) instrumentation_begin(); exit_to_user_mode_prepare(regs); instrumentation_end(); - __exit_to_user_mode(); + exit_to_user_mode(); } noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs) -- cgit From caf4062e35b21cd7d3d35ac2f58f9765d02d32a0 Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Mon, 18 Dec 2023 08:45:19 +0100 Subject: entry: Move enter_from_user_mode() to header file To allow inlining of enter_from_user_mode(), move it to entry-common.h. Signed-off-by: Sven Schnelle Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20231218074520.1998026-3-svens@linux.ibm.com --- include/linux/entry-common.h | 15 ++++++++++++++- kernel/entry/common.c | 26 +++----------------------- 2 files changed, 17 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h index 6a6e98f3805f..c4205390448e 100644 --- a/include/linux/entry-common.h +++ b/include/linux/entry-common.h @@ -11,6 +11,7 @@ #include #include #include +#include #include @@ -102,7 +103,19 @@ static __always_inline void arch_enter_from_user_mode(struct pt_regs *regs) {} * done between establishing state and enabling interrupts. The caller must * enable interrupts before invoking syscall_enter_from_user_mode_work(). */ -void enter_from_user_mode(struct pt_regs *regs); +static __always_inline void enter_from_user_mode(struct pt_regs *regs) +{ + arch_enter_from_user_mode(regs); + lockdep_hardirqs_off(CALLER_ADDR0); + + CT_WARN_ON(__ct_state() != CONTEXT_USER); + user_exit_irqoff(); + + instrumentation_begin(); + kmsan_unpoison_entry_regs(regs); + trace_hardirqs_off_finish(); + instrumentation_end(); +} /** * syscall_enter_from_user_mode_prepare - Establish state and enable interrupts diff --git a/kernel/entry/common.c b/kernel/entry/common.c index 7f8f8c16140a..0616f239da4b 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -15,26 +15,6 @@ #define CREATE_TRACE_POINTS #include -/* See comment for enter_from_user_mode() in entry-common.h */ -static __always_inline void __enter_from_user_mode(struct pt_regs *regs) -{ - arch_enter_from_user_mode(regs); - lockdep_hardirqs_off(CALLER_ADDR0); - - CT_WARN_ON(__ct_state() != CONTEXT_USER); - user_exit_irqoff(); - - instrumentation_begin(); - kmsan_unpoison_entry_regs(regs); - trace_hardirqs_off_finish(); - instrumentation_end(); -} - -void noinstr enter_from_user_mode(struct pt_regs *regs) -{ - __enter_from_user_mode(regs); -} - static inline void syscall_enter_audit(struct pt_regs *regs, long syscall) { if (unlikely(audit_context())) { @@ -105,7 +85,7 @@ noinstr long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall) { long ret; - __enter_from_user_mode(regs); + enter_from_user_mode(regs); instrumentation_begin(); local_irq_enable(); @@ -117,7 +97,7 @@ noinstr long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall) noinstr void syscall_enter_from_user_mode_prepare(struct pt_regs *regs) { - __enter_from_user_mode(regs); + enter_from_user_mode(regs); instrumentation_begin(); local_irq_enable(); instrumentation_end(); @@ -266,7 +246,7 @@ __visible noinstr void syscall_exit_to_user_mode(struct pt_regs *regs) noinstr void irqentry_enter_from_user_mode(struct pt_regs *regs) { - __enter_from_user_mode(regs); + enter_from_user_mode(regs); } noinstr void irqentry_exit_to_user_mode(struct pt_regs *regs) -- cgit From 221a164035fd8b554a44bd7c4bf8e7715a497561 Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Mon, 18 Dec 2023 08:45:20 +0100 Subject: entry: Move syscall_enter_from_user_mode() to header file To allow inlining of syscall_enter_from_user_mode(), move it to entry-common.h. Signed-off-by: Sven Schnelle Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20231218074520.1998026-4-svens@linux.ibm.com --- include/linux/entry-common.h | 27 +++++++++++++++++++++++++-- kernel/entry/common.c | 32 +------------------------------- 2 files changed, 26 insertions(+), 33 deletions(-) (limited to 'kernel') diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h index c4205390448e..b0fb775a600d 100644 --- a/include/linux/entry-common.h +++ b/include/linux/entry-common.h @@ -134,6 +134,9 @@ static __always_inline void enter_from_user_mode(struct pt_regs *regs) */ void syscall_enter_from_user_mode_prepare(struct pt_regs *regs); +long syscall_trace_enter(struct pt_regs *regs, long syscall, + unsigned long work); + /** * syscall_enter_from_user_mode_work - Check and handle work before invoking * a syscall @@ -157,7 +160,15 @@ void syscall_enter_from_user_mode_prepare(struct pt_regs *regs); * ptrace_report_syscall_entry(), __secure_computing(), trace_sys_enter() * 2) Invocation of audit_syscall_entry() */ -long syscall_enter_from_user_mode_work(struct pt_regs *regs, long syscall); +static __always_inline long syscall_enter_from_user_mode_work(struct pt_regs *regs, long syscall) +{ + unsigned long work = READ_ONCE(current_thread_info()->syscall_work); + + if (work & SYSCALL_WORK_ENTER) + syscall = syscall_trace_enter(regs, syscall, work); + + return syscall; +} /** * syscall_enter_from_user_mode - Establish state and check and handle work @@ -176,7 +187,19 @@ long syscall_enter_from_user_mode_work(struct pt_regs *regs, long syscall); * Returns: The original or a modified syscall number. See * syscall_enter_from_user_mode_work() for further explanation. */ -long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall); +static __always_inline long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall) +{ + long ret; + + enter_from_user_mode(regs); + + instrumentation_begin(); + local_irq_enable(); + ret = syscall_enter_from_user_mode_work(regs, syscall); + instrumentation_end(); + + return ret; +} /** * local_irq_enable_exit_to_user - Exit to user variant of local_irq_enable() diff --git a/kernel/entry/common.c b/kernel/entry/common.c index 0616f239da4b..88cb3c88aaa5 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -25,7 +25,7 @@ static inline void syscall_enter_audit(struct pt_regs *regs, long syscall) } } -static long syscall_trace_enter(struct pt_regs *regs, long syscall, +long syscall_trace_enter(struct pt_regs *regs, long syscall, unsigned long work) { long ret = 0; @@ -65,36 +65,6 @@ static long syscall_trace_enter(struct pt_regs *regs, long syscall, return ret ? : syscall; } -static __always_inline long -__syscall_enter_from_user_work(struct pt_regs *regs, long syscall) -{ - unsigned long work = READ_ONCE(current_thread_info()->syscall_work); - - if (work & SYSCALL_WORK_ENTER) - syscall = syscall_trace_enter(regs, syscall, work); - - return syscall; -} - -long syscall_enter_from_user_mode_work(struct pt_regs *regs, long syscall) -{ - return __syscall_enter_from_user_work(regs, syscall); -} - -noinstr long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall) -{ - long ret; - - enter_from_user_mode(regs); - - instrumentation_begin(); - local_irq_enable(); - ret = __syscall_enter_from_user_work(regs, syscall); - instrumentation_end(); - - return ret; -} - noinstr void syscall_enter_from_user_mode_prepare(struct pt_regs *regs) { enter_from_user_mode(regs); -- cgit From b3edde44e5d4504c23a176819865cd603fd16d6c Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Mon, 11 Dec 2023 11:48:51 +0100 Subject: cpufreq/schedutil: Use a fixed reference frequency cpuinfo.max_freq can change at runtime because of boost as an example. This implies that the value could be different than the one that has been used when computing the capacity of a CPU. The new arch_scale_freq_ref() returns a fixed and coherent reference frequency that can be used when computing a frequency based on utilization. Use this arch_scale_freq_ref() when available and fallback to policy otherwise. Signed-off-by: Vincent Guittot Signed-off-by: Ingo Molnar Tested-by: Lukasz Luba Reviewed-by: Lukasz Luba Reviewed-by: Dietmar Eggemann Acked-by: Rafael J. Wysocki Acked-by: Viresh Kumar Link: https://lore.kernel.org/r/20231211104855.558096-4-vincent.guittot@linaro.org --- kernel/sched/cpufreq_schedutil.c | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 4ee8ad70be99..95c3c097083e 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -114,6 +114,28 @@ static void sugov_deferred_update(struct sugov_policy *sg_policy) } } +/** + * get_capacity_ref_freq - get the reference frequency that has been used to + * correlate frequency and compute capacity for a given cpufreq policy. We use + * the CPU managing it for the arch_scale_freq_ref() call in the function. + * @policy: the cpufreq policy of the CPU in question. + * + * Return: the reference CPU frequency to compute a capacity. + */ +static __always_inline +unsigned long get_capacity_ref_freq(struct cpufreq_policy *policy) +{ + unsigned int freq = arch_scale_freq_ref(policy->cpu); + + if (freq) + return freq; + + if (arch_scale_freq_invariant()) + return policy->cpuinfo.max_freq; + + return policy->cur; +} + /** * get_next_freq - Compute a new frequency for a given cpufreq policy. * @sg_policy: schedutil policy object to compute the new frequency for. @@ -140,9 +162,9 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy, unsigned long util, unsigned long max) { struct cpufreq_policy *policy = sg_policy->policy; - unsigned int freq = arch_scale_freq_invariant() ? - policy->cpuinfo.max_freq : policy->cur; + unsigned int freq; + freq = get_capacity_ref_freq(policy); freq = map_util_freq(util, freq, max); if (freq == sg_policy->cached_raw_freq && !sg_policy->need_freq_update) -- cgit From 7736ae5572eb344c090fbef9621a228e7e3d6276 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Fri, 1 Dec 2023 17:16:51 +0100 Subject: sched/fair: Remove SCHED_FEAT(UTIL_EST_FASTUP, true) sched_feat(UTIL_EST_FASTUP) has been added to easily disable the feature in order to check for possibly related regressions. After 3 years, it has never been used and no regression has been reported. Let's remove it and make fast increase a permanent behavior. Signed-off-by: Vincent Guittot Signed-off-by: Ingo Molnar Tested-by: Lukasz Luba Reviewed-by: Lukasz Luba Reviewed-by: Dietmar Eggemann Reviewed-by: Hongyan Xia Reviewed-by: Tang Yizhou Reviewed-by: Yanteng Si [for the Chinese translation] Reviewed-by: Alex Shi Link: https://lore.kernel.org/r/20231201161652.1241695-2-vincent.guittot@linaro.org --- Documentation/scheduler/schedutil.rst | 7 +++---- Documentation/translations/zh_CN/scheduler/schedutil.rst | 7 +++---- kernel/sched/fair.c | 8 +++----- kernel/sched/features.h | 1 - 4 files changed, 9 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/Documentation/scheduler/schedutil.rst b/Documentation/scheduler/schedutil.rst index 32c7d69fc86c..803fba8fc714 100644 --- a/Documentation/scheduler/schedutil.rst +++ b/Documentation/scheduler/schedutil.rst @@ -90,8 +90,8 @@ For more detail see: - Documentation/scheduler/sched-capacity.rst:"1. CPU Capacity + 2. Task utilization" -UTIL_EST / UTIL_EST_FASTUP -========================== +UTIL_EST +======== Because periodic tasks have their averages decayed while they sleep, even though when running their expected utilization will be the same, they suffer a @@ -99,8 +99,7 @@ though when running their expected utilization will be the same, they suffer a To alleviate this (a default enabled option) UTIL_EST drives an Infinite Impulse Response (IIR) EWMA with the 'running' value on dequeue -- when it is -highest. A further default enabled option UTIL_EST_FASTUP modifies the IIR -filter to instantly increase and only decay on decrease. +highest. UTIL_EST filters to instantly increase and only decay on decrease. A further runqueue wide sum (of runnable tasks) is maintained of: diff --git a/Documentation/translations/zh_CN/scheduler/schedutil.rst b/Documentation/translations/zh_CN/scheduler/schedutil.rst index d1ea68007520..7c8d87f21c42 100644 --- a/Documentation/translations/zh_CN/scheduler/schedutil.rst +++ b/Documentation/translations/zh_CN/scheduler/schedutil.rst @@ -89,16 +89,15 @@ r_cpu被定义为当前CPU的最高性能水平与系统中任何其它CPU的最 - Documentation/translations/zh_CN/scheduler/sched-capacity.rst:"1. CPU Capacity + 2. Task utilization" -UTIL_EST / UTIL_EST_FASTUP -========================== +UTIL_EST +======== 由于周期性任务的平均数在睡眠时会衰减,而在运行时其预期利用率会和睡眠前相同, 因此它们在再次运行后会面临(DVFS)的上涨。 为了缓解这个问题,(一个默认使能的编译选项)UTIL_EST驱动一个无限脉冲响应 (Infinite Impulse Response,IIR)的EWMA,“运行”值在出队时是最高的。 -另一个默认使能的编译选项UTIL_EST_FASTUP修改了IIR滤波器,使其允许立即增加, -仅在利用率下降时衰减。 +UTIL_EST滤波使其在遇到更高值时立刻增加,而遇到低值时会缓慢衰减。 进一步,运行队列的(可运行任务的)利用率之和由下式计算: diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index bcea3d55d95d..e94d65da8d66 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4870,11 +4870,9 @@ static inline void util_est_update(struct cfs_rq *cfs_rq, * to smooth utilization decreases. */ ue.enqueued = task_util(p); - if (sched_feat(UTIL_EST_FASTUP)) { - if (ue.ewma < ue.enqueued) { - ue.ewma = ue.enqueued; - goto done; - } + if (ue.ewma < ue.enqueued) { + ue.ewma = ue.enqueued; + goto done; } /* diff --git a/kernel/sched/features.h b/kernel/sched/features.h index a3ddf84de430..143f55df890b 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -83,7 +83,6 @@ SCHED_FEAT(WA_BIAS, true) * UtilEstimation. Use estimated CPU utilization. */ SCHED_FEAT(UTIL_EST, true) -SCHED_FEAT(UTIL_EST_FASTUP, true) SCHED_FEAT(LATENCY_WARN, false) -- cgit From 11137d384996bb05cf33c8163db271e1bac3f4bf Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Fri, 1 Dec 2023 17:16:52 +0100 Subject: sched/fair: Simplify util_est With UTIL_EST_FASTUP now being permanent, we can take advantage of the fact that the ewma jumps directly to a higher utilization at dequeue to simplify util_est and remove the enqueued field. Signed-off-by: Vincent Guittot Signed-off-by: Ingo Molnar Tested-by: Lukasz Luba Reviewed-by: Lukasz Luba Reviewed-by: Dietmar Eggemann Reviewed-by: Hongyan Xia Reviewed-by: Alex Shi Link: https://lore.kernel.org/r/20231201161652.1241695-3-vincent.guittot@linaro.org --- include/linux/sched.h | 49 ++++++++---------------------- kernel/sched/debug.c | 7 ++--- kernel/sched/fair.c | 82 +++++++++++++++++++-------------------------------- kernel/sched/pelt.h | 4 +-- 4 files changed, 48 insertions(+), 94 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index 8d258162deb0..03bfe9ab2951 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -415,42 +415,6 @@ struct load_weight { u32 inv_weight; }; -/** - * struct util_est - Estimation utilization of FAIR tasks - * @enqueued: instantaneous estimated utilization of a task/cpu - * @ewma: the Exponential Weighted Moving Average (EWMA) - * utilization of a task - * - * Support data structure to track an Exponential Weighted Moving Average - * (EWMA) of a FAIR task's utilization. New samples are added to the moving - * average each time a task completes an activation. Sample's weight is chosen - * so that the EWMA will be relatively insensitive to transient changes to the - * task's workload. - * - * The enqueued attribute has a slightly different meaning for tasks and cpus: - * - task: the task's util_avg at last task dequeue time - * - cfs_rq: the sum of util_est.enqueued for each RUNNABLE task on that CPU - * Thus, the util_est.enqueued of a task represents the contribution on the - * estimated utilization of the CPU where that task is currently enqueued. - * - * Only for tasks we track a moving average of the past instantaneous - * estimated utilization. This allows to absorb sporadic drops in utilization - * of an otherwise almost periodic task. - * - * The UTIL_AVG_UNCHANGED flag is used to synchronize util_est with util_avg - * updates. When a task is dequeued, its util_est should not be updated if its - * util_avg has not been updated in the meantime. - * This information is mapped into the MSB bit of util_est.enqueued at dequeue - * time. Since max value of util_est.enqueued for a task is 1024 (PELT util_avg - * for a task) it is safe to use MSB. - */ -struct util_est { - unsigned int enqueued; - unsigned int ewma; -#define UTIL_EST_WEIGHT_SHIFT 2 -#define UTIL_AVG_UNCHANGED 0x80000000 -} __attribute__((__aligned__(sizeof(u64)))); - /* * The load/runnable/util_avg accumulates an infinite geometric series * (see __update_load_avg_cfs_rq() in kernel/sched/pelt.c). @@ -505,9 +469,20 @@ struct sched_avg { unsigned long load_avg; unsigned long runnable_avg; unsigned long util_avg; - struct util_est util_est; + unsigned int util_est; } ____cacheline_aligned; +/* + * The UTIL_AVG_UNCHANGED flag is used to synchronize util_est with util_avg + * updates. When a task is dequeued, its util_est should not be updated if its + * util_avg has not been updated in the meantime. + * This information is mapped into the MSB bit of util_est at dequeue time. + * Since max value of util_est for a task is 1024 (PELT util_avg for a task) + * it is safe to use MSB. + */ +#define UTIL_EST_WEIGHT_SHIFT 2 +#define UTIL_AVG_UNCHANGED 0x80000000 + struct sched_statistics { #ifdef CONFIG_SCHEDSTATS u64 wait_start; diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 168eecc209b4..8d5d98a5834d 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -684,8 +684,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) cfs_rq->avg.runnable_avg); SEQ_printf(m, " .%-30s: %lu\n", "util_avg", cfs_rq->avg.util_avg); - SEQ_printf(m, " .%-30s: %u\n", "util_est_enqueued", - cfs_rq->avg.util_est.enqueued); + SEQ_printf(m, " .%-30s: %u\n", "util_est", + cfs_rq->avg.util_est); SEQ_printf(m, " .%-30s: %ld\n", "removed.load_avg", cfs_rq->removed.load_avg); SEQ_printf(m, " .%-30s: %ld\n", "removed.util_avg", @@ -1075,8 +1075,7 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, P(se.avg.runnable_avg); P(se.avg.util_avg); P(se.avg.last_update_time); - P(se.avg.util_est.ewma); - PM(se.avg.util_est.enqueued, ~UTIL_AVG_UNCHANGED); + PM(se.avg.util_est, ~UTIL_AVG_UNCHANGED); #endif #ifdef CONFIG_UCLAMP_TASK __PS("uclamp.min", p->uclamp_req[UCLAMP_MIN].value); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e94d65da8d66..823dd76d0546 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4781,9 +4781,7 @@ static inline unsigned long task_runnable(struct task_struct *p) static inline unsigned long _task_util_est(struct task_struct *p) { - struct util_est ue = READ_ONCE(p->se.avg.util_est); - - return max(ue.ewma, (ue.enqueued & ~UTIL_AVG_UNCHANGED)); + return READ_ONCE(p->se.avg.util_est) & ~UTIL_AVG_UNCHANGED; } static inline unsigned long task_util_est(struct task_struct *p) @@ -4800,9 +4798,9 @@ static inline void util_est_enqueue(struct cfs_rq *cfs_rq, return; /* Update root cfs_rq's estimated utilization */ - enqueued = cfs_rq->avg.util_est.enqueued; + enqueued = cfs_rq->avg.util_est; enqueued += _task_util_est(p); - WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued); + WRITE_ONCE(cfs_rq->avg.util_est, enqueued); trace_sched_util_est_cfs_tp(cfs_rq); } @@ -4816,34 +4814,20 @@ static inline void util_est_dequeue(struct cfs_rq *cfs_rq, return; /* Update root cfs_rq's estimated utilization */ - enqueued = cfs_rq->avg.util_est.enqueued; + enqueued = cfs_rq->avg.util_est; enqueued -= min_t(unsigned int, enqueued, _task_util_est(p)); - WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued); + WRITE_ONCE(cfs_rq->avg.util_est, enqueued); trace_sched_util_est_cfs_tp(cfs_rq); } #define UTIL_EST_MARGIN (SCHED_CAPACITY_SCALE / 100) -/* - * Check if a (signed) value is within a specified (unsigned) margin, - * based on the observation that: - * - * abs(x) < y := (unsigned)(x + y - 1) < (2 * y - 1) - * - * NOTE: this only works when value + margin < INT_MAX. - */ -static inline bool within_margin(int value, int margin) -{ - return ((unsigned int)(value + margin - 1) < (2 * margin - 1)); -} - static inline void util_est_update(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep) { - long last_ewma_diff, last_enqueued_diff; - struct util_est ue; + unsigned int ewma, dequeued, last_ewma_diff; if (!sched_feat(UTIL_EST)) return; @@ -4855,23 +4839,25 @@ static inline void util_est_update(struct cfs_rq *cfs_rq, if (!task_sleep) return; + /* Get current estimate of utilization */ + ewma = READ_ONCE(p->se.avg.util_est); + /* * If the PELT values haven't changed since enqueue time, * skip the util_est update. */ - ue = p->se.avg.util_est; - if (ue.enqueued & UTIL_AVG_UNCHANGED) + if (ewma & UTIL_AVG_UNCHANGED) return; - last_enqueued_diff = ue.enqueued; + /* Get utilization at dequeue */ + dequeued = task_util(p); /* * Reset EWMA on utilization increases, the moving average is used only * to smooth utilization decreases. */ - ue.enqueued = task_util(p); - if (ue.ewma < ue.enqueued) { - ue.ewma = ue.enqueued; + if (ewma <= dequeued) { + ewma = dequeued; goto done; } @@ -4879,27 +4865,22 @@ static inline void util_est_update(struct cfs_rq *cfs_rq, * Skip update of task's estimated utilization when its members are * already ~1% close to its last activation value. */ - last_ewma_diff = ue.enqueued - ue.ewma; - last_enqueued_diff -= ue.enqueued; - if (within_margin(last_ewma_diff, UTIL_EST_MARGIN)) { - if (!within_margin(last_enqueued_diff, UTIL_EST_MARGIN)) - goto done; - - return; - } + last_ewma_diff = ewma - dequeued; + if (last_ewma_diff < UTIL_EST_MARGIN) + goto done; /* * To avoid overestimation of actual task utilization, skip updates if * we cannot grant there is idle time in this CPU. */ - if (task_util(p) > arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq)))) + if (dequeued > arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq)))) return; /* * To avoid underestimate of task utilization, skip updates of EWMA if * we cannot grant that thread got all CPU time it wanted. */ - if ((ue.enqueued + UTIL_EST_MARGIN) < task_runnable(p)) + if ((dequeued + UTIL_EST_MARGIN) < task_runnable(p)) goto done; @@ -4907,25 +4888,24 @@ static inline void util_est_update(struct cfs_rq *cfs_rq, * Update Task's estimated utilization * * When *p completes an activation we can consolidate another sample - * of the task size. This is done by storing the current PELT value - * as ue.enqueued and by using this value to update the Exponential - * Weighted Moving Average (EWMA): + * of the task size. This is done by using this value to update the + * Exponential Weighted Moving Average (EWMA): * * ewma(t) = w * task_util(p) + (1-w) * ewma(t-1) * = w * task_util(p) + ewma(t-1) - w * ewma(t-1) * = w * (task_util(p) - ewma(t-1)) + ewma(t-1) - * = w * ( last_ewma_diff ) + ewma(t-1) - * = w * (last_ewma_diff + ewma(t-1) / w) + * = w * ( -last_ewma_diff ) + ewma(t-1) + * = w * (-last_ewma_diff + ewma(t-1) / w) * * Where 'w' is the weight of new samples, which is configured to be * 0.25, thus making w=1/4 ( >>= UTIL_EST_WEIGHT_SHIFT) */ - ue.ewma <<= UTIL_EST_WEIGHT_SHIFT; - ue.ewma += last_ewma_diff; - ue.ewma >>= UTIL_EST_WEIGHT_SHIFT; + ewma <<= UTIL_EST_WEIGHT_SHIFT; + ewma -= last_ewma_diff; + ewma >>= UTIL_EST_WEIGHT_SHIFT; done: - ue.enqueued |= UTIL_AVG_UNCHANGED; - WRITE_ONCE(p->se.avg.util_est, ue); + ewma |= UTIL_AVG_UNCHANGED; + WRITE_ONCE(p->se.avg.util_est, ewma); trace_sched_util_est_se_tp(&p->se); } @@ -7653,16 +7633,16 @@ cpu_util(int cpu, struct task_struct *p, int dst_cpu, int boost) if (sched_feat(UTIL_EST)) { unsigned long util_est; - util_est = READ_ONCE(cfs_rq->avg.util_est.enqueued); + util_est = READ_ONCE(cfs_rq->avg.util_est); /* * During wake-up @p isn't enqueued yet and doesn't contribute - * to any cpu_rq(cpu)->cfs.avg.util_est.enqueued. + * to any cpu_rq(cpu)->cfs.avg.util_est. * If @dst_cpu == @cpu add it to "simulate" cpu_util after @p * has been enqueued. * * During exec (@dst_cpu = -1) @p is enqueued and does - * contribute to cpu_rq(cpu)->cfs.util_est.enqueued. + * contribute to cpu_rq(cpu)->cfs.util_est. * Remove it to "simulate" cpu_util without @p's contribution. * * Despite the task_on_rq_queued(@p) check there is still a diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h index 3a0e0dc28721..9e1083465fbc 100644 --- a/kernel/sched/pelt.h +++ b/kernel/sched/pelt.h @@ -52,13 +52,13 @@ static inline void cfs_se_util_change(struct sched_avg *avg) return; /* Avoid store if the flag has been already reset */ - enqueued = avg->util_est.enqueued; + enqueued = avg->util_est; if (!(enqueued & UTIL_AVG_UNCHANGED)) return; /* Reset flag to report util_avg has been updated */ enqueued &= ~UTIL_AVG_UNCHANGED; - WRITE_ONCE(avg->util_est.enqueued, enqueued); + WRITE_ONCE(avg->util_est, enqueued); } static inline u64 rq_clock_pelt(struct rq *rq) -- cgit From 3af7524b14198f5159a86692d57a9f28ec9375ce Mon Sep 17 00:00:00 2001 From: Pierre Gondois Date: Wed, 6 Dec 2023 10:00:43 +0100 Subject: sched/fair: Use all little CPUs for CPU-bound workloads Running N CPU-bound tasks on an N CPUs platform: - with asymmetric CPU capacity - not being a DynamIq system (i.e. having a PKG level sched domain without the SD_SHARE_PKG_RESOURCES flag set) .. might result in a task placement where two tasks run on a big CPU and none on a little CPU. This placement could be more optimal by using all CPUs. Testing platform: Juno-r2: - 2 big CPUs (1-2), maximum capacity of 1024 - 4 little CPUs (0,3-5), maximum capacity of 383 Testing workload ([1]): Spawn 6 CPU-bound tasks. During the first 100ms (step 1), each tasks is affine to a CPU, except for: - one little CPU which is left idle. - one big CPU which has 2 tasks affine. After the 100ms (step 2), remove the cpumask affinity. Behavior before the patch: During step 2, the load balancer running from the idle CPU tags sched domains as: - little CPUs: 'group_has_spare'. Cf. group_has_capacity() and group_is_overloaded(), 3 CPU-bound tasks run on a 4 CPUs sched-domain, and the idle CPU provides enough spare capacity regarding the imbalance_pct - big CPUs: 'group_overloaded'. Indeed, 3 tasks run on a 2 CPUs sched-domain, so the following path is used: group_is_overloaded() \-if (sgs->sum_nr_running <= sgs->group_weight) return true; The following path which would change the migration type to 'migrate_task' is not taken: calculate_imbalance() \-if (env->idle != CPU_NOT_IDLE && env->imbalance == 0) as the local group has some spare capacity, so the imbalance is not 0. The migration type requested is 'migrate_util' and the busiest runqueue is the big CPU's runqueue having 2 tasks (each having a utilization of 512). The idle little CPU cannot pull one of these task as its capacity is too small for the task. The following path is used: detach_tasks() \-case migrate_util: \-if (util > env->imbalance) goto next; After the patch: As the number of failed balancing attempts grows (with 'nr_balance_failed'), progressively make it easier to migrate a big task to the idling little CPU. A similar mechanism is used for the 'migrate_load' migration type. Improvement: Running the testing workload [1] with the step 2 representing a ~10s load for a big CPU: Before patch: ~19.3s After patch: ~18s (-6.7%) Similar issue reported at: https://lore.kernel.org/lkml/20230716014125.139577-1-qyousef@layalina.io/ Suggested-by: Vincent Guittot Signed-off-by: Pierre Gondois Signed-off-by: Ingo Molnar Reviewed-by: Vincent Guittot Reviewed-by: Dietmar Eggemann Acked-by: Qais Yousef Link: https://lore.kernel.org/r/20231206090043.634697-1-pierre.gondois@arm.com --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 823dd76d0546..1d561b5aae2a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -9071,7 +9071,7 @@ static int detach_tasks(struct lb_env *env) case migrate_util: util = task_util_est(p); - if (util > env->imbalance) + if (shr_bound(util, env->sd->nr_balance_failed) > env->imbalance) goto next; env->imbalance -= util; -- cgit From fbb66ce0b1d670c72def736a13ac9176b860df4e Mon Sep 17 00:00:00 2001 From: Wang Jinchao Date: Thu, 14 Dec 2023 13:20:29 +0800 Subject: sched/fair: Remove unused 'next_buddy_marked' local variable in check_preempt_wakeup_fair() This variable became unused in: 5e963f2bd465 ("sched/fair: Commit to EEVDF") Signed-off-by: Wang Jinchao Signed-off-by: Ingo Molnar Reviewed-by: Vincent Guittot Link: https://lore.kernel.org/r/202312141319+0800-wangjinchao@xfusion.com --- kernel/sched/fair.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1d561b5aae2a..9cc20855dc2b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8221,7 +8221,6 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int struct task_struct *curr = rq->curr; struct sched_entity *se = &curr->se, *pse = &p->se; struct cfs_rq *cfs_rq = task_cfs_rq(curr); - int next_buddy_marked = 0; int cse_is_idle, pse_is_idle; if (unlikely(se == pse)) @@ -8238,7 +8237,6 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int if (sched_feat(NEXT_BUDDY) && !(wake_flags & WF_FORK)) { set_next_buddy(pse); - next_buddy_marked = 1; } /* -- cgit From 932562a6045ed613d45bd100db37114273c22077 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 15 Dec 2023 15:58:20 -0500 Subject: rseq: Split out rseq.h from sched.h We're trying to get sched.h down to more or less just types only, not code - rseq can live in its own header. This helps us kill the dependency on preempt.h in sched.h. Signed-off-by: Kent Overstreet --- arch/arm64/kernel/ptrace.c | 1 + arch/powerpc/kernel/interrupt.c | 1 + arch/s390/kernel/signal.c | 1 + arch/x86/kernel/signal.c | 1 + fs/exec.c | 1 + include/linux/resume_user_mode.h | 1 + include/linux/rseq.h | 131 +++++++++++++++++++++++++++++++++++++++ include/linux/sched.h | 125 +------------------------------------ kernel/fork.c | 1 + kernel/sched/core.c | 1 + 10 files changed, 140 insertions(+), 124 deletions(-) create mode 100644 include/linux/rseq.h (limited to 'kernel') diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c index 20d7ef82de90..09bb7fc7d3c2 100644 --- a/arch/arm64/kernel/ptrace.c +++ b/arch/arm64/kernel/ptrace.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include diff --git a/arch/powerpc/kernel/interrupt.c b/arch/powerpc/kernel/interrupt.c index c4f6d3c69ba9..eca293794a1e 100644 --- a/arch/powerpc/kernel/interrupt.c +++ b/arch/powerpc/kernel/interrupt.c @@ -3,6 +3,7 @@ #include #include #include +#include #include /* for show_regs */ #include diff --git a/arch/s390/kernel/signal.c b/arch/s390/kernel/signal.c index d63557d3868c..48e46809ee27 100644 --- a/arch/s390/kernel/signal.c +++ b/arch/s390/kernel/signal.c @@ -12,6 +12,7 @@ #include #include +#include #include #include #include diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 65fe2094da59..31b6f5dddfc2 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include diff --git a/fs/exec.c b/fs/exec.c index 4aa19b24f281..41773af7e3dc 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -66,6 +66,7 @@ #include #include #include +#include #include #include diff --git a/include/linux/resume_user_mode.h b/include/linux/resume_user_mode.h index f8f3e958e9cf..e0135e0adae0 100644 --- a/include/linux/resume_user_mode.h +++ b/include/linux/resume_user_mode.h @@ -6,6 +6,7 @@ #include #include #include +#include #include /** diff --git a/include/linux/rseq.h b/include/linux/rseq.h new file mode 100644 index 000000000000..bc8af3eb5598 --- /dev/null +++ b/include/linux/rseq.h @@ -0,0 +1,131 @@ +/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */ +#ifndef _LINUX_RSEQ_H +#define _LINUX_RSEQ_H + +#ifdef CONFIG_RSEQ + +#include +#include + +/* + * Map the event mask on the user-space ABI enum rseq_cs_flags + * for direct mask checks. + */ +enum rseq_event_mask_bits { + RSEQ_EVENT_PREEMPT_BIT = RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT, + RSEQ_EVENT_SIGNAL_BIT = RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT, + RSEQ_EVENT_MIGRATE_BIT = RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT, +}; + +enum rseq_event_mask { + RSEQ_EVENT_PREEMPT = (1U << RSEQ_EVENT_PREEMPT_BIT), + RSEQ_EVENT_SIGNAL = (1U << RSEQ_EVENT_SIGNAL_BIT), + RSEQ_EVENT_MIGRATE = (1U << RSEQ_EVENT_MIGRATE_BIT), +}; + +static inline void rseq_set_notify_resume(struct task_struct *t) +{ + if (t->rseq) + set_tsk_thread_flag(t, TIF_NOTIFY_RESUME); +} + +void __rseq_handle_notify_resume(struct ksignal *sig, struct pt_regs *regs); + +static inline void rseq_handle_notify_resume(struct ksignal *ksig, + struct pt_regs *regs) +{ + if (current->rseq) + __rseq_handle_notify_resume(ksig, regs); +} + +static inline void rseq_signal_deliver(struct ksignal *ksig, + struct pt_regs *regs) +{ + preempt_disable(); + __set_bit(RSEQ_EVENT_SIGNAL_BIT, ¤t->rseq_event_mask); + preempt_enable(); + rseq_handle_notify_resume(ksig, regs); +} + +/* rseq_preempt() requires preemption to be disabled. */ +static inline void rseq_preempt(struct task_struct *t) +{ + __set_bit(RSEQ_EVENT_PREEMPT_BIT, &t->rseq_event_mask); + rseq_set_notify_resume(t); +} + +/* rseq_migrate() requires preemption to be disabled. */ +static inline void rseq_migrate(struct task_struct *t) +{ + __set_bit(RSEQ_EVENT_MIGRATE_BIT, &t->rseq_event_mask); + rseq_set_notify_resume(t); +} + +/* + * If parent process has a registered restartable sequences area, the + * child inherits. Unregister rseq for a clone with CLONE_VM set. + */ +static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags) +{ + if (clone_flags & CLONE_VM) { + t->rseq = NULL; + t->rseq_len = 0; + t->rseq_sig = 0; + t->rseq_event_mask = 0; + } else { + t->rseq = current->rseq; + t->rseq_len = current->rseq_len; + t->rseq_sig = current->rseq_sig; + t->rseq_event_mask = current->rseq_event_mask; + } +} + +static inline void rseq_execve(struct task_struct *t) +{ + t->rseq = NULL; + t->rseq_len = 0; + t->rseq_sig = 0; + t->rseq_event_mask = 0; +} + +#else + +static inline void rseq_set_notify_resume(struct task_struct *t) +{ +} +static inline void rseq_handle_notify_resume(struct ksignal *ksig, + struct pt_regs *regs) +{ +} +static inline void rseq_signal_deliver(struct ksignal *ksig, + struct pt_regs *regs) +{ +} +static inline void rseq_preempt(struct task_struct *t) +{ +} +static inline void rseq_migrate(struct task_struct *t) +{ +} +static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags) +{ +} +static inline void rseq_execve(struct task_struct *t) +{ +} + +#endif + +#ifdef CONFIG_DEBUG_RSEQ + +void rseq_syscall(struct pt_regs *regs); + +#else + +static inline void rseq_syscall(struct pt_regs *regs) +{ +} + +#endif + +#endif /* _LINUX_RSEQ_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index dd002d193726..a588b94988bc 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -35,7 +35,7 @@ #include #include #include -#include +#include #include #include #include @@ -2181,129 +2181,6 @@ static inline bool owner_on_cpu(struct task_struct *owner) unsigned long sched_cpu_util(int cpu); #endif /* CONFIG_SMP */ -#ifdef CONFIG_RSEQ - -/* - * Map the event mask on the user-space ABI enum rseq_cs_flags - * for direct mask checks. - */ -enum rseq_event_mask_bits { - RSEQ_EVENT_PREEMPT_BIT = RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT, - RSEQ_EVENT_SIGNAL_BIT = RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT, - RSEQ_EVENT_MIGRATE_BIT = RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT, -}; - -enum rseq_event_mask { - RSEQ_EVENT_PREEMPT = (1U << RSEQ_EVENT_PREEMPT_BIT), - RSEQ_EVENT_SIGNAL = (1U << RSEQ_EVENT_SIGNAL_BIT), - RSEQ_EVENT_MIGRATE = (1U << RSEQ_EVENT_MIGRATE_BIT), -}; - -static inline void rseq_set_notify_resume(struct task_struct *t) -{ - if (t->rseq) - set_tsk_thread_flag(t, TIF_NOTIFY_RESUME); -} - -void __rseq_handle_notify_resume(struct ksignal *sig, struct pt_regs *regs); - -static inline void rseq_handle_notify_resume(struct ksignal *ksig, - struct pt_regs *regs) -{ - if (current->rseq) - __rseq_handle_notify_resume(ksig, regs); -} - -static inline void rseq_signal_deliver(struct ksignal *ksig, - struct pt_regs *regs) -{ - preempt_disable(); - __set_bit(RSEQ_EVENT_SIGNAL_BIT, ¤t->rseq_event_mask); - preempt_enable(); - rseq_handle_notify_resume(ksig, regs); -} - -/* rseq_preempt() requires preemption to be disabled. */ -static inline void rseq_preempt(struct task_struct *t) -{ - __set_bit(RSEQ_EVENT_PREEMPT_BIT, &t->rseq_event_mask); - rseq_set_notify_resume(t); -} - -/* rseq_migrate() requires preemption to be disabled. */ -static inline void rseq_migrate(struct task_struct *t) -{ - __set_bit(RSEQ_EVENT_MIGRATE_BIT, &t->rseq_event_mask); - rseq_set_notify_resume(t); -} - -/* - * If parent process has a registered restartable sequences area, the - * child inherits. Unregister rseq for a clone with CLONE_VM set. - */ -static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags) -{ - if (clone_flags & CLONE_VM) { - t->rseq = NULL; - t->rseq_len = 0; - t->rseq_sig = 0; - t->rseq_event_mask = 0; - } else { - t->rseq = current->rseq; - t->rseq_len = current->rseq_len; - t->rseq_sig = current->rseq_sig; - t->rseq_event_mask = current->rseq_event_mask; - } -} - -static inline void rseq_execve(struct task_struct *t) -{ - t->rseq = NULL; - t->rseq_len = 0; - t->rseq_sig = 0; - t->rseq_event_mask = 0; -} - -#else - -static inline void rseq_set_notify_resume(struct task_struct *t) -{ -} -static inline void rseq_handle_notify_resume(struct ksignal *ksig, - struct pt_regs *regs) -{ -} -static inline void rseq_signal_deliver(struct ksignal *ksig, - struct pt_regs *regs) -{ -} -static inline void rseq_preempt(struct task_struct *t) -{ -} -static inline void rseq_migrate(struct task_struct *t) -{ -} -static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags) -{ -} -static inline void rseq_execve(struct task_struct *t) -{ -} - -#endif - -#ifdef CONFIG_DEBUG_RSEQ - -void rseq_syscall(struct pt_regs *regs); - -#else - -static inline void rseq_syscall(struct pt_regs *regs) -{ -} - -#endif - #ifdef CONFIG_SCHED_CORE extern void sched_core_free(struct task_struct *tsk); extern void sched_core_fork(struct task_struct *p); diff --git a/kernel/fork.c b/kernel/fork.c index 319e61297bfb..53816393995b 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -100,6 +100,7 @@ #include #include #include +#include #include #include diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a708d225c28e..d04cf3c47899 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -57,6 +57,7 @@ #include #include #include +#include #include #include #include -- cgit From 1e2f2d31997a9496f99e2b43255d6a48b06fbcc2 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 15 Dec 2023 15:51:54 -0500 Subject: Kill sched.h dependency on rcupdate.h by moving cond_resched_rcu() to rcupdate_wait.h, we can kill another big sched.h dependency. Signed-off-by: Kent Overstreet --- arch/powerpc/kvm/book3s_64_vio.c | 1 + include/linux/rcupdate_wait.h | 10 ++++++++++ include/linux/sched.h | 15 +++++---------- include/linux/sched/task.h | 1 + kernel/bpf/hashtab.c | 1 + lib/test_rhashtable.c | 1 + mm/filemap.c | 1 + mm/khugepaged.c | 1 + mm/shmem.c | 1 + net/ipv4/fib_trie.c | 1 + net/netfilter/ipset/ip_set_bitmap_gen.h | 2 ++ net/netfilter/ipset/ip_set_hash_gen.h | 1 + net/netfilter/ipvs/ip_vs_conn.c | 1 + net/netfilter/ipvs/ip_vs_est.c | 1 + 14 files changed, 28 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c index 14c6d7e318da..b569ebaa590e 100644 --- a/arch/powerpc/kvm/book3s_64_vio.c +++ b/arch/powerpc/kvm/book3s_64_vio.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include diff --git a/include/linux/rcupdate_wait.h b/include/linux/rcupdate_wait.h index 5e0f74f2f8ca..d07f0848802e 100644 --- a/include/linux/rcupdate_wait.h +++ b/include/linux/rcupdate_wait.h @@ -8,6 +8,7 @@ #include #include +#include /* * Structure allowing asynchronous waiting on RCU. @@ -55,4 +56,13 @@ do { \ #define synchronize_rcu_mult(...) \ _wait_rcu_gp(IS_ENABLED(CONFIG_TINY_RCU), __VA_ARGS__) +static inline void cond_resched_rcu(void) +{ +#if defined(CONFIG_DEBUG_ATOMIC_SLEEP) || !defined(CONFIG_PREEMPT_RCU) + rcu_read_unlock(); + cond_resched(); + rcu_read_lock(); +#endif +} + #endif /* _LINUX_SCHED_RCUPDATE_WAIT_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index a588b94988bc..814bfdafbc1c 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -10,9 +10,14 @@ #include #include +#include +#include +#include +#include #include #include +#include #include #include #include @@ -23,7 +28,6 @@ #include #include #include -#include #include #include #include @@ -2059,15 +2063,6 @@ extern int __cond_resched_rwlock_write(rwlock_t *lock); __cond_resched_rwlock_write(lock); \ }) -static inline void cond_resched_rcu(void) -{ -#if defined(CONFIG_DEBUG_ATOMIC_SLEEP) || !defined(CONFIG_PREEMPT_RCU) - rcu_read_unlock(); - cond_resched(); - rcu_read_lock(); -#endif -} - #ifdef CONFIG_PREEMPT_DYNAMIC extern bool preempt_model_none(void); diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index 1880ae21a9cb..538cdfbe895f 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -7,6 +7,7 @@ * functionality: */ +#include #include #include #include diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index fd8d4b0addfc..3ba98ed2b3f9 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include diff --git a/lib/test_rhashtable.c b/lib/test_rhashtable.c index c20f6cb4bf55..42b585208249 100644 --- a/lib/test_rhashtable.c +++ b/lib/test_rhashtable.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include diff --git a/mm/filemap.c b/mm/filemap.c index f1c8c278310f..1219ffc04a26 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -45,6 +45,7 @@ #include #include #include +#include #include #include #include "internal.h" diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 064654717843..47a20a4ece09 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include diff --git a/mm/shmem.c b/mm/shmem.c index 91e2620148b2..98f6ca7bdae1 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -79,6 +79,7 @@ static struct vfsmount *shm_mnt __ro_after_init; #include #include #include +#include #include diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 9bdfdab906fe..3ff35f811765 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -52,6 +52,7 @@ #include #include #include +#include #include #include #include diff --git a/net/netfilter/ipset/ip_set_bitmap_gen.h b/net/netfilter/ipset/ip_set_bitmap_gen.h index 26ab0e9612d8..21f7860e8fa1 100644 --- a/net/netfilter/ipset/ip_set_bitmap_gen.h +++ b/net/netfilter/ipset/ip_set_bitmap_gen.h @@ -4,6 +4,8 @@ #ifndef __IP_SET_BITMAP_IP_GEN_H #define __IP_SET_BITMAP_IP_GEN_H +#include + #define mtype_do_test IPSET_TOKEN(MTYPE, _do_test) #define mtype_gc_test IPSET_TOKEN(MTYPE, _gc_test) #define mtype_is_filled IPSET_TOKEN(MTYPE, _is_filled) diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h index 7c2399541771..cbf80da9a01c 100644 --- a/net/netfilter/ipset/ip_set_hash_gen.h +++ b/net/netfilter/ipset/ip_set_hash_gen.h @@ -5,6 +5,7 @@ #define _IP_SET_HASH_GEN_H #include +#include #include #include #include diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index 9065da3cdd12..a743db073887 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -31,6 +31,7 @@ #include #include #include +#include #include #include diff --git a/net/netfilter/ipvs/ip_vs_est.c b/net/netfilter/ipvs/ip_vs_est.c index c5970ba416ae..f821ad2e19b3 100644 --- a/net/netfilter/ipvs/ip_vs_est.c +++ b/net/netfilter/ipvs/ip_vs_est.c @@ -21,6 +21,7 @@ #include #include #include +#include #include -- cgit From f60a631ab9ed5df15e446269ea515f2b8948ba0c Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Thu, 21 Dec 2023 17:40:14 +0100 Subject: sched/fair: Fix tg->load when offlining a CPU When a CPU is taken offline, the contribution of its cfs_rqs to task_groups' load may remain and will negatively impact the calculation of the share of the online CPUs. To fix this bug, clear the contribution of an offlining CPU to task groups' load and skip its contribution while it is inactive. Here's the reproducer of the anomaly, by Imran Khan: "So far I have encountered only one rather lengthy way of reproducing this issue, which is as follows: 1. Take a KVM guest (booted with 4 CPUs and can be scaled up to 124 CPUs) and create 2 custom cgroups: /sys/fs/cgroup/cpu/test_group_1 and /sys/fs/cgroup/ cpu/test_group_2 2. Assign a CPU intensive workload to each of these cgroups and start the workload. For my tests I am using following app: int main(int argc, char *argv[]) { unsigned long count, i, val; if (argc != 2) { printf("usage: ./a.out \n"); return 0; } count = strtoul(argv[1], NULL, 10); printf("Generating %lu random numbers \n", count); for (i = 0; i < count; i++) { val = rand(); val = val % 2; //usleep(1); } printf("Generated %lu random numbers \n", count); return 0; } Also since the system is booted with 4 CPUs, in order to completely load the system I am also launching 4 instances of same test app under: /sys/fs/cgroup/cpu/ 3. We can see that both of the cgroups get similar CPU time: # systemd-cgtop --depth 1 Path Tasks %CPU Memory Input/s Output/s / 659 - 5.5G - - /system.slice - - 5.7G - - /test_group_1 4 - - - - /test_group_2 3 - - - - /user.slice 31 - 56.5M - - Path Tasks %CPU Memory Input/s Output/s / 659 394.6 5.5G - - /test_group_2 3 65.7 - - - /user.slice 29 55.1 48.0M - - /test_group_1 4 47.3 - - - /system.slice - 2.2 5.7G - - Path Tasks %CPU Memory Input/s Output/s / 659 394.8 5.5G - - /test_group_1 4 62.9 - - - /user.slice 28 44.9 54.2M - - /test_group_2 3 44.7 - - - /system.slice - 0.9 5.7G - - Path Tasks %CPU Memory Input/s Output/s / 659 394.4 5.5G - - /test_group_2 3 58.8 - - - /test_group_1 4 51.9 - - - /user.slice 30 39.3 59.6M - - /system.slice - 1.9 5.7G - - Path Tasks %CPU Memory Input/s Output/s / 659 394.7 5.5G - - /test_group_1 4 60.9 - - - /test_group_2 3 57.9 - - - /user.slice 28 43.5 36.9M - - /system.slice - 3.0 5.7G - - Path Tasks %CPU Memory Input/s Output/s / 659 395.0 5.5G - - /test_group_1 4 66.8 - - - /test_group_2 3 56.3 - - - /user.slice 29 43.1 51.8M - - /system.slice - 0.7 5.7G - - 4. Now move systemd-udevd to one of these test groups, say test_group_1, and perform scale up to 124 CPUs followed by scale down back to 4 CPUs from the host side. 5. Run the same workload i.e 4 instances of CPU hogger under /sys/fs/cgroup/cpu and one instance of CPU hogger each in /sys/fs/cgroup/cpu/test_group_1 and /sys/fs/cgroup/test_group_2. It can be seen that test_group_1 (the one where systemd-udevd was moved) is getting much less CPU time than the test_group_2, even though at this point of time both of these groups have only CPU hogger running: # systemd-cgtop --depth 1 Path Tasks %CPU Memory Input/s Output/s / 1219 - 5.4G - - /system.slice - - 5.6G - - /test_group_1 4 - - - - /test_group_2 3 - - - - /user.slice 26 - 91.3M - - Path Tasks %CPU Memory Input/s Output/s / 1221 394.3 5.4G - - /test_group_2 3 82.7 - - - /test_group_1 4 14.3 - - - /system.slice - 0.8 5.6G - - /user.slice 26 0.4 91.2M - - Path Tasks %CPU Memory Input/s Output/s / 1221 394.6 5.4G - - /test_group_2 3 67.4 - - - /system.slice - 24.6 5.6G - - /test_group_1 4 12.5 - - - /user.slice 26 0.4 91.2M - - Path Tasks %CPU Memory Input/s Output/s / 1221 395.2 5.4G - - /test_group_2 3 60.9 - - - /system.slice - 27.9 5.6G - - /test_group_1 4 12.2 - - - /user.slice 26 0.4 91.2M - - Path Tasks %CPU Memory Input/s Output/s / 1221 395.2 5.4G - - /test_group_2 3 69.4 - - - /test_group_1 4 13.9 - - - /user.slice 28 1.6 92.0M - - /system.slice - 1.0 5.6G - - Path Tasks %CPU Memory Input/s Output/s / 1221 395.6 5.4G - - /test_group_2 3 59.3 - - - /test_group_1 4 14.1 - - - /user.slice 28 1.3 92.2M - - /system.slice - 0.7 5.6G - - Path Tasks %CPU Memory Input/s Output/s / 1221 395.5 5.4G - - /test_group_2 3 67.2 - - - /test_group_1 4 11.5 - - - /user.slice 28 1.3 92.5M - - /system.slice - 0.6 5.6G - - Path Tasks %CPU Memory Input/s Output/s / 1221 395.1 5.4G - - /test_group_2 3 76.8 - - - /test_group_1 4 12.9 - - - /user.slice 28 1.3 92.8M - - /system.slice - 1.2 5.6G - - From sched_debug data it can be seen that in bad case the load.weight of per-CPU sched entities corresponding to test_group_1 has reduced significantly and also load_avg of test_group_1 remains much higher than that of test_group_2, even though systemd-udevd stopped running long time back and at this point of time both cgroups just have the CPU hogger app as running entity." [ mingo: Added details from the original discussion, plus minor edits to the patch. ] Reported-by: Imran Khan Tested-by: Imran Khan Tested-by: Aaron Lu Signed-off-by: Vincent Guittot Signed-off-by: Ingo Molnar Reviewed-by: Imran Khan Cc: Peter Zijlstra Cc: Borislav Petkov Link: https://lore.kernel.org/r/20231223111545.62135-1-vincent.guittot@linaro.org --- kernel/sched/fair.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d7a3c63a2171..43c1216898cb 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4096,6 +4096,10 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq) if (cfs_rq->tg == &root_task_group) return; + /* rq has been offline and doesn't contribute to the share anymore: */ + if (!cpu_active(cpu_of(rq_of(cfs_rq)))) + return; + /* * For migration heavy workloads, access to tg->load_avg can be * unbound. Limit the update rate to at most once per ms. @@ -4112,6 +4116,49 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq) } } +static inline void clear_tg_load_avg(struct cfs_rq *cfs_rq) +{ + long delta; + u64 now; + + /* + * No need to update load_avg for root_task_group, as it is not used. + */ + if (cfs_rq->tg == &root_task_group) + return; + + now = sched_clock_cpu(cpu_of(rq_of(cfs_rq))); + delta = 0 - cfs_rq->tg_load_avg_contrib; + atomic_long_add(delta, &cfs_rq->tg->load_avg); + cfs_rq->tg_load_avg_contrib = 0; + cfs_rq->last_update_tg_load_avg = now; +} + +/* CPU offline callback: */ +static void __maybe_unused clear_tg_offline_cfs_rqs(struct rq *rq) +{ + struct task_group *tg; + + lockdep_assert_rq_held(rq); + + /* + * The rq clock has already been updated in + * set_rq_offline(), so we should skip updating + * the rq clock again in unthrottle_cfs_rq(). + */ + rq_clock_start_loop_update(rq); + + rcu_read_lock(); + list_for_each_entry_rcu(tg, &task_groups, list) { + struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; + + clear_tg_load_avg(cfs_rq); + } + rcu_read_unlock(); + + rq_clock_stop_loop_update(rq); +} + /* * Called within set_task_rq() right before setting a task's CPU. The * caller only guarantees p->pi_lock is held; no other assumptions, @@ -4408,6 +4455,8 @@ static inline bool skip_blocked_update(struct sched_entity *se) static inline void update_tg_load_avg(struct cfs_rq *cfs_rq) {} +static inline void clear_tg_offline_cfs_rqs(struct rq *rq) {} + static inline int propagate_entity_load_avg(struct sched_entity *se) { return 0; @@ -12413,6 +12462,9 @@ static void rq_offline_fair(struct rq *rq) /* Ensure any throttled groups are reachable by pick_next_task */ unthrottle_offline_cfs_rqs(rq); + + /* Ensure that we remove rq contribution to group share: */ + clear_tg_offline_cfs_rqs(rq); } #endif /* CONFIG_SMP */ -- cgit From 623b1f896fa8a669a277ee5a258307a16c7377a3 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 26 Dec 2023 12:59:02 -0500 Subject: ring-buffer: Fix wake ups when buffer_percent is set to 100 The tracefs file "buffer_percent" is to allow user space to set a water-mark on how much of the tracing ring buffer needs to be filled in order to wake up a blocked reader. 0 - is to wait until any data is in the buffer 1 - is to wait for 1% of the sub buffers to be filled 50 - would be half of the sub buffers are filled with data 100 - is not to wake the waiter until the ring buffer is completely full Unfortunately the test for being full was: dirty = ring_buffer_nr_dirty_pages(buffer, cpu); return (dirty * 100) > (full * nr_pages); Where "full" is the value for "buffer_percent". There is two issues with the above when full == 100. 1. dirty * 100 > 100 * nr_pages will never be true That is, the above is basically saying that if the user sets buffer_percent to 100, more pages need to be dirty than exist in the ring buffer! 2. The page that the writer is on is never considered dirty, as dirty pages are only those that are full. When the writer goes to a new sub-buffer, it clears the contents of that sub-buffer. That is, even if the check was ">=" it would still not be equal as the most pages that can be considered "dirty" is nr_pages - 1. To fix this, add one to dirty and use ">=" in the compare. Link: https://lore.kernel.org/linux-trace-kernel/20231226125902.4a057f1d@gandalf.local.home Cc: stable@vger.kernel.org Cc: Mark Rutland Cc: Mathieu Desnoyers Acked-by: Masami Hiramatsu (Google) Fixes: 03329f9939781 ("tracing: Add tracefs file buffer_percentage") Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 83eab547f1d1..32c0dd2fd1c3 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -881,9 +881,14 @@ static __always_inline bool full_hit(struct trace_buffer *buffer, int cpu, int f if (!nr_pages || !full) return true; - dirty = ring_buffer_nr_dirty_pages(buffer, cpu); + /* + * Add one as dirty will never equal nr_pages, as the sub-buffer + * that the writer is on is not counted as dirty. + * This is needed if "buffer_percent" is set to 100. + */ + dirty = ring_buffer_nr_dirty_pages(buffer, cpu) + 1; - return (dirty * 100) > (full * nr_pages); + return (dirty * 100) >= (full * nr_pages); } /* -- cgit From 39a7dc23a1ed0fe81141792a09449d124c5953bd Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Thu, 28 Dec 2023 09:51:49 -0500 Subject: tracing: Fix blocked reader of snapshot buffer If an application blocks on the snapshot or snapshot_raw files, expecting to be woken up when a snapshot occurs, it will not happen. Or it may happen with an unexpected result. That result is that the application will be reading the main buffer instead of the snapshot buffer. That is because when the snapshot occurs, the main and snapshot buffers are swapped. But the reader has a descriptor still pointing to the buffer that it originally connected to. This is fine for the main buffer readers, as they may be blocked waiting for a watermark to be hit, and when a snapshot occurs, the data that the main readers want is now on the snapshot buffer. But for waiters of the snapshot buffer, they are waiting for an event to occur that will trigger the snapshot and they can then consume it quickly to save the snapshot before the next snapshot occurs. But to do this, they need to read the new snapshot buffer, not the old one that is now receiving new data. Also, it does not make sense to have a watermark "buffer_percent" on the snapshot buffer, as the snapshot buffer is static and does not receive new data except all at once. Link: https://lore.kernel.org/linux-trace-kernel/20231228095149.77f5b45d@gandalf.local.home Cc: stable@vger.kernel.org Cc: Mathieu Desnoyers Cc: Mark Rutland Acked-by: Masami Hiramatsu (Google) Fixes: debdd57f5145f ("tracing: Make a snapshot feature available from userspace") Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 3 ++- kernel/trace/trace.c | 20 +++++++++++++++++--- 2 files changed, 19 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 32c0dd2fd1c3..9286f88fcd32 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -949,7 +949,8 @@ void ring_buffer_wake_waiters(struct trace_buffer *buffer, int cpu) /* make sure the waiters see the new index */ smp_wmb(); - rb_wake_up_waiters(&rbwork->work); + /* This can be called in any context */ + irq_work_queue(&rbwork->work); } /** diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 199df497db07..a0defe156b57 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1894,6 +1894,9 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu, __update_max_tr(tr, tsk, cpu); arch_spin_unlock(&tr->max_lock); + + /* Any waiters on the old snapshot buffer need to wake up */ + ring_buffer_wake_waiters(tr->array_buffer.buffer, RING_BUFFER_ALL_CPUS); } /** @@ -1945,12 +1948,23 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) static int wait_on_pipe(struct trace_iterator *iter, int full) { + int ret; + /* Iterators are static, they should be filled or empty */ if (trace_buffer_iter(iter, iter->cpu_file)) return 0; - return ring_buffer_wait(iter->array_buffer->buffer, iter->cpu_file, - full); + ret = ring_buffer_wait(iter->array_buffer->buffer, iter->cpu_file, full); + +#ifdef CONFIG_TRACER_MAX_TRACE + /* + * Make sure this is still the snapshot buffer, as if a snapshot were + * to happen, this would now be the main buffer. + */ + if (iter->snapshot) + iter->array_buffer = &iter->tr->max_buffer; +#endif + return ret; } #ifdef CONFIG_FTRACE_STARTUP_TEST @@ -8517,7 +8531,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, wait_index = READ_ONCE(iter->wait_index); - ret = wait_on_pipe(iter, iter->tr->buffer_percent); + ret = wait_on_pipe(iter, iter->snapshot ? 0 : iter->tr->buffer_percent); if (ret) goto out; -- cgit From 2853b66b601a265306be709b4d86aaff7d92a0fc Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 11 Dec 2023 16:22:09 +0000 Subject: mm: remove some calls to page_add_new_anon_rmap() We already have the folio in these functions, we just need to use it. folio_add_new_anon_rmap() didn't exist at the time they were converted to folios. Link: https://lkml.kernel.org/r/20231211162214.2146080-5-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton --- kernel/events/uprobes.c | 2 +- mm/memory.c | 2 +- mm/userfaultfd.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 435aac1d8c27..8b115fc43f04 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -181,7 +181,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, if (new_page) { folio_get(new_folio); - page_add_new_anon_rmap(new_page, vma, addr); + folio_add_new_anon_rmap(new_folio, vma, addr); folio_add_lru_vma(new_folio, vma); } else /* no new page, just dec_mm_counter for old_page */ diff --git a/mm/memory.c b/mm/memory.c index 7649cb9eb7f5..5c023bef2adb 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4071,7 +4071,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) /* ksm created a completely new copy */ if (unlikely(folio != swapcache && swapcache)) { - page_add_new_anon_rmap(page, vma, vmf->address); + folio_add_new_anon_rmap(folio, vma, vmf->address); folio_add_lru_vma(folio, vma); } else { page_add_anon_rmap(page, vma, vmf->address, rmap_flags); diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 9ec814e47e99..203cda9192c2 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -116,7 +116,7 @@ int mfill_atomic_install_pte(pmd_t *dst_pmd, folio_add_lru(folio); page_add_file_rmap(page, dst_vma, false); } else { - page_add_new_anon_rmap(page, dst_vma, dst_addr); + folio_add_new_anon_rmap(folio, dst_vma, dst_addr); folio_add_lru_vma(folio, dst_vma); } -- cgit From 5cc9695f06b065168f5c893c8e006b6a8a2c9c91 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 20 Dec 2023 23:44:48 +0100 Subject: kernel/events/uprobes: page_remove_rmap() -> folio_remove_rmap_pte() Let's convert __replace_page(). Link: https://lkml.kernel.org/r/20231220224504.646757-25-david@redhat.com Signed-off-by: David Hildenbrand Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: Muchun Song Cc: Muchun Song Cc: Peter Xu Cc: Ryan Roberts Cc: Yin Fengwei Signed-off-by: Andrew Morton --- kernel/events/uprobes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 8b115fc43f04..485bb0389b48 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -198,7 +198,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, set_pte_at_notify(mm, addr, pvmw.pte, mk_pte(new_page, vma->vm_page_prot)); - page_remove_rmap(old_page, vma, false); + folio_remove_rmap_pte(old_folio, old_page, vma); if (!folio_mapped(old_folio)) folio_free_swap(old_folio); page_vma_mapped_walk_done(&pvmw); -- cgit From 816d334afa85c836080b41bb6238aea845615ad9 Mon Sep 17 00:00:00 2001 From: Yuntao Wang Date: Sun, 17 Dec 2023 11:35:26 +0800 Subject: kexec: modify the meaning of the end parameter in kimage_is_destination_range() The end parameter received by kimage_is_destination_range() should be the last valid byte address of the target memory segment plus 1. However, in the locate_mem_hole_bottom_up() and locate_mem_hole_top_down() functions, the corresponding value passed to kimage_is_destination_range() is the last valid byte address of the target memory segment, which is 1 less. There are two ways to fix this bug. We can either correct the logic of the locate_mem_hole_bottom_up() and locate_mem_hole_top_down() functions, or we can fix kimage_is_destination_range() by making the end parameter represent the last valid byte address of the target memory segment. Here, we choose the second approach. Due to the modification to kimage_is_destination_range(), we also need to adjust its callers, such as kimage_alloc_normal_control_pages() and kimage_alloc_page(). Link: https://lkml.kernel.org/r/20231217033528.303333-2-ytcoode@gmail.com Signed-off-by: Yuntao Wang Acked-by: Baoquan He Cc: Borislav Petkov (AMD) Cc: Dave Hansen Cc: "Eric W. Biederman" Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jonathan Corbet Cc: Thomas Gleixner Signed-off-by: Andrew Morton --- kernel/kexec_core.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 6e0f022987ff..2f039a7d9af9 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -278,8 +278,8 @@ int kimage_is_destination_range(struct kimage *image, unsigned long mstart, mend; mstart = image->segment[i].mem; - mend = mstart + image->segment[i].memsz; - if ((end > mstart) && (start < mend)) + mend = mstart + image->segment[i].memsz - 1; + if ((end >= mstart) && (start <= mend)) return 1; } @@ -372,7 +372,7 @@ static struct page *kimage_alloc_normal_control_pages(struct kimage *image, pfn = page_to_boot_pfn(pages); epfn = pfn + count; addr = pfn << PAGE_SHIFT; - eaddr = epfn << PAGE_SHIFT; + eaddr = (epfn << PAGE_SHIFT) - 1; if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) || kimage_is_destination_range(image, addr, eaddr)) { list_add(&pages->lru, &extra_pages); @@ -718,7 +718,7 @@ static struct page *kimage_alloc_page(struct kimage *image, /* If the page is not a destination page use it */ if (!kimage_is_destination_range(image, addr, - addr + PAGE_SIZE)) + addr + PAGE_SIZE - 1)) break; /* -- cgit From 18d565ea95fe553f442c5bbc5050415bab3c3fa4 Mon Sep 17 00:00:00 2001 From: Yuntao Wang Date: Sun, 17 Dec 2023 11:35:27 +0800 Subject: kexec_file: fix incorrect temp_start value in locate_mem_hole_top_down() temp_end represents the address of the last available byte. Therefore, the starting address of the memory segment with temp_end as its last available byte and a size of `kbuf->memsz`, that is, the value of temp_start, should be `temp_end - kbuf->memsz + 1` instead of `temp_end - kbuf->memsz`. Additionally, use the ALIGN_DOWN macro instead of open-coding it directly in locate_mem_hole_top_down() to improve code readability. Link: https://lkml.kernel.org/r/20231217033528.303333-3-ytcoode@gmail.com Signed-off-by: Yuntao Wang Acked-by: Baoquan He Cc: Borislav Petkov (AMD) Cc: Dave Hansen Cc: "Eric W. Biederman" Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jonathan Corbet Cc: Thomas Gleixner Signed-off-by: Andrew Morton --- kernel/kexec_file.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index aca5f3668f4c..bef2f6f2571b 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -434,11 +434,11 @@ static int locate_mem_hole_top_down(unsigned long start, unsigned long end, unsigned long temp_start, temp_end; temp_end = min(end, kbuf->buf_max); - temp_start = temp_end - kbuf->memsz; + temp_start = temp_end - kbuf->memsz + 1; do { /* align down start */ - temp_start = temp_start & (~(kbuf->buf_align - 1)); + temp_start = ALIGN_DOWN(temp_start, kbuf->buf_align); if (temp_start < start || temp_start < kbuf->buf_min) return 0; -- cgit From d391615618e8b2c30ef1e09c1705a7b1751f74f0 Mon Sep 17 00:00:00 2001 From: Ahelenia Ziemiańska Date: Tue, 19 Dec 2023 23:24:14 +0100 Subject: kernel: relay: remove relay_file_splice_read dead code, doesn't work MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Documentation/filesystems/relay.rst says to use return debugfs_create_file(filename, mode, parent, buf, &relay_file_operations); and this is the only way relay_file_operations is used. Thus: debugfs_create_file(&relay_file_operations) -> __debugfs_create_file(&debugfs_full_proxy_file_operations, &relay_file_operations) -> dentry{inode: {i_fop: &debugfs_full_proxy_file_operations}, d_fsdata: &relay_file_operations | DEBUGFS_FSDATA_IS_REAL_FOPS_BIT} debugfs_full_proxy_file_operations.open is full_proxy_open, which extracts the &relay_file_operations from the dentry, and allocates via __full_proxy_fops_init() new fops, with trivial wrappers around release, llseek, read, write, poll, and unlocked_ioctl, then replaces the fops on the opened file therewith. Naturally, all thusly-created debugfs files have .splice_read = NULL. This was introduced in commit 49d200deaa68 ("debugfs: prevent access to removed files' private data") from 2016-03-22. AFAICT, relay_file_operations is the only struct file_operations used for debugfs which defines a .splice_read callback. Hooking it up with > diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c > index 5063434be0fc..952fcf5b2afa 100644 > --- a/fs/debugfs/file.c > +++ b/fs/debugfs/file.c > @@ -328,6 +328,11 @@ FULL_PROXY_FUNC(write, ssize_t, filp, > loff_t *ppos), > ARGS(filp, buf, size, ppos)); > > +FULL_PROXY_FUNC(splice_read, long, in, > + PROTO(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, > + size_t len, unsigned int flags), > + ARGS(in, ppos, pipe, len, flags)); > + > FULL_PROXY_FUNC(unlocked_ioctl, long, filp, > PROTO(struct file *filp, unsigned int cmd, unsigned long arg), > ARGS(filp, cmd, arg)); > @@ -382,6 +387,8 @@ static void __full_proxy_fops_init(struct file_operations *proxy_fops, > proxy_fops->write = full_proxy_write; > if (real_fops->poll) > proxy_fops->poll = full_proxy_poll; > + if (real_fops->splice_read) > + proxy_fops->splice_read = full_proxy_splice_read; > if (real_fops->unlocked_ioctl) > proxy_fops->unlocked_ioctl = full_proxy_unlocked_ioctl; > } shows it just doesn't work, and splicing always instantly returns empty (subsequent reads actually return the contents). No-one noticed it became dead code in 2016, who knows if it worked back then. Clearly no-one cares; just delete it. Link: https://lkml.kernel.org/r/dtexwpw6zcdx7dkx3xj5gyjp5syxmyretdcbcdtvrnukd4vvuh@tarta.nabijaczleweli.xyz Signed-off-by: Ahelenia Ziemiańska Cc: Greg Kroah-Hartman Cc: Li kunyu Cc: Mike Rapoport (IBM) Cc: Rafael J. Wysocki Cc: Suren Baghdasaryan Cc: Zhang Zhengming Cc: Zhao Lei Signed-off-by: Andrew Morton --- kernel/relay.c | 162 --------------------------------------------------------- 1 file changed, 162 deletions(-) (limited to 'kernel') diff --git a/kernel/relay.c b/kernel/relay.c index 83fe0325cde1..a8e90e98bf2c 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -1073,167 +1073,6 @@ static ssize_t relay_file_read(struct file *filp, return written; } -static void relay_consume_bytes(struct rchan_buf *rbuf, int bytes_consumed) -{ - rbuf->bytes_consumed += bytes_consumed; - - if (rbuf->bytes_consumed >= rbuf->chan->subbuf_size) { - relay_subbufs_consumed(rbuf->chan, rbuf->cpu, 1); - rbuf->bytes_consumed %= rbuf->chan->subbuf_size; - } -} - -static void relay_pipe_buf_release(struct pipe_inode_info *pipe, - struct pipe_buffer *buf) -{ - struct rchan_buf *rbuf; - - rbuf = (struct rchan_buf *)page_private(buf->page); - relay_consume_bytes(rbuf, buf->private); -} - -static const struct pipe_buf_operations relay_pipe_buf_ops = { - .release = relay_pipe_buf_release, - .try_steal = generic_pipe_buf_try_steal, - .get = generic_pipe_buf_get, -}; - -static void relay_page_release(struct splice_pipe_desc *spd, unsigned int i) -{ -} - -/* - * subbuf_splice_actor - splice up to one subbuf's worth of data - */ -static ssize_t subbuf_splice_actor(struct file *in, - loff_t *ppos, - struct pipe_inode_info *pipe, - size_t len, - unsigned int flags, - int *nonpad_ret) -{ - unsigned int pidx, poff, total_len, subbuf_pages, nr_pages; - struct rchan_buf *rbuf = in->private_data; - unsigned int subbuf_size = rbuf->chan->subbuf_size; - uint64_t pos = (uint64_t) *ppos; - uint32_t alloc_size = (uint32_t) rbuf->chan->alloc_size; - size_t read_start = (size_t) do_div(pos, alloc_size); - size_t read_subbuf = read_start / subbuf_size; - size_t padding = rbuf->padding[read_subbuf]; - size_t nonpad_end = read_subbuf * subbuf_size + subbuf_size - padding; - struct page *pages[PIPE_DEF_BUFFERS]; - struct partial_page partial[PIPE_DEF_BUFFERS]; - struct splice_pipe_desc spd = { - .pages = pages, - .nr_pages = 0, - .nr_pages_max = PIPE_DEF_BUFFERS, - .partial = partial, - .ops = &relay_pipe_buf_ops, - .spd_release = relay_page_release, - }; - ssize_t ret; - - if (rbuf->subbufs_produced == rbuf->subbufs_consumed) - return 0; - if (splice_grow_spd(pipe, &spd)) - return -ENOMEM; - - /* - * Adjust read len, if longer than what is available - */ - if (len > (subbuf_size - read_start % subbuf_size)) - len = subbuf_size - read_start % subbuf_size; - - subbuf_pages = rbuf->chan->alloc_size >> PAGE_SHIFT; - pidx = (read_start / PAGE_SIZE) % subbuf_pages; - poff = read_start & ~PAGE_MASK; - nr_pages = min_t(unsigned int, subbuf_pages, spd.nr_pages_max); - - for (total_len = 0; spd.nr_pages < nr_pages; spd.nr_pages++) { - unsigned int this_len, this_end, private; - unsigned int cur_pos = read_start + total_len; - - if (!len) - break; - - this_len = min_t(unsigned long, len, PAGE_SIZE - poff); - private = this_len; - - spd.pages[spd.nr_pages] = rbuf->page_array[pidx]; - spd.partial[spd.nr_pages].offset = poff; - - this_end = cur_pos + this_len; - if (this_end >= nonpad_end) { - this_len = nonpad_end - cur_pos; - private = this_len + padding; - } - spd.partial[spd.nr_pages].len = this_len; - spd.partial[spd.nr_pages].private = private; - - len -= this_len; - total_len += this_len; - poff = 0; - pidx = (pidx + 1) % subbuf_pages; - - if (this_end >= nonpad_end) { - spd.nr_pages++; - break; - } - } - - ret = 0; - if (!spd.nr_pages) - goto out; - - ret = *nonpad_ret = splice_to_pipe(pipe, &spd); - if (ret < 0 || ret < total_len) - goto out; - - if (read_start + ret == nonpad_end) - ret += padding; - -out: - splice_shrink_spd(&spd); - return ret; -} - -static ssize_t relay_file_splice_read(struct file *in, - loff_t *ppos, - struct pipe_inode_info *pipe, - size_t len, - unsigned int flags) -{ - ssize_t spliced; - int ret; - int nonpad_ret = 0; - - ret = 0; - spliced = 0; - - while (len && !spliced) { - ret = subbuf_splice_actor(in, ppos, pipe, len, flags, &nonpad_ret); - if (ret < 0) - break; - else if (!ret) { - if (flags & SPLICE_F_NONBLOCK) - ret = -EAGAIN; - break; - } - - *ppos += ret; - if (ret > len) - len = 0; - else - len -= ret; - spliced += nonpad_ret; - nonpad_ret = 0; - } - - if (spliced) - return spliced; - - return ret; -} const struct file_operations relay_file_operations = { .open = relay_file_open, @@ -1242,6 +1081,5 @@ const struct file_operations relay_file_operations = { .read = relay_file_read, .llseek = no_llseek, .release = relay_file_release, - .splice_read = relay_file_splice_read, }; EXPORT_SYMBOL_GPL(relay_file_operations); -- cgit From 5f981878c71eaa8dc5563805152bd129a73a90be Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 19 Dec 2023 21:49:45 -0800 Subject: stacktrace: fix kernel-doc typo Change @task to @tsk to prevent kernel-doc warnings: kernel/stacktrace.c:138: warning: Excess function parameter 'task' description in 'stack_trace_save_tsk' kernel/stacktrace.c:138: warning: Function parameter or member 'tsk' not described in 'stack_trace_save_tsk' Link: https://lkml.kernel.org/r/20231220054945.17663-1-rdunlap@infradead.org Signed-off-by: Randy Dunlap Signed-off-by: Andrew Morton --- kernel/stacktrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c index 4f65824879ab..afb3c116da91 100644 --- a/kernel/stacktrace.c +++ b/kernel/stacktrace.c @@ -126,7 +126,7 @@ EXPORT_SYMBOL_GPL(stack_trace_save); /** * stack_trace_save_tsk - Save a task stack trace into a storage array - * @task: The task to examine + * @tsk: The task to examine * @store: Pointer to storage array * @size: Size of the storage array * @skipnr: Number of entries to skip at the start of the stack trace -- cgit From 2861b37732627d7d115d77585ce4853f25cf332d Mon Sep 17 00:00:00 2001 From: Yuntao Wang Date: Thu, 21 Dec 2023 12:23:08 +0800 Subject: kexec_core: fix the assignment to kimage->control_page image->control_page represents the starting address for allocating the next control page, while hole_end represents the address of the last valid byte of the currently allocated control page. This bug actually does not affect the correctness of allocating control pages, because image->control_page is currently only used in kimage_alloc_crash_control_pages(), and this function, when allocating control pages, will first align image->control_page up to the nearest `(1 << order) << PAGE_SHIFT` boundary, then use this value as the starting address of the next control page. This ensures that the newly allocated control page will use the correct starting address and not overlap with previously allocated control pages. Although it does not affect the correctness of the final result, it is better for us to set image->control_page to the correct value, in case it might be used elsewhere in the future, potentially causing errors. Therefore, after successfully allocating a control page, image->control_page should be updated to `hole_end + 1`, rather than hole_end. Link: https://lkml.kernel.org/r/20231221042308.11076-1-ytcoode@gmail.com Signed-off-by: Yuntao Wang Cc: Baoquan He Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton --- kernel/kexec_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 2f039a7d9af9..a08031b57a61 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -457,7 +457,7 @@ static struct page *kimage_alloc_crash_control_pages(struct kimage *image, /* If I don't overlap any segments I have found my hole! */ if (i == image->nr_segments) { pages = pfn_to_page(hole_start >> PAGE_SHIFT); - image->control_page = hole_end; + image->control_page = hole_end + 1; break; } } -- cgit From 6dcde5d5f248291c5ff6cbe00a7fa6ae400d1aa9 Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Wed, 20 Dec 2023 13:15:34 -0800 Subject: watchdog/hardlockup: adopt softlockup logic avoiding double-dumps Patch series "watchdog: Better handling of concurrent lockups". When we get multiple lockups at roughly the same time, the output in the kernel logs can be very confusing since the reports about the lockups end up interleaved in the logs. There is some code in the kernel to try to handle this but it wasn't that complete. Li Zhe recently made this a bit better for softlockups (specifically for the case where `kernel.softlockup_all_cpu_backtrace` is not set) in commit 9d02330abd3e ("softlockup: serialized softlockup's log"), but that only handled softlockup reports. Hardlockup reports still had similar issues. This series also has a small fix to avoid dumping all stacks a second time in the case of a panic. This is a bit unrelated to the interleaving fixes but it does also improve the clarity of lockup reports. This patch (of 4): The hardlockup detector and softlockup detector both have the ability to dump the stack of all CPUs (`kernel.hardlockup_all_cpu_backtrace` and `kernel.softlockup_all_cpu_backtrace`). Both detectors also have some logic to attempt to avoid interleaving printouts if two CPUs were trying to do dumps of all CPUs at the same time. However: - The hardlockup detector's logic still allowed interleaving some information. Specifically another CPU could print modules and dump the stack of the locked CPU at the same time we were dumping all CPUs. - In the case where `kernel.hardlockup_panic` was set in addition to `kernel.hardlockup_all_cpu_backtrace`, when two CPUs both detected hardlockups at the same time the second CPU could call panic() while the first was still dumping stacks. This was especially bad if the locked up CPU wasn't responding to the request for a backtrace since the function nmi_trigger_cpumask_backtrace() can wait up to 10 seconds. Let's resolve this by adopting the softlockup logic in the hardlockup handler. NOTES: - As part of this, one might think that we should make a helper function that both the hard and softlockup detectors call. This turns out not to be super trivial since it would have to be parameterized quite a bit since there are separate global variables controlling each lockup detector and they print log messages that are just different enough that it would be a pain. We probably don't want to change the messages that are printed without good reason to avoid throwing log parsers for a loop. - One might also think that it would be a good idea to have the hardlockup and softlockup detector use the same global variable to prevent interleaving. This would make sure that softlockups and hardlockups can't interleave each other. That _almost_ works but has a dangerous flaw if `kernel.hardlockup_panic` is not the same as `kernel.softlockup_panic` because we might skip a call to panic() if one type of lockup was detected at the same time as another. Link: https://lkml.kernel.org/r/20231220211640.2023645-1-dianders@chromium.org Link: https://lkml.kernel.org/r/20231220131534.1.I4f35a69fbb124b5f0c71f75c631e11fabbe188ff@changeid Signed-off-by: Douglas Anderson Cc: John Ogness Cc: Lecopzer Chen Cc: Li Zhe Cc: Petr Mladek Cc: Pingfan Liu Signed-off-by: Andrew Morton --- kernel/watchdog.c | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index bf30a6fac665..b4fd2f12137f 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -91,7 +91,7 @@ static DEFINE_PER_CPU(atomic_t, hrtimer_interrupts); static DEFINE_PER_CPU(int, hrtimer_interrupts_saved); static DEFINE_PER_CPU(bool, watchdog_hardlockup_warned); static DEFINE_PER_CPU(bool, watchdog_hardlockup_touched); -static unsigned long watchdog_hardlockup_all_cpu_dumped; +static unsigned long hard_lockup_nmi_warn; notrace void arch_touch_nmi_watchdog(void) { @@ -156,6 +156,15 @@ void watchdog_hardlockup_check(unsigned int cpu, struct pt_regs *regs) if (per_cpu(watchdog_hardlockup_warned, cpu)) return; + /* + * Prevent multiple hard-lockup reports if one cpu is already + * engaged in dumping all cpu back traces. + */ + if (sysctl_hardlockup_all_cpu_backtrace) { + if (test_and_set_bit_lock(0, &hard_lockup_nmi_warn)) + return; + } + pr_emerg("Watchdog detected hard LOCKUP on cpu %d\n", cpu); print_modules(); print_irqtrace_events(current); @@ -168,13 +177,10 @@ void watchdog_hardlockup_check(unsigned int cpu, struct pt_regs *regs) trigger_single_cpu_backtrace(cpu); } - /* - * Perform multi-CPU dump only once to avoid multiple - * hardlockups generating interleaving traces - */ - if (sysctl_hardlockup_all_cpu_backtrace && - !test_and_set_bit(0, &watchdog_hardlockup_all_cpu_dumped)) + if (sysctl_hardlockup_all_cpu_backtrace) { trigger_allbutcpu_cpu_backtrace(cpu); + clear_bit_unlock(0, &hard_lockup_nmi_warn); + } if (hardlockup_panic) nmi_panic(regs, "Hard LOCKUP"); -- cgit From 896260a6d69d01ceb1aad8cfe4298bd837a67432 Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Wed, 20 Dec 2023 13:15:35 -0800 Subject: watchdog/softlockup: use printk_cpu_sync_get_irqsave() to serialize reporting Instead of introducing a spinlock, use printk_cpu_sync_get_irqsave() and printk_cpu_sync_put_irqrestore() to serialize softlockup reporting. Alone this doesn't have any real advantage over the spinlock, but this will allow us to use the same function in a future change to also serialize hardlockup crawls. NOTE: for the most part this serialization is important because we often end up in the show_regs() path and that has no built-in serialization if there are multiple callers at once. However, even in the case where we end up in the dump_stack() path this still has some advantages because the stack will be guaranteed to be together in the logs with the lockup message with no interleaving. NOTE: the fact that printk_cpu_sync_get_irqsave() is allowed to be called multiple times on the same CPU is important here. Specifically we hold the "lock" while calling dump_stack() which also gets the same "lock". This is explicitly documented to be OK and means we don't need to introduce a variant of dump_stack() that doesn't grab the lock. Link: https://lkml.kernel.org/r/20231220131534.2.Ia5906525d440d8e8383cde31b7c61c2aadc8f907@changeid Signed-off-by: Douglas Anderson Reviewed-by: John Ogness Reviewed-by: Li Zhe Cc: Lecopzer Chen Cc: Petr Mladek Cc: Pingfan Liu Signed-off-by: Andrew Morton --- kernel/watchdog.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index b4fd2f12137f..526041a1100a 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -454,7 +454,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) struct pt_regs *regs = get_irq_regs(); int duration; int softlockup_all_cpu_backtrace = sysctl_softlockup_all_cpu_backtrace; - static DEFINE_SPINLOCK(watchdog_output_lock); + unsigned long flags; if (!watchdog_enabled) return HRTIMER_NORESTART; @@ -521,7 +521,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) /* Start period for the next softlockup warning. */ update_report_ts(); - spin_lock(&watchdog_output_lock); + printk_cpu_sync_get_irqsave(flags); pr_emerg("BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n", smp_processor_id(), duration, current->comm, task_pid_nr(current)); @@ -531,7 +531,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) show_regs(regs); else dump_stack(); - spin_unlock(&watchdog_output_lock); + printk_cpu_sync_put_irqrestore(flags); if (softlockup_all_cpu_backtrace) { trigger_allbutcpu_cpu_backtrace(smp_processor_id()); -- cgit From ee6bdb3f4bf046ff7878c6103b8c88bb4ccfb11d Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Wed, 20 Dec 2023 13:15:36 -0800 Subject: watchdog/hardlockup: use printk_cpu_sync_get_irqsave() to serialize reporting If two CPUs end up reporting a hardlockup at the same time then their logs could get interleaved which is hard to read. The interleaving problem was especially bad with the "perf" hardlockup detector where the locked up CPU is always the same as the running CPU and we end up in show_regs(). show_regs() has no inherent serialization so we could mix together two crawls if two hardlockups happened at the same time (and if we didn't have `sysctl_hardlockup_all_cpu_backtrace` set). With this change we'll fully serialize hardlockups when using the "perf" hardlockup detector. The interleaving problem was less bad with the "buddy" hardlockup detector. With "buddy" we always end up calling `trigger_single_cpu_backtrace(cpu)` on some CPU other than the running one. trigger_single_cpu_backtrace() always at least serializes the individual stack crawls because it eventually uses printk_cpu_sync_get_irqsave(). Unfortunately the fact that trigger_single_cpu_backtrace() eventually calls printk_cpu_sync_get_irqsave() (on a different CPU) means that we have to drop the "lock" before calling it and we can't fully serialize all printouts associated with a given hardlockup. However, we still do get the advantage of serializing the output of print_modules() and print_irqtrace_events(). Aside from serializing hardlockups from each other, this change also has the advantage of serializing hardlockups and softlockups from each other if they happen to happen at the same time since they are both using the same "lock". Even though nobody is expected to hang while holding the lock associated with printk_cpu_sync_get_irqsave(), out of an abundance of caution, we don't call printk_cpu_sync_get_irqsave() until after we print out about the hardlockup. This makes extra sure that, even if printk_cpu_sync_get_irqsave() somehow never runs we at least print that we saw the hardlockup. This is different than the choice made for softlockup because hardlockup is really our last resort. Link: https://lkml.kernel.org/r/20231220131534.3.I6ff691b3b40f0379bc860f80c6e729a0485b5247@changeid Signed-off-by: Douglas Anderson Reviewed-by: John Ogness Cc: Lecopzer Chen Cc: Li Zhe Cc: Petr Mladek Cc: Pingfan Liu Signed-off-by: Andrew Morton --- kernel/watchdog.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 526041a1100a..11f9577accca 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -151,6 +151,7 @@ void watchdog_hardlockup_check(unsigned int cpu, struct pt_regs *regs) */ if (is_hardlockup(cpu)) { unsigned int this_cpu = smp_processor_id(); + unsigned long flags; /* Only print hardlockups once. */ if (per_cpu(watchdog_hardlockup_warned, cpu)) @@ -165,7 +166,17 @@ void watchdog_hardlockup_check(unsigned int cpu, struct pt_regs *regs) return; } + /* + * NOTE: we call printk_cpu_sync_get_irqsave() after printing + * the lockup message. While it would be nice to serialize + * that printout, we really want to make sure that if some + * other CPU somehow locked up while holding the lock associated + * with printk_cpu_sync_get_irqsave() that we can still at least + * get the message about the lockup out. + */ pr_emerg("Watchdog detected hard LOCKUP on cpu %d\n", cpu); + printk_cpu_sync_get_irqsave(flags); + print_modules(); print_irqtrace_events(current); if (cpu == this_cpu) { @@ -173,7 +184,9 @@ void watchdog_hardlockup_check(unsigned int cpu, struct pt_regs *regs) show_regs(regs); else dump_stack(); + printk_cpu_sync_put_irqrestore(flags); } else { + printk_cpu_sync_put_irqrestore(flags); trigger_single_cpu_backtrace(cpu); } -- cgit From 55efe4abf927aca3692870a1851067f309e9a374 Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Wed, 20 Dec 2023 13:15:37 -0800 Subject: watchdog: if panicking and we dumped everything, don't re-enable dumping If, as part of handling a hardlockup or softlockup, we've already dumped all CPUs and we're just about to panic, don't reenable dumping and give some other CPU a chance to hop in there and add some confusing logs right as the panic is happening. Link: https://lkml.kernel.org/r/20231220131534.4.Id3a9c7ec2d7d83e4080da6f8662ba2226b40543f@changeid Signed-off-by: Douglas Anderson Cc: John Ogness Cc: Lecopzer Chen Cc: Li Zhe Cc: Petr Mladek Cc: Pingfan Liu Signed-off-by: Andrew Morton --- kernel/watchdog.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 11f9577accca..81a8862295d6 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -192,7 +192,8 @@ void watchdog_hardlockup_check(unsigned int cpu, struct pt_regs *regs) if (sysctl_hardlockup_all_cpu_backtrace) { trigger_allbutcpu_cpu_backtrace(cpu); - clear_bit_unlock(0, &hard_lockup_nmi_warn); + if (!hardlockup_panic) + clear_bit_unlock(0, &hard_lockup_nmi_warn); } if (hardlockup_panic) @@ -548,7 +549,8 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) if (softlockup_all_cpu_backtrace) { trigger_allbutcpu_cpu_backtrace(smp_processor_id()); - clear_bit_unlock(0, &soft_lockup_nmi_warn); + if (!softlockup_panic) + clear_bit_unlock(0, &soft_lockup_nmi_warn); } add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); -- cgit From d05cb470663a2a1879277e544f69e660208f08f2 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Fri, 29 Dec 2023 11:51:34 -0500 Subject: ftrace: Fix modification of direct_function hash while in use Masami Hiramatsu reported a memory leak in register_ftrace_direct() where if the number of new entries are added is large enough to cause two allocations in the loop: for (i = 0; i < size; i++) { hlist_for_each_entry(entry, &hash->buckets[i], hlist) { new = ftrace_add_rec_direct(entry->ip, addr, &free_hash); if (!new) goto out_remove; entry->direct = addr; } } Where ftrace_add_rec_direct() has: if (ftrace_hash_empty(direct_functions) || direct_functions->count > 2 * (1 << direct_functions->size_bits)) { struct ftrace_hash *new_hash; int size = ftrace_hash_empty(direct_functions) ? 0 : direct_functions->count + 1; if (size < 32) size = 32; new_hash = dup_hash(direct_functions, size); if (!new_hash) return NULL; *free_hash = direct_functions; direct_functions = new_hash; } The "*free_hash = direct_functions;" can happen twice, losing the previous allocation of direct_functions. But this also exposed a more serious bug. The modification of direct_functions above is not safe. As direct_functions can be referenced at any time to find what direct caller it should call, the time between: new_hash = dup_hash(direct_functions, size); and direct_functions = new_hash; can have a race with another CPU (or even this one if it gets interrupted), and the entries being moved to the new hash are not referenced. That's because the "dup_hash()" is really misnamed and is really a "move_hash()". It moves the entries from the old hash to the new one. Now even if that was changed, this code is not proper as direct_functions should not be updated until the end. That is the best way to handle function reference changes, and is the way other parts of ftrace handles this. The following is done: 1. Change add_hash_entry() to return the entry it created and inserted into the hash, and not just return success or not. 2. Replace ftrace_add_rec_direct() with add_hash_entry(), and remove the former. 3. Allocate a "new_hash" at the start that is made for holding both the new hash entries as well as the existing entries in direct_functions. 4. Copy (not move) the direct_function entries over to the new_hash. 5. Copy the entries of the added hash to the new_hash. 6. If everything succeeds, then use rcu_pointer_assign() to update the direct_functions with the new_hash. This simplifies the code and fixes both the memory leak as well as the race condition mentioned above. Link: https://lore.kernel.org/all/170368070504.42064.8960569647118388081.stgit@devnote2/ Link: https://lore.kernel.org/linux-trace-kernel/20231229115134.08dd5174@gandalf.local.home Cc: stable@vger.kernel.org Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Jiri Olsa Cc: Alexei Starovoitov Cc: Daniel Borkmann Acked-by: Masami Hiramatsu (Google) Fixes: 763e34e74bb7d ("ftrace: Add register_ftrace_direct()") Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ftrace.c | 100 ++++++++++++++++++++++++-------------------------- 1 file changed, 47 insertions(+), 53 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 8de8bec5f366..b01ae7d36021 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1183,18 +1183,19 @@ static void __add_hash_entry(struct ftrace_hash *hash, hash->count++; } -static int add_hash_entry(struct ftrace_hash *hash, unsigned long ip) +static struct ftrace_func_entry * +add_hash_entry(struct ftrace_hash *hash, unsigned long ip) { struct ftrace_func_entry *entry; entry = kmalloc(sizeof(*entry), GFP_KERNEL); if (!entry) - return -ENOMEM; + return NULL; entry->ip = ip; __add_hash_entry(hash, entry); - return 0; + return entry; } static void @@ -1349,7 +1350,6 @@ alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash) struct ftrace_func_entry *entry; struct ftrace_hash *new_hash; int size; - int ret; int i; new_hash = alloc_ftrace_hash(size_bits); @@ -1366,8 +1366,7 @@ alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash) size = 1 << hash->size_bits; for (i = 0; i < size; i++) { hlist_for_each_entry(entry, &hash->buckets[i], hlist) { - ret = add_hash_entry(new_hash, entry->ip); - if (ret < 0) + if (add_hash_entry(new_hash, entry->ip) == NULL) goto free_hash; } } @@ -2536,7 +2535,7 @@ ftrace_find_unique_ops(struct dyn_ftrace *rec) #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS /* Protected by rcu_tasks for reading, and direct_mutex for writing */ -static struct ftrace_hash *direct_functions = EMPTY_HASH; +static struct ftrace_hash __rcu *direct_functions = EMPTY_HASH; static DEFINE_MUTEX(direct_mutex); int ftrace_direct_func_count; @@ -2555,39 +2554,6 @@ unsigned long ftrace_find_rec_direct(unsigned long ip) return entry->direct; } -static struct ftrace_func_entry* -ftrace_add_rec_direct(unsigned long ip, unsigned long addr, - struct ftrace_hash **free_hash) -{ - struct ftrace_func_entry *entry; - - if (ftrace_hash_empty(direct_functions) || - direct_functions->count > 2 * (1 << direct_functions->size_bits)) { - struct ftrace_hash *new_hash; - int size = ftrace_hash_empty(direct_functions) ? 0 : - direct_functions->count + 1; - - if (size < 32) - size = 32; - - new_hash = dup_hash(direct_functions, size); - if (!new_hash) - return NULL; - - *free_hash = direct_functions; - direct_functions = new_hash; - } - - entry = kmalloc(sizeof(*entry), GFP_KERNEL); - if (!entry) - return NULL; - - entry->ip = ip; - entry->direct = addr; - __add_hash_entry(direct_functions, entry); - return entry; -} - static void call_direct_funcs(unsigned long ip, unsigned long pip, struct ftrace_ops *ops, struct ftrace_regs *fregs) { @@ -4223,8 +4189,8 @@ enter_record(struct ftrace_hash *hash, struct dyn_ftrace *rec, int clear_filter) /* Do nothing if it exists */ if (entry) return 0; - - ret = add_hash_entry(hash, rec->ip); + if (add_hash_entry(hash, rec->ip) == NULL) + ret = -ENOMEM; } return ret; } @@ -5266,7 +5232,8 @@ __ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove) return 0; } - return add_hash_entry(hash, ip); + entry = add_hash_entry(hash, ip); + return entry ? 0 : -ENOMEM; } static int @@ -5410,7 +5377,7 @@ static void remove_direct_functions_hash(struct ftrace_hash *hash, unsigned long */ int register_ftrace_direct(struct ftrace_ops *ops, unsigned long addr) { - struct ftrace_hash *hash, *free_hash = NULL; + struct ftrace_hash *hash, *new_hash = NULL, *free_hash = NULL; struct ftrace_func_entry *entry, *new; int err = -EBUSY, size, i; @@ -5436,17 +5403,44 @@ int register_ftrace_direct(struct ftrace_ops *ops, unsigned long addr) } } - /* ... and insert them to direct_functions hash. */ err = -ENOMEM; + + /* Make a copy hash to place the new and the old entries in */ + size = hash->count + direct_functions->count; + if (size > 32) + size = 32; + new_hash = alloc_ftrace_hash(fls(size)); + if (!new_hash) + goto out_unlock; + + /* Now copy over the existing direct entries */ + size = 1 << direct_functions->size_bits; + for (i = 0; i < size; i++) { + hlist_for_each_entry(entry, &direct_functions->buckets[i], hlist) { + new = add_hash_entry(new_hash, entry->ip); + if (!new) + goto out_unlock; + new->direct = entry->direct; + } + } + + /* ... and add the new entries */ + size = 1 << hash->size_bits; for (i = 0; i < size; i++) { hlist_for_each_entry(entry, &hash->buckets[i], hlist) { - new = ftrace_add_rec_direct(entry->ip, addr, &free_hash); + new = add_hash_entry(new_hash, entry->ip); if (!new) - goto out_remove; + goto out_unlock; + /* Update both the copy and the hash entry */ + new->direct = addr; entry->direct = addr; } } + free_hash = direct_functions; + rcu_assign_pointer(direct_functions, new_hash); + new_hash = NULL; + ops->func = call_direct_funcs; ops->flags = MULTI_FLAGS; ops->trampoline = FTRACE_REGS_ADDR; @@ -5454,17 +5448,17 @@ int register_ftrace_direct(struct ftrace_ops *ops, unsigned long addr) err = register_ftrace_function_nolock(ops); - out_remove: - if (err) - remove_direct_functions_hash(hash, addr); - out_unlock: mutex_unlock(&direct_mutex); - if (free_hash) { + if (free_hash && free_hash != EMPTY_HASH) { synchronize_rcu_tasks(); free_ftrace_hash(free_hash); } + + if (new_hash) + free_ftrace_hash(new_hash); + return err; } EXPORT_SYMBOL_GPL(register_ftrace_direct); @@ -6309,7 +6303,7 @@ ftrace_graph_set_hash(struct ftrace_hash *hash, char *buffer) if (entry) continue; - if (add_hash_entry(hash, rec->ip) < 0) + if (add_hash_entry(hash, rec->ip) == NULL) goto out; } else { if (entry) { -- cgit From 7c223098212957a1ecd8768e8e747ae2cf88e880 Mon Sep 17 00:00:00 2001 From: David Laight Date: Fri, 29 Dec 2023 20:53:49 +0000 Subject: locking/osq_lock: Move the definition of optimistic_spin_node into osq_lock.c struct optimistic_spin_node is private to the implementation. Move it into the C file to ensure nothing is accessing it. Signed-off-by: David Laight Acked-by: Waiman Long Signed-off-by: Linus Torvalds --- include/linux/osq_lock.h | 5 ----- kernel/locking/osq_lock.c | 7 +++++++ 2 files changed, 7 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/include/linux/osq_lock.h b/include/linux/osq_lock.h index 5581dbd3bd34..ea8fb31379e3 100644 --- a/include/linux/osq_lock.h +++ b/include/linux/osq_lock.h @@ -6,11 +6,6 @@ * An MCS like lock especially tailored for optimistic spinning for sleeping * lock implementations (mutex, rwsem, etc). */ -struct optimistic_spin_node { - struct optimistic_spin_node *next, *prev; - int locked; /* 1 if lock acquired */ - int cpu; /* encoded CPU # + 1 value */ -}; struct optimistic_spin_queue { /* diff --git a/kernel/locking/osq_lock.c b/kernel/locking/osq_lock.c index d5610ad52b92..d414eef4bec6 100644 --- a/kernel/locking/osq_lock.c +++ b/kernel/locking/osq_lock.c @@ -11,6 +11,13 @@ * called from interrupt context and we have preemption disabled while * spinning. */ + +struct optimistic_spin_node { + struct optimistic_spin_node *next, *prev; + int locked; /* 1 if lock acquired */ + int cpu; /* encoded CPU # + 1 value */ +}; + static DEFINE_PER_CPU_SHARED_ALIGNED(struct optimistic_spin_node, osq_node); /* -- cgit From 563adbfc351b2af9f1406b809ba60b9f1673a882 Mon Sep 17 00:00:00 2001 From: David Laight Date: Fri, 29 Dec 2023 20:56:03 +0000 Subject: locking/osq_lock: Clarify osq_wait_next() calling convention osq_wait_next() is passed 'prev' from osq_lock() and NULL from osq_unlock() but only needs the 'cpu' value to write to lock->tail. Just pass prev->cpu or OSQ_UNLOCKED_VAL instead. Should have no effect on the generated code since gcc manages to assume that 'prev != NULL' due to an earlier dereference. Signed-off-by: David Laight [ Changed 'old' to 'old_cpu' by request from Waiman Long - Linus ] Signed-off-by: Linus Torvalds --- kernel/locking/osq_lock.c | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/osq_lock.c b/kernel/locking/osq_lock.c index d414eef4bec6..15955ce35c53 100644 --- a/kernel/locking/osq_lock.c +++ b/kernel/locking/osq_lock.c @@ -44,26 +44,23 @@ static inline struct optimistic_spin_node *decode_cpu(int encoded_cpu_val) /* * Get a stable @node->next pointer, either for unlock() or unqueue() purposes. * Can return NULL in case we were the last queued and we updated @lock instead. + * + * If osq_lock() is being cancelled there must be a previous node + * and 'old_cpu' is its CPU #. + * For osq_unlock() there is never a previous node and old_cpu is + * set to OSQ_UNLOCKED_VAL. */ static inline struct optimistic_spin_node * osq_wait_next(struct optimistic_spin_queue *lock, struct optimistic_spin_node *node, - struct optimistic_spin_node *prev) + int old_cpu) { struct optimistic_spin_node *next = NULL; int curr = encode_cpu(smp_processor_id()); - int old; - - /* - * If there is a prev node in queue, then the 'old' value will be - * the prev node's CPU #, else it's set to OSQ_UNLOCKED_VAL since if - * we're currently last in queue, then the queue will then become empty. - */ - old = prev ? prev->cpu : OSQ_UNLOCKED_VAL; for (;;) { if (atomic_read(&lock->tail) == curr && - atomic_cmpxchg_acquire(&lock->tail, curr, old) == curr) { + atomic_cmpxchg_acquire(&lock->tail, curr, old_cpu) == curr) { /* * We were the last queued, we moved @lock back. @prev * will now observe @lock and will complete its @@ -193,7 +190,7 @@ bool osq_lock(struct optimistic_spin_queue *lock) * back to @prev. */ - next = osq_wait_next(lock, node, prev); + next = osq_wait_next(lock, node, prev->cpu); if (!next) return false; @@ -233,7 +230,7 @@ void osq_unlock(struct optimistic_spin_queue *lock) return; } - next = osq_wait_next(lock, node, NULL); + next = osq_wait_next(lock, node, OSQ_UNLOCKED_VAL); if (next) WRITE_ONCE(next->locked, 1); } -- cgit From b106bcf0f99ae0459f3c8c2f0af575ef9f5d9bde Mon Sep 17 00:00:00 2001 From: David Laight Date: Fri, 29 Dec 2023 20:56:03 +0000 Subject: locking/osq_lock: Clarify osq_wait_next() Directly return NULL or 'next' instead of breaking out of the loop. Signed-off-by: David Laight [ Split original patch into two independent parts - Linus ] Link: https://lore.kernel.org/lkml/7c8828aec72e42eeb841ca0ee3397e9a@AcuMS.aculab.com/ Signed-off-by: Linus Torvalds --- kernel/locking/osq_lock.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/osq_lock.c b/kernel/locking/osq_lock.c index 15955ce35c53..75a6f6133866 100644 --- a/kernel/locking/osq_lock.c +++ b/kernel/locking/osq_lock.c @@ -55,7 +55,6 @@ osq_wait_next(struct optimistic_spin_queue *lock, struct optimistic_spin_node *node, int old_cpu) { - struct optimistic_spin_node *next = NULL; int curr = encode_cpu(smp_processor_id()); for (;;) { @@ -66,7 +65,7 @@ osq_wait_next(struct optimistic_spin_queue *lock, * will now observe @lock and will complete its * unlock()/unqueue(). */ - break; + return NULL; } /* @@ -80,15 +79,15 @@ osq_wait_next(struct optimistic_spin_queue *lock, * wait for a new @node->next from its Step-C. */ if (node->next) { + struct optimistic_spin_node *next; + next = xchg(&node->next, NULL); if (next) - break; + return next; } cpu_relax(); } - - return next; } bool osq_lock(struct optimistic_spin_queue *lock) -- cgit From 5a0e241003b80247de59727c945bc94c848f893d Mon Sep 17 00:00:00 2001 From: Fabio Estevam Date: Wed, 29 Nov 2023 09:43:28 -0300 Subject: thermal/core: Prepare for introduction of thermal reboot Add some helper functions to make it easier introducing the support for thermal reboot. No functional change. Signed-off-by: Fabio Estevam Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20231129124330.519423-2-festevam@gmail.com --- drivers/thermal/thermal_core.c | 14 ++++++++++---- include/linux/reboot.h | 7 ++++++- kernel/reboot.c | 8 ++++---- 3 files changed, 20 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c index 5e5fcbd81dda..859f62e9d779 100644 --- a/drivers/thermal/thermal_core.c +++ b/drivers/thermal/thermal_core.c @@ -311,18 +311,24 @@ static void handle_non_critical_trips(struct thermal_zone_device *tz, def_governor->throttle(tz, trip); } -void thermal_zone_device_critical(struct thermal_zone_device *tz) +static void thermal_zone_device_halt(struct thermal_zone_device *tz, bool shutdown) { /* * poweroff_delay_ms must be a carefully profiled positive value. * Its a must for forced_emergency_poweroff_work to be scheduled. */ int poweroff_delay_ms = CONFIG_THERMAL_EMERGENCY_POWEROFF_DELAY_MS; + const char *msg = "Temperature too high"; + + dev_emerg(&tz->device, "%s: critical temperature reached\n", tz->type); - dev_emerg(&tz->device, "%s: critical temperature reached, " - "shutting down\n", tz->type); + if (shutdown) + hw_protection_shutdown(msg, poweroff_delay_ms); +} - hw_protection_shutdown("Temperature too high", poweroff_delay_ms); +void thermal_zone_device_critical(struct thermal_zone_device *tz) +{ + thermal_zone_device_halt(tz, true); } EXPORT_SYMBOL(thermal_zone_device_critical); diff --git a/include/linux/reboot.h b/include/linux/reboot.h index c4cc3b89ced1..4586c663884e 100644 --- a/include/linux/reboot.h +++ b/include/linux/reboot.h @@ -177,7 +177,12 @@ void ctrl_alt_del(void); extern void orderly_poweroff(bool force); extern void orderly_reboot(void); -void hw_protection_shutdown(const char *reason, int ms_until_forced); +void __hw_protection_shutdown(const char *reason, int ms_until_forced, bool shutdown); + +static inline void hw_protection_shutdown(const char *reason, int ms_until_forced) +{ + __hw_protection_shutdown(reason, ms_until_forced, true); +} /* * Emergency restart, callable from an interrupt handler. diff --git a/kernel/reboot.c b/kernel/reboot.c index 395a0ea3c7a8..b236c4c06bb3 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -957,7 +957,7 @@ static void hw_failure_emergency_poweroff(int poweroff_delay_ms) } /** - * hw_protection_shutdown - Trigger an emergency system poweroff + * __hw_protection_shutdown - Trigger an emergency system poweroff * * @reason: Reason of emergency shutdown to be printed. * @ms_until_forced: Time to wait for orderly shutdown before tiggering a @@ -971,7 +971,7 @@ static void hw_failure_emergency_poweroff(int poweroff_delay_ms) * if the previous request has given a large timeout for forced shutdown. * Can be called from any context. */ -void hw_protection_shutdown(const char *reason, int ms_until_forced) +void __hw_protection_shutdown(const char *reason, int ms_until_forced, bool shutdown) { static atomic_t allow_proceed = ATOMIC_INIT(1); @@ -986,9 +986,9 @@ void hw_protection_shutdown(const char *reason, int ms_until_forced) * orderly_poweroff failure */ hw_failure_emergency_poweroff(ms_until_forced); - orderly_poweroff(true); + if (shutdown) + orderly_poweroff(true); } -EXPORT_SYMBOL_GPL(hw_protection_shutdown); static int __init reboot_setup(char *str) { -- cgit From 79fa723ba84c2b1b3124c72df8a3b07b851a5477 Mon Sep 17 00:00:00 2001 From: Fabio Estevam Date: Wed, 29 Nov 2023 09:43:29 -0300 Subject: reboot: Introduce thermal_zone_device_critical_reboot() Introduce thermal_zone_device_critical_reboot() to trigger an emergency reboot. It is a counterpart of thermal_zone_device_critical() with the difference that it will force a reboot instead of shutdown. The motivation for doing this is to allow the thermal subystem to trigger a reboot when the temperature reaches the critical temperature. Signed-off-by: Fabio Estevam Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20231129124330.519423-3-festevam@gmail.com --- drivers/thermal/thermal_core.c | 7 +++++++ drivers/thermal/thermal_core.h | 1 + include/linux/reboot.h | 5 +++++ kernel/reboot.c | 28 +++++++++++++++++----------- 4 files changed, 30 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c index 859f62e9d779..0d761afb7cbc 100644 --- a/drivers/thermal/thermal_core.c +++ b/drivers/thermal/thermal_core.c @@ -324,6 +324,8 @@ static void thermal_zone_device_halt(struct thermal_zone_device *tz, bool shutdo if (shutdown) hw_protection_shutdown(msg, poweroff_delay_ms); + else + hw_protection_reboot(msg, poweroff_delay_ms); } void thermal_zone_device_critical(struct thermal_zone_device *tz) @@ -332,6 +334,11 @@ void thermal_zone_device_critical(struct thermal_zone_device *tz) } EXPORT_SYMBOL(thermal_zone_device_critical); +void thermal_zone_device_critical_reboot(struct thermal_zone_device *tz) +{ + thermal_zone_device_halt(tz, false); +} + static void handle_critical_trips(struct thermal_zone_device *tz, const struct thermal_trip *trip) { diff --git a/drivers/thermal/thermal_core.h b/drivers/thermal/thermal_core.h index fe2917a74054..b5e6743bd157 100644 --- a/drivers/thermal/thermal_core.h +++ b/drivers/thermal/thermal_core.h @@ -114,6 +114,7 @@ int thermal_zone_device_set_policy(struct thermal_zone_device *, char *); int thermal_build_list_of_policies(char *buf); void __thermal_zone_device_update(struct thermal_zone_device *tz, enum thermal_notify_event event); +void thermal_zone_device_critical_reboot(struct thermal_zone_device *tz); /* Helpers */ #define for_each_trip(__tz, __trip) \ diff --git a/include/linux/reboot.h b/include/linux/reboot.h index 4586c663884e..abcdde4df697 100644 --- a/include/linux/reboot.h +++ b/include/linux/reboot.h @@ -179,6 +179,11 @@ extern void orderly_poweroff(bool force); extern void orderly_reboot(void); void __hw_protection_shutdown(const char *reason, int ms_until_forced, bool shutdown); +static inline void hw_protection_reboot(const char *reason, int ms_until_forced) +{ + __hw_protection_shutdown(reason, ms_until_forced, false); +} + static inline void hw_protection_shutdown(const char *reason, int ms_until_forced) { __hw_protection_shutdown(reason, ms_until_forced, true); diff --git a/kernel/reboot.c b/kernel/reboot.c index b236c4c06bb3..35d5e0b67993 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -957,19 +957,22 @@ static void hw_failure_emergency_poweroff(int poweroff_delay_ms) } /** - * __hw_protection_shutdown - Trigger an emergency system poweroff + * __hw_protection_shutdown - Trigger an emergency system shutdown or reboot * - * @reason: Reason of emergency shutdown to be printed. - * @ms_until_forced: Time to wait for orderly shutdown before tiggering a - * forced shudown. Negative value disables the forced - * shutdown. + * @reason: Reason of emergency shutdown or reboot to be printed. + * @ms_until_forced: Time to wait for orderly shutdown or reboot before + * triggering it. Negative value disables the forced + * shutdown or reboot. + * @shutdown: If true, indicates that a shutdown will happen + * after the critical tempeature is reached. + * If false, indicates that a reboot will happen + * after the critical tempeature is reached. * - * Initiate an emergency system shutdown in order to protect hardware from - * further damage. Usage examples include a thermal protection or a voltage or - * current regulator failures. - * NOTE: The request is ignored if protection shutdown is already pending even - * if the previous request has given a large timeout for forced shutdown. - * Can be called from any context. + * Initiate an emergency system shutdown or reboot in order to protect + * hardware from further damage. Usage examples include a thermal protection. + * NOTE: The request is ignored if protection shutdown or reboot is already + * pending even if the previous request has given a large timeout for forced + * shutdown/reboot. */ void __hw_protection_shutdown(const char *reason, int ms_until_forced, bool shutdown) { @@ -988,7 +991,10 @@ void __hw_protection_shutdown(const char *reason, int ms_until_forced, bool shut hw_failure_emergency_poweroff(ms_until_forced); if (shutdown) orderly_poweroff(true); + else + orderly_reboot(); } +EXPORT_SYMBOL_GPL(__hw_protection_shutdown); static int __init reboot_setup(char *str) { -- cgit From 6aa09a5bccd8e224d917afdb4c278fc66aacde4d Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 27 Dec 2023 21:37:02 +0100 Subject: async: Split async_schedule_node_domain() In preparation for subsequent changes, split async_schedule_node_domain() in two pieces so as to allow the bottom part of it to be called from a somewhat different code path. No functional impact. Signed-off-by: Rafael J. Wysocki Reviewed-by: Stanislaw Gruszka Tested-by: Youngmin Nam Reviewed-by: Ulf Hansson --- kernel/async.c | 56 ++++++++++++++++++++++++++++++++++---------------------- 1 file changed, 34 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/async.c b/kernel/async.c index b2c4ba5686ee..cffe6b4cff9f 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -145,6 +145,39 @@ static void async_run_entry_fn(struct work_struct *work) wake_up(&async_done); } +static async_cookie_t __async_schedule_node_domain(async_func_t func, + void *data, int node, + struct async_domain *domain, + struct async_entry *entry) +{ + async_cookie_t newcookie; + unsigned long flags; + + INIT_LIST_HEAD(&entry->domain_list); + INIT_LIST_HEAD(&entry->global_list); + INIT_WORK(&entry->work, async_run_entry_fn); + entry->func = func; + entry->data = data; + entry->domain = domain; + + spin_lock_irqsave(&async_lock, flags); + + /* allocate cookie and queue */ + newcookie = entry->cookie = next_cookie++; + + list_add_tail(&entry->domain_list, &domain->pending); + if (domain->registered) + list_add_tail(&entry->global_list, &async_global_pending); + + atomic_inc(&entry_count); + spin_unlock_irqrestore(&async_lock, flags); + + /* schedule for execution */ + queue_work_node(node, system_unbound_wq, &entry->work); + + return newcookie; +} + /** * async_schedule_node_domain - NUMA specific version of async_schedule_domain * @func: function to execute asynchronously @@ -186,29 +219,8 @@ async_cookie_t async_schedule_node_domain(async_func_t func, void *data, func(data, newcookie); return newcookie; } - INIT_LIST_HEAD(&entry->domain_list); - INIT_LIST_HEAD(&entry->global_list); - INIT_WORK(&entry->work, async_run_entry_fn); - entry->func = func; - entry->data = data; - entry->domain = domain; - - spin_lock_irqsave(&async_lock, flags); - /* allocate cookie and queue */ - newcookie = entry->cookie = next_cookie++; - - list_add_tail(&entry->domain_list, &domain->pending); - if (domain->registered) - list_add_tail(&entry->global_list, &async_global_pending); - - atomic_inc(&entry_count); - spin_unlock_irqrestore(&async_lock, flags); - - /* schedule for execution */ - queue_work_node(node, system_unbound_wq, &entry->work); - - return newcookie; + return __async_schedule_node_domain(func, data, node, domain, entry); } EXPORT_SYMBOL_GPL(async_schedule_node_domain); -- cgit From 7d4b5d7a37bdd63a5a3371b988744b060d5bb86f Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 27 Dec 2023 21:38:23 +0100 Subject: async: Introduce async_schedule_dev_nocall() In preparation for subsequent changes, introduce a specialized variant of async_schedule_dev() that will not invoke the argument function synchronously when it cannot be scheduled for asynchronous execution. The new function, async_schedule_dev_nocall(), will be used for fixing possible deadlocks in the system-wide power management core code. Signed-off-by: Rafael J. Wysocki Reviewed-by: Stanislaw Gruszka for the series. Tested-by: Youngmin Nam Reviewed-by: Ulf Hansson --- include/linux/async.h | 2 ++ kernel/async.c | 29 +++++++++++++++++++++++++++++ 2 files changed, 31 insertions(+) (limited to 'kernel') diff --git a/include/linux/async.h b/include/linux/async.h index cce4ad31e8fc..33c9ff4afb49 100644 --- a/include/linux/async.h +++ b/include/linux/async.h @@ -90,6 +90,8 @@ async_schedule_dev(async_func_t func, struct device *dev) return async_schedule_node(func, dev, dev_to_node(dev)); } +bool async_schedule_dev_nocall(async_func_t func, struct device *dev); + /** * async_schedule_dev_domain - A device specific version of async_schedule_domain * @func: function to execute asynchronously diff --git a/kernel/async.c b/kernel/async.c index cffe6b4cff9f..673bba6bdf3a 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -243,6 +243,35 @@ async_cookie_t async_schedule_node(async_func_t func, void *data, int node) } EXPORT_SYMBOL_GPL(async_schedule_node); +/** + * async_schedule_dev_nocall - A simplified variant of async_schedule_dev() + * @func: function to execute asynchronously + * @dev: device argument to be passed to function + * + * @dev is used as both the argument for the function and to provide NUMA + * context for where to run the function. + * + * If the asynchronous execution of @func is scheduled successfully, return + * true. Otherwise, do nothing and return false, unlike async_schedule_dev() + * that will run the function synchronously then. + */ +bool async_schedule_dev_nocall(async_func_t func, struct device *dev) +{ + struct async_entry *entry; + + entry = kzalloc(sizeof(struct async_entry), GFP_KERNEL); + + /* Give up if there is no memory or too much work. */ + if (!entry || atomic_read(&entry_count) > MAX_WORK) { + kfree(entry); + return false; + } + + __async_schedule_node_domain(func, dev, dev_to_node(dev), + &async_dfl_domain, entry); + return true; +} + /** * async_synchronize_full - synchronize all asynchronous function calls * -- cgit From 8a021e7fa10576eeb3938328f39bbf98fe7d4715 Mon Sep 17 00:00:00 2001 From: Andrei Matei Date: Thu, 21 Dec 2023 18:22:24 -0500 Subject: bpf: Simplify checking size of helper accesses This patch simplifies the verification of size arguments associated to pointer arguments to helpers and kfuncs. Many helpers take a pointer argument followed by the size of the memory access performed to be performed through that pointer. Before this patch, the handling of the size argument in check_mem_size_reg() was confusing and wasteful: if the size register's lower bound was 0, then the verification was done twice: once considering the size of the access to be the lower-bound of the respective argument, and once considering the upper bound (even if the two are the same). The upper bound checking is a super-set of the lower-bound checking(*), except: the only point of the lower-bound check is to handle the case where zero-sized-accesses are explicitly not allowed and the lower-bound is zero. This static condition is now checked explicitly, replacing a much more complex, expensive and confusing verification call to check_helper_mem_access(). Error messages change in this patch. Before, messages about illegal zero-size accesses depended on the type of the pointer and on other conditions, and sometimes the message was plain wrong: in some tests that changed you'll see that the old message was something like "R1 min value is outside of the allowed memory range", where R1 is the pointer register; the error was wrongly claiming that the pointer was bad instead of the size being bad. Other times the information that the size came for a register with a possible range of values was wrong, and the error presented the size as a fixed zero. Now the errors refer to the right register. However, the old error messages did contain useful information about the pointer register which is now lost; recovering this information was deemed not important enough. (*) Besides standing to reason that the checks for a bigger size access are a super-set of the checks for a smaller size access, I have also mechanically verified this by reading the code for all types of pointers. I could convince myself that it's true for all but PTR_TO_BTF_ID (check_ptr_to_btf_access). There, simply looking line-by-line does not immediately prove what we want. If anyone has any qualms, let me know. Signed-off-by: Andrei Matei Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20231221232225.568730-2-andreimatei1@gmail.com --- kernel/bpf/verifier.c | 10 ++++------ .../testing/selftests/bpf/progs/verifier_helper_value_access.c | 8 ++++---- tools/testing/selftests/bpf/progs/verifier_raw_stack.c | 2 +- 3 files changed, 9 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a376eb609c41..d4e31f61de0e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -7279,12 +7279,10 @@ static int check_mem_size_reg(struct bpf_verifier_env *env, return -EACCES; } - if (reg->umin_value == 0) { - err = check_helper_mem_access(env, regno - 1, 0, - zero_size_allowed, - meta); - if (err) - return err; + if (reg->umin_value == 0 && !zero_size_allowed) { + verbose(env, "R%d invalid zero-sized read: u64=[%lld,%lld]\n", + regno, reg->umin_value, reg->umax_value); + return -EACCES; } if (reg->umax_value >= BPF_MAX_VAR_SIZ) { diff --git a/tools/testing/selftests/bpf/progs/verifier_helper_value_access.c b/tools/testing/selftests/bpf/progs/verifier_helper_value_access.c index 692216c0ad3d..3e8340c2408f 100644 --- a/tools/testing/selftests/bpf/progs/verifier_helper_value_access.c +++ b/tools/testing/selftests/bpf/progs/verifier_helper_value_access.c @@ -91,7 +91,7 @@ l0_%=: exit; \ SEC("tracepoint") __description("helper access to map: empty range") -__failure __msg("invalid access to map value, value_size=48 off=0 size=0") +__failure __msg("R2 invalid zero-sized read") __naked void access_to_map_empty_range(void) { asm volatile (" \ @@ -221,7 +221,7 @@ l0_%=: exit; \ SEC("tracepoint") __description("helper access to adjusted map (via const imm): empty range") -__failure __msg("invalid access to map value, value_size=48 off=4 size=0") +__failure __msg("R2 invalid zero-sized read") __naked void via_const_imm_empty_range(void) { asm volatile (" \ @@ -386,7 +386,7 @@ l0_%=: exit; \ SEC("tracepoint") __description("helper access to adjusted map (via const reg): empty range") -__failure __msg("R1 min value is outside of the allowed memory range") +__failure __msg("R2 invalid zero-sized read") __naked void via_const_reg_empty_range(void) { asm volatile (" \ @@ -556,7 +556,7 @@ l0_%=: exit; \ SEC("tracepoint") __description("helper access to adjusted map (via variable): empty range") -__failure __msg("R1 min value is outside of the allowed memory range") +__failure __msg("R2 invalid zero-sized read") __naked void map_via_variable_empty_range(void) { asm volatile (" \ diff --git a/tools/testing/selftests/bpf/progs/verifier_raw_stack.c b/tools/testing/selftests/bpf/progs/verifier_raw_stack.c index f67390224a9c..7cc83acac727 100644 --- a/tools/testing/selftests/bpf/progs/verifier_raw_stack.c +++ b/tools/testing/selftests/bpf/progs/verifier_raw_stack.c @@ -64,7 +64,7 @@ __naked void load_bytes_negative_len_2(void) SEC("tc") __description("raw_stack: skb_load_bytes, zero len") -__failure __msg("invalid zero-sized read") +__failure __msg("R4 invalid zero-sized read: u64=[0,0]") __naked void skb_load_bytes_zero_len(void) { asm volatile (" \ -- cgit From 9beda16c257d55213f70adee2f16d7f13a8502e1 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 21 Dec 2023 19:17:34 -0800 Subject: bpf: Avoid unnecessary extra percpu memory allocation Currently, for percpu memory allocation, say if the user requests allocation size to be 32 bytes, the actually calculated size will be 40 bytes and it further rounds to 64 bytes, and eventually 64 bytes are allocated, wasting 32-byte memory. Change bpf_mem_alloc() to calculate the cache index based on the user-provided allocation size so unnecessary extra memory can be avoided. Suggested-by: Hou Tao Acked-by: Hou Tao Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20231222031734.1288400-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/memalloc.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c index aa0fbf000a12..288ec4a967d0 100644 --- a/kernel/bpf/memalloc.c +++ b/kernel/bpf/memalloc.c @@ -833,7 +833,9 @@ void notrace *bpf_mem_alloc(struct bpf_mem_alloc *ma, size_t size) if (!size) return NULL; - idx = bpf_mem_cache_idx(size + LLIST_NODE_SZ); + if (!ma->percpu) + size += LLIST_NODE_SZ; + idx = bpf_mem_cache_idx(size); if (idx < 0) return NULL; -- cgit From 9fc8e802048ad150e8032c4f3dbf40112160cfe9 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 21 Dec 2023 19:17:39 -0800 Subject: bpf: Add objcg to bpf_mem_alloc The objcg is a bpf_mem_alloc level property since all bpf_mem_cache's are with the same objcg. This patch made such a property explicit. The next patch will use this property to save and restore objcg for percpu unit allocator. Acked-by: Hou Tao Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20231222031739.1288590-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf_mem_alloc.h | 1 + kernel/bpf/memalloc.c | 11 ++++++----- 2 files changed, 7 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf_mem_alloc.h b/include/linux/bpf_mem_alloc.h index bb1223b21308..acef8c808599 100644 --- a/include/linux/bpf_mem_alloc.h +++ b/include/linux/bpf_mem_alloc.h @@ -11,6 +11,7 @@ struct bpf_mem_caches; struct bpf_mem_alloc { struct bpf_mem_caches __percpu *caches; struct bpf_mem_cache __percpu *cache; + struct obj_cgroup *objcg; bool percpu; struct work_struct work; }; diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c index 288ec4a967d0..4a21050f0359 100644 --- a/kernel/bpf/memalloc.c +++ b/kernel/bpf/memalloc.c @@ -523,6 +523,7 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu) if (memcg_bpf_enabled()) objcg = get_obj_cgroup_from_current(); #endif + ma->objcg = objcg; for_each_possible_cpu(cpu) { c = per_cpu_ptr(pc, cpu); c->unit_size = unit_size; @@ -542,6 +543,7 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu) #ifdef CONFIG_MEMCG_KMEM objcg = get_obj_cgroup_from_current(); #endif + ma->objcg = objcg; for_each_possible_cpu(cpu) { cc = per_cpu_ptr(pcc, cpu); for (i = 0; i < NUM_CACHES; i++) { @@ -691,9 +693,8 @@ void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma) rcu_in_progress += atomic_read(&c->call_rcu_ttrace_in_progress); rcu_in_progress += atomic_read(&c->call_rcu_in_progress); } - /* objcg is the same across cpus */ - if (c->objcg) - obj_cgroup_put(c->objcg); + if (ma->objcg) + obj_cgroup_put(ma->objcg); destroy_mem_alloc(ma, rcu_in_progress); } if (ma->caches) { @@ -709,8 +710,8 @@ void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma) rcu_in_progress += atomic_read(&c->call_rcu_in_progress); } } - if (c->objcg) - obj_cgroup_put(c->objcg); + if (ma->objcg) + obj_cgroup_put(ma->objcg); destroy_mem_alloc(ma, rcu_in_progress); } } -- cgit From c39aa3b289e9c10d0d246cd919b06809f13b72b8 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 21 Dec 2023 19:17:45 -0800 Subject: bpf: Allow per unit prefill for non-fix-size percpu memory allocator Commit 41a5db8d8161 ("Add support for non-fix-size percpu mem allocation") added support for non-fix-size percpu memory allocation. Such allocation will allocate percpu memory for all buckets on all cpus and the memory consumption is in the order to quadratic. For example, let us say, 4 cpus, unit size 16 bytes, so each cpu has 16 * 4 = 64 bytes, with 4 cpus, total will be 64 * 4 = 256 bytes. Then let us say, 8 cpus with the same unit size, each cpu has 16 * 8 = 128 bytes, with 8 cpus, total will be 128 * 8 = 1024 bytes. So if the number of cpus doubles, the number of memory consumption will be 4 times. So for a system with large number of cpus, the memory consumption goes up quickly with quadratic order. For example, for 4KB percpu allocation, 128 cpus. The total memory consumption will 4KB * 128 * 128 = 64MB. Things will become worse if the number of cpus is bigger (e.g., 512, 1024, etc.) In Commit 41a5db8d8161, the non-fix-size percpu memory allocation is done in boot time, so for system with large number of cpus, the initial percpu memory consumption is very visible. For example, for 128 cpu system, the total percpu memory allocation will be at least (16 + 32 + 64 + 96 + 128 + 196 + 256 + 512 + 1024 + 2048 + 4096) * 128 * 128 = ~138MB. which is pretty big. It will be even bigger for larger number of cpus. Note that the current prefill also allocates 4 entries if the unit size is less than 256. So on top of 138MB memory consumption, this will add more consumption with 3 * (16 + 32 + 64 + 96 + 128 + 196 + 256) * 128 * 128 = ~38MB. Next patch will try to reduce this memory consumption. Later on, Commit 1fda5bb66ad8 ("bpf: Do not allocate percpu memory at init stage") moved the non-fix-size percpu memory allocation to bpf verificaiton stage. Once a particular bpf_percpu_obj_new() is called by bpf program, the memory allocator will try to fill in the cache with all sizes, causing the same amount of percpu memory consumption as in the boot stage. To reduce the initial percpu memory consumption for non-fix-size percpu memory allocation, instead of filling the cache with all supported allocation sizes, this patch intends to fill the cache only for the requested size. As typically users will not use large percpu data structure, this can save memory significantly. For example, the allocation size is 64 bytes with 128 cpus. Then total percpu memory amount will be 64 * 128 * 128 = 1MB, much less than previous 138MB. Signed-off-by: Yonghong Song Acked-by: Hou Tao Link: https://lore.kernel.org/r/20231222031745.1289082-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf_mem_alloc.h | 7 ++++++ kernel/bpf/memalloc.c | 57 ++++++++++++++++++++++++++++++++++++++++++- kernel/bpf/verifier.c | 37 +++++++++++++++++----------- 3 files changed, 86 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf_mem_alloc.h b/include/linux/bpf_mem_alloc.h index acef8c808599..aaf004d94322 100644 --- a/include/linux/bpf_mem_alloc.h +++ b/include/linux/bpf_mem_alloc.h @@ -22,8 +22,15 @@ struct bpf_mem_alloc { * 'size = 0' is for bpf_mem_alloc which manages many fixed-size objects. * Alloc and free are done with bpf_mem_{alloc,free}() and the size of * the returned object is given by the size argument of bpf_mem_alloc(). + * If percpu equals true, error will be returned in order to avoid + * large memory consumption and the below bpf_mem_alloc_percpu_unit_init() + * should be used to do on-demand per-cpu allocation for each size. */ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu); +/* Initialize a non-fix-size percpu memory allocator */ +int bpf_mem_alloc_percpu_init(struct bpf_mem_alloc *ma, struct obj_cgroup *objcg); +/* The percpu allocation with a specific unit size. */ +int bpf_mem_alloc_percpu_unit_init(struct bpf_mem_alloc *ma, int size); void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma); /* kmalloc/kfree equivalent: */ diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c index 4a21050f0359..f71da07eb8a0 100644 --- a/kernel/bpf/memalloc.c +++ b/kernel/bpf/memalloc.c @@ -121,6 +121,8 @@ struct bpf_mem_caches { struct bpf_mem_cache cache[NUM_CACHES]; }; +static const u16 sizes[NUM_CACHES] = {96, 192, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096}; + static struct llist_node notrace *__llist_del_first(struct llist_head *head) { struct llist_node *entry, *next; @@ -499,12 +501,14 @@ static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu) */ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu) { - static u16 sizes[NUM_CACHES] = {96, 192, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096}; struct bpf_mem_caches *cc, __percpu *pcc; struct bpf_mem_cache *c, __percpu *pc; struct obj_cgroup *objcg = NULL; int cpu, i, unit_size, percpu_size = 0; + if (percpu && size == 0) + return -EINVAL; + /* room for llist_node and per-cpu pointer */ if (percpu) percpu_size = LLIST_NODE_SZ + sizeof(void *); @@ -524,6 +528,7 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu) objcg = get_obj_cgroup_from_current(); #endif ma->objcg = objcg; + for_each_possible_cpu(cpu) { c = per_cpu_ptr(pc, cpu); c->unit_size = unit_size; @@ -562,6 +567,56 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu) return 0; } +int bpf_mem_alloc_percpu_init(struct bpf_mem_alloc *ma, struct obj_cgroup *objcg) +{ + struct bpf_mem_caches __percpu *pcc; + + pcc = __alloc_percpu_gfp(sizeof(struct bpf_mem_caches), 8, GFP_KERNEL); + if (!pcc) + return -ENOMEM; + + ma->caches = pcc; + ma->objcg = objcg; + ma->percpu = true; + return 0; +} + +int bpf_mem_alloc_percpu_unit_init(struct bpf_mem_alloc *ma, int size) +{ + struct bpf_mem_caches *cc, __percpu *pcc; + int cpu, i, unit_size, percpu_size; + struct obj_cgroup *objcg; + struct bpf_mem_cache *c; + + i = bpf_mem_cache_idx(size); + if (i < 0) + return -EINVAL; + + /* room for llist_node and per-cpu pointer */ + percpu_size = LLIST_NODE_SZ + sizeof(void *); + + unit_size = sizes[i]; + objcg = ma->objcg; + pcc = ma->caches; + + for_each_possible_cpu(cpu) { + cc = per_cpu_ptr(pcc, cpu); + c = &cc->cache[i]; + if (cpu == 0 && c->unit_size) + break; + + c->unit_size = unit_size; + c->objcg = objcg; + c->percpu_size = percpu_size; + c->tgt = c; + + init_refill_work(c); + prefill_mem_cache(c, cpu); + } + + return 0; +} + static void drain_mem_cache(struct bpf_mem_cache *c) { bool percpu = !!c->percpu_size; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d4e31f61de0e..e9699a2cfe4f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -12139,20 +12139,6 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, if (meta.func_id == special_kfunc_list[KF_bpf_obj_new_impl] && !bpf_global_ma_set) return -ENOMEM; - if (meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) { - if (!bpf_global_percpu_ma_set) { - mutex_lock(&bpf_percpu_ma_lock); - if (!bpf_global_percpu_ma_set) { - err = bpf_mem_alloc_init(&bpf_global_percpu_ma, 0, true); - if (!err) - bpf_global_percpu_ma_set = true; - } - mutex_unlock(&bpf_percpu_ma_lock); - if (err) - return err; - } - } - if (((u64)(u32)meta.arg_constant.value) != meta.arg_constant.value) { verbose(env, "local type ID argument must be in range [0, U32_MAX]\n"); return -EINVAL; @@ -12173,6 +12159,29 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, return -EINVAL; } + if (meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) { + if (!bpf_global_percpu_ma_set) { + mutex_lock(&bpf_percpu_ma_lock); + if (!bpf_global_percpu_ma_set) { + /* Charge memory allocated with bpf_global_percpu_ma to + * root memcg. The obj_cgroup for root memcg is NULL. + */ + err = bpf_mem_alloc_percpu_init(&bpf_global_percpu_ma, NULL); + if (!err) + bpf_global_percpu_ma_set = true; + } + mutex_unlock(&bpf_percpu_ma_lock); + if (err) + return err; + } + + mutex_lock(&bpf_percpu_ma_lock); + err = bpf_mem_alloc_percpu_unit_init(&bpf_global_percpu_ma, ret_t->size); + mutex_unlock(&bpf_percpu_ma_lock); + if (err) + return err; + } + struct_meta = btf_find_struct_meta(ret_btf, ret_btf_id); if (meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) { if (!__btf_type_is_scalar_struct(env, ret_btf, ret_t, 0)) { -- cgit From 5b95e638f134e552b5ba2976326c02babe248615 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 21 Dec 2023 19:17:50 -0800 Subject: bpf: Refill only one percpu element in memalloc Typically for percpu map element or data structure, once allocated, most operations are lookup or in-place update. Deletion are really rare. Currently, for percpu data strcture, 4 elements will be refilled if the size is <= 256. Let us just do with one element for percpu data. For example, for size 256 and 128 cpus, the potential saving will be 3 * 256 * 128 * 128 = 12MB. Acked-by: Hou Tao Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20231222031750.1289290-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/memalloc.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c index f71da07eb8a0..a8ee6fb8401c 100644 --- a/kernel/bpf/memalloc.c +++ b/kernel/bpf/memalloc.c @@ -485,11 +485,16 @@ static void init_refill_work(struct bpf_mem_cache *c) static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu) { - /* To avoid consuming memory assume that 1st run of bpf - * prog won't be doing more than 4 map_update_elem from - * irq disabled region + int cnt = 1; + + /* To avoid consuming memory, for non-percpu allocation, assume that + * 1st run of bpf prog won't be doing more than 4 map_update_elem from + * irq disabled region if unit size is less than or equal to 256. + * For all other cases, let us just do one allocation. */ - alloc_bulk(c, c->unit_size <= 256 ? 4 : 1, cpu_to_node(cpu), false); + if (!c->percpu_size && c->unit_size <= 256) + cnt = 4; + alloc_bulk(c, cnt, cpu_to_node(cpu), false); } /* When size != 0 bpf_mem_cache for each cpu. -- cgit From 0e2ba9f96f9b82893ba19170ae48d46003f8ef44 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 21 Dec 2023 19:17:55 -0800 Subject: bpf: Use smaller low/high marks for percpu allocation Currently, refill low/high marks are set with the assumption of normal non-percpu memory allocation. For example, for an allocation size 256, for non-percpu memory allocation, low mark is 32 and high mark is 96, resulting in the batch allocation of 48 elements and the allocated memory will be 48 * 256 = 12KB for this particular cpu. Assuming an 128-cpu system, the total memory consumption across all cpus will be 12K * 128 = 1.5MB memory. This might be okay for non-percpu allocation, but may not be good for percpu allocation, which will consume 1.5MB * 128 = 192MB memory in the worst case if every cpu has a chance of memory allocation. In practice, percpu allocation is very rare compared to non-percpu allocation. So let us have smaller low/high marks which can avoid unnecessary memory consumption. Signed-off-by: Yonghong Song Acked-by: Hou Tao Link: https://lore.kernel.org/r/20231222031755.1289671-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/memalloc.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c index a8ee6fb8401c..460c8f38fed6 100644 --- a/kernel/bpf/memalloc.c +++ b/kernel/bpf/memalloc.c @@ -464,11 +464,17 @@ static void notrace irq_work_raise(struct bpf_mem_cache *c) * consume ~ 11 Kbyte per cpu. * Typical case will be between 11K and 116K closer to 11K. * bpf progs can and should share bpf_mem_cache when possible. + * + * Percpu allocation is typically rare. To avoid potential unnecessary large + * memory consumption, set low_mark = 1 and high_mark = 3, resulting in c->batch = 1. */ static void init_refill_work(struct bpf_mem_cache *c) { init_irq_work(&c->refill_work, bpf_mem_refill); - if (c->unit_size <= 256) { + if (c->percpu_size) { + c->low_watermark = 1; + c->high_watermark = 3; + } else if (c->unit_size <= 256) { c->low_watermark = 32; c->high_watermark = 96; } else { -- cgit From 5c1a37653260ed5d9c8b26fb7fe7b99629612982 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 21 Dec 2023 19:18:01 -0800 Subject: bpf: Limit up to 512 bytes for bpf_global_percpu_ma allocation For percpu data structure allocation with bpf_global_percpu_ma, the maximum data size is 4K. But for a system with large number of cpus, bigger data size (e.g., 2K, 4K) might consume a lot of memory. For example, the percpu memory consumption with unit size 2K and 1024 cpus will be 2K * 1K * 1k = 2GB memory. We should discourage such usage. Let us limit the maximum data size to be 512 for bpf_global_percpu_ma allocation. Acked-by: Hou Tao Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20231222031801.1290841-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index e9699a2cfe4f..d5f4ff1eb235 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -195,6 +195,8 @@ struct bpf_verifier_stack_elem { POISON_POINTER_DELTA)) #define BPF_MAP_PTR(X) ((struct bpf_map *)((X) & ~BPF_MAP_PTR_UNPRIV)) +#define BPF_GLOBAL_PERCPU_MA_MAX_SIZE 512 + static int acquire_reference_state(struct bpf_verifier_env *env, int insn_idx); static int release_reference(struct bpf_verifier_env *env, int ref_obj_id); static void invalidate_non_owning_refs(struct bpf_verifier_env *env); @@ -12160,6 +12162,12 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, } if (meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) { + if (ret_t->size > BPF_GLOBAL_PERCPU_MA_MAX_SIZE) { + verbose(env, "bpf_percpu_obj_new type size (%d) is greater than %d\n", + ret_t->size, BPF_GLOBAL_PERCPU_MA_MAX_SIZE); + return -EINVAL; + } + if (!bpf_global_percpu_ma_set) { mutex_lock(&bpf_percpu_ma_lock); if (!bpf_global_percpu_ma_set) { -- cgit From 86438841e48f6361f0a6a04805b7d7813738761f Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 19 Dec 2023 14:41:42 +0100 Subject: dma-debug: make dma_debug_add_bus take a const pointer The driver core now can handle a const struct bus_type pointer, and the dma_debug_add_bus() call just passes on the pointer give to it to the driver core, so make this pointer const as well to allow everyone to use read-only struct bus_type pointers going forward. Cc: Christoph Hellwig Cc: Marek Szyprowski Cc: Robin Murphy Cc: Reviewed-by: Robin Murphy Link: https://lore.kernel.org/r/2023121941-dejected-nugget-681e@gregkh Signed-off-by: Greg Kroah-Hartman --- include/linux/dma-map-ops.h | 4 ++-- kernel/dma/debug.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h index f2fc203fb8a1..e401f824a007 100644 --- a/include/linux/dma-map-ops.h +++ b/include/linux/dma-map-ops.h @@ -443,10 +443,10 @@ static inline void arch_teardown_dma_ops(struct device *dev) #endif /* CONFIG_ARCH_HAS_TEARDOWN_DMA_OPS */ #ifdef CONFIG_DMA_API_DEBUG -void dma_debug_add_bus(struct bus_type *bus); +void dma_debug_add_bus(const struct bus_type *bus); void debug_dma_dump_mappings(struct device *dev); #else -static inline void dma_debug_add_bus(struct bus_type *bus) +static inline void dma_debug_add_bus(const struct bus_type *bus) { } static inline void debug_dma_dump_mappings(struct device *dev) diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index 3de494375b7b..1a5c86dd87d5 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -876,7 +876,7 @@ static int dma_debug_device_change(struct notifier_block *nb, unsigned long acti return 0; } -void dma_debug_add_bus(struct bus_type *bus) +void dma_debug_add_bus(const struct bus_type *bus) { struct notifier_block *nb; -- cgit From 9ddf872b47e3ac8f27dbfc4a4737a976c7588de6 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 4 Jan 2024 08:57:44 -0800 Subject: bpf: Remove unnecessary cpu == 0 check in memalloc After merging the patch set [1] to reduce memory usage for bpf_global_percpu_ma, Alexei found a redundant check (cpu == 0) in function bpf_mem_alloc_percpu_unit_init() ([2]). Indeed, the check is unnecessary since c->unit_size will be all NULL or all non-NULL for all cpus before for_each_possible_cpu() loop. Removing the check makes code less confusing. [1] https://lore.kernel.org/all/20231222031729.1287957-1-yonghong.song@linux.dev/ [2] https://lore.kernel.org/all/20231222031745.1289082-1-yonghong.song@linux.dev/ Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20240104165744.702239-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/memalloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c index 460c8f38fed6..550f02e2cb13 100644 --- a/kernel/bpf/memalloc.c +++ b/kernel/bpf/memalloc.c @@ -613,7 +613,7 @@ int bpf_mem_alloc_percpu_unit_init(struct bpf_mem_alloc *ma, int size) for_each_possible_cpu(cpu) { cc = per_cpu_ptr(pcc, cpu); c = &cc->cache[i]; - if (cpu == 0 && c->unit_size) + if (c->unit_size) break; c->unit_size = unit_size; -- cgit From 19bfcdf9498aa968ea293417fbbc39e523527ca8 Mon Sep 17 00:00:00 2001 From: Dmitrii Dolgov <9erthalion6@gmail.com> Date: Wed, 3 Jan 2024 20:05:44 +0100 Subject: bpf: Relax tracing prog recursive attach rules Currently, it's not allowed to attach an fentry/fexit prog to another one fentry/fexit. At the same time it's not uncommon to see a tracing program with lots of logic in use, and the attachment limitation prevents usage of fentry/fexit for performance analysis (e.g. with "bpftool prog profile" command) in this case. An example could be falcosecurity libs project that uses tp_btf tracing programs. Following the corresponding discussion [1], the reason for that is to avoid tracing progs call cycles without introducing more complex solutions. But currently it seems impossible to load and attach tracing programs in a way that will form such a cycle. The limitation is coming from the fact that attach_prog_fd is specified at the prog load (thus making it impossible to attach to a program loaded after it in this way), as well as tracing progs not implementing link_detach. Replace "no same type" requirement with verification that no more than one level of attachment nesting is allowed. In this way only one fentry/fexit program could be attached to another fentry/fexit to cover profiling use case, and still no cycle could be formed. To implement, add a new field into bpf_prog_aux to track nested attachment for tracing programs. [1]: https://lore.kernel.org/bpf/20191108064039.2041889-16-ast@kernel.org/ Acked-by: Jiri Olsa Acked-by: Song Liu Signed-off-by: Dmitrii Dolgov <9erthalion6@gmail.com> Link: https://lore.kernel.org/r/20240103190559.14750-2-9erthalion6@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 + kernel/bpf/syscall.c | 23 ++++++++++++++++++++++- kernel/bpf/verifier.c | 39 +++++++++++++++++++++++++-------------- 3 files changed, 48 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 7671530d6e4e..e30100597d0a 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1449,6 +1449,7 @@ struct bpf_prog_aux { bool dev_bound; /* Program is bound to the netdev. */ bool offload_requested; /* Program is bound and offloaded to the netdev. */ bool attach_btf_trace; /* true if attaching to BTF-enabled raw tp */ + bool attach_tracing_prog; /* true if tracing another tracing program */ bool func_proto_unreliable; bool sleepable; bool tail_call_reachable; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 1bf9805ee185..d15f9998bc20 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2738,6 +2738,22 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) goto free_prog_sec; } + /* + * Bookkeeping for managing the program attachment chain. + * + * It might be tempting to set attach_tracing_prog flag at the attachment + * time, but this will not prevent from loading bunch of tracing prog + * first, then attach them one to another. + * + * The flag attach_tracing_prog is set for the whole program lifecycle, and + * doesn't have to be cleared in bpf_tracing_link_release, since tracing + * programs cannot change attachment target. + */ + if (type == BPF_PROG_TYPE_TRACING && dst_prog && + dst_prog->type == BPF_PROG_TYPE_TRACING) { + prog->aux->attach_tracing_prog = true; + } + /* find program type: socket_filter vs tracing_filter */ err = find_prog_type(type, prog); if (err < 0) @@ -3171,7 +3187,12 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog, } if (tgt_prog_fd) { - /* For now we only allow new targets for BPF_PROG_TYPE_EXT */ + /* + * For now we only allow new targets for BPF_PROG_TYPE_EXT. If this + * part would be changed to implement the same for + * BPF_PROG_TYPE_TRACING, do not forget to update the way how + * attach_tracing_prog flag is set. + */ if (prog->type != BPF_PROG_TYPE_EXT) { err = -EINVAL; goto out_put_prog; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d5f4ff1eb235..adbf330d364b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -20317,6 +20317,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, struct bpf_attach_target_info *tgt_info) { bool prog_extension = prog->type == BPF_PROG_TYPE_EXT; + bool prog_tracing = prog->type == BPF_PROG_TYPE_TRACING; const char prefix[] = "btf_trace_"; int ret = 0, subprog = -1, i; const struct btf_type *t; @@ -20387,10 +20388,21 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, bpf_log(log, "Can attach to only JITed progs\n"); return -EINVAL; } - if (tgt_prog->type == prog->type) { - /* Cannot fentry/fexit another fentry/fexit program. - * Cannot attach program extension to another extension. - * It's ok to attach fentry/fexit to extension program. + if (prog_tracing) { + if (aux->attach_tracing_prog) { + /* + * Target program is an fentry/fexit which is already attached + * to another tracing program. More levels of nesting + * attachment are not allowed. + */ + bpf_log(log, "Cannot nest tracing program attach more than once\n"); + return -EINVAL; + } + } else if (tgt_prog->type == prog->type) { + /* + * To avoid potential call chain cycles, prevent attaching of a + * program extension to another extension. It's ok to attach + * fentry/fexit to extension program. */ bpf_log(log, "Cannot recursively attach\n"); return -EINVAL; @@ -20403,16 +20415,15 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, * except fentry/fexit. The reason is the following. * The fentry/fexit programs are used for performance * analysis, stats and can be attached to any program - * type except themselves. When extension program is - * replacing XDP function it is necessary to allow - * performance analysis of all functions. Both original - * XDP program and its program extension. Hence - * attaching fentry/fexit to BPF_PROG_TYPE_EXT is - * allowed. If extending of fentry/fexit was allowed it - * would be possible to create long call chain - * fentry->extension->fentry->extension beyond - * reasonable stack size. Hence extending fentry is not - * allowed. + * type. When extension program is replacing XDP function + * it is necessary to allow performance analysis of all + * functions. Both original XDP program and its program + * extension. Hence attaching fentry/fexit to + * BPF_PROG_TYPE_EXT is allowed. If extending of + * fentry/fexit was allowed it would be possible to create + * long call chain fentry->extension->fentry->extension + * beyond reasonable stack size. Hence extending fentry + * is not allowed. */ bpf_log(log, "Cannot extend fentry/fexit\n"); return -EINVAL; -- cgit From 715d82ba636cb3629a6e18a33bb9dbe53f9936ee Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 3 Jan 2024 20:05:46 +0100 Subject: bpf: Fix re-attachment branch in bpf_tracing_prog_attach The following case can cause a crash due to missing attach_btf: 1) load rawtp program 2) load fentry program with rawtp as target_fd 3) create tracing link for fentry program with target_fd = 0 4) repeat 3 In the end we have: - prog->aux->dst_trampoline == NULL - tgt_prog == NULL (because we did not provide target_fd to link_create) - prog->aux->attach_btf == NULL (the program was loaded with attach_prog_fd=X) - the program was loaded for tgt_prog but we have no way to find out which one BUG: kernel NULL pointer dereference, address: 0000000000000058 Call Trace: ? __die+0x20/0x70 ? page_fault_oops+0x15b/0x430 ? fixup_exception+0x22/0x330 ? exc_page_fault+0x6f/0x170 ? asm_exc_page_fault+0x22/0x30 ? bpf_tracing_prog_attach+0x279/0x560 ? btf_obj_id+0x5/0x10 bpf_tracing_prog_attach+0x439/0x560 __sys_bpf+0x1cf4/0x2de0 __x64_sys_bpf+0x1c/0x30 do_syscall_64+0x41/0xf0 entry_SYSCALL_64_after_hwframe+0x6e/0x76 Return -EINVAL in this situation. Fixes: f3a95075549e0 ("bpf: Allow trampoline re-attach for tracing and lsm programs") Cc: stable@vger.kernel.org Signed-off-by: Jiri Olsa Acked-by: Jiri Olsa Acked-by: Song Liu Signed-off-by: Dmitrii Dolgov <9erthalion6@gmail.com> Link: https://lore.kernel.org/r/20240103190559.14750-4-9erthalion6@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index d15f9998bc20..a1f18681721c 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3237,6 +3237,10 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog, * * - if prog->aux->dst_trampoline and tgt_prog is NULL, the program * was detached and is going for re-attachment. + * + * - if prog->aux->dst_trampoline is NULL and tgt_prog and prog->aux->attach_btf + * are NULL, then program was already attached and user did not provide + * tgt_prog_fd so we have no way to find out or create trampoline */ if (!prog->aux->dst_trampoline && !tgt_prog) { /* @@ -3250,6 +3254,11 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog, err = -EINVAL; goto out_unlock; } + /* We can allow re-attach only if we have valid attach_btf. */ + if (!prog->aux->attach_btf) { + err = -EINVAL; + goto out_unlock; + } btf_id = prog->aux->attach_btf_id; key = bpf_trampoline_compute_key(NULL, prog->aux->attach_btf, btf_id); } -- cgit From 61dd3f246b3adaabff3241c586f2210ac91b05a4 Mon Sep 17 00:00:00 2001 From: Kinsey Ho Date: Wed, 27 Dec 2023 14:12:02 +0000 Subject: mm/mglru: add CONFIG_LRU_GEN_WALKS_MMU Add CONFIG_LRU_GEN_WALKS_MMU such that if disabled, the code that walks page tables to promote pages into the youngest generation will not be built. Also improves code readability by adding two helper functions get_mm_state() and get_next_mm(). Link: https://lkml.kernel.org/r/20231227141205.2200125-3-kinseyho@google.com Signed-off-by: Kinsey Ho Co-developed-by: Aneesh Kumar K.V Signed-off-by: Aneesh Kumar K.V Tested-by: Donet Tom Acked-by: Yu Zhao Cc: kernel test robot Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 2 +- include/linux/mm_types.h | 12 ++- include/linux/mmzone.h | 2 + kernel/fork.c | 2 +- mm/Kconfig | 4 + mm/vmscan.c | 192 +++++++++++++++++++++++++++++---------------- 6 files changed, 139 insertions(+), 75 deletions(-) (limited to 'kernel') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 5de775e6cdd9..20ff87f8e001 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -330,7 +330,7 @@ struct mem_cgroup { struct deferred_split deferred_split_queue; #endif -#ifdef CONFIG_LRU_GEN +#ifdef CONFIG_LRU_GEN_WALKS_MMU /* per-memcg mm_struct list */ struct lru_gen_mm_list mm_list; #endif diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index a66534c78c4d..552fa2d11c57 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -958,7 +958,7 @@ struct mm_struct { */ unsigned long ksm_zero_pages; #endif /* CONFIG_KSM */ -#ifdef CONFIG_LRU_GEN +#ifdef CONFIG_LRU_GEN_WALKS_MMU struct { /* this mm_struct is on lru_gen_mm_list */ struct list_head list; @@ -973,7 +973,7 @@ struct mm_struct { struct mem_cgroup *memcg; #endif } lru_gen; -#endif /* CONFIG_LRU_GEN */ +#endif /* CONFIG_LRU_GEN_WALKS_MMU */ } __randomize_layout; /* @@ -1011,6 +1011,10 @@ struct lru_gen_mm_list { spinlock_t lock; }; +#endif /* CONFIG_LRU_GEN */ + +#ifdef CONFIG_LRU_GEN_WALKS_MMU + void lru_gen_add_mm(struct mm_struct *mm); void lru_gen_del_mm(struct mm_struct *mm); #ifdef CONFIG_MEMCG @@ -1036,7 +1040,7 @@ static inline void lru_gen_use_mm(struct mm_struct *mm) WRITE_ONCE(mm->lru_gen.bitmap, -1); } -#else /* !CONFIG_LRU_GEN */ +#else /* !CONFIG_LRU_GEN_WALKS_MMU */ static inline void lru_gen_add_mm(struct mm_struct *mm) { @@ -1060,7 +1064,7 @@ static inline void lru_gen_use_mm(struct mm_struct *mm) { } -#endif /* CONFIG_LRU_GEN */ +#endif /* CONFIG_LRU_GEN_WALKS_MMU */ struct vma_iterator { struct ma_state mas; diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 2efd3be484fd..bc3f63ec4291 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -640,9 +640,11 @@ struct lruvec { #ifdef CONFIG_LRU_GEN /* evictable pages divided into generations */ struct lru_gen_folio lrugen; +#ifdef CONFIG_LRU_GEN_WALKS_MMU /* to concurrently iterate lru_gen_mm_list */ struct lru_gen_mm_state mm_state; #endif +#endif /* CONFIG_LRU_GEN */ #ifdef CONFIG_MEMCG struct pglist_data *pgdat; #endif diff --git a/kernel/fork.c b/kernel/fork.c index 93924392a5c3..56cf276432c8 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2946,7 +2946,7 @@ pid_t kernel_clone(struct kernel_clone_args *args) get_task_struct(p); } - if (IS_ENABLED(CONFIG_LRU_GEN) && !(clone_flags & CLONE_VM)) { + if (IS_ENABLED(CONFIG_LRU_GEN_WALKS_MMU) && !(clone_flags & CLONE_VM)) { /* lock the task to synchronize with memcg migration */ task_lock(p); lru_gen_add_mm(p->mm); diff --git a/mm/Kconfig b/mm/Kconfig index b072664b889a..79d563d8f9e0 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -1274,6 +1274,10 @@ config LRU_GEN_STATS from evicted generations for debugging purpose. This option has a per-memcg and per-node memory overhead. + +config LRU_GEN_WALKS_MMU + def_bool y + depends on LRU_GEN && ARCH_HAS_HW_PTE_YOUNG # } config ARCH_SUPPORTS_PER_VMA_LOCK diff --git a/mm/vmscan.c b/mm/vmscan.c index b4ca3563bcf4..aa7ea09ffb4c 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2671,13 +2671,14 @@ static void get_item_key(void *item, int *key) key[1] = hash >> BLOOM_FILTER_SHIFT; } -static bool test_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item) +static bool test_bloom_filter(struct lru_gen_mm_state *mm_state, unsigned long seq, + void *item) { int key[2]; unsigned long *filter; int gen = filter_gen_from_seq(seq); - filter = READ_ONCE(lruvec->mm_state.filters[gen]); + filter = READ_ONCE(mm_state->filters[gen]); if (!filter) return true; @@ -2686,13 +2687,14 @@ static bool test_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *it return test_bit(key[0], filter) && test_bit(key[1], filter); } -static void update_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item) +static void update_bloom_filter(struct lru_gen_mm_state *mm_state, unsigned long seq, + void *item) { int key[2]; unsigned long *filter; int gen = filter_gen_from_seq(seq); - filter = READ_ONCE(lruvec->mm_state.filters[gen]); + filter = READ_ONCE(mm_state->filters[gen]); if (!filter) return; @@ -2704,12 +2706,12 @@ static void update_bloom_filter(struct lruvec *lruvec, unsigned long seq, void * set_bit(key[1], filter); } -static void reset_bloom_filter(struct lruvec *lruvec, unsigned long seq) +static void reset_bloom_filter(struct lru_gen_mm_state *mm_state, unsigned long seq) { unsigned long *filter; int gen = filter_gen_from_seq(seq); - filter = lruvec->mm_state.filters[gen]; + filter = mm_state->filters[gen]; if (filter) { bitmap_clear(filter, 0, BIT(BLOOM_FILTER_SHIFT)); return; @@ -2717,13 +2719,15 @@ static void reset_bloom_filter(struct lruvec *lruvec, unsigned long seq) filter = bitmap_zalloc(BIT(BLOOM_FILTER_SHIFT), __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN); - WRITE_ONCE(lruvec->mm_state.filters[gen], filter); + WRITE_ONCE(mm_state->filters[gen], filter); } /****************************************************************************** * mm_struct list ******************************************************************************/ +#ifdef CONFIG_LRU_GEN_WALKS_MMU + static struct lru_gen_mm_list *get_mm_list(struct mem_cgroup *memcg) { static struct lru_gen_mm_list mm_list = { @@ -2740,6 +2744,29 @@ static struct lru_gen_mm_list *get_mm_list(struct mem_cgroup *memcg) return &mm_list; } +static struct lru_gen_mm_state *get_mm_state(struct lruvec *lruvec) +{ + return &lruvec->mm_state; +} + +static struct mm_struct *get_next_mm(struct lru_gen_mm_walk *walk) +{ + int key; + struct mm_struct *mm; + struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec); + struct lru_gen_mm_state *mm_state = get_mm_state(walk->lruvec); + + mm = list_entry(mm_state->head, struct mm_struct, lru_gen.list); + key = pgdat->node_id % BITS_PER_TYPE(mm->lru_gen.bitmap); + + if (!walk->force_scan && !test_bit(key, &mm->lru_gen.bitmap)) + return NULL; + + clear_bit(key, &mm->lru_gen.bitmap); + + return mmget_not_zero(mm) ? mm : NULL; +} + void lru_gen_add_mm(struct mm_struct *mm) { int nid; @@ -2755,10 +2782,11 @@ void lru_gen_add_mm(struct mm_struct *mm) for_each_node_state(nid, N_MEMORY) { struct lruvec *lruvec = get_lruvec(memcg, nid); + struct lru_gen_mm_state *mm_state = get_mm_state(lruvec); /* the first addition since the last iteration */ - if (lruvec->mm_state.tail == &mm_list->fifo) - lruvec->mm_state.tail = &mm->lru_gen.list; + if (mm_state->tail == &mm_list->fifo) + mm_state->tail = &mm->lru_gen.list; } list_add_tail(&mm->lru_gen.list, &mm_list->fifo); @@ -2784,14 +2812,15 @@ void lru_gen_del_mm(struct mm_struct *mm) for_each_node(nid) { struct lruvec *lruvec = get_lruvec(memcg, nid); + struct lru_gen_mm_state *mm_state = get_mm_state(lruvec); /* where the current iteration continues after */ - if (lruvec->mm_state.head == &mm->lru_gen.list) - lruvec->mm_state.head = lruvec->mm_state.head->prev; + if (mm_state->head == &mm->lru_gen.list) + mm_state->head = mm_state->head->prev; /* where the last iteration ended before */ - if (lruvec->mm_state.tail == &mm->lru_gen.list) - lruvec->mm_state.tail = lruvec->mm_state.tail->next; + if (mm_state->tail == &mm->lru_gen.list) + mm_state->tail = mm_state->tail->next; } list_del_init(&mm->lru_gen.list); @@ -2834,10 +2863,30 @@ void lru_gen_migrate_mm(struct mm_struct *mm) } #endif +#else /* !CONFIG_LRU_GEN_WALKS_MMU */ + +static struct lru_gen_mm_list *get_mm_list(struct mem_cgroup *memcg) +{ + return NULL; +} + +static struct lru_gen_mm_state *get_mm_state(struct lruvec *lruvec) +{ + return NULL; +} + +static struct mm_struct *get_next_mm(struct lru_gen_mm_walk *walk) +{ + return NULL; +} + +#endif + static void reset_mm_stats(struct lruvec *lruvec, struct lru_gen_mm_walk *walk, bool last) { int i; int hist; + struct lru_gen_mm_state *mm_state = get_mm_state(lruvec); lockdep_assert_held(&get_mm_list(lruvec_memcg(lruvec))->lock); @@ -2845,44 +2894,20 @@ static void reset_mm_stats(struct lruvec *lruvec, struct lru_gen_mm_walk *walk, hist = lru_hist_from_seq(walk->max_seq); for (i = 0; i < NR_MM_STATS; i++) { - WRITE_ONCE(lruvec->mm_state.stats[hist][i], - lruvec->mm_state.stats[hist][i] + walk->mm_stats[i]); + WRITE_ONCE(mm_state->stats[hist][i], + mm_state->stats[hist][i] + walk->mm_stats[i]); walk->mm_stats[i] = 0; } } if (NR_HIST_GENS > 1 && last) { - hist = lru_hist_from_seq(lruvec->mm_state.seq + 1); + hist = lru_hist_from_seq(mm_state->seq + 1); for (i = 0; i < NR_MM_STATS; i++) - WRITE_ONCE(lruvec->mm_state.stats[hist][i], 0); + WRITE_ONCE(mm_state->stats[hist][i], 0); } } -static bool should_skip_mm(struct mm_struct *mm, struct lru_gen_mm_walk *walk) -{ - int type; - unsigned long size = 0; - struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec); - int key = pgdat->node_id % BITS_PER_TYPE(mm->lru_gen.bitmap); - - if (!walk->force_scan && !test_bit(key, &mm->lru_gen.bitmap)) - return true; - - clear_bit(key, &mm->lru_gen.bitmap); - - for (type = !walk->can_swap; type < ANON_AND_FILE; type++) { - size += type ? get_mm_counter(mm, MM_FILEPAGES) : - get_mm_counter(mm, MM_ANONPAGES) + - get_mm_counter(mm, MM_SHMEMPAGES); - } - - if (size < MIN_LRU_BATCH) - return true; - - return !mmget_not_zero(mm); -} - static bool iterate_mm_list(struct lruvec *lruvec, struct lru_gen_mm_walk *walk, struct mm_struct **iter) { @@ -2891,7 +2916,7 @@ static bool iterate_mm_list(struct lruvec *lruvec, struct lru_gen_mm_walk *walk, struct mm_struct *mm = NULL; struct mem_cgroup *memcg = lruvec_memcg(lruvec); struct lru_gen_mm_list *mm_list = get_mm_list(memcg); - struct lru_gen_mm_state *mm_state = &lruvec->mm_state; + struct lru_gen_mm_state *mm_state = get_mm_state(lruvec); /* * mm_state->seq is incremented after each iteration of mm_list. There @@ -2929,11 +2954,7 @@ static bool iterate_mm_list(struct lruvec *lruvec, struct lru_gen_mm_walk *walk, mm_state->tail = mm_state->head->next; walk->force_scan = true; } - - mm = list_entry(mm_state->head, struct mm_struct, lru_gen.list); - if (should_skip_mm(mm, walk)) - mm = NULL; - } while (!mm); + } while (!(mm = get_next_mm(walk))); done: if (*iter || last) reset_mm_stats(lruvec, walk, last); @@ -2941,7 +2962,7 @@ done: spin_unlock(&mm_list->lock); if (mm && first) - reset_bloom_filter(lruvec, walk->max_seq + 1); + reset_bloom_filter(mm_state, walk->max_seq + 1); if (*iter) mmput_async(*iter); @@ -2956,7 +2977,7 @@ static bool iterate_mm_list_nowalk(struct lruvec *lruvec, unsigned long max_seq) bool success = false; struct mem_cgroup *memcg = lruvec_memcg(lruvec); struct lru_gen_mm_list *mm_list = get_mm_list(memcg); - struct lru_gen_mm_state *mm_state = &lruvec->mm_state; + struct lru_gen_mm_state *mm_state = get_mm_state(lruvec); spin_lock(&mm_list->lock); @@ -3469,6 +3490,7 @@ static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end, DECLARE_BITMAP(bitmap, MIN_LRU_BATCH); unsigned long first = -1; struct lru_gen_mm_walk *walk = args->private; + struct lru_gen_mm_state *mm_state = get_mm_state(walk->lruvec); VM_WARN_ON_ONCE(pud_leaf(*pud)); @@ -3520,7 +3542,7 @@ restart: walk_pmd_range_locked(pud, addr, vma, args, bitmap, &first); } - if (!walk->force_scan && !test_bloom_filter(walk->lruvec, walk->max_seq, pmd + i)) + if (!walk->force_scan && !test_bloom_filter(mm_state, walk->max_seq, pmd + i)) continue; walk->mm_stats[MM_NONLEAF_FOUND]++; @@ -3531,7 +3553,7 @@ restart: walk->mm_stats[MM_NONLEAF_ADDED]++; /* carry over to the next generation */ - update_bloom_filter(walk->lruvec, walk->max_seq + 1, pmd + i); + update_bloom_filter(mm_state, walk->max_seq + 1, pmd + i); } walk_pmd_range_locked(pud, -1, vma, args, bitmap, &first); @@ -3738,16 +3760,25 @@ next: return success; } -static void inc_max_seq(struct lruvec *lruvec, bool can_swap, bool force_scan) +static bool inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, + bool can_swap, bool force_scan) { + bool success; int prev, next; int type, zone; struct lru_gen_folio *lrugen = &lruvec->lrugen; restart: + if (max_seq < READ_ONCE(lrugen->max_seq)) + return false; + spin_lock_irq(&lruvec->lru_lock); VM_WARN_ON_ONCE(!seq_is_valid(lruvec)); + success = max_seq == lrugen->max_seq; + if (!success) + goto unlock; + for (type = ANON_AND_FILE - 1; type >= 0; type--) { if (get_nr_gens(lruvec, type) != MAX_NR_GENS) continue; @@ -3791,8 +3822,10 @@ restart: WRITE_ONCE(lrugen->timestamps[next], jiffies); /* make sure preceding modifications appear */ smp_store_release(&lrugen->max_seq, lrugen->max_seq + 1); - +unlock: spin_unlock_irq(&lruvec->lru_lock); + + return success; } static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, @@ -3802,14 +3835,16 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, struct lru_gen_mm_walk *walk; struct mm_struct *mm = NULL; struct lru_gen_folio *lrugen = &lruvec->lrugen; + struct lru_gen_mm_state *mm_state = get_mm_state(lruvec); VM_WARN_ON_ONCE(max_seq > READ_ONCE(lrugen->max_seq)); + if (!mm_state) + return inc_max_seq(lruvec, max_seq, can_swap, force_scan); + /* see the comment in iterate_mm_list() */ - if (max_seq <= READ_ONCE(lruvec->mm_state.seq)) { - success = false; - goto done; - } + if (max_seq <= READ_ONCE(mm_state->seq)) + return false; /* * If the hardware doesn't automatically set the accessed bit, fallback @@ -3839,8 +3874,10 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, walk_mm(lruvec, mm, walk); } while (mm); done: - if (success) - inc_max_seq(lruvec, can_swap, force_scan); + if (success) { + success = inc_max_seq(lruvec, max_seq, can_swap, force_scan); + WARN_ON_ONCE(!success); + } return success; } @@ -3964,6 +4001,7 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) struct mem_cgroup *memcg = folio_memcg(folio); struct pglist_data *pgdat = folio_pgdat(folio); struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); + struct lru_gen_mm_state *mm_state = get_mm_state(lruvec); DEFINE_MAX_SEQ(lruvec); int old_gen, new_gen = lru_gen_from_seq(max_seq); @@ -4042,8 +4080,8 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) mem_cgroup_unlock_pages(); /* feedback from rmap walkers to page table walkers */ - if (suitable_to_scan(i, young)) - update_bloom_filter(lruvec, max_seq, pvmw->pmd); + if (mm_state && suitable_to_scan(i, young)) + update_bloom_filter(mm_state, max_seq, pvmw->pmd); } /****************************************************************************** @@ -5219,6 +5257,7 @@ static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec, int type, tier; int hist = lru_hist_from_seq(seq); struct lru_gen_folio *lrugen = &lruvec->lrugen; + struct lru_gen_mm_state *mm_state = get_mm_state(lruvec); for (tier = 0; tier < MAX_NR_TIERS; tier++) { seq_printf(m, " %10d", tier); @@ -5244,6 +5283,9 @@ static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec, seq_putc(m, '\n'); } + if (!mm_state) + return; + seq_puts(m, " "); for (i = 0; i < NR_MM_STATS; i++) { const char *s = " "; @@ -5251,10 +5293,10 @@ static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec, if (seq == max_seq && NR_HIST_GENS == 1) { s = "LOYNFA"; - n = READ_ONCE(lruvec->mm_state.stats[hist][i]); + n = READ_ONCE(mm_state->stats[hist][i]); } else if (seq != max_seq && NR_HIST_GENS > 1) { s = "loynfa"; - n = READ_ONCE(lruvec->mm_state.stats[hist][i]); + n = READ_ONCE(mm_state->stats[hist][i]); } seq_printf(m, " %10lu%c", n, s[i]); @@ -5523,6 +5565,7 @@ void lru_gen_init_lruvec(struct lruvec *lruvec) int i; int gen, type, zone; struct lru_gen_folio *lrugen = &lruvec->lrugen; + struct lru_gen_mm_state *mm_state = get_mm_state(lruvec); lrugen->max_seq = MIN_NR_GENS + 1; lrugen->enabled = lru_gen_enabled(); @@ -5533,7 +5576,8 @@ void lru_gen_init_lruvec(struct lruvec *lruvec) for_each_gen_type_zone(gen, type, zone) INIT_LIST_HEAD(&lrugen->folios[gen][type][zone]); - lruvec->mm_state.seq = MIN_NR_GENS; + if (mm_state) + mm_state->seq = MIN_NR_GENS; } #ifdef CONFIG_MEMCG @@ -5552,28 +5596,38 @@ void lru_gen_init_pgdat(struct pglist_data *pgdat) void lru_gen_init_memcg(struct mem_cgroup *memcg) { - INIT_LIST_HEAD(&memcg->mm_list.fifo); - spin_lock_init(&memcg->mm_list.lock); + struct lru_gen_mm_list *mm_list = get_mm_list(memcg); + + if (!mm_list) + return; + + INIT_LIST_HEAD(&mm_list->fifo); + spin_lock_init(&mm_list->lock); } void lru_gen_exit_memcg(struct mem_cgroup *memcg) { int i; int nid; + struct lru_gen_mm_list *mm_list = get_mm_list(memcg); - VM_WARN_ON_ONCE(!list_empty(&memcg->mm_list.fifo)); + VM_WARN_ON_ONCE(mm_list && !list_empty(&mm_list->fifo)); for_each_node(nid) { struct lruvec *lruvec = get_lruvec(memcg, nid); + struct lru_gen_mm_state *mm_state = get_mm_state(lruvec); VM_WARN_ON_ONCE(memchr_inv(lruvec->lrugen.nr_pages, 0, sizeof(lruvec->lrugen.nr_pages))); lruvec->lrugen.list.next = LIST_POISON1; + if (!mm_state) + continue; + for (i = 0; i < NR_BLOOM_FILTERS; i++) { - bitmap_free(lruvec->mm_state.filters[i]); - lruvec->mm_state.filters[i] = NULL; + bitmap_free(mm_state->filters[i]); + mm_state->filters[i] = NULL; } } } -- cgit From 6dff315972640bfe542e2d044933751afd8e6c4a Mon Sep 17 00:00:00 2001 From: Yuntao Wang Date: Tue, 2 Jan 2024 22:49:05 +0800 Subject: crash_core: fix and simplify the logic of crash_exclude_mem_range() The purpose of crash_exclude_mem_range() is to remove all memory ranges that overlap with [mstart-mend]. However, the current logic only removes the first overlapping memory range. Commit a2e9a95d2190 ("kexec: Improve & fix crash_exclude_mem_range() to handle overlapping ranges") attempted to address this issue, but it did not fix all error cases. Let's fix and simplify the logic of crash_exclude_mem_range(). Link: https://lkml.kernel.org/r/20240102144905.110047-4-ytcoode@gmail.com Signed-off-by: Yuntao Wang Acked-by: Baoquan He Cc: Borislav Petkov (AMD) Cc: Dave Hansen Cc: Dave Young Cc: Hari Bathini Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Sourabh Jain Cc: Takashi Iwai Cc: Thomas Gleixner Cc: Vivek Goyal Signed-off-by: Andrew Morton --- kernel/crash_core.c | 80 +++++++++++++++++++---------------------------------- 1 file changed, 29 insertions(+), 51 deletions(-) (limited to 'kernel') diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 6f074e112c1e..62e0227d390e 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -566,9 +566,8 @@ int crash_prepare_elf64_headers(struct crash_mem *mem, int need_kernel_map, int crash_exclude_mem_range(struct crash_mem *mem, unsigned long long mstart, unsigned long long mend) { - int i, j; + int i; unsigned long long start, end, p_start, p_end; - struct range temp_range = {0, 0}; for (i = 0; i < mem->nr_ranges; i++) { start = mem->ranges[i].start; @@ -576,72 +575,51 @@ int crash_exclude_mem_range(struct crash_mem *mem, p_start = mstart; p_end = mend; - if (mstart > end || mend < start) + if (p_start > end) continue; + /* + * Because the memory ranges in mem->ranges are stored in + * ascending order, when we detect `p_end < start`, we can + * immediately exit the for loop, as the subsequent memory + * ranges will definitely be outside the range we are looking + * for. + */ + if (p_end < start) + break; + /* Truncate any area outside of range */ - if (mstart < start) + if (p_start < start) p_start = start; - if (mend > end) + if (p_end > end) p_end = end; /* Found completely overlapping range */ if (p_start == start && p_end == end) { - mem->ranges[i].start = 0; - mem->ranges[i].end = 0; - if (i < mem->nr_ranges - 1) { - /* Shift rest of the ranges to left */ - for (j = i; j < mem->nr_ranges - 1; j++) { - mem->ranges[j].start = - mem->ranges[j+1].start; - mem->ranges[j].end = - mem->ranges[j+1].end; - } - - /* - * Continue to check if there are another overlapping ranges - * from the current position because of shifting the above - * mem ranges. - */ - i--; - mem->nr_ranges--; - continue; - } + memmove(&mem->ranges[i], &mem->ranges[i + 1], + (mem->nr_ranges - (i + 1)) * sizeof(mem->ranges[i])); + i--; mem->nr_ranges--; - return 0; - } - - if (p_start > start && p_end < end) { + } else if (p_start > start && p_end < end) { /* Split original range */ + if (mem->nr_ranges >= mem->max_nr_ranges) + return -ENOMEM; + + memmove(&mem->ranges[i + 2], &mem->ranges[i + 1], + (mem->nr_ranges - (i + 1)) * sizeof(mem->ranges[i])); + mem->ranges[i].end = p_start - 1; - temp_range.start = p_end + 1; - temp_range.end = end; + mem->ranges[i + 1].start = p_end + 1; + mem->ranges[i + 1].end = end; + + i++; + mem->nr_ranges++; } else if (p_start != start) mem->ranges[i].end = p_start - 1; else mem->ranges[i].start = p_end + 1; - break; - } - - /* If a split happened, add the split to array */ - if (!temp_range.end) - return 0; - - /* Split happened */ - if (i == mem->max_nr_ranges - 1) - return -ENOMEM; - - /* Location where new range should go */ - j = i + 1; - if (j < mem->nr_ranges) { - /* Move over all ranges one slot towards the end */ - for (i = mem->nr_ranges - 1; i >= j; i--) - mem->ranges[i + 1] = mem->ranges[i]; } - mem->ranges[j].start = temp_range.start; - mem->ranges[j].end = temp_range.end; - mem->nr_ranges++; return 0; } -- cgit From 4f1991a92cfe89096b2d1f5583a2e093bdd55c37 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Sun, 7 Jan 2024 20:32:58 -0500 Subject: tracing histograms: Simplify parse_actions() function The parse_actions() function uses 'len = str_has_prefix()' to test which action is in the string being parsed. But then it goes and repeats the logic for each different action. This logic can be simplified and duplicate code can be removed as 'len' contains the length of the found prefix which should be used for all actions. Link: https://lore.kernel.org/all/20240107112044.6702cb66@gandalf.local.home/ Link: https://lore.kernel.org/linux-trace-kernel/20240107203258.37e26d2b@gandalf.local.home Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Andy Shevchenko Cc: Tom Zanussi Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_hist.c | 49 ++++++++++++++++++++-------------------- 1 file changed, 24 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 5ecf3c8bde20..6ece1308d36a 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -4805,36 +4805,35 @@ static int parse_actions(struct hist_trigger_data *hist_data) int len; for (i = 0; i < hist_data->attrs->n_actions; i++) { + enum handler_id hid = 0; + char *action_str; + str = hist_data->attrs->action_str[i]; - if ((len = str_has_prefix(str, "onmatch("))) { - char *action_str = str + len; + if ((len = str_has_prefix(str, "onmatch("))) + hid = HANDLER_ONMATCH; + else if ((len = str_has_prefix(str, "onmax("))) + hid = HANDLER_ONMAX; + else if ((len = str_has_prefix(str, "onchange("))) + hid = HANDLER_ONCHANGE; - data = onmatch_parse(tr, action_str); - if (IS_ERR(data)) { - ret = PTR_ERR(data); - break; - } - } else if ((len = str_has_prefix(str, "onmax("))) { - char *action_str = str + len; + action_str = str + len; - data = track_data_parse(hist_data, action_str, - HANDLER_ONMAX); - if (IS_ERR(data)) { - ret = PTR_ERR(data); - break; - } - } else if ((len = str_has_prefix(str, "onchange("))) { - char *action_str = str + len; + switch (hid) { + case HANDLER_ONMATCH: + data = onmatch_parse(tr, action_str); + break; + case HANDLER_ONMAX: + case HANDLER_ONCHANGE: + data = track_data_parse(hist_data, action_str, hid); + break; + default: + data = ERR_PTR(-EINVAL); + break; + } - data = track_data_parse(hist_data, action_str, - HANDLER_ONCHANGE); - if (IS_ERR(data)) { - ret = PTR_ERR(data); - break; - } - } else { - ret = -EINVAL; + if (IS_ERR(data)) { + ret = PTR_ERR(data); break; } -- cgit From fd37721803c6e73619108f76ad2e12a9aa5fafaf Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Thu, 28 Dec 2023 17:47:03 +0300 Subject: mm, treewide: introduce NR_PAGE_ORDERS NR_PAGE_ORDERS defines the number of page orders supported by the page allocator, ranging from 0 to MAX_ORDER, MAX_ORDER + 1 in total. NR_PAGE_ORDERS assists in defining arrays of page orders and allows for more natural iteration over them. [kirill.shutemov@linux.intel.com: fixup for kerneldoc warning] Link: https://lkml.kernel.org/r/20240101111512.7empzyifq7kxtzk3@box Link: https://lkml.kernel.org/r/20231228144704.14033-1-kirill.shutemov@linux.intel.com Signed-off-by: Kirill A. Shutemov Reviewed-by: Zi Yan Cc: Linus Torvalds Signed-off-by: Andrew Morton --- Documentation/admin-guide/kdump/vmcoreinfo.rst | 6 +++--- arch/arm64/kvm/hyp/include/nvhe/gfp.h | 2 +- arch/sparc/kernel/traps_64.c | 2 +- drivers/gpu/drm/ttm/tests/ttm_device_test.c | 2 +- drivers/gpu/drm/ttm/ttm_pool.c | 20 ++++++++++---------- include/drm/ttm/ttm_pool.h | 2 +- include/linux/mmzone.h | 6 ++++-- kernel/crash_core.c | 2 +- lib/test_meminit.c | 2 +- mm/compaction.c | 2 +- mm/kmsan/init.c | 2 +- mm/page_alloc.c | 13 ++++++------- mm/page_reporting.c | 2 +- mm/show_mem.c | 8 ++++---- mm/vmstat.c | 12 ++++++------ 15 files changed, 42 insertions(+), 41 deletions(-) (limited to 'kernel') diff --git a/Documentation/admin-guide/kdump/vmcoreinfo.rst b/Documentation/admin-guide/kdump/vmcoreinfo.rst index 78e4d2e7ba14..3f8769e46b07 100644 --- a/Documentation/admin-guide/kdump/vmcoreinfo.rst +++ b/Documentation/admin-guide/kdump/vmcoreinfo.rst @@ -172,7 +172,7 @@ variables. Offset of the free_list's member. This value is used to compute the number of free pages. -Each zone has a free_area structure array called free_area[MAX_ORDER + 1]. +Each zone has a free_area structure array called free_area[NR_PAGE_ORDERS]. The free_list represents a linked list of free page blocks. (list_head, next|prev) @@ -189,8 +189,8 @@ Offsets of the vmap_area's members. They carry vmalloc-specific information. Makedumpfile gets the start address of the vmalloc region from this. -(zone.free_area, MAX_ORDER + 1) -------------------------------- +(zone.free_area, NR_PAGE_ORDERS) +-------------------------------- Free areas descriptor. User-space tools use this value to iterate the free_area ranges. MAX_ORDER is used by the zone buddy allocator. diff --git a/arch/arm64/kvm/hyp/include/nvhe/gfp.h b/arch/arm64/kvm/hyp/include/nvhe/gfp.h index fe5472a184a3..97c527ef53c2 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/gfp.h +++ b/arch/arm64/kvm/hyp/include/nvhe/gfp.h @@ -16,7 +16,7 @@ struct hyp_pool { * API at EL2. */ hyp_spinlock_t lock; - struct list_head free_area[MAX_ORDER + 1]; + struct list_head free_area[NR_PAGE_ORDERS]; phys_addr_t range_start; phys_addr_t range_end; unsigned short max_order; diff --git a/arch/sparc/kernel/traps_64.c b/arch/sparc/kernel/traps_64.c index 08ffd17d5ec3..523a6e5ee925 100644 --- a/arch/sparc/kernel/traps_64.c +++ b/arch/sparc/kernel/traps_64.c @@ -897,7 +897,7 @@ void __init cheetah_ecache_flush_init(void) /* Now allocate error trap reporting scoreboard. */ sz = NR_CPUS * (2 * sizeof(struct cheetah_err_info)); - for (order = 0; order <= MAX_ORDER; order++) { + for (order = 0; order < NR_PAGE_ORDERS; order++) { if ((PAGE_SIZE << order) >= sz) break; } diff --git a/drivers/gpu/drm/ttm/tests/ttm_device_test.c b/drivers/gpu/drm/ttm/tests/ttm_device_test.c index b1b423b68cdf..19eaff22e6ae 100644 --- a/drivers/gpu/drm/ttm/tests/ttm_device_test.c +++ b/drivers/gpu/drm/ttm/tests/ttm_device_test.c @@ -175,7 +175,7 @@ static void ttm_device_init_pools(struct kunit *test) if (params->pools_init_expected) { for (int i = 0; i < TTM_NUM_CACHING_TYPES; ++i) { - for (int j = 0; j <= MAX_ORDER; ++j) { + for (int j = 0; j < NR_PAGE_ORDERS; ++j) { pt = pool->caching[i].orders[j]; KUNIT_EXPECT_PTR_EQ(test, pt.pool, pool); KUNIT_EXPECT_EQ(test, pt.caching, i); diff --git a/drivers/gpu/drm/ttm/ttm_pool.c b/drivers/gpu/drm/ttm/ttm_pool.c index fe610a3cace0..d183bb97c526 100644 --- a/drivers/gpu/drm/ttm/ttm_pool.c +++ b/drivers/gpu/drm/ttm/ttm_pool.c @@ -65,11 +65,11 @@ module_param(page_pool_size, ulong, 0644); static atomic_long_t allocated_pages; -static struct ttm_pool_type global_write_combined[MAX_ORDER + 1]; -static struct ttm_pool_type global_uncached[MAX_ORDER + 1]; +static struct ttm_pool_type global_write_combined[NR_PAGE_ORDERS]; +static struct ttm_pool_type global_uncached[NR_PAGE_ORDERS]; -static struct ttm_pool_type global_dma32_write_combined[MAX_ORDER + 1]; -static struct ttm_pool_type global_dma32_uncached[MAX_ORDER + 1]; +static struct ttm_pool_type global_dma32_write_combined[NR_PAGE_ORDERS]; +static struct ttm_pool_type global_dma32_uncached[NR_PAGE_ORDERS]; static spinlock_t shrinker_lock; static struct list_head shrinker_list; @@ -568,7 +568,7 @@ void ttm_pool_init(struct ttm_pool *pool, struct device *dev, if (use_dma_alloc || nid != NUMA_NO_NODE) { for (i = 0; i < TTM_NUM_CACHING_TYPES; ++i) - for (j = 0; j <= MAX_ORDER; ++j) + for (j = 0; j < NR_PAGE_ORDERS; ++j) ttm_pool_type_init(&pool->caching[i].orders[j], pool, i, j); } @@ -601,7 +601,7 @@ void ttm_pool_fini(struct ttm_pool *pool) if (pool->use_dma_alloc || pool->nid != NUMA_NO_NODE) { for (i = 0; i < TTM_NUM_CACHING_TYPES; ++i) - for (j = 0; j <= MAX_ORDER; ++j) + for (j = 0; j < NR_PAGE_ORDERS; ++j) ttm_pool_type_fini(&pool->caching[i].orders[j]); } @@ -656,7 +656,7 @@ static void ttm_pool_debugfs_header(struct seq_file *m) unsigned int i; seq_puts(m, "\t "); - for (i = 0; i <= MAX_ORDER; ++i) + for (i = 0; i < NR_PAGE_ORDERS; ++i) seq_printf(m, " ---%2u---", i); seq_puts(m, "\n"); } @@ -667,7 +667,7 @@ static void ttm_pool_debugfs_orders(struct ttm_pool_type *pt, { unsigned int i; - for (i = 0; i <= MAX_ORDER; ++i) + for (i = 0; i < NR_PAGE_ORDERS; ++i) seq_printf(m, " %8u", ttm_pool_type_count(&pt[i])); seq_puts(m, "\n"); } @@ -776,7 +776,7 @@ int ttm_pool_mgr_init(unsigned long num_pages) spin_lock_init(&shrinker_lock); INIT_LIST_HEAD(&shrinker_list); - for (i = 0; i <= MAX_ORDER; ++i) { + for (i = 0; i < NR_PAGE_ORDERS; ++i) { ttm_pool_type_init(&global_write_combined[i], NULL, ttm_write_combined, i); ttm_pool_type_init(&global_uncached[i], NULL, ttm_uncached, i); @@ -816,7 +816,7 @@ void ttm_pool_mgr_fini(void) { unsigned int i; - for (i = 0; i <= MAX_ORDER; ++i) { + for (i = 0; i < NR_PAGE_ORDERS; ++i) { ttm_pool_type_fini(&global_write_combined[i]); ttm_pool_type_fini(&global_uncached[i]); diff --git a/include/drm/ttm/ttm_pool.h b/include/drm/ttm/ttm_pool.h index 30a347e5aa11..4490d43c63e3 100644 --- a/include/drm/ttm/ttm_pool.h +++ b/include/drm/ttm/ttm_pool.h @@ -74,7 +74,7 @@ struct ttm_pool { bool use_dma32; struct { - struct ttm_pool_type orders[MAX_ORDER + 1]; + struct ttm_pool_type orders[NR_PAGE_ORDERS]; } caching[TTM_NUM_CACHING_TYPES]; }; diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index c18c53353b50..1ea7636dfb76 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -35,6 +35,8 @@ #define IS_MAX_ORDER_ALIGNED(pfn) IS_ALIGNED(pfn, MAX_ORDER_NR_PAGES) +#define NR_PAGE_ORDERS (MAX_ORDER + 1) + /* * PAGE_ALLOC_COSTLY_ORDER is the order at which allocations are deemed * costly to service. That is between allocation orders which should @@ -96,7 +98,7 @@ static inline bool migratetype_is_mergeable(int mt) } #define for_each_migratetype_order(order, type) \ - for (order = 0; order <= MAX_ORDER; order++) \ + for (order = 0; order < NR_PAGE_ORDERS; order++) \ for (type = 0; type < MIGRATE_TYPES; type++) extern int page_group_by_mobility_disabled; @@ -933,7 +935,7 @@ struct zone { CACHELINE_PADDING(_pad1_); /* free areas of different sizes */ - struct free_area free_area[MAX_ORDER + 1]; + struct free_area free_area[NR_PAGE_ORDERS]; #ifdef CONFIG_UNACCEPTED_MEMORY /* Pages to be accepted. All pages on the list are MAX_ORDER */ diff --git a/kernel/crash_core.c b/kernel/crash_core.c index d4313b53837e..56cf4ad7abbb 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -802,7 +802,7 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_OFFSET(list_head, prev); VMCOREINFO_OFFSET(vmap_area, va_start); VMCOREINFO_OFFSET(vmap_area, list); - VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER + 1); + VMCOREINFO_LENGTH(zone.free_area, NR_PAGE_ORDERS); log_buf_vmcoreinfo_setup(); VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES); VMCOREINFO_NUMBER(NR_FREE_PAGES); diff --git a/lib/test_meminit.c b/lib/test_meminit.c index 0ae35223d773..0dc173849a54 100644 --- a/lib/test_meminit.c +++ b/lib/test_meminit.c @@ -93,7 +93,7 @@ static int __init test_pages(int *total_failures) int failures = 0, num_tests = 0; int i; - for (i = 0; i <= MAX_ORDER; i++) + for (i = 0; i < NR_PAGE_ORDERS; i++) num_tests += do_alloc_pages_order(i, &failures); REPORT_FAILURES_IN_FN(); diff --git a/mm/compaction.c b/mm/compaction.c index de15a2ef0af5..24f8eb4d6260 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -2229,7 +2229,7 @@ static enum compact_result __compact_finished(struct compact_control *cc) /* Direct compactor: Is a suitable page free? */ ret = COMPACT_NO_SUITABLE_PAGE; - for (order = cc->order; order <= MAX_ORDER; order++) { + for (order = cc->order; order < NR_PAGE_ORDERS; order++) { struct free_area *area = &cc->zone->free_area[order]; bool can_steal; diff --git a/mm/kmsan/init.c b/mm/kmsan/init.c index ffedf4dbc49d..103e2e88ea03 100644 --- a/mm/kmsan/init.c +++ b/mm/kmsan/init.c @@ -96,7 +96,7 @@ void __init kmsan_init_shadow(void) struct metadata_page_pair { struct page *shadow, *origin; }; -static struct metadata_page_pair held_back[MAX_ORDER + 1] __initdata; +static struct metadata_page_pair held_back[NR_PAGE_ORDERS] __initdata; /* * Eager metadata allocation. When the memblock allocator is freeing pages to diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5526797b7f96..ccecf6158ae4 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1560,7 +1560,7 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, struct page *page; /* Find a page of the appropriate size in the preferred list */ - for (current_order = order; current_order <= MAX_ORDER; ++current_order) { + for (current_order = order; current_order < NR_PAGE_ORDERS; ++current_order) { area = &(zone->free_area[current_order]); page = get_page_from_free_area(area, migratetype); if (!page) @@ -1934,7 +1934,7 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, continue; spin_lock_irqsave(&zone->lock, flags); - for (order = 0; order <= MAX_ORDER; order++) { + for (order = 0; order < NR_PAGE_ORDERS; order++) { struct free_area *area = &(zone->free_area[order]); page = get_page_from_free_area(area, MIGRATE_HIGHATOMIC); @@ -2044,8 +2044,7 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype, return false; find_smallest: - for (current_order = order; current_order <= MAX_ORDER; - current_order++) { + for (current_order = order; current_order < NR_PAGE_ORDERS; current_order++) { area = &(zone->free_area[current_order]); fallback_mt = find_suitable_fallback(area, current_order, start_migratetype, false, &can_steal); @@ -3000,7 +2999,7 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, return true; /* For a high-order request, check at least one suitable page is free */ - for (o = order; o <= MAX_ORDER; o++) { + for (o = order; o < NR_PAGE_ORDERS; o++) { struct free_area *area = &z->free_area[o]; int mt; @@ -6628,7 +6627,7 @@ bool is_free_buddy_page(struct page *page) unsigned long pfn = page_to_pfn(page); unsigned int order; - for (order = 0; order <= MAX_ORDER; order++) { + for (order = 0; order < NR_PAGE_ORDERS; order++) { struct page *page_head = page - (pfn & ((1 << order) - 1)); if (PageBuddy(page_head) && @@ -6683,7 +6682,7 @@ bool take_page_off_buddy(struct page *page) bool ret = false; spin_lock_irqsave(&zone->lock, flags); - for (order = 0; order <= MAX_ORDER; order++) { + for (order = 0; order < NR_PAGE_ORDERS; order++) { struct page *page_head = page - (pfn & ((1 << order) - 1)); int page_order = buddy_order(page_head); diff --git a/mm/page_reporting.c b/mm/page_reporting.c index b021f482a4cb..66369cc5279b 100644 --- a/mm/page_reporting.c +++ b/mm/page_reporting.c @@ -276,7 +276,7 @@ page_reporting_process_zone(struct page_reporting_dev_info *prdev, return err; /* Process each free list starting from lowest order/mt */ - for (order = page_reporting_order; order <= MAX_ORDER; order++) { + for (order = page_reporting_order; order < NR_PAGE_ORDERS; order++) { for (mt = 0; mt < MIGRATE_TYPES; mt++) { /* We do not pull pages from the isolate free list */ if (is_migrate_isolate(mt)) diff --git a/mm/show_mem.c b/mm/show_mem.c index ba0808d6917f..8dcfafbd283c 100644 --- a/mm/show_mem.c +++ b/mm/show_mem.c @@ -352,8 +352,8 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z for_each_populated_zone(zone) { unsigned int order; - unsigned long nr[MAX_ORDER + 1], flags, total = 0; - unsigned char types[MAX_ORDER + 1]; + unsigned long nr[NR_PAGE_ORDERS], flags, total = 0; + unsigned char types[NR_PAGE_ORDERS]; if (zone_idx(zone) > max_zone_idx) continue; @@ -363,7 +363,7 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z printk(KERN_CONT "%s: ", zone->name); spin_lock_irqsave(&zone->lock, flags); - for (order = 0; order <= MAX_ORDER; order++) { + for (order = 0; order < NR_PAGE_ORDERS; order++) { struct free_area *area = &zone->free_area[order]; int type; @@ -377,7 +377,7 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z } } spin_unlock_irqrestore(&zone->lock, flags); - for (order = 0; order <= MAX_ORDER; order++) { + for (order = 0; order < NR_PAGE_ORDERS; order++) { printk(KERN_CONT "%lu*%lukB ", nr[order], K(1UL) << order); if (nr[order]) diff --git a/mm/vmstat.c b/mm/vmstat.c index 1437ca2f28c5..03ead31c46a0 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1059,7 +1059,7 @@ static void fill_contig_page_info(struct zone *zone, info->free_blocks_total = 0; info->free_blocks_suitable = 0; - for (order = 0; order <= MAX_ORDER; order++) { + for (order = 0; order < NR_PAGE_ORDERS; order++) { unsigned long blocks; /* @@ -1476,7 +1476,7 @@ static void frag_show_print(struct seq_file *m, pg_data_t *pgdat, int order; seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name); - for (order = 0; order <= MAX_ORDER; ++order) + for (order = 0; order < NR_PAGE_ORDERS; ++order) /* * Access to nr_free is lockless as nr_free is used only for * printing purposes. Use data_race to avoid KCSAN warning. @@ -1505,7 +1505,7 @@ static void pagetypeinfo_showfree_print(struct seq_file *m, pgdat->node_id, zone->name, migratetype_names[mtype]); - for (order = 0; order <= MAX_ORDER; ++order) { + for (order = 0; order < NR_PAGE_ORDERS; ++order) { unsigned long freecount = 0; struct free_area *area; struct list_head *curr; @@ -1545,7 +1545,7 @@ static void pagetypeinfo_showfree(struct seq_file *m, void *arg) /* Print header */ seq_printf(m, "%-43s ", "Free pages count per migrate type at order"); - for (order = 0; order <= MAX_ORDER; ++order) + for (order = 0; order < NR_PAGE_ORDERS; ++order) seq_printf(m, "%6d ", order); seq_putc(m, '\n'); @@ -2181,7 +2181,7 @@ static void unusable_show_print(struct seq_file *m, seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name); - for (order = 0; order <= MAX_ORDER; ++order) { + for (order = 0; order < NR_PAGE_ORDERS; ++order) { fill_contig_page_info(zone, order, &info); index = unusable_free_index(order, &info); seq_printf(m, "%d.%03d ", index / 1000, index % 1000); @@ -2233,7 +2233,7 @@ static void extfrag_show_print(struct seq_file *m, seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name); - for (order = 0; order <= MAX_ORDER; ++order) { + for (order = 0; order < NR_PAGE_ORDERS; ++order) { fill_contig_page_info(zone, order, &info); index = __fragmentation_index(order, &info); seq_printf(m, "%2d.%03d ", index / 1000, index % 1000); -- cgit From 5e0a760b44417f7cadd79de2204d6247109558a0 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Thu, 28 Dec 2023 17:47:04 +0300 Subject: mm, treewide: rename MAX_ORDER to MAX_PAGE_ORDER commit 23baf831a32c ("mm, treewide: redefine MAX_ORDER sanely") has changed the definition of MAX_ORDER to be inclusive. This has caused issues with code that was not yet upstream and depended on the previous definition. To draw attention to the altered meaning of the define, rename MAX_ORDER to MAX_PAGE_ORDER. Link: https://lkml.kernel.org/r/20231228144704.14033-2-kirill.shutemov@linux.intel.com Signed-off-by: Kirill A. Shutemov Cc: Linus Torvalds Signed-off-by: Andrew Morton --- Documentation/admin-guide/kdump/vmcoreinfo.rst | 2 +- Documentation/admin-guide/kernel-parameters.txt | 24 ++++++++++++------------ Documentation/networking/packet_mmap.rst | 14 +++++++------- arch/arm/Kconfig | 2 +- arch/arm64/Kconfig | 20 ++++++++++---------- arch/arm64/include/asm/sparsemem.h | 2 +- arch/arm64/kvm/hyp/nvhe/page_alloc.c | 3 ++- arch/arm64/mm/hugetlbpage.c | 2 +- arch/m68k/Kconfig.cpu | 2 +- arch/nios2/Kconfig | 2 +- arch/powerpc/Kconfig | 2 +- arch/powerpc/mm/book3s64/iommu_api.c | 2 +- arch/powerpc/mm/hugetlbpage.c | 2 +- arch/powerpc/platforms/powernv/pci-ioda.c | 2 +- arch/sh/mm/Kconfig | 2 +- arch/sparc/Kconfig | 2 +- arch/sparc/kernel/pci_sun4v.c | 2 +- arch/sparc/mm/tsb.c | 4 ++-- arch/um/kernel/um_arch.c | 4 ++-- arch/xtensa/Kconfig | 2 +- drivers/accel/qaic/qaic_data.c | 2 +- drivers/base/regmap/regmap-debugfs.c | 8 ++++---- drivers/block/floppy.c | 2 +- drivers/crypto/ccp/sev-dev.c | 2 +- drivers/crypto/hisilicon/sgl.c | 6 +++--- drivers/gpu/drm/i915/gem/i915_gem_internal.c | 2 +- drivers/gpu/drm/i915/gem/selftests/huge_pages.c | 2 +- drivers/gpu/drm/ttm/tests/ttm_pool_test.c | 8 ++++---- drivers/gpu/drm/ttm/ttm_pool.c | 2 +- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h | 2 +- drivers/iommu/dma-iommu.c | 2 +- drivers/irqchip/irq-gic-v3-its.c | 4 ++-- drivers/md/dm-bufio.c | 2 +- drivers/md/dm-crypt.c | 2 +- drivers/md/dm-flakey.c | 2 +- drivers/misc/genwqe/card_dev.c | 2 +- drivers/misc/genwqe/card_utils.c | 4 ++-- drivers/net/ethernet/hisilicon/hns3/hns3_enet.c | 2 +- drivers/net/ethernet/ibm/ibmvnic.h | 4 ++-- drivers/video/fbdev/hyperv_fb.c | 6 +++--- drivers/video/fbdev/vermilion/vermilion.c | 2 +- drivers/virtio/virtio_balloon.c | 2 +- drivers/virtio/virtio_mem.c | 8 ++++---- fs/ramfs/file-nommu.c | 2 +- include/linux/hugetlb.h | 2 +- include/linux/mmzone.h | 14 +++++++------- include/linux/pageblock-flags.h | 4 ++-- include/linux/slab.h | 4 ++-- kernel/dma/pool.c | 6 +++--- kernel/dma/swiotlb.c | 4 ++-- kernel/events/ring_buffer.c | 10 +++++----- mm/Kconfig | 6 +++--- mm/compaction.c | 4 ++-- mm/debug_page_alloc.c | 2 +- mm/debug_vm_pgtable.c | 4 ++-- mm/huge_memory.c | 2 +- mm/hugetlb.c | 4 ++-- mm/internal.h | 2 +- mm/kmsan/init.c | 6 +++--- mm/memblock.c | 7 ++++--- mm/memory_hotplug.c | 9 +++++---- mm/mm_init.c | 22 +++++++++++----------- mm/page_alloc.c | 24 ++++++++++++------------ mm/page_isolation.c | 17 +++++++++-------- mm/page_owner.c | 6 +++--- mm/page_reporting.c | 4 ++-- mm/shuffle.h | 2 +- mm/slab.c | 2 +- mm/slub.c | 4 ++-- mm/vmscan.c | 2 +- mm/vmstat.c | 2 +- net/smc/smc_ib.c | 2 +- security/integrity/ima/ima_crypto.c | 2 +- tools/perf/Documentation/perf-intel-pt.txt | 2 +- tools/testing/memblock/linux/mmzone.h | 6 +++--- tools/testing/selftests/mm/thuge-gen.c | 3 ++- 76 files changed, 186 insertions(+), 181 deletions(-) (limited to 'kernel') diff --git a/Documentation/admin-guide/kdump/vmcoreinfo.rst b/Documentation/admin-guide/kdump/vmcoreinfo.rst index 3f8769e46b07..bced9e4b6e08 100644 --- a/Documentation/admin-guide/kdump/vmcoreinfo.rst +++ b/Documentation/admin-guide/kdump/vmcoreinfo.rst @@ -193,7 +193,7 @@ from this. -------------------------------- Free areas descriptor. User-space tools use this value to iterate the -free_area ranges. MAX_ORDER is used by the zone buddy allocator. +free_area ranges. NR_PAGE_ORDERS is used by the zone buddy allocator. prb --- diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 65731b060e3f..8a01b8112f0b 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -970,17 +970,17 @@ buddy allocator. Bigger value increase the probability of catching random memory corruption, but reduce the amount of memory for normal system use. The maximum - possible value is MAX_ORDER/2. Setting this parameter - to 1 or 2 should be enough to identify most random - memory corruption problems caused by bugs in kernel or - driver code when a CPU writes to (or reads from) a - random memory location. Note that there exists a class - of memory corruptions problems caused by buggy H/W or - F/W or by drivers badly programming DMA (basically when - memory is written at bus level and the CPU MMU is - bypassed) which are not detectable by - CONFIG_DEBUG_PAGEALLOC, hence this option will not help - tracking down these problems. + possible value is MAX_PAGE_ORDER/2. Setting this + parameter to 1 or 2 should be enough to identify most + random memory corruption problems caused by bugs in + kernel or driver code when a CPU writes to (or reads + from) a random memory location. Note that there exists + a class of memory corruptions problems caused by buggy + H/W or F/W or by drivers badly programming DMA + (basically when memory is written at bus level and the + CPU MMU is bypassed) which are not detectable by + CONFIG_DEBUG_PAGEALLOC, hence this option will not + help tracking down these problems. debug_pagealloc= [KNL] When CONFIG_DEBUG_PAGEALLOC is set, this parameter @@ -4136,7 +4136,7 @@ [KNL] Minimal page reporting order Format: Adjust the minimal page reporting order. The page - reporting is disabled when it exceeds MAX_ORDER. + reporting is disabled when it exceeds MAX_PAGE_ORDER. panic= [KNL] Kernel behaviour on panic: delay timeout > 0: seconds before rebooting diff --git a/Documentation/networking/packet_mmap.rst b/Documentation/networking/packet_mmap.rst index 30a3be3c48f3..dca15d15feaf 100644 --- a/Documentation/networking/packet_mmap.rst +++ b/Documentation/networking/packet_mmap.rst @@ -263,20 +263,20 @@ the name indicates, this function allocates pages of memory, and the second argument is "order" or a power of two number of pages, that is (for PAGE_SIZE == 4096) order=0 ==> 4096 bytes, order=1 ==> 8192 bytes, order=2 ==> 16384 bytes, etc. The maximum size of a -region allocated by __get_free_pages is determined by the MAX_ORDER macro. More -precisely the limit can be calculated as:: +region allocated by __get_free_pages is determined by the MAX_PAGE_ORDER macro. +More precisely the limit can be calculated as:: - PAGE_SIZE << MAX_ORDER + PAGE_SIZE << MAX_PAGE_ORDER In a i386 architecture PAGE_SIZE is 4096 bytes - In a 2.4/i386 kernel MAX_ORDER is 10 - In a 2.6/i386 kernel MAX_ORDER is 11 + In a 2.4/i386 kernel MAX_PAGE_ORDER is 10 + In a 2.6/i386 kernel MAX_PAGE_ORDER is 11 So get_free_pages can allocate as much as 4MB or 8MB in a 2.4/2.6 kernel respectively, with an i386 architecture. User space programs can include /usr/include/sys/user.h and -/usr/include/linux/mmzone.h to get PAGE_SIZE MAX_ORDER declarations. +/usr/include/linux/mmzone.h to get PAGE_SIZE MAX_PAGE_ORDER declarations. The pagesize can also be determined dynamically with the getpagesize (2) system call. @@ -324,7 +324,7 @@ Definitions: (see /proc/slabinfo) depends on the architecture -- ``sizeof(void *)`` depends on the architecture -- PAGE_SIZE or getpagesize (2) - is the value defined with MAX_ORDER + is the value defined with MAX_PAGE_ORDER it's an upper bound of frame's capture size (more on this later) ============== ================================================================ diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index f8567e95f98b..b2ab8db63c4b 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -1362,7 +1362,7 @@ config ARCH_FORCE_MAX_ORDER default "10" help The kernel page allocator limits the size of maximal physically - contiguous allocations. The limit is called MAX_ORDER and it + contiguous allocations. The limit is called MAX_PAGE_ORDER and it defines the maximal power of two of number of pages that can be allocated as a single contiguous block. This option allows overriding the default setting when ability to allocate very diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 12d611f3da5d..442539fd06fe 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1520,15 +1520,15 @@ config XEN # include/linux/mmzone.h requires the following to be true: # -# MAX_ORDER + PAGE_SHIFT <= SECTION_SIZE_BITS +# MAX_PAGE_ORDER + PAGE_SHIFT <= SECTION_SIZE_BITS # -# so the maximum value of MAX_ORDER is SECTION_SIZE_BITS - PAGE_SHIFT: +# so the maximum value of MAX_PAGE_ORDER is SECTION_SIZE_BITS - PAGE_SHIFT: # -# | SECTION_SIZE_BITS | PAGE_SHIFT | max MAX_ORDER | default MAX_ORDER | -# ----+-------------------+--------------+-----------------+--------------------+ -# 4K | 27 | 12 | 15 | 10 | -# 16K | 27 | 14 | 13 | 11 | -# 64K | 29 | 16 | 13 | 13 | +# | SECTION_SIZE_BITS | PAGE_SHIFT | max MAX_PAGE_ORDER | default MAX_PAGE_ORDER | +# ----+-------------------+--------------+----------------------+-------------------------+ +# 4K | 27 | 12 | 15 | 10 | +# 16K | 27 | 14 | 13 | 11 | +# 64K | 29 | 16 | 13 | 13 | config ARCH_FORCE_MAX_ORDER int default "13" if ARM64_64K_PAGES @@ -1536,16 +1536,16 @@ config ARCH_FORCE_MAX_ORDER default "10" help The kernel page allocator limits the size of maximal physically - contiguous allocations. The limit is called MAX_ORDER and it + contiguous allocations. The limit is called MAX_PAGE_ORDER and it defines the maximal power of two of number of pages that can be allocated as a single contiguous block. This option allows overriding the default setting when ability to allocate very large blocks of physically contiguous memory is required. The maximal size of allocation cannot exceed the size of the - section, so the value of MAX_ORDER should satisfy + section, so the value of MAX_PAGE_ORDER should satisfy - MAX_ORDER + PAGE_SHIFT <= SECTION_SIZE_BITS + MAX_PAGE_ORDER + PAGE_SHIFT <= SECTION_SIZE_BITS Don't change if unsure. diff --git a/arch/arm64/include/asm/sparsemem.h b/arch/arm64/include/asm/sparsemem.h index 5f5437621029..8a8acc220371 100644 --- a/arch/arm64/include/asm/sparsemem.h +++ b/arch/arm64/include/asm/sparsemem.h @@ -10,7 +10,7 @@ /* * Section size must be at least 512MB for 64K base * page size config. Otherwise it will be less than - * MAX_ORDER and the build process will fail. + * MAX_PAGE_ORDER and the build process will fail. */ #ifdef CONFIG_ARM64_64K_PAGES #define SECTION_SIZE_BITS 29 diff --git a/arch/arm64/kvm/hyp/nvhe/page_alloc.c b/arch/arm64/kvm/hyp/nvhe/page_alloc.c index b1e392186a0f..e691290d3765 100644 --- a/arch/arm64/kvm/hyp/nvhe/page_alloc.c +++ b/arch/arm64/kvm/hyp/nvhe/page_alloc.c @@ -228,7 +228,8 @@ int hyp_pool_init(struct hyp_pool *pool, u64 pfn, unsigned int nr_pages, int i; hyp_spin_lock_init(&pool->lock); - pool->max_order = min(MAX_ORDER, get_order(nr_pages << PAGE_SHIFT)); + pool->max_order = min(MAX_PAGE_ORDER, + get_order(nr_pages << PAGE_SHIFT)); for (i = 0; i <= pool->max_order; i++) INIT_LIST_HEAD(&pool->free_area[i]); pool->range_start = phys; diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c index f5aae342632c..8116ac599f80 100644 --- a/arch/arm64/mm/hugetlbpage.c +++ b/arch/arm64/mm/hugetlbpage.c @@ -51,7 +51,7 @@ void __init arm64_hugetlb_cma_reserve(void) * page allocator. Just warn if there is any change * breaking this assumption. */ - WARN_ON(order <= MAX_ORDER); + WARN_ON(order <= MAX_PAGE_ORDER); hugetlb_cma_reserve(order); } #endif /* CONFIG_CMA */ diff --git a/arch/m68k/Kconfig.cpu b/arch/m68k/Kconfig.cpu index ad69b466a08b..9dcf245c9cbf 100644 --- a/arch/m68k/Kconfig.cpu +++ b/arch/m68k/Kconfig.cpu @@ -402,7 +402,7 @@ config ARCH_FORCE_MAX_ORDER default "10" help The kernel page allocator limits the size of maximal physically - contiguous allocations. The limit is called MAX_ORDER and it + contiguous allocations. The limit is called MAX_PAGE_ORDER and it defines the maximal power of two of number of pages that can be allocated as a single contiguous block. This option allows overriding the default setting when ability to allocate very diff --git a/arch/nios2/Kconfig b/arch/nios2/Kconfig index d54464021a61..58d9565dc2c7 100644 --- a/arch/nios2/Kconfig +++ b/arch/nios2/Kconfig @@ -50,7 +50,7 @@ config ARCH_FORCE_MAX_ORDER default "10" help The kernel page allocator limits the size of maximal physically - contiguous allocations. The limit is called MAX_ORDER and it + contiguous allocations. The limit is called MAX_PAGE_ORDER and it defines the maximal power of two of number of pages that can be allocated as a single contiguous block. This option allows overriding the default setting when ability to allocate very diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 1f11a62809f2..52d7e3fad553 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -915,7 +915,7 @@ config ARCH_FORCE_MAX_ORDER default "10" help The kernel page allocator limits the size of maximal physically - contiguous allocations. The limit is called MAX_ORDER and it + contiguous allocations. The limit is called MAX_PAGE_ORDER and it defines the maximal power of two of number of pages that can be allocated as a single contiguous block. This option allows overriding the default setting when ability to allocate very diff --git a/arch/powerpc/mm/book3s64/iommu_api.c b/arch/powerpc/mm/book3s64/iommu_api.c index d19fb1f3007d..c0e8d597e4cb 100644 --- a/arch/powerpc/mm/book3s64/iommu_api.c +++ b/arch/powerpc/mm/book3s64/iommu_api.c @@ -97,7 +97,7 @@ static long mm_iommu_do_alloc(struct mm_struct *mm, unsigned long ua, } mmap_read_lock(mm); - chunk = (1UL << (PAGE_SHIFT + MAX_ORDER)) / + chunk = (1UL << (PAGE_SHIFT + MAX_PAGE_ORDER)) / sizeof(struct vm_area_struct *); chunk = min(chunk, entries); for (entry = 0; entry < entries; entry += chunk) { diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index f7c683b672c1..0a540b37aab6 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -615,7 +615,7 @@ void __init gigantic_hugetlb_cma_reserve(void) order = mmu_psize_to_shift(MMU_PAGE_16G) - PAGE_SHIFT; if (order) { - VM_WARN_ON(order <= MAX_ORDER); + VM_WARN_ON(order <= MAX_PAGE_ORDER); hugetlb_cma_reserve(order); } } diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 28fac4770073..23f5b5093ec1 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -1389,7 +1389,7 @@ static long pnv_pci_ioda2_setup_default_config(struct pnv_ioda_pe *pe) * DMA window can be larger than available memory, which will * cause errors later. */ - const u64 maxblock = 1UL << (PAGE_SHIFT + MAX_ORDER); + const u64 maxblock = 1UL << (PAGE_SHIFT + MAX_PAGE_ORDER); /* * We create the default window as big as we can. The constraint is diff --git a/arch/sh/mm/Kconfig b/arch/sh/mm/Kconfig index 511c17aede4a..455311d9a5e9 100644 --- a/arch/sh/mm/Kconfig +++ b/arch/sh/mm/Kconfig @@ -26,7 +26,7 @@ config ARCH_FORCE_MAX_ORDER default "10" help The kernel page allocator limits the size of maximal physically - contiguous allocations. The limit is called MAX_ORDER and it + contiguous allocations. The limit is called MAX_PAGE:_ORDER and it defines the maximal power of two of number of pages that can be allocated as a single contiguous block. This option allows overriding the default setting when ability to allocate very diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index 49849790e66d..204c43cb3d43 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -277,7 +277,7 @@ config ARCH_FORCE_MAX_ORDER default "12" help The kernel page allocator limits the size of maximal physically - contiguous allocations. The limit is called MAX_ORDER and it + contiguous allocations. The limit is called MAX_PAGE_ORDER and it defines the maximal power of two of number of pages that can be allocated as a single contiguous block. This option allows overriding the default setting when ability to allocate very diff --git a/arch/sparc/kernel/pci_sun4v.c b/arch/sparc/kernel/pci_sun4v.c index c80b0a21d709..083e5f05a7f0 100644 --- a/arch/sparc/kernel/pci_sun4v.c +++ b/arch/sparc/kernel/pci_sun4v.c @@ -194,7 +194,7 @@ static void *dma_4v_alloc_coherent(struct device *dev, size_t size, size = IO_PAGE_ALIGN(size); order = get_order(size); - if (unlikely(order > MAX_ORDER)) + if (unlikely(order > MAX_PAGE_ORDER)) return NULL; npages = size >> IO_PAGE_SHIFT; diff --git a/arch/sparc/mm/tsb.c b/arch/sparc/mm/tsb.c index 5e2931a18409..6acd8a4c1e2a 100644 --- a/arch/sparc/mm/tsb.c +++ b/arch/sparc/mm/tsb.c @@ -402,8 +402,8 @@ void tsb_grow(struct mm_struct *mm, unsigned long tsb_index, unsigned long rss) unsigned long new_rss_limit; gfp_t gfp_flags; - if (max_tsb_size > PAGE_SIZE << MAX_ORDER) - max_tsb_size = PAGE_SIZE << MAX_ORDER; + if (max_tsb_size > PAGE_SIZE << MAX_PAGE_ORDER) + max_tsb_size = PAGE_SIZE << MAX_PAGE_ORDER; new_cache_index = 0; for (new_size = 8192; new_size < max_tsb_size; new_size <<= 1UL) { diff --git a/arch/um/kernel/um_arch.c b/arch/um/kernel/um_arch.c index b1bfed0c8528..7a9820797eae 100644 --- a/arch/um/kernel/um_arch.c +++ b/arch/um/kernel/um_arch.c @@ -373,10 +373,10 @@ int __init linux_main(int argc, char **argv) max_physmem = TASK_SIZE - uml_physmem - iomem_size - MIN_VMALLOC; /* - * Zones have to begin on a 1 << MAX_ORDER page boundary, + * Zones have to begin on a 1 << MAX_PAGE_ORDER page boundary, * so this makes sure that's true for highmem */ - max_physmem &= ~((1 << (PAGE_SHIFT + MAX_ORDER)) - 1); + max_physmem &= ~((1 << (PAGE_SHIFT + MAX_PAGE_ORDER)) - 1); if (physmem_size + iomem_size > max_physmem) { highmem = physmem_size + iomem_size - max_physmem; physmem_size -= highmem; diff --git a/arch/xtensa/Kconfig b/arch/xtensa/Kconfig index 7d792077e5fd..e031eaf36c99 100644 --- a/arch/xtensa/Kconfig +++ b/arch/xtensa/Kconfig @@ -793,7 +793,7 @@ config ARCH_FORCE_MAX_ORDER default "10" help The kernel page allocator limits the size of maximal physically - contiguous allocations. The limit is called MAX_ORDER and it + contiguous allocations. The limit is called MAX_PAGE_ORDER and it defines the maximal power of two of number of pages that can be allocated as a single contiguous block. This option allows overriding the default setting when ability to allocate very diff --git a/drivers/accel/qaic/qaic_data.c b/drivers/accel/qaic/qaic_data.c index 4a8e43a7a6a4..aaeb2c9c071a 100644 --- a/drivers/accel/qaic/qaic_data.c +++ b/drivers/accel/qaic/qaic_data.c @@ -451,7 +451,7 @@ static int create_sgt(struct qaic_device *qdev, struct sg_table **sgt_out, u64 s * later */ buf_extra = (PAGE_SIZE - size % PAGE_SIZE) % PAGE_SIZE; - max_order = min(MAX_ORDER - 1, get_order(size)); + max_order = min(MAX_PAGE_ORDER - 1, get_order(size)); } else { /* allocate a single page for book keeping */ nr_pages = 1; diff --git a/drivers/base/regmap/regmap-debugfs.c b/drivers/base/regmap/regmap-debugfs.c index bdd80b73c3e6..fb84cda92a75 100644 --- a/drivers/base/regmap/regmap-debugfs.c +++ b/drivers/base/regmap/regmap-debugfs.c @@ -226,8 +226,8 @@ static ssize_t regmap_read_debugfs(struct regmap *map, unsigned int from, if (*ppos < 0 || !count) return -EINVAL; - if (count > (PAGE_SIZE << MAX_ORDER)) - count = PAGE_SIZE << MAX_ORDER; + if (count > (PAGE_SIZE << MAX_PAGE_ORDER)) + count = PAGE_SIZE << MAX_PAGE_ORDER; buf = kmalloc(count, GFP_KERNEL); if (!buf) @@ -373,8 +373,8 @@ static ssize_t regmap_reg_ranges_read_file(struct file *file, if (*ppos < 0 || !count) return -EINVAL; - if (count > (PAGE_SIZE << MAX_ORDER)) - count = PAGE_SIZE << MAX_ORDER; + if (count > (PAGE_SIZE << MAX_PAGE_ORDER)) + count = PAGE_SIZE << MAX_PAGE_ORDER; buf = kmalloc(count, GFP_KERNEL); if (!buf) diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index 11114a5d9e5c..d0e41d52d6a9 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -3079,7 +3079,7 @@ static void raw_cmd_free(struct floppy_raw_cmd **ptr) } } -#define MAX_LEN (1UL << MAX_ORDER << PAGE_SHIFT) +#define MAX_LEN (1UL << MAX_PAGE_ORDER << PAGE_SHIFT) static int raw_cmd_copyin(int cmd, void __user *param, struct floppy_raw_cmd **rcmd) diff --git a/drivers/crypto/ccp/sev-dev.c b/drivers/crypto/ccp/sev-dev.c index fcaccd0b5a65..e4d3f45242f6 100644 --- a/drivers/crypto/ccp/sev-dev.c +++ b/drivers/crypto/ccp/sev-dev.c @@ -906,7 +906,7 @@ static int sev_ioctl_do_get_id2(struct sev_issue_cmd *argp) /* * The length of the ID shouldn't be assumed by software since * it may change in the future. The allocation size is limited - * to 1 << (PAGE_SHIFT + MAX_ORDER) by the page allocator. + * to 1 << (PAGE_SHIFT + MAX_PAGE_ORDER) by the page allocator. * If the allocation fails, simply return ENOMEM rather than * warning in the kernel log. */ diff --git a/drivers/crypto/hisilicon/sgl.c b/drivers/crypto/hisilicon/sgl.c index 3df7a256e919..5c1012d7ffa9 100644 --- a/drivers/crypto/hisilicon/sgl.c +++ b/drivers/crypto/hisilicon/sgl.c @@ -70,11 +70,11 @@ struct hisi_acc_sgl_pool *hisi_acc_create_sgl_pool(struct device *dev, HISI_ACC_SGL_ALIGN_SIZE); /* - * the pool may allocate a block of memory of size PAGE_SIZE * 2^MAX_ORDER, + * the pool may allocate a block of memory of size PAGE_SIZE * 2^MAX_PAGE_ORDER, * block size may exceed 2^31 on ia64, so the max of block size is 2^31 */ - block_size = 1 << (PAGE_SHIFT + MAX_ORDER < 32 ? - PAGE_SHIFT + MAX_ORDER : 31); + block_size = 1 << (PAGE_SHIFT + MAX_PAGE_ORDER < 32 ? + PAGE_SHIFT + MAX_PAGE_ORDER : 31); sgl_num_per_block = block_size / sgl_size; block_num = count / sgl_num_per_block; remain_sgl = count % sgl_num_per_block; diff --git a/drivers/gpu/drm/i915/gem/i915_gem_internal.c b/drivers/gpu/drm/i915/gem/i915_gem_internal.c index 6bc26b4b06b8..ea7561ae6e13 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_internal.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_internal.c @@ -36,7 +36,7 @@ static int i915_gem_object_get_pages_internal(struct drm_i915_gem_object *obj) struct sg_table *st; struct scatterlist *sg; unsigned int npages; /* restricted by sg_alloc_table */ - int max_order = MAX_ORDER; + int max_order = MAX_PAGE_ORDER; unsigned int max_segment; gfp_t gfp; diff --git a/drivers/gpu/drm/i915/gem/selftests/huge_pages.c b/drivers/gpu/drm/i915/gem/selftests/huge_pages.c index 6b9f6cf50bf6..84c50c4c4af7 100644 --- a/drivers/gpu/drm/i915/gem/selftests/huge_pages.c +++ b/drivers/gpu/drm/i915/gem/selftests/huge_pages.c @@ -115,7 +115,7 @@ static int get_huge_pages(struct drm_i915_gem_object *obj) do { struct page *page; - GEM_BUG_ON(order > MAX_ORDER); + GEM_BUG_ON(order > MAX_PAGE_ORDER); page = alloc_pages(GFP | __GFP_ZERO, order); if (!page) goto err; diff --git a/drivers/gpu/drm/ttm/tests/ttm_pool_test.c b/drivers/gpu/drm/ttm/tests/ttm_pool_test.c index 2d9cae8cd984..cceaa18d4e46 100644 --- a/drivers/gpu/drm/ttm/tests/ttm_pool_test.c +++ b/drivers/gpu/drm/ttm/tests/ttm_pool_test.c @@ -109,7 +109,7 @@ static const struct ttm_pool_test_case ttm_pool_basic_cases[] = { }, { .description = "Above the allocation limit", - .order = MAX_ORDER + 1, + .order = MAX_PAGE_ORDER + 1, }, { .description = "One page, with coherent DMA mappings enabled", @@ -118,7 +118,7 @@ static const struct ttm_pool_test_case ttm_pool_basic_cases[] = { }, { .description = "Above the allocation limit, with coherent DMA mappings enabled", - .order = MAX_ORDER + 1, + .order = MAX_PAGE_ORDER + 1, .use_dma_alloc = true, }, }; @@ -165,7 +165,7 @@ static void ttm_pool_alloc_basic(struct kunit *test) fst_page = tt->pages[0]; last_page = tt->pages[tt->num_pages - 1]; - if (params->order <= MAX_ORDER) { + if (params->order <= MAX_PAGE_ORDER) { if (params->use_dma_alloc) { KUNIT_ASSERT_NOT_NULL(test, (void *)fst_page->private); KUNIT_ASSERT_NOT_NULL(test, (void *)last_page->private); @@ -182,7 +182,7 @@ static void ttm_pool_alloc_basic(struct kunit *test) * order 0 blocks */ KUNIT_ASSERT_EQ(test, fst_page->private, - min_t(unsigned int, MAX_ORDER, + min_t(unsigned int, MAX_PAGE_ORDER, params->order)); KUNIT_ASSERT_EQ(test, last_page->private, 0); } diff --git a/drivers/gpu/drm/ttm/ttm_pool.c b/drivers/gpu/drm/ttm/ttm_pool.c index d183bb97c526..b62f420a9f96 100644 --- a/drivers/gpu/drm/ttm/ttm_pool.c +++ b/drivers/gpu/drm/ttm/ttm_pool.c @@ -447,7 +447,7 @@ int ttm_pool_alloc(struct ttm_pool *pool, struct ttm_tt *tt, else gfp_flags |= GFP_HIGHUSER; - for (order = min_t(unsigned int, MAX_ORDER, __fls(num_pages)); + for (order = min_t(unsigned int, MAX_PAGE_ORDER, __fls(num_pages)); num_pages; order = min_t(unsigned int, order, __fls(num_pages))) { struct ttm_pool_type *pt; diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h index 961205ba86d2..925ac6a47bce 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h @@ -188,7 +188,7 @@ #ifdef CONFIG_CMA_ALIGNMENT #define Q_MAX_SZ_SHIFT (PAGE_SHIFT + CONFIG_CMA_ALIGNMENT) #else -#define Q_MAX_SZ_SHIFT (PAGE_SHIFT + MAX_ORDER) +#define Q_MAX_SZ_SHIFT (PAGE_SHIFT + MAX_PAGE_ORDER) #endif /* diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index 85163a83df2f..e59f50e11ea8 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -884,7 +884,7 @@ static struct page **__iommu_dma_alloc_pages(struct device *dev, struct page **pages; unsigned int i = 0, nid = dev_to_node(dev); - order_mask &= GENMASK(MAX_ORDER, 0); + order_mask &= GENMASK(MAX_PAGE_ORDER, 0); if (!order_mask) return NULL; diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c index 9a7a74239eab..d097001c1e3e 100644 --- a/drivers/irqchip/irq-gic-v3-its.c +++ b/drivers/irqchip/irq-gic-v3-its.c @@ -2465,8 +2465,8 @@ static bool its_parse_indirect_baser(struct its_node *its, * feature is not supported by hardware. */ new_order = max_t(u32, get_order(esz << ids), new_order); - if (new_order > MAX_ORDER) { - new_order = MAX_ORDER; + if (new_order > MAX_PAGE_ORDER) { + new_order = MAX_PAGE_ORDER; ids = ilog2(PAGE_ORDER_TO_SIZE(new_order) / (int)esz); pr_warn("ITS@%pa: %s Table too large, reduce ids %llu->%u\n", &its->phys_base, its_base_type_string[type], diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index f03d7dba270c..13c65b7e1ed6 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -1170,7 +1170,7 @@ static void __cache_size_refresh(void) * If the allocation may fail we use __get_free_pages. Memory fragmentation * won't have a fatal effect here, but it just causes flushes of some other * buffers and more I/O will be performed. Don't use __get_free_pages if it - * always fails (i.e. order > MAX_ORDER). + * always fails (i.e. order > MAX_PAGE_ORDER). * * If the allocation shouldn't fail we use __vmalloc. This is only for the * initial reserve allocation, so there's no risk of wasting all vmalloc diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 2ae8560b6a14..855b482cbff1 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -1673,7 +1673,7 @@ static struct bio *crypt_alloc_buffer(struct dm_crypt_io *io, unsigned int size) unsigned int nr_iovecs = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; gfp_t gfp_mask = GFP_NOWAIT | __GFP_HIGHMEM; unsigned int remaining_size; - unsigned int order = MAX_ORDER; + unsigned int order = MAX_PAGE_ORDER; retry: if (unlikely(gfp_mask & __GFP_DIRECT_RECLAIM)) diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c index f57fb821528d..7916ed9f10e8 100644 --- a/drivers/md/dm-flakey.c +++ b/drivers/md/dm-flakey.c @@ -434,7 +434,7 @@ static struct bio *clone_bio(struct dm_target *ti, struct flakey_c *fc, struct b remaining_size = size; - order = MAX_ORDER; + order = MAX_PAGE_ORDER; while (remaining_size) { struct page *pages; unsigned size_to_add, to_copy; diff --git a/drivers/misc/genwqe/card_dev.c b/drivers/misc/genwqe/card_dev.c index 55fc5b80e649..4441aca2280a 100644 --- a/drivers/misc/genwqe/card_dev.c +++ b/drivers/misc/genwqe/card_dev.c @@ -443,7 +443,7 @@ static int genwqe_mmap(struct file *filp, struct vm_area_struct *vma) if (vsize == 0) return -EINVAL; - if (get_order(vsize) > MAX_ORDER) + if (get_order(vsize) > MAX_PAGE_ORDER) return -ENOMEM; dma_map = kzalloc(sizeof(struct dma_mapping), GFP_KERNEL); diff --git a/drivers/misc/genwqe/card_utils.c b/drivers/misc/genwqe/card_utils.c index 1c798d6b2dfb..a2c4a9b4f871 100644 --- a/drivers/misc/genwqe/card_utils.c +++ b/drivers/misc/genwqe/card_utils.c @@ -210,7 +210,7 @@ u32 genwqe_crc32(u8 *buff, size_t len, u32 init) void *__genwqe_alloc_consistent(struct genwqe_dev *cd, size_t size, dma_addr_t *dma_handle) { - if (get_order(size) > MAX_ORDER) + if (get_order(size) > MAX_PAGE_ORDER) return NULL; return dma_alloc_coherent(&cd->pci_dev->dev, size, dma_handle, @@ -308,7 +308,7 @@ int genwqe_alloc_sync_sgl(struct genwqe_dev *cd, struct genwqe_sgl *sgl, sgl->write = write; sgl->sgl_size = genwqe_sgl_size(sgl->nr_pages); - if (get_order(sgl->sgl_size) > MAX_ORDER) { + if (get_order(sgl->sgl_size) > MAX_PAGE_ORDER) { dev_err(&pci_dev->dev, "[%s] err: too much memory requested!\n", __func__); return ret; diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c index b618797a7e8d..f1695c889d3a 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c @@ -1041,7 +1041,7 @@ static void hns3_init_tx_spare_buffer(struct hns3_enet_ring *ring) return; order = get_order(alloc_size); - if (order > MAX_ORDER) { + if (order > MAX_PAGE_ORDER) { if (net_ratelimit()) dev_warn(ring_to_dev(ring), "failed to allocate tx spare buffer, exceed to max order\n"); return; diff --git a/drivers/net/ethernet/ibm/ibmvnic.h b/drivers/net/ethernet/ibm/ibmvnic.h index 4e18b4cefa97..94ac36b1408b 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.h +++ b/drivers/net/ethernet/ibm/ibmvnic.h @@ -48,7 +48,7 @@ * of 4096 jumbo frames (MTU=9000) we will need about 9K*4K = 36MB plus * some padding. * - * But the size of a single DMA region is limited by MAX_ORDER in the + * But the size of a single DMA region is limited by MAX_PAGE_ORDER in the * kernel (about 16MB currently). To support say 4K Jumbo frames, we * use a set of LTBs (struct ltb_set) per pool. * @@ -75,7 +75,7 @@ * pool for the 4MB. Thus the 16 Rx and Tx queues require 32 * 5 = 160 * plus 16 for the TSO pools for a total of 176 LTB mappings per VNIC. */ -#define IBMVNIC_ONE_LTB_MAX ((u32)((1 << MAX_ORDER) * PAGE_SIZE)) +#define IBMVNIC_ONE_LTB_MAX ((u32)((1 << MAX_PAGE_ORDER) * PAGE_SIZE)) #define IBMVNIC_ONE_LTB_SIZE min((u32)(8 << 20), IBMVNIC_ONE_LTB_MAX) #define IBMVNIC_LTB_SET_SIZE (38 << 20) diff --git a/drivers/video/fbdev/hyperv_fb.c b/drivers/video/fbdev/hyperv_fb.c index a80939fe2ee6..6a29d2594b91 100644 --- a/drivers/video/fbdev/hyperv_fb.c +++ b/drivers/video/fbdev/hyperv_fb.c @@ -927,8 +927,8 @@ static phys_addr_t hvfb_get_phymem(struct hv_device *hdev, if (request_size == 0) return -1; - if (order <= MAX_ORDER) { - /* Call alloc_pages if the size is less than 2^MAX_ORDER */ + if (order <= MAX_PAGE_ORDER) { + /* Call alloc_pages if the size is less than 2^MAX_PAGE_ORDER */ page = alloc_pages(GFP_KERNEL | __GFP_ZERO, order); if (!page) return -1; @@ -958,7 +958,7 @@ static void hvfb_release_phymem(struct hv_device *hdev, { unsigned int order = get_order(size); - if (order <= MAX_ORDER) + if (order <= MAX_PAGE_ORDER) __free_pages(pfn_to_page(paddr >> PAGE_SHIFT), order); else dma_free_coherent(&hdev->device, diff --git a/drivers/video/fbdev/vermilion/vermilion.c b/drivers/video/fbdev/vermilion/vermilion.c index 840ead69654b..a32e5b2924c9 100644 --- a/drivers/video/fbdev/vermilion/vermilion.c +++ b/drivers/video/fbdev/vermilion/vermilion.c @@ -197,7 +197,7 @@ static int vmlfb_alloc_vram(struct vml_info *vinfo, va = &vinfo->vram[i]; order = 0; - while (requested > (PAGE_SIZE << order) && order <= MAX_ORDER) + while (requested > (PAGE_SIZE << order) && order <= MAX_PAGE_ORDER) order++; err = vmlfb_alloc_vram_area(va, order, 0); diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c index 1fe93e93f5bc..59cdc0292dce 100644 --- a/drivers/virtio/virtio_balloon.c +++ b/drivers/virtio/virtio_balloon.c @@ -33,7 +33,7 @@ #define VIRTIO_BALLOON_FREE_PAGE_ALLOC_FLAG (__GFP_NORETRY | __GFP_NOWARN | \ __GFP_NOMEMALLOC) /* The order of free page blocks to report to host */ -#define VIRTIO_BALLOON_HINT_BLOCK_ORDER MAX_ORDER +#define VIRTIO_BALLOON_HINT_BLOCK_ORDER MAX_PAGE_ORDER /* The size of a free page block in bytes */ #define VIRTIO_BALLOON_HINT_BLOCK_BYTES \ (1 << (VIRTIO_BALLOON_HINT_BLOCK_ORDER + PAGE_SHIFT)) diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c index fa5226c198cc..8e3223294442 100644 --- a/drivers/virtio/virtio_mem.c +++ b/drivers/virtio/virtio_mem.c @@ -1154,13 +1154,13 @@ static void virtio_mem_clear_fake_offline(unsigned long pfn, */ static void virtio_mem_fake_online(unsigned long pfn, unsigned long nr_pages) { - unsigned long order = MAX_ORDER; + unsigned long order = MAX_PAGE_ORDER; unsigned long i; /* * We might get called for ranges that don't cover properly aligned - * MAX_ORDER pages; however, we can only online properly aligned - * pages with an order of MAX_ORDER at maximum. + * MAX_PAGE_ORDER pages; however, we can only online properly aligned + * pages with an order of MAX_PAGE_ORDER at maximum. */ while (!IS_ALIGNED(pfn | nr_pages, 1 << order)) order--; @@ -1280,7 +1280,7 @@ static void virtio_mem_online_page(struct virtio_mem *vm, bool do_online; /* - * We can get called with any order up to MAX_ORDER. If our subblock + * We can get called with any order up to MAX_PAGE_ORDER. If our subblock * size is smaller than that and we have a mixture of plugged and * unplugged subblocks within such a page, we have to process in * smaller granularity. In that case we'll adjust the order exactly once diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c index efb1b4c1a0a4..7a6d980e614d 100644 --- a/fs/ramfs/file-nommu.c +++ b/fs/ramfs/file-nommu.c @@ -70,7 +70,7 @@ int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize) /* make various checks */ order = get_order(newsize); - if (unlikely(order > MAX_ORDER)) + if (unlikely(order > MAX_PAGE_ORDER)) return -EFBIG; ret = inode_newsize_ok(inode, newsize); diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 236ec7b63c54..c1ee640d87b1 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -829,7 +829,7 @@ static inline unsigned huge_page_shift(struct hstate *h) static inline bool hstate_is_gigantic(struct hstate *h) { - return huge_page_order(h) > MAX_ORDER; + return huge_page_order(h) > MAX_PAGE_ORDER; } static inline unsigned int pages_per_huge_page(const struct hstate *h) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 1ea7636dfb76..4ed33b127821 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -27,15 +27,15 @@ /* Free memory management - zoned buddy allocator. */ #ifndef CONFIG_ARCH_FORCE_MAX_ORDER -#define MAX_ORDER 10 +#define MAX_PAGE_ORDER 10 #else -#define MAX_ORDER CONFIG_ARCH_FORCE_MAX_ORDER +#define MAX_PAGE_ORDER CONFIG_ARCH_FORCE_MAX_ORDER #endif -#define MAX_ORDER_NR_PAGES (1 << MAX_ORDER) +#define MAX_ORDER_NR_PAGES (1 << MAX_PAGE_ORDER) #define IS_MAX_ORDER_ALIGNED(pfn) IS_ALIGNED(pfn, MAX_ORDER_NR_PAGES) -#define NR_PAGE_ORDERS (MAX_ORDER + 1) +#define NR_PAGE_ORDERS (MAX_PAGE_ORDER + 1) /* * PAGE_ALLOC_COSTLY_ORDER is the order at which allocations are deemed @@ -938,7 +938,7 @@ struct zone { struct free_area free_area[NR_PAGE_ORDERS]; #ifdef CONFIG_UNACCEPTED_MEMORY - /* Pages to be accepted. All pages on the list are MAX_ORDER */ + /* Pages to be accepted. All pages on the list are MAX_PAGE_ORDER */ struct list_head unaccepted_pages; #endif @@ -1748,8 +1748,8 @@ static inline bool movable_only_nodes(nodemask_t *nodes) #define SECTION_BLOCKFLAGS_BITS \ ((1UL << (PFN_SECTION_SHIFT - pageblock_order)) * NR_PAGEBLOCK_BITS) -#if (MAX_ORDER + PAGE_SHIFT) > SECTION_SIZE_BITS -#error Allocator MAX_ORDER exceeds SECTION_SIZE +#if (MAX_PAGE_ORDER + PAGE_SHIFT) > SECTION_SIZE_BITS +#error Allocator MAX_PAGE_ORDER exceeds SECTION_SIZE #endif static inline unsigned long pfn_to_section_nr(unsigned long pfn) diff --git a/include/linux/pageblock-flags.h b/include/linux/pageblock-flags.h index e83c4c095041..3f2409b968ec 100644 --- a/include/linux/pageblock-flags.h +++ b/include/linux/pageblock-flags.h @@ -41,14 +41,14 @@ extern unsigned int pageblock_order; * Huge pages are a constant size, but don't exceed the maximum allocation * granularity. */ -#define pageblock_order min_t(unsigned int, HUGETLB_PAGE_ORDER, MAX_ORDER) +#define pageblock_order min_t(unsigned int, HUGETLB_PAGE_ORDER, MAX_PAGE_ORDER) #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ #else /* CONFIG_HUGETLB_PAGE */ /* If huge pages are not used, group by MAX_ORDER_NR_PAGES */ -#define pageblock_order MAX_ORDER +#define pageblock_order MAX_PAGE_ORDER #endif /* CONFIG_HUGETLB_PAGE */ diff --git a/include/linux/slab.h b/include/linux/slab.h index d6d6ffeeb9a2..d63823e518c0 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -308,7 +308,7 @@ static inline unsigned int arch_slab_minalign(void) * (PAGE_SIZE*2). Larger requests are passed to the page allocator. */ #define KMALLOC_SHIFT_HIGH (PAGE_SHIFT + 1) -#define KMALLOC_SHIFT_MAX (MAX_ORDER + PAGE_SHIFT) +#define KMALLOC_SHIFT_MAX (MAX_PAGE_ORDER + PAGE_SHIFT) #ifndef KMALLOC_SHIFT_LOW #define KMALLOC_SHIFT_LOW 5 #endif @@ -316,7 +316,7 @@ static inline unsigned int arch_slab_minalign(void) #ifdef CONFIG_SLUB #define KMALLOC_SHIFT_HIGH (PAGE_SHIFT + 1) -#define KMALLOC_SHIFT_MAX (MAX_ORDER + PAGE_SHIFT) +#define KMALLOC_SHIFT_MAX (MAX_PAGE_ORDER + PAGE_SHIFT) #ifndef KMALLOC_SHIFT_LOW #define KMALLOC_SHIFT_LOW 3 #endif diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c index b481c48a31a6..d10613eb0f63 100644 --- a/kernel/dma/pool.c +++ b/kernel/dma/pool.c @@ -84,8 +84,8 @@ static int atomic_pool_expand(struct gen_pool *pool, size_t pool_size, void *addr; int ret = -ENOMEM; - /* Cannot allocate larger than MAX_ORDER */ - order = min(get_order(pool_size), MAX_ORDER); + /* Cannot allocate larger than MAX_PAGE_ORDER */ + order = min(get_order(pool_size), MAX_PAGE_ORDER); do { pool_size = 1 << (PAGE_SHIFT + order); @@ -190,7 +190,7 @@ static int __init dma_atomic_pool_init(void) /* * If coherent_pool was not used on the command line, default the pool - * sizes to 128KB per 1GB of memory, min 128KB, max MAX_ORDER. + * sizes to 128KB per 1GB of memory, min 128KB, max MAX_PAGE_ORDER. */ if (!atomic_pool_size) { unsigned long pages = totalram_pages() / (SZ_1G / SZ_128K); diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 33d942615be5..176078bf2215 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -686,8 +686,8 @@ static struct io_tlb_pool *swiotlb_alloc_pool(struct device *dev, size_t pool_size; size_t tlb_size; - if (nslabs > SLABS_PER_PAGE << MAX_ORDER) { - nslabs = SLABS_PER_PAGE << MAX_ORDER; + if (nslabs > SLABS_PER_PAGE << MAX_PAGE_ORDER) { + nslabs = SLABS_PER_PAGE << MAX_PAGE_ORDER; nareas = limit_nareas(nareas, nslabs); } diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index e8d82c2f07d0..60ed43d1c29e 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -610,8 +610,8 @@ static struct page *rb_alloc_aux_page(int node, int order) { struct page *page; - if (order > MAX_ORDER) - order = MAX_ORDER; + if (order > MAX_PAGE_ORDER) + order = MAX_PAGE_ORDER; do { page = alloc_pages_node(node, PERF_AUX_GFP, order); @@ -702,9 +702,9 @@ int rb_alloc_aux(struct perf_buffer *rb, struct perf_event *event, /* * kcalloc_node() is unable to allocate buffer if the size is larger - * than: PAGE_SIZE << MAX_ORDER; directly bail out in this case. + * than: PAGE_SIZE << MAX_PAGE_ORDER; directly bail out in this case. */ - if (get_order((unsigned long)nr_pages * sizeof(void *)) > MAX_ORDER) + if (get_order((unsigned long)nr_pages * sizeof(void *)) > MAX_PAGE_ORDER) return -ENOMEM; rb->aux_pages = kcalloc_node(nr_pages, sizeof(void *), GFP_KERNEL, node); @@ -821,7 +821,7 @@ struct perf_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags) size = sizeof(struct perf_buffer); size += nr_pages * sizeof(void *); - if (order_base_2(size) > PAGE_SHIFT+MAX_ORDER) + if (order_base_2(size) > PAGE_SHIFT+MAX_PAGE_ORDER) goto fail; node = (cpu == -1) ? cpu : cpu_to_node(cpu); diff --git a/mm/Kconfig b/mm/Kconfig index 79d563d8f9e0..cb9d470f0bf7 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -381,7 +381,7 @@ config SHUFFLE_PAGE_ALLOCATOR the presence of a memory-side-cache. There are also incidental security benefits as it reduces the predictability of page allocations to compliment SLAB_FREELIST_RANDOM, but the - default granularity of shuffling on the MAX_ORDER i.e, 10th + default granularity of shuffling on the MAX_PAGE_ORDER i.e, 10th order of pages is selected based on cache utilization benefits on x86. @@ -713,8 +713,8 @@ config HUGETLB_PAGE_SIZE_VARIABLE HUGETLB_PAGE_ORDER when there are multiple HugeTLB page sizes available on a platform. - Note that the pageblock_order cannot exceed MAX_ORDER and will be - clamped down to MAX_ORDER. + Note that the pageblock_order cannot exceed MAX_PAGE_ORDER and will be + clamped down to MAX_PAGE_ORDER. config CONTIG_ALLOC def_bool (MEMORY_ISOLATION && COMPACTION) || CMA diff --git a/mm/compaction.c b/mm/compaction.c index 24f8eb4d6260..27ada42924d5 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -999,7 +999,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, * a valid page order. Consider only values in the * valid order range to prevent low_pfn overflow. */ - if (freepage_order > 0 && freepage_order <= MAX_ORDER) { + if (freepage_order > 0 && freepage_order <= MAX_PAGE_ORDER) { low_pfn += (1UL << freepage_order) - 1; nr_scanned += (1UL << freepage_order) - 1; } @@ -1017,7 +1017,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, if (PageCompound(page) && !cc->alloc_contig) { const unsigned int order = compound_order(page); - if (likely(order <= MAX_ORDER)) { + if (likely(order <= MAX_PAGE_ORDER)) { low_pfn += (1UL << order) - 1; nr_scanned += (1UL << order) - 1; } diff --git a/mm/debug_page_alloc.c b/mm/debug_page_alloc.c index f9d145730fd1..6755f0c9d4a3 100644 --- a/mm/debug_page_alloc.c +++ b/mm/debug_page_alloc.c @@ -22,7 +22,7 @@ static int __init debug_guardpage_minorder_setup(char *buf) { unsigned long res; - if (kstrtoul(buf, 10, &res) < 0 || res > MAX_ORDER / 2) { + if (kstrtoul(buf, 10, &res) < 0 || res > MAX_PAGE_ORDER / 2) { pr_err("Bad debug_guardpage_minorder value\n"); return 0; } diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index e651500e597a..5662e29fe253 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -1091,7 +1091,7 @@ debug_vm_pgtable_alloc_huge_page(struct pgtable_debug_args *args, int order) struct page *page = NULL; #ifdef CONFIG_CONTIG_ALLOC - if (order > MAX_ORDER) { + if (order > MAX_PAGE_ORDER) { page = alloc_contig_pages((1 << order), GFP_KERNEL, first_online_node, NULL); if (page) { @@ -1101,7 +1101,7 @@ debug_vm_pgtable_alloc_huge_page(struct pgtable_debug_args *args, int order) } #endif - if (order <= MAX_ORDER) + if (order <= MAX_PAGE_ORDER) page = alloc_pages(GFP_KERNEL, order); return page; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 1a588e29d287..b9a7a57691d7 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -682,7 +682,7 @@ static int __init hugepage_init(void) /* * hugepages can't be allocated by the buddy allocator */ - MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER > MAX_ORDER); + MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER > MAX_PAGE_ORDER); /* * we use page->mapping and page->index in second tail page * as list_head: assuming THP order >= 2 diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 378e460a6ab4..0d262784ce60 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3410,7 +3410,7 @@ static void __init prep_and_add_bootmem_folios(struct hstate *h, /* * Put bootmem huge pages into the standard lists after mem_map is up. - * Note: This only applies to gigantic (order > MAX_ORDER) pages. + * Note: This only applies to gigantic (order > MAX_PAGE_ORDER) pages. */ static void __init gather_bootmem_prealloc(void) { @@ -4790,7 +4790,7 @@ static int __init default_hugepagesz_setup(char *s) * The number of default huge pages (for this size) could have been * specified as the first hugetlb parameter: hugepages=X. If so, * then default_hstate_max_huge_pages is set. If the default huge - * page size is gigantic (> MAX_ORDER), then the pages must be + * page size is gigantic (> MAX_PAGE_ORDER), then the pages must be * allocated here from bootmem allocator. */ if (default_hstate_max_huge_pages) { diff --git a/mm/internal.h b/mm/internal.h index ac40c3d00336..f309a010d50f 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -335,7 +335,7 @@ static inline bool page_is_buddy(struct page *page, struct page *buddy, * satisfies the following equation: * P = B & ~(1 << O) * - * Assumption: *_mem_map is contiguous at least up to MAX_ORDER + * Assumption: *_mem_map is contiguous at least up to MAX_PAGE_ORDER */ static inline unsigned long __find_buddy_pfn(unsigned long page_pfn, unsigned int order) diff --git a/mm/kmsan/init.c b/mm/kmsan/init.c index 103e2e88ea03..3ac3b8921d36 100644 --- a/mm/kmsan/init.c +++ b/mm/kmsan/init.c @@ -141,7 +141,7 @@ struct smallstack { static struct smallstack collect = { .index = 0, - .order = MAX_ORDER, + .order = MAX_PAGE_ORDER, }; static void smallstack_push(struct smallstack *stack, struct page *pages) @@ -211,8 +211,8 @@ static void kmsan_memblock_discard(void) * order=N-1, * - repeat. */ - collect.order = MAX_ORDER; - for (int i = MAX_ORDER; i >= 0; i--) { + collect.order = MAX_PAGE_ORDER; + for (int i = MAX_PAGE_ORDER; i >= 0; i--) { if (held_back[i].shadow) smallstack_push(&collect, held_back[i].shadow); if (held_back[i].origin) diff --git a/mm/memblock.c b/mm/memblock.c index 4a62f7774b65..8c194d8afeec 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -2113,12 +2113,13 @@ static void __init __free_pages_memory(unsigned long start, unsigned long end) * Free the pages in the largest chunks alignment allows. * * __ffs() behaviour is undefined for 0. start == 0 is - * MAX_ORDER-aligned, set order to MAX_ORDER for the case. + * MAX_PAGE_ORDER-aligned, set order to MAX_PAGE_ORDER for + * the case. */ if (start) - order = min_t(int, MAX_ORDER, __ffs(start)); + order = min_t(int, MAX_PAGE_ORDER, __ffs(start)); else - order = MAX_ORDER; + order = MAX_PAGE_ORDER; while (start + (1UL << order) > end) order--; diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 926e1cfb10e9..b3c0ff52bb72 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -645,7 +645,7 @@ static void online_pages_range(unsigned long start_pfn, unsigned long nr_pages) unsigned long pfn; /* - * Online the pages in MAX_ORDER aligned chunks. The callback might + * Online the pages in MAX_PAGE_ORDER aligned chunks. The callback might * decide to not expose all pages to the buddy (e.g., expose them * later). We account all pages as being online and belonging to this * zone ("present"). @@ -660,12 +660,13 @@ static void online_pages_range(unsigned long start_pfn, unsigned long nr_pages) * Free to online pages in the largest chunks alignment allows. * * __ffs() behaviour is undefined for 0. start == 0 is - * MAX_ORDER-aligned, Set order to MAX_ORDER for the case. + * MAX_PAGE_ORDER-aligned, Set order to MAX_PAGE_ORDER for + * the case. */ if (pfn) - order = min_t(int, MAX_ORDER, __ffs(pfn)); + order = min_t(int, MAX_PAGE_ORDER, __ffs(pfn)); else - order = MAX_ORDER; + order = MAX_PAGE_ORDER; (*online_page_callback)(pfn_to_page(pfn), order); pfn += (1UL << order); diff --git a/mm/mm_init.c b/mm/mm_init.c index 2830eef2b16c..89dc29f1e6c6 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -1455,7 +1455,7 @@ static inline void setup_usemap(struct zone *zone) {} /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */ void __init set_pageblock_order(void) { - unsigned int order = MAX_ORDER; + unsigned int order = MAX_PAGE_ORDER; /* Check that pageblock_nr_pages has not already been setup */ if (pageblock_order) @@ -1638,7 +1638,7 @@ static void __init alloc_node_mem_map(struct pglist_data *pgdat) start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1); offset = pgdat->node_start_pfn - start; /* - * The zone's endpoints aren't required to be MAX_ORDER + * The zone's endpoints aren't required to be MAX_PAGE_ORDER * aligned but the node_mem_map endpoints must be in order * for the buddy allocator to function correctly. */ @@ -1964,11 +1964,11 @@ static void __init deferred_free_range(unsigned long pfn, if (nr_pages == MAX_ORDER_NR_PAGES && IS_MAX_ORDER_ALIGNED(pfn)) { for (i = 0; i < nr_pages; i += pageblock_nr_pages) set_pageblock_migratetype(page + i, MIGRATE_MOVABLE); - __free_pages_core(page, MAX_ORDER); + __free_pages_core(page, MAX_PAGE_ORDER); return; } - /* Accept chunks smaller than MAX_ORDER upfront */ + /* Accept chunks smaller than MAX_PAGE_ORDER upfront */ accept_memory(PFN_PHYS(pfn), PFN_PHYS(pfn + nr_pages)); for (i = 0; i < nr_pages; i++, page++, pfn++) { @@ -1991,8 +1991,8 @@ static inline void __init pgdat_init_report_one_done(void) /* * Returns true if page needs to be initialized or freed to buddy allocator. * - * We check if a current MAX_ORDER block is valid by only checking the validity - * of the head pfn. + * We check if a current MAX_PAGE_ORDER block is valid by only checking the + * validity of the head pfn. */ static inline bool __init deferred_pfn_valid(unsigned long pfn) { @@ -2149,8 +2149,8 @@ deferred_init_memmap_chunk(unsigned long start_pfn, unsigned long end_pfn, deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, start_pfn); /* - * Initialize and free pages in MAX_ORDER sized increments so that we - * can avoid introducing any issues with the buddy allocator. + * Initialize and free pages in MAX_PAGE_ORDER sized increments so that + * we can avoid introducing any issues with the buddy allocator. */ while (spfn < end_pfn) { deferred_init_maxorder(&i, zone, &spfn, &epfn); @@ -2291,7 +2291,7 @@ bool __init deferred_grow_zone(struct zone *zone, unsigned int order) } /* - * Initialize and free pages in MAX_ORDER sized increments so + * Initialize and free pages in MAX_PAGE_ORDER sized increments so * that we can avoid introducing any issues with the buddy * allocator. */ @@ -2509,7 +2509,7 @@ void *__init alloc_large_system_hash(const char *tablename, else table = memblock_alloc_raw(size, SMP_CACHE_BYTES); - } else if (get_order(size) > MAX_ORDER || hashdist) { + } else if (get_order(size) > MAX_PAGE_ORDER || hashdist) { table = vmalloc_huge(size, gfp_flags); virt = true; if (table) @@ -2756,7 +2756,7 @@ void __init mm_core_init(void) /* * page_ext requires contiguous pages, - * bigger than MAX_ORDER unless SPARSEMEM. + * bigger than MAX_PAGE_ORDER unless SPARSEMEM. */ page_ext_init_flatmem(); mem_debugging_and_hardening_init(); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ccecf6158ae4..a01baf0454f8 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -727,7 +727,7 @@ buddy_merge_likely(unsigned long pfn, unsigned long buddy_pfn, unsigned long higher_page_pfn; struct page *higher_page; - if (order >= MAX_ORDER - 1) + if (order >= MAX_PAGE_ORDER - 1) return false; higher_page_pfn = buddy_pfn & pfn; @@ -782,7 +782,7 @@ static inline void __free_one_page(struct page *page, VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page); VM_BUG_ON_PAGE(bad_range(zone, page), page); - while (order < MAX_ORDER) { + while (order < MAX_PAGE_ORDER) { if (compaction_capture(capc, page, order, migratetype)) { __mod_zone_freepage_state(zone, -(1 << order), migratetype); @@ -1297,7 +1297,7 @@ void __free_pages_core(struct page *page, unsigned int order) atomic_long_add(nr_pages, &page_zone(page)->managed_pages); if (page_contains_unaccepted(page, order)) { - if (order == MAX_ORDER && __free_unaccepted(page)) + if (order == MAX_PAGE_ORDER && __free_unaccepted(page)) return; accept_page(page, order); @@ -1327,7 +1327,7 @@ void __free_pages_core(struct page *page, unsigned int order) * * Note: the function may return non-NULL struct page even for a page block * which contains a memory hole (i.e. there is no physical memory for a subset - * of the pfn range). For example, if the pageblock order is MAX_ORDER, which + * of the pfn range). For example, if the pageblock order is MAX_PAGE_ORDER, which * will fall into 2 sub-sections, and the end pfn of the pageblock may be hole * even though the start pfn is online and valid. This should be safe most of * the time because struct pages are still initialized via init_unavailable_range() @@ -2018,7 +2018,7 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype, * approximates finding the pageblock with the most free pages, which * would be too costly to do exactly. */ - for (current_order = MAX_ORDER; current_order >= min_order; + for (current_order = MAX_PAGE_ORDER; current_order >= min_order; --current_order) { area = &(zone->free_area[current_order]); fallback_mt = find_suitable_fallback(area, current_order, @@ -2056,7 +2056,7 @@ find_smallest: * This should not happen - we already found a suitable fallback * when looking for the largest page. */ - VM_BUG_ON(current_order > MAX_ORDER); + VM_BUG_ON(current_order > MAX_PAGE_ORDER); do_steal: page = get_page_from_free_area(area, fallback_mt); @@ -4533,7 +4533,7 @@ struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid, * There are several places where we assume that the order value is sane * so bail out early if the request is out of bound. */ - if (WARN_ON_ONCE_GFP(order > MAX_ORDER, gfp)) + if (WARN_ON_ONCE_GFP(order > MAX_PAGE_ORDER, gfp)) return NULL; gfp &= gfp_allowed_mask; @@ -4815,7 +4815,7 @@ static void *make_alloc_exact(unsigned long addr, unsigned int order, * minimum number of pages to satisfy the request. alloc_pages() can only * allocate memory in power-of-two pages. * - * This function is also limited by MAX_ORDER. + * This function is also limited by MAX_PAGE_ORDER. * * Memory allocated by this function must be released by free_pages_exact(). * @@ -6373,7 +6373,7 @@ int alloc_contig_range(unsigned long start, unsigned long end, order = 0; outer_start = start; while (!PageBuddy(pfn_to_page(outer_start))) { - if (++order > MAX_ORDER) { + if (++order > MAX_PAGE_ORDER) { outer_start = start; break; } @@ -6635,7 +6635,7 @@ bool is_free_buddy_page(struct page *page) break; } - return order <= MAX_ORDER; + return order <= MAX_PAGE_ORDER; } EXPORT_SYMBOL(is_free_buddy_page); @@ -6807,9 +6807,9 @@ static bool try_to_accept_memory_one(struct zone *zone) __mod_zone_page_state(zone, NR_UNACCEPTED, -MAX_ORDER_NR_PAGES); spin_unlock_irqrestore(&zone->lock, flags); - accept_page(page, MAX_ORDER); + accept_page(page, MAX_PAGE_ORDER); - __free_pages_ok(page, MAX_ORDER, FPI_TO_TAIL); + __free_pages_ok(page, MAX_PAGE_ORDER, FPI_TO_TAIL); if (last) static_branch_dec(&zones_with_unaccepted_pages); diff --git a/mm/page_isolation.c b/mm/page_isolation.c index bcf99ba747a0..cd0ea3668253 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -226,7 +226,7 @@ static void unset_migratetype_isolate(struct page *page, int migratetype) */ if (PageBuddy(page)) { order = buddy_order(page); - if (order >= pageblock_order && order < MAX_ORDER) { + if (order >= pageblock_order && order < MAX_PAGE_ORDER) { buddy = find_buddy_page_pfn(page, page_to_pfn(page), order, NULL); if (buddy && !is_migrate_isolate_page(buddy)) { @@ -290,11 +290,12 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages) * isolate_single_pageblock() * @migratetype: migrate type to set in error recovery. * - * Free and in-use pages can be as big as MAX_ORDER and contain more than one + * Free and in-use pages can be as big as MAX_PAGE_ORDER and contain more than one * pageblock. When not all pageblocks within a page are isolated at the same * time, free page accounting can go wrong. For example, in the case of - * MAX_ORDER = pageblock_order + 1, a MAX_ORDER page has two pagelbocks. - * [ MAX_ORDER ] + * MAX_PAGE_ORDER = pageblock_order + 1, a MAX_PAGE_ORDER page has two + * pagelbocks. + * [ MAX_PAGE_ORDER ] * [ pageblock0 | pageblock1 ] * When either pageblock is isolated, if it is a free page, the page is not * split into separate migratetype lists, which is supposed to; if it is an @@ -451,7 +452,7 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags, * the free page to the right migratetype list. * * head_pfn is not used here as a hugetlb page order - * can be bigger than MAX_ORDER, but after it is + * can be bigger than MAX_PAGE_ORDER, but after it is * freed, the free page order is not. Use pfn within * the range to find the head of the free page. */ @@ -459,7 +460,7 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags, outer_pfn = pfn; while (!PageBuddy(pfn_to_page(outer_pfn))) { /* stop if we cannot find the free page */ - if (++order > MAX_ORDER) + if (++order > MAX_PAGE_ORDER) goto failed; outer_pfn &= ~0UL << order; } @@ -660,8 +661,8 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn, int ret; /* - * Note: pageblock_nr_pages != MAX_ORDER. Then, chunks of free pages - * are not aligned to pageblock_nr_pages. + * Note: pageblock_nr_pages != MAX_PAGE_ORDER. Then, chunks of free + * pages are not aligned to pageblock_nr_pages. * Then we just check migratetype first. */ for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { diff --git a/mm/page_owner.c b/mm/page_owner.c index 040dbf26a986..5634e5d890f8 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -320,7 +320,7 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m, unsigned long freepage_order; freepage_order = buddy_order_unsafe(page); - if (freepage_order <= MAX_ORDER) + if (freepage_order <= MAX_PAGE_ORDER) pfn += (1UL << freepage_order) - 1; continue; } @@ -555,7 +555,7 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos) if (PageBuddy(page)) { unsigned long freepage_order = buddy_order_unsafe(page); - if (freepage_order <= MAX_ORDER) + if (freepage_order <= MAX_PAGE_ORDER) pfn += (1UL << freepage_order) - 1; continue; } @@ -663,7 +663,7 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone) if (PageBuddy(page)) { unsigned long order = buddy_order_unsafe(page); - if (order > 0 && order <= MAX_ORDER) + if (order > 0 && order <= MAX_PAGE_ORDER) pfn += (1UL << order) - 1; continue; } diff --git a/mm/page_reporting.c b/mm/page_reporting.c index 66369cc5279b..e4c428e61d8c 100644 --- a/mm/page_reporting.c +++ b/mm/page_reporting.c @@ -20,7 +20,7 @@ static int page_order_update_notify(const char *val, const struct kernel_param * * If param is set beyond this limit, order is set to default * pageblock_order value */ - return param_set_uint_minmax(val, kp, 0, MAX_ORDER); + return param_set_uint_minmax(val, kp, 0, MAX_PAGE_ORDER); } static const struct kernel_param_ops page_reporting_param_ops = { @@ -370,7 +370,7 @@ int page_reporting_register(struct page_reporting_dev_info *prdev) */ if (page_reporting_order == -1) { - if (prdev->order > 0 && prdev->order <= MAX_ORDER) + if (prdev->order > 0 && prdev->order <= MAX_PAGE_ORDER) page_reporting_order = prdev->order; else page_reporting_order = pageblock_order; diff --git a/mm/shuffle.h b/mm/shuffle.h index a6bdf54f96f1..61bbcddeeee6 100644 --- a/mm/shuffle.h +++ b/mm/shuffle.h @@ -4,7 +4,7 @@ #define _MM_SHUFFLE_H #include -#define SHUFFLE_ORDER MAX_ORDER +#define SHUFFLE_ORDER MAX_PAGE_ORDER #ifdef CONFIG_SHUFFLE_PAGE_ALLOCATOR DECLARE_STATIC_KEY_FALSE(page_alloc_shuffle_key); diff --git a/mm/slab.c b/mm/slab.c index 773c79e153f3..073cae923d56 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -465,7 +465,7 @@ static int __init slab_max_order_setup(char *str) { get_option(&str, &slab_max_order); slab_max_order = slab_max_order < 0 ? 0 : - min(slab_max_order, MAX_ORDER); + min(slab_max_order, MAX_PAGE_ORDER); slab_max_order_set = true; return 1; diff --git a/mm/slub.c b/mm/slub.c index a5420be89c8c..ba162e661e2e 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -4194,7 +4194,7 @@ static inline int calculate_order(unsigned int size) * Doh this slab cannot be placed using slub_max_order. */ order = get_order(size); - if (order <= MAX_ORDER) + if (order <= MAX_PAGE_ORDER) return order; return -ENOSYS; } @@ -4722,7 +4722,7 @@ __setup("slub_min_order=", setup_slub_min_order); static int __init setup_slub_max_order(char *str) { get_option(&str, (int *)&slub_max_order); - slub_max_order = min_t(unsigned int, slub_max_order, MAX_ORDER); + slub_max_order = min_t(unsigned int, slub_max_order, MAX_PAGE_ORDER); if (slub_min_order > slub_max_order) slub_min_order = slub_max_order; diff --git a/mm/vmscan.c b/mm/vmscan.c index 600ed3cbf7cb..68f0abbb8e59 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -6415,7 +6415,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, * scan_control uses s8 fields for order, priority, and reclaim_idx. * Confirm they are large enough for max values. */ - BUILD_BUG_ON(MAX_ORDER >= S8_MAX); + BUILD_BUG_ON(MAX_PAGE_ORDER >= S8_MAX); BUILD_BUG_ON(DEF_PRIORITY > S8_MAX); BUILD_BUG_ON(MAX_NR_ZONES > S8_MAX); diff --git a/mm/vmstat.c b/mm/vmstat.c index 03ead31c46a0..db79935e4a54 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1092,7 +1092,7 @@ static int __fragmentation_index(unsigned int order, struct contig_page_info *in { unsigned long requested = 1UL << order; - if (WARN_ON_ONCE(order > MAX_ORDER)) + if (WARN_ON_ONCE(order > MAX_PAGE_ORDER)) return 0; if (!info->free_blocks_total) diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index 89981dbe46c9..97704a9e84c7 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -844,7 +844,7 @@ long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev) goto out; /* the calculated number of cq entries fits to mlx5 cq allocation */ cqe_size_order = cache_line_size() == 128 ? 7 : 6; - smc_order = MAX_ORDER - cqe_size_order; + smc_order = MAX_PAGE_ORDER - cqe_size_order; if (SMC_MAX_CQE + 2 > (0x00000001 << smc_order) * PAGE_SIZE) cqattr.cqe = (0x00000001 << smc_order) * PAGE_SIZE - 2; smcibdev->roce_cq_send = ib_create_cq(smcibdev->ibdev, diff --git a/security/integrity/ima/ima_crypto.c b/security/integrity/ima/ima_crypto.c index 51ad29940f05..f3738b2c8bcd 100644 --- a/security/integrity/ima/ima_crypto.c +++ b/security/integrity/ima/ima_crypto.c @@ -38,7 +38,7 @@ static int param_set_bufsize(const char *val, const struct kernel_param *kp) size = memparse(val, NULL); order = get_order(size); - if (order > MAX_ORDER) + if (order > MAX_PAGE_ORDER) return -EINVAL; ima_maxorder = order; ima_bufsize = PAGE_SIZE << order; diff --git a/tools/perf/Documentation/perf-intel-pt.txt b/tools/perf/Documentation/perf-intel-pt.txt index 4c90cc176f81..2109690b0d5f 100644 --- a/tools/perf/Documentation/perf-intel-pt.txt +++ b/tools/perf/Documentation/perf-intel-pt.txt @@ -683,7 +683,7 @@ Buffer handling ~~~~~~~~~~~~~~~ There may be buffer limitations (i.e. single ToPa entry) which means that actual -buffer sizes are limited to powers of 2 up to 4MiB (MAX_ORDER). In order to +buffer sizes are limited to powers of 2 up to 4MiB (MAX_PAGE_ORDER). In order to provide other sizes, and in particular an arbitrarily large size, multiple buffers are logically concatenated. However an interrupt must be used to switch between buffers. That has two potential problems: diff --git a/tools/testing/memblock/linux/mmzone.h b/tools/testing/memblock/linux/mmzone.h index 134f8eab0768..71546e15bdd3 100644 --- a/tools/testing/memblock/linux/mmzone.h +++ b/tools/testing/memblock/linux/mmzone.h @@ -17,10 +17,10 @@ enum zone_type { }; #define MAX_NR_ZONES __MAX_NR_ZONES -#define MAX_ORDER 10 -#define MAX_ORDER_NR_PAGES (1 << MAX_ORDER) +#define MAX_PAGE_ORDER 10 +#define MAX_ORDER_NR_PAGES (1 << MAX_PAGE_ORDER) -#define pageblock_order MAX_ORDER +#define pageblock_order MAX_PAGE_ORDER #define pageblock_nr_pages BIT(pageblock_order) #define pageblock_align(pfn) ALIGN((pfn), pageblock_nr_pages) #define pageblock_start_pfn(pfn) ALIGN_DOWN((pfn), pageblock_nr_pages) diff --git a/tools/testing/selftests/mm/thuge-gen.c b/tools/testing/selftests/mm/thuge-gen.c index 16ed4dfa7359..622987f12c89 100644 --- a/tools/testing/selftests/mm/thuge-gen.c +++ b/tools/testing/selftests/mm/thuge-gen.c @@ -3,7 +3,8 @@ Before running this huge pages for each huge page size must have been reserved. - For large pages beyond MAX_ORDER (like 1GB on x86) boot options must be used. + For large pages beyond MAX_PAGE_ORDER (like 1GB on x86) boot options must + be used. Also shmmax must be increased. And you need to run as root to work around some weird permissions in shm. And nothing using huge pages should run in parallel. -- cgit From 3dc2f209208dddc73ffd1d817081711343de3242 Mon Sep 17 00:00:00 2001 From: ZhangPeng Date: Tue, 9 Jan 2024 10:45:47 +0800 Subject: swiotlb: check alloc_size before the allocation of a new memory pool The allocation request for swiotlb contiguous memory greater than 128*2KB cannot be fulfilled because it exceeds the maximum contiguous memory limit. If the swiotlb memory we allocate is larger than 128*2KB, swiotlb_find_slots() will still schedule the allocation of a new memory pool, which will increase memory overhead. Fix it by adding a check with alloc_size no more than 128*2KB before scheduling the allocation of a new memory pool in swiotlb_find_slots(). Signed-off-by: ZhangPeng Reviewed-by: Petr Tesarik Signed-off-by: Christoph Hellwig --- kernel/dma/swiotlb.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index e20b856255ef..96f832d828fd 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -1136,6 +1136,9 @@ static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr, int cpu, i; int index; + if (alloc_size > IO_TLB_SEGSIZE * IO_TLB_SIZE) + return -1; + cpu = raw_smp_processor_id(); for (i = 0; i < default_nareas; ++i) { index = swiotlb_search_area(dev, cpu, i, orig_addr, alloc_size, -- cgit From 25742aeb135c4a44e92fb347e037adaa145b9484 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Wed, 20 Dec 2023 08:10:28 -0500 Subject: ring-buffer: Remove stale comment from ring_buffer_size() It's been 11 years since the ring_buffer_size() function was updated to use the nr_pages from the buffer->buffers[cpu] structure instead of using the buffer->nr_pages that no longer exists. The comment in the code is more of what a change log should have and is pretty much useless for development. It's saying how things worked back in 2012 that bares no purpose on today's code. Remove it. Link: https://lore.kernel.org/linux-trace-kernel/84d3b41a72bd43dbb9d44921ef535c92@AcuMS.aculab.com/ Link: https://lore.kernel.org/linux-trace-kernel/20231220081028.7cd7e8e2@gandalf.local.home Cc: Mark Rutland Cc: Mathieu Desnoyers Reported-by: David Laight Signed-off-by: Steven Rostedt (Google) Acked-by: Masami Hiramatsu (Google) --- kernel/trace/ring_buffer.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 173d2595ce2d..7887d61d5b56 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -5122,12 +5122,6 @@ EXPORT_SYMBOL_GPL(ring_buffer_iter_advance); */ unsigned long ring_buffer_size(struct trace_buffer *buffer, int cpu) { - /* - * Earlier, this method returned - * buffer->subbuf_size * buffer->nr_pages - * Since the nr_pages field is now removed, we have converted this to - * return the per cpu buffer value. - */ if (!cpumask_test_cpu(cpu, buffer->cpumask)) return 0; -- cgit From 4a693ce65b186fddc1a73621bd6f941e6e3eca21 Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Fri, 29 Dec 2023 16:02:13 +0800 Subject: kdump: defer the insertion of crashkernel resources In /proc/iomem, sub-regions should be inserted after their parent, otherwise the insertion of parent resource fails. But after generic crashkernel reservation applied, in both RISC-V and ARM64 (LoongArch will also use generic reservation later on), crashkernel resources are inserted before their parent, which causes the parent disappear in /proc/iomem. So we defer the insertion of crashkernel resources to an early_initcall(). 1, Without 'crashkernel' parameter: 100d0100-100d01ff : LOON0001:00 100d0100-100d01ff : LOON0001:00 LOON0001:00 100e0000-100e0bff : LOON0002:00 100e0000-100e0bff : LOON0002:00 LOON0002:00 1fe001e0-1fe001e7 : serial 90400000-fa17ffff : System RAM f6220000-f622ffff : Reserved f9ee0000-f9ee3fff : Reserved fa120000-fa17ffff : Reserved fa190000-fe0bffff : System RAM fa190000-fa1bffff : Reserved fe4e0000-47fffffff : System RAM 43c000000-441ffffff : Reserved 47ff98000-47ffa3fff : Reserved 47ffa4000-47ffa7fff : Reserved 47ffa8000-47ffabfff : Reserved 47ffac000-47ffaffff : Reserved 47ffb0000-47ffb3fff : Reserved 2, With 'crashkernel' parameter, before this patch: 100d0100-100d01ff : LOON0001:00 100d0100-100d01ff : LOON0001:00 LOON0001:00 100e0000-100e0bff : LOON0002:00 100e0000-100e0bff : LOON0002:00 LOON0002:00 1fe001e0-1fe001e7 : serial e6200000-f61fffff : Crash kernel fa190000-fe0bffff : System RAM fa190000-fa1bffff : Reserved fe4e0000-47fffffff : System RAM 43c000000-441ffffff : Reserved 47ff98000-47ffa3fff : Reserved 47ffa4000-47ffa7fff : Reserved 47ffa8000-47ffabfff : Reserved 47ffac000-47ffaffff : Reserved 47ffb0000-47ffb3fff : Reserved 3, With 'crashkernel' parameter, after this patch: 100d0100-100d01ff : LOON0001:00 100d0100-100d01ff : LOON0001:00 LOON0001:00 100e0000-100e0bff : LOON0002:00 100e0000-100e0bff : LOON0002:00 LOON0002:00 1fe001e0-1fe001e7 : serial 90400000-fa17ffff : System RAM e6200000-f61fffff : Crash kernel f6220000-f622ffff : Reserved f9ee0000-f9ee3fff : Reserved fa120000-fa17ffff : Reserved fa190000-fe0bffff : System RAM fa190000-fa1bffff : Reserved fe4e0000-47fffffff : System RAM 43c000000-441ffffff : Reserved 47ff98000-47ffa3fff : Reserved 47ffa4000-47ffa7fff : Reserved 47ffa8000-47ffabfff : Reserved 47ffac000-47ffaffff : Reserved 47ffb0000-47ffb3fff : Reserved Link: https://lkml.kernel.org/r/20231229080213.2622204-1-chenhuacai@loongson.cn Signed-off-by: Huacai Chen Fixes: 0ab97169aa05 ("crash_core: add generic function to do reservation") Cc: Baoquan He Cc: Zhen Lei Cc: [6.6+] Signed-off-by: Andrew Morton --- kernel/crash_core.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/crash_core.c b/kernel/crash_core.c index d48315667752..4e2cac71b84f 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -376,7 +376,6 @@ static int __init reserve_crashkernel_low(unsigned long long low_size) crashk_low_res.start = low_base; crashk_low_res.end = low_base + low_size - 1; - insert_resource(&iomem_resource, &crashk_low_res); #endif return 0; } @@ -458,8 +457,19 @@ retry: crashk_res.start = crash_base; crashk_res.end = crash_base + crash_size - 1; - insert_resource(&iomem_resource, &crashk_res); } + +static __init int insert_crashkernel_resources(void) +{ + if (crashk_res.start < crashk_res.end) + insert_resource(&iomem_resource, &crashk_res); + + if (crashk_low_res.start < crashk_low_res.end) + insert_resource(&iomem_resource, &crashk_low_res); + + return 0; +} +early_initcall(insert_crashkernel_resources); #endif int crash_prepare_elf64_headers(struct crash_mem *mem, int need_kernel_map, -- cgit From 7bb943806ff61e83ae4cceef8906b7fe52453e8a Mon Sep 17 00:00:00 2001 From: James Gowans Date: Wed, 13 Dec 2023 08:40:04 +0200 Subject: kexec: do syscore_shutdown() in kernel_kexec syscore_shutdown() runs driver and module callbacks to get the system into a state where it can be correctly shut down. In commit 6f389a8f1dd2 ("PM / reboot: call syscore_shutdown() after disable_nonboot_cpus()") syscore_shutdown() was removed from kernel_restart_prepare() and hence got (incorrectly?) removed from the kexec flow. This was innocuous until commit 6735150b6997 ("KVM: Use syscore_ops instead of reboot_notifier to hook restart/shutdown") changed the way that KVM registered its shutdown callbacks, switching from reboot notifiers to syscore_ops.shutdown. As syscore_shutdown() is missing from kexec, KVM's shutdown hook is not run and virtualisation is left enabled on the boot CPU which results in triple faults when switching to the new kernel on Intel x86 VT-x with VMXE enabled. Fix this by adding syscore_shutdown() to the kexec sequence. In terms of where to add it, it is being added after migrating the kexec task to the boot CPU, but before APs are shut down. It is not totally clear if this is the best place: in commit 6f389a8f1dd2 ("PM / reboot: call syscore_shutdown() after disable_nonboot_cpus()") it is stated that "syscore_ops operations should be carried with one CPU on-line and interrupts disabled." APs are only offlined later in machine_shutdown(), so this syscore_shutdown() is being run while APs are still online. This seems to be the correct place as it matches where syscore_shutdown() is run in the reboot and halt flows - they also run it before APs are shut down. The assumption is that the commit message in commit 6f389a8f1dd2 ("PM / reboot: call syscore_shutdown() after disable_nonboot_cpus()") is no longer valid. KVM has been discussed here as it is what broke loudly by not having syscore_shutdown() in kexec, but this change impacts more than just KVM; all drivers/modules which register a syscore_ops.shutdown callback will now be invoked in the kexec flow. Looking at some of them like x86 MCE it is probably more correct to also shut these down during kexec. Maintainers of all drivers which use syscore_ops.shutdown are added on CC for visibility. They are: arch/powerpc/platforms/cell/spu_base.c .shutdown = spu_shutdown, arch/x86/kernel/cpu/mce/core.c .shutdown = mce_syscore_shutdown, arch/x86/kernel/i8259.c .shutdown = i8259A_shutdown, drivers/irqchip/irq-i8259.c .shutdown = i8259A_shutdown, drivers/irqchip/irq-sun6i-r.c .shutdown = sun6i_r_intc_shutdown, drivers/leds/trigger/ledtrig-cpu.c .shutdown = ledtrig_cpu_syscore_shutdown, drivers/power/reset/sc27xx-poweroff.c .shutdown = sc27xx_poweroff_shutdown, kernel/irq/generic-chip.c .shutdown = irq_gc_shutdown, virt/kvm/kvm_main.c .shutdown = kvm_shutdown, This has been tested by doing a kexec on x86_64 and aarch64. Link: https://lkml.kernel.org/r/20231213064004.2419447-1-jgowans@amazon.com Fixes: 6735150b6997 ("KVM: Use syscore_ops instead of reboot_notifier to hook restart/shutdown") Signed-off-by: James Gowans Cc: Baoquan He Cc: Eric Biederman Cc: Paolo Bonzini Cc: Sean Christopherson Cc: Marc Zyngier Cc: Arnd Bergmann Cc: Tony Luck Cc: Borislav Petkov Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Chen-Yu Tsai Cc: Jernej Skrabec Cc: Samuel Holland Cc: Pavel Machek Cc: Sebastian Reichel Cc: Orson Zhai Cc: Alexander Graf Cc: Jan H. Schoenherr Cc: Signed-off-by: Andrew Morton --- kernel/kexec_core.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index a08031b57a61..d08fc7b5db97 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -1257,6 +1257,7 @@ int kernel_kexec(void) kexec_in_progress = true; kernel_restart_prepare("kexec reboot"); migrate_to_reboot_cpu(); + syscore_shutdown(); /* * migrate_to_reboot_cpu() disables CPU hotplug assuming that -- cgit From 4e87ff59cebb53d3ce1333245e64ab7d51ebf118 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 9 Jan 2024 21:35:21 -0800 Subject: kernel/crash_core.c: make __crash_hotplug_lock static sparse warnings: kernel/crash_core.c:749:1: sparse: sparse: symbol '__crash_hotplug_lock' was not declared. Should it be static? Fixes: e2a8f20dd8e9 ("Crash: add lock to serialize crash hotplug handling") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202401080654.IjjU5oK7-lkp@intel.com/ Cc: Baoquan He Signed-off-by: Andrew Morton --- kernel/crash_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 4e2cac71b84f..75cd6a736d03 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -877,7 +877,7 @@ subsys_initcall(crash_notes_memory_init); * regions are online. So mutex lock __crash_hotplug_lock is used to * serialize the crash hotplug handling specifically. */ -DEFINE_MUTEX(__crash_hotplug_lock); +static DEFINE_MUTEX(__crash_hotplug_lock); #define crash_hotplug_lock() mutex_lock(&__crash_hotplug_lock) #define crash_hotplug_unlock() mutex_unlock(&__crash_hotplug_lock) -- cgit From 7c65aa3cc072cee76f577262fbe381a111a98774 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 13 Jan 2024 20:46:42 -0800 Subject: dma-debug: fix kernel-doc warnings Update the kernel-doc comments to catch up with the code changes and fix the kernel-doc warnings: debug.c:83: warning: Excess struct member 'stacktrace' description in 'dma_debug_entry' debug.c:83: warning: Function parameter or struct member 'stack_len' not described in 'dma_debug_entry' debug.c:83: warning: Function parameter or struct member 'stack_entries' not described in 'dma_debug_entry' Fixes: 746017ed8d4d ("dma/debug: Simplify stracktrace retrieval") Signed-off-by: Randy Dunlap Cc: Christoph Hellwig Cc: Marek Szyprowski Cc: Robin Murphy Cc: iommu@lists.linux.dev Signed-off-by: Christoph Hellwig --- kernel/dma/debug.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index 3de494375b7b..08b7f84b76f9 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -62,7 +62,8 @@ enum map_err_types { * @pfn: page frame of the start address * @offset: offset of mapping relative to pfn * @map_err_type: track whether dma_mapping_error() was checked - * @stacktrace: support backtraces when a violation is detected + * @stack_len: number of backtrace entries in @stack_entries + * @stack_entries: stack of backtrace history */ struct dma_debug_entry { struct list_head list; -- cgit From e37617c8e53a1f7fcba6d5e1041f4fd8a2425c27 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Sun, 14 Jan 2024 19:36:00 +0100 Subject: sched/fair: Fix frequency selection for non-invariant case Linus reported a ~50% performance regression on single-threaded workloads on his AMD Ryzen system, and bisected it to: 9c0b4bb7f630 ("sched/cpufreq: Rework schedutil governor performance estimation") When frequency invariance is not enabled, get_capacity_ref_freq(policy) is supposed to return the current frequency and the performance margin applied by map_util_perf(), enabling the utilization to go above the maximum compute capacity and to select a higher frequency than the current one. After the changes in 9c0b4bb7f630, the performance margin was applied earlier in the path to take into account utilization clampings and we couldn't get a utilization higher than the maximum compute capacity, and the CPU remained 'stuck' at lower frequencies. To fix this, we must use a frequency above the current frequency to get a chance to select a higher OPP when the current one becomes fully used. Apply the same margin and return a frequency 25% higher than the current one in order to switch to the next OPP before we fully use the CPU at the current one. [ mingo: Clarified the changelog. ] Fixes: 9c0b4bb7f630 ("sched/cpufreq: Rework schedutil governor performance estimation") Reported-by: Linus Torvalds Bisected-by: Linus Torvalds Reported-by: Wyes Karny Signed-off-by: Vincent Guittot Signed-off-by: Ingo Molnar Tested-by: Wyes Karny Link: https://lore.kernel.org/r/20240114183600.135316-1-vincent.guittot@linaro.org --- kernel/sched/cpufreq_schedutil.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 95c3c097083e..eece6244f9d2 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -133,7 +133,11 @@ unsigned long get_capacity_ref_freq(struct cpufreq_policy *policy) if (arch_scale_freq_invariant()) return policy->cpuinfo.max_freq; - return policy->cur; + /* + * Apply a 25% margin so that we select a higher frequency than + * the current one before the CPU is fully busy: + */ + return policy->cur + (policy->cur >> 2); } /** -- cgit From 22c7fa171a02d310e3a3f6ed46a698ca8a0060ed Mon Sep 17 00:00:00 2001 From: Hao Sun Date: Mon, 15 Jan 2024 09:20:27 +0100 Subject: bpf: Reject variable offset alu on PTR_TO_FLOW_KEYS For PTR_TO_FLOW_KEYS, check_flow_keys_access() only uses fixed off for validation. However, variable offset ptr alu is not prohibited for this ptr kind. So the variable offset is not checked. The following prog is accepted: func#0 @0 0: R1=ctx() R10=fp0 0: (bf) r6 = r1 ; R1=ctx() R6_w=ctx() 1: (79) r7 = *(u64 *)(r6 +144) ; R6_w=ctx() R7_w=flow_keys() 2: (b7) r8 = 1024 ; R8_w=1024 3: (37) r8 /= 1 ; R8_w=scalar() 4: (57) r8 &= 1024 ; R8_w=scalar(smin=smin32=0, smax=umax=smax32=umax32=1024,var_off=(0x0; 0x400)) 5: (0f) r7 += r8 mark_precise: frame0: last_idx 5 first_idx 0 subseq_idx -1 mark_precise: frame0: regs=r8 stack= before 4: (57) r8 &= 1024 mark_precise: frame0: regs=r8 stack= before 3: (37) r8 /= 1 mark_precise: frame0: regs=r8 stack= before 2: (b7) r8 = 1024 6: R7_w=flow_keys(smin=smin32=0,smax=umax=smax32=umax32=1024,var_off =(0x0; 0x400)) R8_w=scalar(smin=smin32=0,smax=umax=smax32=umax32=1024, var_off=(0x0; 0x400)) 6: (79) r0 = *(u64 *)(r7 +0) ; R0_w=scalar() 7: (95) exit This prog loads flow_keys to r7, and adds the variable offset r8 to r7, and finally causes out-of-bounds access: BUG: unable to handle page fault for address: ffffc90014c80038 [...] Call Trace: bpf_dispatcher_nop_func include/linux/bpf.h:1231 [inline] __bpf_prog_run include/linux/filter.h:651 [inline] bpf_prog_run include/linux/filter.h:658 [inline] bpf_prog_run_pin_on_cpu include/linux/filter.h:675 [inline] bpf_flow_dissect+0x15f/0x350 net/core/flow_dissector.c:991 bpf_prog_test_run_flow_dissector+0x39d/0x620 net/bpf/test_run.c:1359 bpf_prog_test_run kernel/bpf/syscall.c:4107 [inline] __sys_bpf+0xf8f/0x4560 kernel/bpf/syscall.c:5475 __do_sys_bpf kernel/bpf/syscall.c:5561 [inline] __se_sys_bpf kernel/bpf/syscall.c:5559 [inline] __x64_sys_bpf+0x73/0xb0 kernel/bpf/syscall.c:5559 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0x3f/0x110 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x63/0x6b Fix this by rejecting ptr alu with variable offset on flow_keys. Applying the patch rejects the program with "R7 pointer arithmetic on flow_keys prohibited". Fixes: d58e468b1112 ("flow_dissector: implements flow dissector BPF hook") Signed-off-by: Hao Sun Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20240115082028.9992-1-sunhao.th@gmail.com --- kernel/bpf/verifier.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index adbf330d364b..65f598694d55 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -12826,6 +12826,10 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, } switch (base_type(ptr_reg->type)) { + case PTR_TO_FLOW_KEYS: + if (known) + break; + fallthrough; case CONST_PTR_TO_MAP: /* smin_val represents the known value */ if (known && smin_val == 0 && opcode == BPF_ADD) -- cgit From 4f41d30cd6dc865c3cbc1a852372321eba6d4e4c Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sat, 25 Nov 2023 13:05:04 +0100 Subject: kdb: Fix a potential buffer overflow in kdb_local() When appending "[defcmd]" to 'kdb_prompt_str', the size of the string already in the buffer should be taken into account. An option could be to switch from strncat() to strlcat() which does the correct test to avoid such an overflow. However, this actually looks as dead code, because 'defcmd_in_progress' can't be true here. See a more detailed explanation at [1]. [1]: https://lore.kernel.org/all/CAD=FV=WSh7wKN7Yp-3wWiDgX4E3isQ8uh0LCzTmd1v9Cg9j+nQ@mail.gmail.com/ Fixes: 5d5314d6795f ("kdb: core for kgdb back end (1 of 2)") Signed-off-by: Christophe JAILLET Reviewed-by: Douglas Anderson --- kernel/debug/kdb/kdb_main.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 6b213c8252d6..d05066cb40b2 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -1348,8 +1348,6 @@ do_full_getstr: /* PROMPT can only be set if we have MEM_READ permission. */ snprintf(kdb_prompt_str, CMD_BUFLEN, kdbgetenv("PROMPT"), raw_smp_processor_id()); - if (defcmd_in_progress) - strncat(kdb_prompt_str, "[defcmd]", CMD_BUFLEN); /* * Fetch command from keyboard -- cgit From 66967a32d3b16ed447e76fed4d946bab52e43d86 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 17 Jan 2024 19:31:40 -0800 Subject: bpf: extract bpf_ctx_convert_map logic and make it more reusable Refactor btf_get_prog_ctx_type() a bit to allow reuse of bpf_ctx_convert_map logic in more than one places. Simplify interface by returning btf_type instead of btf_member (field reference in BTF). To do the above we need to touch and start untangling btf_translate_to_vmlinux() implementation. We do the bare minimum to not regress anything for btf_translate_to_vmlinux(), but its implementation is very questionable for what it claims to be doing. Mapping kfunc argument types to kernel corresponding types conceptually is quite different from recognizing program context types. Fixing this is out of scope for this change though. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240118033143.3384355-3-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/btf.h | 2 +- kernel/bpf/btf.c | 71 +++++++++++++++++++++++++++++++++-------------------- 2 files changed, 46 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/include/linux/btf.h b/include/linux/btf.h index 59d404e22814..cf5c6ff48981 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -512,7 +512,7 @@ s32 btf_find_dtor_kfunc(struct btf *btf, u32 btf_id); int register_btf_id_dtor_kfuncs(const struct btf_id_dtor_kfunc *dtors, u32 add_cnt, struct module *owner); struct btf_struct_meta *btf_find_struct_meta(const struct btf *btf, u32 btf_id); -const struct btf_member * +const struct btf_type * btf_get_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf, const struct btf_type *t, enum bpf_prog_type prog_type, int arg); diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 51e8b4bee0c8..10ac9efc662d 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -5615,21 +5615,46 @@ static u8 bpf_ctx_convert_map[] = { #undef BPF_MAP_TYPE #undef BPF_LINK_TYPE -const struct btf_member * -btf_get_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf, - const struct btf_type *t, enum bpf_prog_type prog_type, - int arg) +static const struct btf_type *find_canonical_prog_ctx_type(enum bpf_prog_type prog_type) { const struct btf_type *conv_struct; - const struct btf_type *ctx_struct; const struct btf_member *ctx_type; - const char *tname, *ctx_tname; conv_struct = bpf_ctx_convert.t; - if (!conv_struct) { - bpf_log(log, "btf_vmlinux is malformed\n"); + if (!conv_struct) return NULL; - } + /* prog_type is valid bpf program type. No need for bounds check. */ + ctx_type = btf_type_member(conv_struct) + bpf_ctx_convert_map[prog_type] * 2; + /* ctx_type is a pointer to prog_ctx_type in vmlinux. + * Like 'struct __sk_buff' + */ + return btf_type_by_id(btf_vmlinux, ctx_type->type); +} + +static int find_kern_ctx_type_id(enum bpf_prog_type prog_type) +{ + const struct btf_type *conv_struct; + const struct btf_member *ctx_type; + + conv_struct = bpf_ctx_convert.t; + if (!conv_struct) + return -EFAULT; + /* prog_type is valid bpf program type. No need for bounds check. */ + ctx_type = btf_type_member(conv_struct) + bpf_ctx_convert_map[prog_type] * 2 + 1; + /* ctx_type is a pointer to prog_ctx_type in vmlinux. + * Like 'struct sk_buff' + */ + return ctx_type->type; +} + +const struct btf_type * +btf_get_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf, + const struct btf_type *t, enum bpf_prog_type prog_type, + int arg) +{ + const struct btf_type *ctx_type; + const char *tname, *ctx_tname; + t = btf_type_by_id(btf, t->type); while (btf_type_is_modifier(t)) t = btf_type_by_id(btf, t->type); @@ -5646,17 +5671,15 @@ btf_get_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf, bpf_log(log, "arg#%d struct doesn't have a name\n", arg); return NULL; } - /* prog_type is valid bpf program type. No need for bounds check. */ - ctx_type = btf_type_member(conv_struct) + bpf_ctx_convert_map[prog_type] * 2; - /* ctx_struct is a pointer to prog_ctx_type in vmlinux. - * Like 'struct __sk_buff' - */ - ctx_struct = btf_type_by_id(btf_vmlinux, ctx_type->type); - if (!ctx_struct) + + ctx_type = find_canonical_prog_ctx_type(prog_type); + if (!ctx_type) { + bpf_log(log, "btf_vmlinux is malformed\n"); /* should not happen */ return NULL; + } again: - ctx_tname = btf_name_by_offset(btf_vmlinux, ctx_struct->name_off); + ctx_tname = btf_name_by_offset(btf_vmlinux, ctx_type->name_off); if (!ctx_tname) { /* should not happen */ bpf_log(log, "Please fix kernel include/linux/bpf_types.h\n"); @@ -5677,10 +5700,10 @@ again: /* bpf_user_pt_regs_t is a typedef, so resolve it to * underlying struct and check name again */ - if (!btf_type_is_modifier(ctx_struct)) + if (!btf_type_is_modifier(ctx_type)) return NULL; - while (btf_type_is_modifier(ctx_struct)) - ctx_struct = btf_type_by_id(btf_vmlinux, ctx_struct->type); + while (btf_type_is_modifier(ctx_type)) + ctx_type = btf_type_by_id(btf_vmlinux, ctx_type->type); goto again; } return ctx_type; @@ -5692,13 +5715,9 @@ static int btf_translate_to_vmlinux(struct bpf_verifier_log *log, enum bpf_prog_type prog_type, int arg) { - const struct btf_member *prog_ctx_type, *kern_ctx_type; - - prog_ctx_type = btf_get_prog_ctx_type(log, btf, t, prog_type, arg); - if (!prog_ctx_type) + if (!btf_get_prog_ctx_type(log, btf, t, prog_type, arg)) return -ENOENT; - kern_ctx_type = prog_ctx_type + 1; - return kern_ctx_type->type; + return find_kern_ctx_type_id(prog_type); } int get_kern_ctx_btf_id(struct bpf_verifier_log *log, enum bpf_prog_type prog_type) -- cgit From 0ba971511d16603599f947459e59b435cc465b0d Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 17 Jan 2024 19:31:41 -0800 Subject: bpf: enforce types for __arg_ctx-tagged arguments in global subprogs Add enforcement of expected types for context arguments tagged with arg:ctx (__arg_ctx) tag. First, any program type will accept generic `void *` context type when combined with __arg_ctx tag. Besides accepting "canonical" struct names and `void *`, for a bunch of program types for which program context is actually a named struct, we allows a bunch of pragmatic exceptions to match real-world and expected usage: - for both kprobes and perf_event we allow `bpf_user_pt_regs_t *` as canonical context argument type, where `bpf_user_pt_regs_t` is a *typedef*, not a struct; - for kprobes, we also always accept `struct pt_regs *`, as that's what actually is passed as a context to any kprobe program; - for perf_event, we resolve typedefs (unless it's `bpf_user_pt_regs_t`) down to actual struct type and accept `struct pt_regs *`, or `struct user_pt_regs *`, or `struct user_regs_struct *`, depending on the actual struct type kernel architecture points `bpf_user_pt_regs_t` typedef to; otherwise, canonical `struct bpf_perf_event_data *` is expected; - for raw_tp/raw_tp.w programs, `u64/long *` are accepted, as that's what's expected with BPF_PROG() usage; otherwise, canonical `struct bpf_raw_tracepoint_args *` is expected; - tp_btf supports both `struct bpf_raw_tracepoint_args *` and `u64 *` formats, both are coded as expections as tp_btf is actually a TRACING program type, which has no canonical context type; - iterator programs accept `struct bpf_iter__xxx *` structs, currently with no further iterator-type specific enforcement; - fentry/fexit/fmod_ret/lsm/struct_ops all accept `u64 *`; - classic tracepoint programs, as well as syscall and freplace programs allow any user-provided type. In all other cases kernel will enforce exact match of struct name to expected canonical type. And if user-provided type doesn't match that expectation, verifier will emit helpful message with expected type name. Note a bit unnatural way the check is done after processing all the arguments. This is done to avoid conflict between bpf and bpf-next trees. Once trees converge, a small follow up patch will place a simple btf_validate_prog_ctx_type() check into a proper ARG_PTR_TO_CTX branch (which bpf-next tree patch refactored already), removing duplicated arg:ctx detection logic. Suggested-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240118033143.3384355-4-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/btf.c | 160 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 160 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 10ac9efc662d..596471189176 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -5709,6 +5709,149 @@ again: return ctx_type; } +/* forward declarations for arch-specific underlying types of + * bpf_user_pt_regs_t; this avoids the need for arch-specific #ifdef + * compilation guards below for BPF_PROG_TYPE_PERF_EVENT checks, but still + * works correctly with __builtin_types_compatible_p() on respective + * architectures + */ +struct user_regs_struct; +struct user_pt_regs; + +static int btf_validate_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf, + const struct btf_type *t, int arg, + enum bpf_prog_type prog_type, + enum bpf_attach_type attach_type) +{ + const struct btf_type *ctx_type; + const char *tname, *ctx_tname; + + if (!btf_is_ptr(t)) { + bpf_log(log, "arg#%d type isn't a pointer\n", arg); + return -EINVAL; + } + t = btf_type_by_id(btf, t->type); + + /* KPROBE and PERF_EVENT programs allow bpf_user_pt_regs_t typedef */ + if (prog_type == BPF_PROG_TYPE_KPROBE || prog_type == BPF_PROG_TYPE_PERF_EVENT) { + while (btf_type_is_modifier(t) && !btf_type_is_typedef(t)) + t = btf_type_by_id(btf, t->type); + + if (btf_type_is_typedef(t)) { + tname = btf_name_by_offset(btf, t->name_off); + if (tname && strcmp(tname, "bpf_user_pt_regs_t") == 0) + return 0; + } + } + + /* all other program types don't use typedefs for context type */ + while (btf_type_is_modifier(t)) + t = btf_type_by_id(btf, t->type); + + /* `void *ctx __arg_ctx` is always valid */ + if (btf_type_is_void(t)) + return 0; + + tname = btf_name_by_offset(btf, t->name_off); + if (str_is_empty(tname)) { + bpf_log(log, "arg#%d type doesn't have a name\n", arg); + return -EINVAL; + } + + /* special cases */ + switch (prog_type) { + case BPF_PROG_TYPE_KPROBE: + if (__btf_type_is_struct(t) && strcmp(tname, "pt_regs") == 0) + return 0; + break; + case BPF_PROG_TYPE_PERF_EVENT: + if (__builtin_types_compatible_p(bpf_user_pt_regs_t, struct pt_regs) && + __btf_type_is_struct(t) && strcmp(tname, "pt_regs") == 0) + return 0; + if (__builtin_types_compatible_p(bpf_user_pt_regs_t, struct user_pt_regs) && + __btf_type_is_struct(t) && strcmp(tname, "user_pt_regs") == 0) + return 0; + if (__builtin_types_compatible_p(bpf_user_pt_regs_t, struct user_regs_struct) && + __btf_type_is_struct(t) && strcmp(tname, "user_regs_struct") == 0) + return 0; + break; + case BPF_PROG_TYPE_RAW_TRACEPOINT: + case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE: + /* allow u64* as ctx */ + if (btf_is_int(t) && t->size == 8) + return 0; + break; + case BPF_PROG_TYPE_TRACING: + switch (attach_type) { + case BPF_TRACE_RAW_TP: + /* tp_btf program is TRACING, so need special case here */ + if (__btf_type_is_struct(t) && + strcmp(tname, "bpf_raw_tracepoint_args") == 0) + return 0; + /* allow u64* as ctx */ + if (btf_is_int(t) && t->size == 8) + return 0; + break; + case BPF_TRACE_ITER: + /* allow struct bpf_iter__xxx types only */ + if (__btf_type_is_struct(t) && + strncmp(tname, "bpf_iter__", sizeof("bpf_iter__") - 1) == 0) + return 0; + break; + case BPF_TRACE_FENTRY: + case BPF_TRACE_FEXIT: + case BPF_MODIFY_RETURN: + /* allow u64* as ctx */ + if (btf_is_int(t) && t->size == 8) + return 0; + break; + default: + break; + } + break; + case BPF_PROG_TYPE_LSM: + case BPF_PROG_TYPE_STRUCT_OPS: + /* allow u64* as ctx */ + if (btf_is_int(t) && t->size == 8) + return 0; + break; + case BPF_PROG_TYPE_TRACEPOINT: + case BPF_PROG_TYPE_SYSCALL: + case BPF_PROG_TYPE_EXT: + return 0; /* anything goes */ + default: + break; + } + + ctx_type = find_canonical_prog_ctx_type(prog_type); + if (!ctx_type) { + /* should not happen */ + bpf_log(log, "btf_vmlinux is malformed\n"); + return -EINVAL; + } + + /* resolve typedefs and check that underlying structs are matching as well */ + while (btf_type_is_modifier(ctx_type)) + ctx_type = btf_type_by_id(btf_vmlinux, ctx_type->type); + + /* if program type doesn't have distinctly named struct type for + * context, then __arg_ctx argument can only be `void *`, which we + * already checked above + */ + if (!__btf_type_is_struct(ctx_type)) { + bpf_log(log, "arg#%d should be void pointer\n", arg); + return -EINVAL; + } + + ctx_tname = btf_name_by_offset(btf_vmlinux, ctx_type->name_off); + if (!__btf_type_is_struct(t) || strcmp(ctx_tname, tname) != 0) { + bpf_log(log, "arg#%d should be `struct %s *`\n", arg, ctx_tname); + return -EINVAL; + } + + return 0; +} + static int btf_translate_to_vmlinux(struct bpf_verifier_log *log, struct btf *btf, const struct btf_type *t, @@ -6953,6 +7096,23 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog) return -EINVAL; } + for (i = 0; i < nargs; i++) { + const char *tag; + + if (sub->args[i].arg_type != ARG_PTR_TO_CTX) + continue; + + /* check if arg has "arg:ctx" tag */ + t = btf_type_by_id(btf, args[i].type); + tag = btf_find_decl_tag_value(btf, fn_t, i, "arg:"); + if (IS_ERR_OR_NULL(tag) || strcmp(tag, "ctx") != 0) + continue; + + if (btf_validate_prog_ctx_type(log, btf, t, i, prog_type, + prog->expected_attach_type)) + return -EINVAL; + } + sub->arg_cnt = nargs; sub->args_cached = true; -- cgit From 71fee48fb772ac4f6cfa63dbebc5629de8b4cc09 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 15 Jan 2024 17:35:55 +0100 Subject: tick-sched: Fix idle and iowait sleeptime accounting vs CPU hotplug When offlining and onlining CPUs the overall reported idle and iowait times as reported by /proc/stat jump backward and forward: cpu 132 0 176 225249 47 6 6 21 0 0 cpu0 80 0 115 112575 33 3 4 18 0 0 cpu1 52 0 60 112673 13 3 1 2 0 0 cpu 133 0 177 226681 47 6 6 21 0 0 cpu0 80 0 116 113387 33 3 4 18 0 0 cpu 133 0 178 114431 33 6 6 21 0 0 <---- jump backward cpu0 80 0 116 114247 33 3 4 18 0 0 cpu1 52 0 61 183 0 3 1 2 0 0 <---- idle + iowait start with 0 cpu 133 0 178 228956 47 6 6 21 0 0 <---- jump forward cpu0 81 0 117 114929 33 3 4 18 0 0 Reason for this is that get_idle_time() in fs/proc/stat.c has different sources for both values depending on if a CPU is online or offline: - if a CPU is online the values may be taken from its per cpu tick_cpu_sched structure - if a CPU is offline the values are taken from its per cpu cpustat structure The problem is that the per cpu tick_cpu_sched structure is set to zero on CPU offline. See tick_cancel_sched_timer() in kernel/time/tick-sched.c. Therefore when a CPU is brought offline and online afterwards both its idle and iowait sleeptime will be zero, causing a jump backward in total system idle and iowait sleeptime. In a similar way if a CPU is then brought offline again the total idle and iowait sleeptimes will jump forward. It looks like this behavior was introduced with commit 4b0c0f294f60 ("tick: Cleanup NOHZ per cpu data on cpu down"). This was only noticed now on s390, since we switched to generic idle time reporting with commit be76ea614460 ("s390/idle: remove arch_cpu_idle_time() and corresponding code"). Fix this by preserving the values of idle_sleeptime and iowait_sleeptime members of the per-cpu tick_sched structure on CPU hotplug. Fixes: 4b0c0f294f60 ("tick: Cleanup NOHZ per cpu data on cpu down") Reported-by: Gerald Schaefer Signed-off-by: Heiko Carstens Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lore.kernel.org/r/20240115163555.1004144-1-hca@linux.ibm.com --- kernel/time/tick-sched.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index a17d26002831..d2501673028d 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -1576,13 +1576,18 @@ void tick_setup_sched_timer(void) void tick_cancel_sched_timer(int cpu) { struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); + ktime_t idle_sleeptime, iowait_sleeptime; # ifdef CONFIG_HIGH_RES_TIMERS if (ts->sched_timer.base) hrtimer_cancel(&ts->sched_timer); # endif + idle_sleeptime = ts->idle_sleeptime; + iowait_sleeptime = ts->iowait_sleeptime; memset(ts, 0, sizeof(*ts)); + ts->idle_sleeptime = idle_sleeptime; + ts->iowait_sleeptime = iowait_sleeptime; } #endif -- cgit