aboutsummaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/bpf/arraymap.c18
-rw-r--r--kernel/bpf/cpumap.c31
-rw-r--r--kernel/bpf/devmap.c8
-rw-r--r--kernel/bpf/disasm.h4
-rw-r--r--kernel/bpf/hashtab.c103
-rw-r--r--kernel/bpf/lpm_trie.c7
-rw-r--r--kernel/bpf/offload.c222
-rw-r--r--kernel/bpf/sockmap.c8
-rw-r--r--kernel/bpf/stackmap.c6
-rw-r--r--kernel/bpf/syscall.c71
-rw-r--r--kernel/bpf/verifier.c14
-rw-r--r--kernel/crash_core.c2
-rw-r--r--kernel/fail_function.c349
-rw-r--r--kernel/kprobes.c163
-rw-r--r--kernel/module.c8
-rw-r--r--kernel/sched/completion.c5
-rw-r--r--kernel/sched/membarrier.c2
-rw-r--r--kernel/trace/Kconfig6
-rw-r--r--kernel/trace/bpf_trace.c11
-rw-r--r--kernel/trace/ring_buffer.c62
-rw-r--r--kernel/trace/trace_kprobe.c33
-rw-r--r--kernel/trace/trace_probe.h12
23 files changed, 800 insertions, 346 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 172d151d429c..f85ae5dfa474 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -81,6 +81,7 @@ obj-$(CONFIG_AUDIT_TREE) += audit_tree.o
obj-$(CONFIG_GCOV_KERNEL) += gcov/
obj-$(CONFIG_KCOV) += kcov.o
obj-$(CONFIG_KPROBES) += kprobes.o
+obj-$(CONFIG_FAIL_FUNCTION) += fail_function.o
obj-$(CONFIG_KGDB) += debug/
obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index aaa319848e7d..ab94d304a634 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -56,7 +56,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
u32 elem_size, index_mask, max_entries;
bool unpriv = !capable(CAP_SYS_ADMIN);
struct bpf_array *array;
- u64 array_size;
+ u64 array_size, mask64;
/* check sanity of attributes */
if (attr->max_entries == 0 || attr->key_size != 4 ||
@@ -74,13 +74,25 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
elem_size = round_up(attr->value_size, 8);
max_entries = attr->max_entries;
- index_mask = roundup_pow_of_two(max_entries) - 1;
- if (unpriv)
+ /* On 32 bit archs roundup_pow_of_two() with max_entries that has
+ * upper most bit set in u32 space is undefined behavior due to
+ * resulting 1U << 32, so do it manually here in u64 space.
+ */
+ mask64 = fls_long(max_entries - 1);
+ mask64 = 1ULL << mask64;
+ mask64 -= 1;
+
+ index_mask = mask64;
+ if (unpriv) {
/* round up array size to nearest power of 2,
* since cpu will speculate within index_mask limits
*/
max_entries = index_mask + 1;
+ /* Check for overflows. */
+ if (max_entries < attr->max_entries)
+ return ERR_PTR(-E2BIG);
+ }
array_size = sizeof(*array);
if (percpu)
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index ce5b669003b2..fbfdada6caee 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -94,13 +94,7 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
if (!cmap)
return ERR_PTR(-ENOMEM);
- /* mandatory map attributes */
- cmap->map.map_type = attr->map_type;
- cmap->map.key_size = attr->key_size;
- cmap->map.value_size = attr->value_size;
- cmap->map.max_entries = attr->max_entries;
- cmap->map.map_flags = attr->map_flags;
- cmap->map.numa_node = bpf_map_attr_numa_node(attr);
+ bpf_map_init_from_attr(&cmap->map, attr);
/* Pre-limit array size based on NR_CPUS, not final CPU check */
if (cmap->map.max_entries > NR_CPUS) {
@@ -143,7 +137,7 @@ free_cmap:
return ERR_PTR(err);
}
-void __cpu_map_queue_destructor(void *ptr)
+static void __cpu_map_queue_destructor(void *ptr)
{
/* The tear-down procedure should have made sure that queue is
* empty. See __cpu_map_entry_replace() and work-queue
@@ -222,8 +216,8 @@ static struct xdp_pkt *convert_to_xdp_pkt(struct xdp_buff *xdp)
return xdp_pkt;
}
-struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu,
- struct xdp_pkt *xdp_pkt)
+static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu,
+ struct xdp_pkt *xdp_pkt)
{
unsigned int frame_size;
void *pkt_data_start;
@@ -337,7 +331,8 @@ static int cpu_map_kthread_run(void *data)
return 0;
}
-struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu, int map_id)
+static struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu,
+ int map_id)
{
gfp_t gfp = GFP_ATOMIC|__GFP_NOWARN;
struct bpf_cpu_map_entry *rcpu;
@@ -395,7 +390,7 @@ free_rcu:
return NULL;
}
-void __cpu_map_entry_free(struct rcu_head *rcu)
+static void __cpu_map_entry_free(struct rcu_head *rcu)
{
struct bpf_cpu_map_entry *rcpu;
int cpu;
@@ -438,8 +433,8 @@ void __cpu_map_entry_free(struct rcu_head *rcu)
* cpu_map_kthread_stop, which waits for an RCU graze period before
* stopping kthread, emptying the queue.
*/
-void __cpu_map_entry_replace(struct bpf_cpu_map *cmap,
- u32 key_cpu, struct bpf_cpu_map_entry *rcpu)
+static void __cpu_map_entry_replace(struct bpf_cpu_map *cmap,
+ u32 key_cpu, struct bpf_cpu_map_entry *rcpu)
{
struct bpf_cpu_map_entry *old_rcpu;
@@ -451,7 +446,7 @@ void __cpu_map_entry_replace(struct bpf_cpu_map *cmap,
}
}
-int cpu_map_delete_elem(struct bpf_map *map, void *key)
+static int cpu_map_delete_elem(struct bpf_map *map, void *key)
{
struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
u32 key_cpu = *(u32 *)key;
@@ -464,8 +459,8 @@ int cpu_map_delete_elem(struct bpf_map *map, void *key)
return 0;
}
-int cpu_map_update_elem(struct bpf_map *map, void *key, void *value,
- u64 map_flags)
+static int cpu_map_update_elem(struct bpf_map *map, void *key, void *value,
+ u64 map_flags)
{
struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
struct bpf_cpu_map_entry *rcpu;
@@ -502,7 +497,7 @@ int cpu_map_update_elem(struct bpf_map *map, void *key, void *value,
return 0;
}
-void cpu_map_free(struct bpf_map *map)
+static void cpu_map_free(struct bpf_map *map)
{
struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
int cpu;
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index ebdef54bf7df..565f9ece9115 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -93,13 +93,7 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
if (!dtab)
return ERR_PTR(-ENOMEM);
- /* mandatory map attributes */
- dtab->map.map_type = attr->map_type;
- dtab->map.key_size = attr->key_size;
- dtab->map.value_size = attr->value_size;
- dtab->map.max_entries = attr->max_entries;
- dtab->map.map_flags = attr->map_flags;
- dtab->map.numa_node = bpf_map_attr_numa_node(attr);
+ bpf_map_init_from_attr(&dtab->map, attr);
/* make sure page count doesn't overflow */
cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *);
diff --git a/kernel/bpf/disasm.h b/kernel/bpf/disasm.h
index e0857d016f89..266fe8ee542b 100644
--- a/kernel/bpf/disasm.h
+++ b/kernel/bpf/disasm.h
@@ -29,8 +29,8 @@ extern const char *const bpf_class_string[8];
const char *func_id_name(int id);
-typedef void (*bpf_insn_print_t)(struct bpf_verifier_env *env,
- const char *, ...);
+typedef __printf(2, 3) void (*bpf_insn_print_t)(struct bpf_verifier_env *env,
+ const char *, ...);
typedef const char *(*bpf_insn_revmap_call_t)(void *private_data,
const struct bpf_insn *insn);
typedef const char *(*bpf_insn_print_imm_t)(void *private_data,
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 3905d4bc5b80..b76828f23b49 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -227,7 +227,7 @@ static int alloc_extra_elems(struct bpf_htab *htab)
}
/* Called from syscall */
-static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
+static int htab_map_alloc_check(union bpf_attr *attr)
{
bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH);
@@ -241,9 +241,6 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU);
bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC);
int numa_node = bpf_map_attr_numa_node(attr);
- struct bpf_htab *htab;
- int err, i;
- u64 cost;
BUILD_BUG_ON(offsetof(struct htab_elem, htab) !=
offsetof(struct htab_elem, hash_node.pprev));
@@ -254,40 +251,68 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
/* LRU implementation is much complicated than other
* maps. Hence, limit to CAP_SYS_ADMIN for now.
*/
- return ERR_PTR(-EPERM);
+ return -EPERM;
if (attr->map_flags & ~HTAB_CREATE_FLAG_MASK)
/* reserved bits should not be used */
- return ERR_PTR(-EINVAL);
+ return -EINVAL;
if (!lru && percpu_lru)
- return ERR_PTR(-EINVAL);
+ return -EINVAL;
if (lru && !prealloc)
- return ERR_PTR(-ENOTSUPP);
+ return -ENOTSUPP;
if (numa_node != NUMA_NO_NODE && (percpu || percpu_lru))
- return ERR_PTR(-EINVAL);
+ return -EINVAL;
+
+ /* check sanity of attributes.
+ * value_size == 0 may be allowed in the future to use map as a set
+ */
+ if (attr->max_entries == 0 || attr->key_size == 0 ||
+ attr->value_size == 0)
+ return -EINVAL;
+
+ if (attr->key_size > MAX_BPF_STACK)
+ /* eBPF programs initialize keys on stack, so they cannot be
+ * larger than max stack size
+ */
+ return -E2BIG;
+
+ if (attr->value_size >= KMALLOC_MAX_SIZE -
+ MAX_BPF_STACK - sizeof(struct htab_elem))
+ /* if value_size is bigger, the user space won't be able to
+ * access the elements via bpf syscall. This check also makes
+ * sure that the elem_size doesn't overflow and it's
+ * kmalloc-able later in htab_map_update_elem()
+ */
+ return -E2BIG;
+
+ return 0;
+}
+
+static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
+{
+ bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
+ attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH);
+ bool lru = (attr->map_type == BPF_MAP_TYPE_LRU_HASH ||
+ attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH);
+ /* percpu_lru means each cpu has its own LRU list.
+ * it is different from BPF_MAP_TYPE_PERCPU_HASH where
+ * the map's value itself is percpu. percpu_lru has
+ * nothing to do with the map's value.
+ */
+ bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU);
+ bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC);
+ struct bpf_htab *htab;
+ int err, i;
+ u64 cost;
htab = kzalloc(sizeof(*htab), GFP_USER);
if (!htab)
return ERR_PTR(-ENOMEM);
- /* mandatory map attributes */
- htab->map.map_type = attr->map_type;
- htab->map.key_size = attr->key_size;
- htab->map.value_size = attr->value_size;
- htab->map.max_entries = attr->max_entries;
- htab->map.map_flags = attr->map_flags;
- htab->map.numa_node = numa_node;
-
- /* check sanity of attributes.
- * value_size == 0 may be allowed in the future to use map as a set
- */
- err = -EINVAL;
- if (htab->map.max_entries == 0 || htab->map.key_size == 0 ||
- htab->map.value_size == 0)
- goto free_htab;
+ bpf_map_init_from_attr(&htab->map, attr);
if (percpu_lru) {
/* ensure each CPU's lru list has >=1 elements.
@@ -304,22 +329,6 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
/* hash table size must be power of 2 */
htab->n_buckets = roundup_pow_of_two(htab->map.max_entries);
- err = -E2BIG;
- if (htab->map.key_size > MAX_BPF_STACK)
- /* eBPF programs initialize keys on stack, so they cannot be
- * larger than max stack size
- */
- goto free_htab;
-
- if (htab->map.value_size >= KMALLOC_MAX_SIZE -
- MAX_BPF_STACK - sizeof(struct htab_elem))
- /* if value_size is bigger, the user space won't be able to
- * access the elements via bpf syscall. This check also makes
- * sure that the elem_size doesn't overflow and it's
- * kmalloc-able later in htab_map_update_elem()
- */
- goto free_htab;
-
htab->elem_size = sizeof(struct htab_elem) +
round_up(htab->map.key_size, 8);
if (percpu)
@@ -327,6 +336,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
else
htab->elem_size += round_up(htab->map.value_size, 8);
+ err = -E2BIG;
/* prevent zero size kmalloc and check for u32 overflow */
if (htab->n_buckets == 0 ||
htab->n_buckets > U32_MAX / sizeof(struct bucket))
@@ -1143,6 +1153,7 @@ static void htab_map_free(struct bpf_map *map)
}
const struct bpf_map_ops htab_map_ops = {
+ .map_alloc_check = htab_map_alloc_check,
.map_alloc = htab_map_alloc,
.map_free = htab_map_free,
.map_get_next_key = htab_map_get_next_key,
@@ -1153,6 +1164,7 @@ const struct bpf_map_ops htab_map_ops = {
};
const struct bpf_map_ops htab_lru_map_ops = {
+ .map_alloc_check = htab_map_alloc_check,
.map_alloc = htab_map_alloc,
.map_free = htab_map_free,
.map_get_next_key = htab_map_get_next_key,
@@ -1236,6 +1248,7 @@ int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value,
}
const struct bpf_map_ops htab_percpu_map_ops = {
+ .map_alloc_check = htab_map_alloc_check,
.map_alloc = htab_map_alloc,
.map_free = htab_map_free,
.map_get_next_key = htab_map_get_next_key,
@@ -1245,6 +1258,7 @@ const struct bpf_map_ops htab_percpu_map_ops = {
};
const struct bpf_map_ops htab_lru_percpu_map_ops = {
+ .map_alloc_check = htab_map_alloc_check,
.map_alloc = htab_map_alloc,
.map_free = htab_map_free,
.map_get_next_key = htab_map_get_next_key,
@@ -1253,11 +1267,11 @@ const struct bpf_map_ops htab_lru_percpu_map_ops = {
.map_delete_elem = htab_lru_map_delete_elem,
};
-static struct bpf_map *fd_htab_map_alloc(union bpf_attr *attr)
+static int fd_htab_map_alloc_check(union bpf_attr *attr)
{
if (attr->value_size != sizeof(u32))
- return ERR_PTR(-EINVAL);
- return htab_map_alloc(attr);
+ return -EINVAL;
+ return htab_map_alloc_check(attr);
}
static void fd_htab_map_free(struct bpf_map *map)
@@ -1328,7 +1342,7 @@ static struct bpf_map *htab_of_map_alloc(union bpf_attr *attr)
if (IS_ERR(inner_map_meta))
return inner_map_meta;
- map = fd_htab_map_alloc(attr);
+ map = htab_map_alloc(attr);
if (IS_ERR(map)) {
bpf_map_meta_free(inner_map_meta);
return map;
@@ -1372,6 +1386,7 @@ static void htab_of_map_free(struct bpf_map *map)
}
const struct bpf_map_ops htab_of_maps_map_ops = {
+ .map_alloc_check = fd_htab_map_alloc_check,
.map_alloc = htab_of_map_alloc,
.map_free = htab_of_map_free,
.map_get_next_key = htab_map_get_next_key,
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index 885e45479680..584e02227671 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -522,12 +522,7 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr)
return ERR_PTR(-ENOMEM);
/* copy mandatory map attributes */
- trie->map.map_type = attr->map_type;
- trie->map.key_size = attr->key_size;
- trie->map.value_size = attr->value_size;
- trie->map.max_entries = attr->max_entries;
- trie->map.map_flags = attr->map_flags;
- trie->map.numa_node = bpf_map_attr_numa_node(attr);
+ bpf_map_init_from_attr(&trie->map, attr);
trie->data_size = attr->key_size -
offsetof(struct bpf_lpm_trie_key, data);
trie->max_prefixlen = trie->data_size * 8;
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index 040d4e0edf3f..a88cebf368bf 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c
@@ -24,15 +24,27 @@
#include <linux/rtnetlink.h>
#include <linux/rwsem.h>
-/* Protects bpf_prog_offload_devs and offload members of all progs.
+/* Protects bpf_prog_offload_devs, bpf_map_offload_devs and offload members
+ * of all progs.
* RTNL lock cannot be taken when holding this lock.
*/
static DECLARE_RWSEM(bpf_devs_lock);
static LIST_HEAD(bpf_prog_offload_devs);
+static LIST_HEAD(bpf_map_offload_devs);
+
+static int bpf_dev_offload_check(struct net_device *netdev)
+{
+ if (!netdev)
+ return -EINVAL;
+ if (!netdev->netdev_ops->ndo_bpf)
+ return -EOPNOTSUPP;
+ return 0;
+}
int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr)
{
- struct bpf_dev_offload *offload;
+ struct bpf_prog_offload *offload;
+ int err;
if (attr->prog_type != BPF_PROG_TYPE_SCHED_CLS &&
attr->prog_type != BPF_PROG_TYPE_XDP)
@@ -49,12 +61,15 @@ int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr)
offload->netdev = dev_get_by_index(current->nsproxy->net_ns,
attr->prog_ifindex);
- if (!offload->netdev)
- goto err_free;
+ err = bpf_dev_offload_check(offload->netdev);
+ if (err)
+ goto err_maybe_put;
down_write(&bpf_devs_lock);
- if (offload->netdev->reg_state != NETREG_REGISTERED)
+ if (offload->netdev->reg_state != NETREG_REGISTERED) {
+ err = -EINVAL;
goto err_unlock;
+ }
prog->aux->offload = offload;
list_add_tail(&offload->offloads, &bpf_prog_offload_devs);
dev_put(offload->netdev);
@@ -63,16 +78,17 @@ int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr)
return 0;
err_unlock:
up_write(&bpf_devs_lock);
- dev_put(offload->netdev);
-err_free:
+err_maybe_put:
+ if (offload->netdev)
+ dev_put(offload->netdev);
kfree(offload);
- return -EINVAL;
+ return err;
}
static int __bpf_offload_ndo(struct bpf_prog *prog, enum bpf_netdev_command cmd,
struct netdev_bpf *data)
{
- struct bpf_dev_offload *offload = prog->aux->offload;
+ struct bpf_prog_offload *offload = prog->aux->offload;
struct net_device *netdev;
ASSERT_RTNL();
@@ -80,8 +96,6 @@ static int __bpf_offload_ndo(struct bpf_prog *prog, enum bpf_netdev_command cmd,
if (!offload)
return -ENODEV;
netdev = offload->netdev;
- if (!netdev->netdev_ops->ndo_bpf)
- return -EOPNOTSUPP;
data->command = cmd;
@@ -110,7 +124,7 @@ exit_unlock:
int bpf_prog_offload_verify_insn(struct bpf_verifier_env *env,
int insn_idx, int prev_insn_idx)
{
- struct bpf_dev_offload *offload;
+ struct bpf_prog_offload *offload;
int ret = -ENODEV;
down_read(&bpf_devs_lock);
@@ -124,7 +138,7 @@ int bpf_prog_offload_verify_insn(struct bpf_verifier_env *env,
static void __bpf_prog_offload_destroy(struct bpf_prog *prog)
{
- struct bpf_dev_offload *offload = prog->aux->offload;
+ struct bpf_prog_offload *offload = prog->aux->offload;
struct netdev_bpf data = {};
data.offload.prog = prog;
@@ -238,11 +252,184 @@ int bpf_prog_offload_info_fill(struct bpf_prog_info *info,
const struct bpf_prog_ops bpf_offload_prog_ops = {
};
+static int bpf_map_offload_ndo(struct bpf_offloaded_map *offmap,
+ enum bpf_netdev_command cmd)
+{
+ struct netdev_bpf data = {};
+ struct net_device *netdev;
+
+ ASSERT_RTNL();
+
+ data.command = cmd;
+ data.offmap = offmap;
+ /* Caller must make sure netdev is valid */
+ netdev = offmap->netdev;
+
+ return netdev->netdev_ops->ndo_bpf(netdev, &data);
+}
+
+struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr)
+{
+ struct net *net = current->nsproxy->net_ns;
+ struct bpf_offloaded_map *offmap;
+ int err;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return ERR_PTR(-EPERM);
+ if (attr->map_type != BPF_MAP_TYPE_HASH)
+ return ERR_PTR(-EINVAL);
+
+ offmap = kzalloc(sizeof(*offmap), GFP_USER);
+ if (!offmap)
+ return ERR_PTR(-ENOMEM);
+
+ bpf_map_init_from_attr(&offmap->map, attr);
+
+ rtnl_lock();
+ down_write(&bpf_devs_lock);
+ offmap->netdev = __dev_get_by_index(net, attr->map_ifindex);
+ err = bpf_dev_offload_check(offmap->netdev);
+ if (err)
+ goto err_unlock;
+
+ err = bpf_map_offload_ndo(offmap, BPF_OFFLOAD_MAP_ALLOC);
+ if (err)
+ goto err_unlock;
+
+ list_add_tail(&offmap->offloads, &bpf_map_offload_devs);
+ up_write(&bpf_devs_lock);
+ rtnl_unlock();
+
+ return &offmap->map;
+
+err_unlock:
+ up_write(&bpf_devs_lock);
+ rtnl_unlock();
+ kfree(offmap);
+ return ERR_PTR(err);
+}
+
+static void __bpf_map_offload_destroy(struct bpf_offloaded_map *offmap)
+{
+ WARN_ON(bpf_map_offload_ndo(offmap, BPF_OFFLOAD_MAP_FREE));
+ /* Make sure BPF_MAP_GET_NEXT_ID can't find this dead map */
+ bpf_map_free_id(&offmap->map, true);
+ list_del_init(&offmap->offloads);
+ offmap->netdev = NULL;
+}
+
+void bpf_map_offload_map_free(struct bpf_map *map)
+{
+ struct bpf_offloaded_map *offmap = map_to_offmap(map);
+
+ rtnl_lock();
+ down_write(&bpf_devs_lock);
+ if (offmap->netdev)
+ __bpf_map_offload_destroy(offmap);
+ up_write(&bpf_devs_lock);
+ rtnl_unlock();
+
+ kfree(offmap);
+}
+
+int bpf_map_offload_lookup_elem(struct bpf_map *map, void *key, void *value)
+{
+ struct bpf_offloaded_map *offmap = map_to_offmap(map);
+ int ret = -ENODEV;
+
+ down_read(&bpf_devs_lock);
+ if (offmap->netdev)
+ ret = offmap->dev_ops->map_lookup_elem(offmap, key, value);
+ up_read(&bpf_devs_lock);
+
+ return ret;
+}
+
+int bpf_map_offload_update_elem(struct bpf_map *map,
+ void *key, void *value, u64 flags)
+{
+ struct bpf_offloaded_map *offmap = map_to_offmap(map);
+ int ret = -ENODEV;
+
+ if (unlikely(flags > BPF_EXIST))
+ return -EINVAL;
+
+ down_read(&bpf_devs_lock);
+ if (offmap->netdev)
+ ret = offmap->dev_ops->map_update_elem(offmap, key, value,
+ flags);
+ up_read(&bpf_devs_lock);
+
+ return ret;
+}
+
+int bpf_map_offload_delete_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_offloaded_map *offmap = map_to_offmap(map);
+ int ret = -ENODEV;
+
+ down_read(&bpf_devs_lock);
+ if (offmap->netdev)
+ ret = offmap->dev_ops->map_delete_elem(offmap, key);
+ up_read(&bpf_devs_lock);
+
+ return ret;
+}
+
+int bpf_map_offload_get_next_key(struct bpf_map *map, void *key, void *next_key)
+{
+ struct bpf_offloaded_map *offmap = map_to_offmap(map);
+ int ret = -ENODEV;
+
+ down_read(&bpf_devs_lock);
+ if (offmap->netdev)
+ ret = offmap->dev_ops->map_get_next_key(offmap, key, next_key);
+ up_read(&bpf_devs_lock);
+
+ return ret;
+}
+
+bool bpf_offload_dev_match(struct bpf_prog *prog, struct bpf_map *map)
+{
+ struct bpf_offloaded_map *offmap;
+ struct bpf_prog_offload *offload;
+ bool ret;
+
+ if (!bpf_prog_is_dev_bound(prog->aux) || !bpf_map_is_dev_bound(map))
+ return false;
+
+ down_read(&bpf_devs_lock);
+ offload = prog->aux->offload;
+ offmap = map_to_offmap(map);
+
+ ret = offload && offload->netdev == offmap->netdev;
+ up_read(&bpf_devs_lock);
+
+ return ret;
+}
+
+static void bpf_offload_orphan_all_progs(struct net_device *netdev)
+{
+ struct bpf_prog_offload *offload, *tmp;
+
+ list_for_each_entry_safe(offload, tmp, &bpf_prog_offload_devs, offloads)
+ if (offload->netdev == netdev)
+ __bpf_prog_offload_destroy(offload->prog);
+}
+
+static void bpf_offload_orphan_all_maps(struct net_device *netdev)
+{
+ struct bpf_offloaded_map *offmap, *tmp;
+
+ list_for_each_entry_safe(offmap, tmp, &bpf_map_offload_devs, offloads)
+ if (offmap->netdev == netdev)
+ __bpf_map_offload_destroy(offmap);
+}
+
static int bpf_offload_notification(struct notifier_block *notifier,
ulong event, void *ptr)
{
struct net_device *netdev = netdev_notifier_info_to_dev(ptr);
- struct bpf_dev_offload *offload, *tmp;
ASSERT_RTNL();
@@ -253,11 +440,8 @@ static int bpf_offload_notification(struct notifier_block *notifier,
break;
down_write(&bpf_devs_lock);
- list_for_each_entry_safe(offload, tmp, &bpf_prog_offload_devs,
- offloads) {
- if (offload->netdev == netdev)
- __bpf_prog_offload_destroy(offload->prog);
- }
+ bpf_offload_orphan_all_progs(netdev);
+ bpf_offload_orphan_all_maps(netdev);
up_write(&bpf_devs_lock);
break;
default:
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index 079968680bc3..0314d1783d77 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -513,13 +513,7 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
if (!stab)
return ERR_PTR(-ENOMEM);
- /* mandatory map attributes */
- stab->map.map_type = attr->map_type;
- stab->map.key_size = attr->key_size;
- stab->map.value_size = attr->value_size;
- stab->map.max_entries = attr->max_entries;
- stab->map.map_flags = attr->map_flags;
- stab->map.numa_node = bpf_map_attr_numa_node(attr);
+ bpf_map_init_from_attr(&stab->map, attr);
/* make sure page count doesn't overflow */
cost = (u64) stab->map.max_entries * sizeof(struct sock *);
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 6c63c2222ea8..b0ecf43f5894 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -88,14 +88,10 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
if (cost >= U32_MAX - PAGE_SIZE)
goto free_smap;
- smap->map.map_type = attr->map_type;
- smap->map.key_size = attr->key_size;
+ bpf_map_init_from_attr(&smap->map, attr);
smap->map.value_size = value_size;
- smap->map.max_entries = attr->max_entries;
- smap->map.map_flags = attr->map_flags;
smap->n_buckets = n_buckets;
smap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
- smap->map.numa_node = bpf_map_attr_numa_node(attr);
err = bpf_map_precharge_memlock(smap->map.pages);
if (err)
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 2bac0dc8baba..c691b9e972e3 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -94,18 +94,34 @@ static int check_uarg_tail_zero(void __user *uaddr,
return 0;
}
+const struct bpf_map_ops bpf_map_offload_ops = {
+ .map_alloc = bpf_map_offload_map_alloc,
+ .map_free = bpf_map_offload_map_free,
+};
+
static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
{
+ const struct bpf_map_ops *ops;
struct bpf_map *map;
+ int err;
- if (attr->map_type >= ARRAY_SIZE(bpf_map_types) ||
- !bpf_map_types[attr->map_type])
+ if (attr->map_type >= ARRAY_SIZE(bpf_map_types))
+ return ERR_PTR(-EINVAL);
+ ops = bpf_map_types[attr->map_type];
+ if (!ops)
return ERR_PTR(-EINVAL);
- map = bpf_map_types[attr->map_type]->map_alloc(attr);
+ if (ops->map_alloc_check) {
+ err = ops->map_alloc_check(attr);
+ if (err)
+ return ERR_PTR(err);
+ }
+ if (attr->map_ifindex)
+ ops = &bpf_map_offload_ops;
+ map = ops->map_alloc(attr);
if (IS_ERR(map))
return map;
- map->ops = bpf_map_types[attr->map_type];
+ map->ops = ops;
map->map_type = attr->map_type;
return map;
}
@@ -134,6 +150,16 @@ void bpf_map_area_free(void *area)
kvfree(area);
}
+void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr)
+{
+ map->map_type = attr->map_type;
+ map->key_size = attr->key_size;
+ map->value_size = attr->value_size;
+ map->max_entries = attr->max_entries;
+ map->map_flags = attr->map_flags;
+ map->numa_node = bpf_map_attr_numa_node(attr);
+}
+
int bpf_map_precharge_memlock(u32 pages)
{
struct user_struct *user = get_current_user();
@@ -189,16 +215,25 @@ static int bpf_map_alloc_id(struct bpf_map *map)
return id > 0 ? 0 : id;
}
-static void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock)
+void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock)
{
unsigned long flags;
+ /* Offloaded maps are removed from the IDR store when their device
+ * disappears - even if someone holds an fd to them they are unusable,
+ * the memory is gone, all ops will fail; they are simply waiting for
+ * refcnt to drop to be freed.
+ */
+ if (!map->id)
+ return;
+
if (do_idr_lock)
spin_lock_irqsave(&map_idr_lock, flags);
else
__acquire(&map_idr_lock);
idr_remove(&map_idr, map->id);
+ map->id = 0;
if (do_idr_lock)
spin_unlock_irqrestore(&map_idr_lock, flags);
@@ -378,7 +413,7 @@ static int bpf_obj_name_cpy(char *dst, const char *src)
return 0;
}
-#define BPF_MAP_CREATE_LAST_FIELD map_name
+#define BPF_MAP_CREATE_LAST_FIELD map_ifindex
/* called via syscall */
static int map_create(union bpf_attr *attr)
{
@@ -566,8 +601,10 @@ static int map_lookup_elem(union bpf_attr *attr)
if (!value)
goto free_key;
- if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
- map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
+ if (bpf_map_is_dev_bound(map)) {
+ err = bpf_map_offload_lookup_elem(map, key, value);
+ } else if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
+ map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
err = bpf_percpu_hash_copy(map, key, value);
} else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
err = bpf_percpu_array_copy(map, key, value);
@@ -654,7 +691,10 @@ static int map_update_elem(union bpf_attr *attr)
goto free_value;
/* Need to create a kthread, thus must support schedule */
- if (map->map_type == BPF_MAP_TYPE_CPUMAP) {
+ if (bpf_map_is_dev_bound(map)) {
+ err = bpf_map_offload_update_elem(map, key, value, attr->flags);
+ goto out;
+ } else if (map->map_type == BPF_MAP_TYPE_CPUMAP) {
err = map->ops->map_update_elem(map, key, value, attr->flags);
goto out;
}
@@ -731,6 +771,11 @@ static int map_delete_elem(union bpf_attr *attr)
goto err_put;
}
+ if (bpf_map_is_dev_bound(map)) {
+ err = bpf_map_offload_delete_elem(map, key);
+ goto out;
+ }
+
preempt_disable();
__this_cpu_inc(bpf_prog_active);
rcu_read_lock();
@@ -738,7 +783,7 @@ static int map_delete_elem(union bpf_attr *attr)
rcu_read_unlock();
__this_cpu_dec(bpf_prog_active);
preempt_enable();
-
+out:
if (!err)
trace_bpf_map_delete_elem(map, ufd, key);
kfree(key);
@@ -788,9 +833,15 @@ static int map_get_next_key(union bpf_attr *attr)
if (!next_key)
goto free_key;
+ if (bpf_map_is_dev_bound(map)) {
+ err = bpf_map_offload_get_next_key(map, key, next_key);
+ goto out;
+ }
+
rcu_read_lock();
err = map->ops->map_get_next_key(map, key, next_key);
rcu_read_unlock();
+out:
if (err)
goto free_next_key;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 48b61caa94cb..2e7a43edf264 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -3091,6 +3091,11 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
return -EINVAL;
}
+ if (opcode == BPF_ARSH && BPF_CLASS(insn->code) != BPF_ALU64) {
+ verbose(env, "BPF_ARSH not supported for 32 bit ALU\n");
+ return -EINVAL;
+ }
+
if ((opcode == BPF_LSH || opcode == BPF_RSH ||
opcode == BPF_ARSH) && BPF_SRC(insn->code) == BPF_K) {
int size = BPF_CLASS(insn->code) == BPF_ALU64 ? 64 : 32;
@@ -4816,6 +4821,13 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
return -EINVAL;
}
}
+
+ if ((bpf_prog_is_dev_bound(prog->aux) || bpf_map_is_dev_bound(map)) &&
+ !bpf_offload_dev_match(prog, map)) {
+ verbose(env, "offload device mismatch between prog and map\n");
+ return -EINVAL;
+ }
+
return 0;
}
@@ -5354,7 +5366,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
*/
map_ptr = env->insn_aux_data[i + delta].map_ptr;
if (map_ptr == BPF_MAP_PTR_POISON) {
- verbose(env, "tail_call obusing map_ptr\n");
+ verbose(env, "tail_call abusing map_ptr\n");
return -EINVAL;
}
if (!map_ptr->unpriv_array)
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index b3663896278e..4f63597c824d 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -410,7 +410,7 @@ static int __init crash_save_vmcoreinfo_init(void)
VMCOREINFO_SYMBOL(contig_page_data);
#endif
#ifdef CONFIG_SPARSEMEM
- VMCOREINFO_SYMBOL(mem_section);
+ VMCOREINFO_SYMBOL_ARRAY(mem_section);
VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS);
VMCOREINFO_STRUCT_SIZE(mem_section);
VMCOREINFO_OFFSET(mem_section, section_mem_map);
diff --git a/kernel/fail_function.c b/kernel/fail_function.c
new file mode 100644
index 000000000000..21b0122cb39c
--- /dev/null
+++ b/kernel/fail_function.c
@@ -0,0 +1,349 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * fail_function.c: Function-based error injection
+ */
+#include <linux/error-injection.h>
+#include <linux/debugfs.h>
+#include <linux/fault-inject.h>
+#include <linux/kallsyms.h>
+#include <linux/kprobes.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+
+static int fei_kprobe_handler(struct kprobe *kp, struct pt_regs *regs);
+
+struct fei_attr {
+ struct list_head list;
+ struct kprobe kp;
+ unsigned long retval;
+};
+static DEFINE_MUTEX(fei_lock);
+static LIST_HEAD(fei_attr_list);
+static DECLARE_FAULT_ATTR(fei_fault_attr);
+static struct dentry *fei_debugfs_dir;
+
+static unsigned long adjust_error_retval(unsigned long addr, unsigned long retv)
+{
+ switch (get_injectable_error_type(addr)) {
+ case EI_ETYPE_NULL:
+ if (retv != 0)
+ return 0;
+ break;
+ case EI_ETYPE_ERRNO:
+ if (retv < (unsigned long)-MAX_ERRNO)
+ return (unsigned long)-EINVAL;
+ break;
+ case EI_ETYPE_ERRNO_NULL:
+ if (retv != 0 && retv < (unsigned long)-MAX_ERRNO)
+ return (unsigned long)-EINVAL;
+ break;
+ }
+
+ return retv;
+}
+
+static struct fei_attr *fei_attr_new(const char *sym, unsigned long addr)
+{
+ struct fei_attr *attr;
+
+ attr = kzalloc(sizeof(*attr), GFP_KERNEL);
+ if (attr) {
+ attr->kp.symbol_name = kstrdup(sym, GFP_KERNEL);
+ if (!attr->kp.symbol_name) {
+ kfree(attr);
+ return NULL;
+ }
+ attr->kp.pre_handler = fei_kprobe_handler;
+ attr->retval = adjust_error_retval(addr, 0);
+ INIT_LIST_HEAD(&attr->list);
+ }
+ return attr;
+}
+
+static void fei_attr_free(struct fei_attr *attr)
+{
+ if (attr) {
+ kfree(attr->kp.symbol_name);
+ kfree(attr);
+ }
+}
+
+static struct fei_attr *fei_attr_lookup(const char *sym)
+{
+ struct fei_attr *attr;
+
+ list_for_each_entry(attr, &fei_attr_list, list) {
+ if (!strcmp(attr->kp.symbol_name, sym))
+ return attr;
+ }
+
+ return NULL;
+}
+
+static bool fei_attr_is_valid(struct fei_attr *_attr)
+{
+ struct fei_attr *attr;
+
+ list_for_each_entry(attr, &fei_attr_list, list) {
+ if (attr == _attr)
+ return true;
+ }
+
+ return false;
+}
+
+static int fei_retval_set(void *data, u64 val)
+{
+ struct fei_attr *attr = data;
+ unsigned long retv = (unsigned long)val;
+ int err = 0;
+
+ mutex_lock(&fei_lock);
+ /*
+ * Since this operation can be done after retval file is removed,
+ * It is safer to check the attr is still valid before accessing
+ * its member.
+ */
+ if (!fei_attr_is_valid(attr)) {
+ err = -ENOENT;
+ goto out;
+ }
+
+ if (attr->kp.addr) {
+ if (adjust_error_retval((unsigned long)attr->kp.addr,
+ val) != retv)
+ err = -EINVAL;
+ }
+ if (!err)
+ attr->retval = val;
+out:
+ mutex_unlock(&fei_lock);
+
+ return err;
+}
+
+static int fei_retval_get(void *data, u64 *val)
+{
+ struct fei_attr *attr = data;
+ int err = 0;
+
+ mutex_lock(&fei_lock);
+ /* Here we also validate @attr to ensure it still exists. */
+ if (!fei_attr_is_valid(attr))
+ err = -ENOENT;
+ else
+ *val = attr->retval;
+ mutex_unlock(&fei_lock);
+
+ return err;
+}
+DEFINE_DEBUGFS_ATTRIBUTE(fei_retval_ops, fei_retval_get, fei_retval_set,
+ "%llx\n");
+
+static int fei_debugfs_add_attr(struct fei_attr *attr)
+{
+ struct dentry *dir;
+
+ dir = debugfs_create_dir(attr->kp.symbol_name, fei_debugfs_dir);
+ if (!dir)
+ return -ENOMEM;
+
+ if (!debugfs_create_file("retval", 0600, dir, attr, &fei_retval_ops)) {
+ debugfs_remove_recursive(dir);
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static void fei_debugfs_remove_attr(struct fei_attr *attr)
+{
+ struct dentry *dir;
+
+ dir = debugfs_lookup(attr->kp.symbol_name, fei_debugfs_dir);
+ if (dir)
+ debugfs_remove_recursive(dir);
+}
+
+static int fei_kprobe_handler(struct kprobe *kp, struct pt_regs *regs)
+{
+ struct fei_attr *attr = container_of(kp, struct fei_attr, kp);
+
+ if (should_fail(&fei_fault_attr, 1)) {
+ regs_set_return_value(regs, attr->retval);
+ override_function_with_return(regs);
+ /* Kprobe specific fixup */
+ reset_current_kprobe();
+ preempt_enable_no_resched();
+ return 1;
+ }
+
+ return 0;
+}
+NOKPROBE_SYMBOL(fei_kprobe_handler)
+
+static void *fei_seq_start(struct seq_file *m, loff_t *pos)
+{
+ mutex_lock(&fei_lock);
+ return seq_list_start(&fei_attr_list, *pos);
+}
+
+static void fei_seq_stop(struct seq_file *m, void *v)
+{
+ mutex_unlock(&fei_lock);
+}
+
+static void *fei_seq_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ return seq_list_next(v, &fei_attr_list, pos);
+}
+
+static int fei_seq_show(struct seq_file *m, void *v)
+{
+ struct fei_attr *attr = list_entry(v, struct fei_attr, list);
+
+ seq_printf(m, "%pf\n", attr->kp.addr);
+ return 0;
+}
+
+static const struct seq_operations fei_seq_ops = {
+ .start = fei_seq_start,
+ .next = fei_seq_next,
+ .stop = fei_seq_stop,
+ .show = fei_seq_show,
+};
+
+static int fei_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &fei_seq_ops);
+}
+
+static void fei_attr_remove(struct fei_attr *attr)
+{
+ fei_debugfs_remove_attr(attr);
+ unregister_kprobe(&attr->kp);
+ list_del(&attr->list);
+ fei_attr_free(attr);
+}
+
+static void fei_attr_remove_all(void)
+{
+ struct fei_attr *attr, *n;
+
+ list_for_each_entry_safe(attr, n, &fei_attr_list, list) {
+ fei_attr_remove(attr);
+ }
+}
+
+static ssize_t fei_write(struct file *file, const char __user *buffer,
+ size_t count, loff_t *ppos)
+{
+ struct fei_attr *attr;
+ unsigned long addr;
+ char *buf, *sym;
+ int ret;
+
+ /* cut off if it is too long */
+ if (count > KSYM_NAME_LEN)
+ count = KSYM_NAME_LEN;
+ buf = kmalloc(sizeof(char) * (count + 1), GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+
+ if (copy_from_user(buf, buffer, count)) {
+ ret = -EFAULT;
+ goto out;
+ }
+ buf[count] = '\0';
+ sym = strstrip(buf);
+
+ mutex_lock(&fei_lock);
+
+ /* Writing just spaces will remove all injection points */
+ if (sym[0] == '\0') {
+ fei_attr_remove_all();
+ ret = count;
+ goto out;
+ }
+ /* Writing !function will remove one injection point */
+ if (sym[0] == '!') {
+ attr = fei_attr_lookup(sym + 1);
+ if (!attr) {
+ ret = -ENOENT;
+ goto out;
+ }
+ fei_attr_remove(attr);
+ ret = count;
+ goto out;
+ }
+
+ addr = kallsyms_lookup_name(sym);
+ if (!addr) {
+ ret = -EINVAL;
+ goto out;
+ }
+ if (!within_error_injection_list(addr)) {
+ ret = -ERANGE;
+ goto out;
+ }
+ if (fei_attr_lookup(sym)) {
+ ret = -EBUSY;
+ goto out;
+ }
+ attr = fei_attr_new(sym, addr);
+ if (!attr) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = register_kprobe(&attr->kp);
+ if (!ret)
+ ret = fei_debugfs_add_attr(attr);
+ if (ret < 0)
+ fei_attr_remove(attr);
+ else {
+ list_add_tail(&attr->list, &fei_attr_list);
+ ret = count;
+ }
+out:
+ kfree(buf);
+ mutex_unlock(&fei_lock);
+ return ret;
+}
+
+static const struct file_operations fei_ops = {
+ .open = fei_open,
+ .read = seq_read,
+ .write = fei_write,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static int __init fei_debugfs_init(void)
+{
+ struct dentry *dir;
+
+ dir = fault_create_debugfs_attr("fail_function", NULL,
+ &fei_fault_attr);
+ if (IS_ERR(dir))
+ return PTR_ERR(dir);
+
+ /* injectable attribute is just a symlink of error_inject/list */
+ if (!debugfs_create_symlink("injectable", dir,
+ "../error_injection/list"))
+ goto error;
+
+ if (!debugfs_create_file("inject", 0600, dir, NULL, &fei_ops))
+ goto error;
+
+ fei_debugfs_dir = dir;
+
+ return 0;
+error:
+ debugfs_remove_recursive(dir);
+ return -ENOMEM;
+}
+
+late_initcall(fei_debugfs_init);
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index b4aab48ad258..da2ccf142358 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -83,16 +83,6 @@ static raw_spinlock_t *kretprobe_table_lock_ptr(unsigned long hash)
return &(kretprobe_table_locks[hash].lock);
}
-/* List of symbols that can be overriden for error injection. */
-static LIST_HEAD(kprobe_error_injection_list);
-static DEFINE_MUTEX(kprobe_ei_mutex);
-struct kprobe_ei_entry {
- struct list_head list;
- unsigned long start_addr;
- unsigned long end_addr;
- void *priv;
-};
-
/* Blacklist -- list of struct kprobe_blacklist_entry */
static LIST_HEAD(kprobe_blacklist);
@@ -1404,17 +1394,6 @@ bool within_kprobe_blacklist(unsigned long addr)
return false;
}
-bool within_kprobe_error_injection_list(unsigned long addr)
-{
- struct kprobe_ei_entry *ent;
-
- list_for_each_entry(ent, &kprobe_error_injection_list, list) {
- if (addr >= ent->start_addr && addr < ent->end_addr)
- return true;
- }
- return false;
-}
-
/*
* If we have a symbol_name argument, look it up and add the offset field
* to it. This way, we can specify a relative address to a symbol.
@@ -2189,86 +2168,6 @@ static int __init populate_kprobe_blacklist(unsigned long *start,
return 0;
}
-#ifdef CONFIG_BPF_KPROBE_OVERRIDE
-/* Markers of the _kprobe_error_inject_list section */
-extern unsigned long __start_kprobe_error_inject_list[];
-extern unsigned long __stop_kprobe_error_inject_list[];
-
-/*
- * Lookup and populate the kprobe_error_injection_list.
- *
- * For safety reasons we only allow certain functions to be overriden with
- * bpf_error_injection, so we need to populate the list of the symbols that have
- * been marked as safe for overriding.
- */
-static void populate_kprobe_error_injection_list(unsigned long *start,
- unsigned long *end,
- void *priv)
-{
- unsigned long *iter;
- struct kprobe_ei_entry *ent;
- unsigned long entry, offset = 0, size = 0;
-
- mutex_lock(&kprobe_ei_mutex);
- for (iter = start; iter < end; iter++) {
- entry = arch_deref_entry_point((void *)*iter);
-
- if (!kernel_text_address(entry) ||
- !kallsyms_lookup_size_offset(entry, &size, &offset)) {
- pr_err("Failed to find error inject entry at %p\n",
- (void *)entry);
- continue;
- }
-
- ent = kmalloc(sizeof(*ent), GFP_KERNEL);
- if (!ent)
- break;
- ent->start_addr = entry;
- ent->end_addr = entry + size;
- ent->priv = priv;
- INIT_LIST_HEAD(&ent->list);
- list_add_tail(&ent->list, &kprobe_error_injection_list);
- }
- mutex_unlock(&kprobe_ei_mutex);
-}
-
-static void __init populate_kernel_kprobe_ei_list(void)
-{
- populate_kprobe_error_injection_list(__start_kprobe_error_inject_list,
- __stop_kprobe_error_inject_list,
- NULL);
-}
-
-static void module_load_kprobe_ei_list(struct module *mod)
-{
- if (!mod->num_kprobe_ei_funcs)
- return;
- populate_kprobe_error_injection_list(mod->kprobe_ei_funcs,
- mod->kprobe_ei_funcs +
- mod->num_kprobe_ei_funcs, mod);
-}
-
-static void module_unload_kprobe_ei_list(struct module *mod)
-{
- struct kprobe_ei_entry *ent, *n;
- if (!mod->num_kprobe_ei_funcs)
- return;
-
- mutex_lock(&kprobe_ei_mutex);
- list_for_each_entry_safe(ent, n, &kprobe_error_injection_list, list) {
- if (ent->priv == mod) {
- list_del_init(&ent->list);
- kfree(ent);
- }
- }
- mutex_unlock(&kprobe_ei_mutex);
-}
-#else
-static inline void __init populate_kernel_kprobe_ei_list(void) {}
-static inline void module_load_kprobe_ei_list(struct module *m) {}
-static inline void module_unload_kprobe_ei_list(struct module *m) {}
-#endif
-
/* Module notifier call back, checking kprobes on the module */
static int kprobes_module_callback(struct notifier_block *nb,
unsigned long val, void *data)
@@ -2279,11 +2178,6 @@ static int kprobes_module_callback(struct notifier_block *nb,
unsigned int i;
int checkcore = (val == MODULE_STATE_GOING);
- if (val == MODULE_STATE_COMING)
- module_load_kprobe_ei_list(mod);
- else if (val == MODULE_STATE_GOING)
- module_unload_kprobe_ei_list(mod);
-
if (val != MODULE_STATE_GOING && val != MODULE_STATE_LIVE)
return NOTIFY_DONE;
@@ -2346,8 +2240,6 @@ static int __init init_kprobes(void)
pr_err("Please take care of using kprobes.\n");
}
- populate_kernel_kprobe_ei_list();
-
if (kretprobe_blacklist_size) {
/* lookup the function address from its name */
for (i = 0; kretprobe_blacklist[i].name != NULL; i++) {
@@ -2515,56 +2407,6 @@ static const struct file_operations debugfs_kprobe_blacklist_ops = {
.release = seq_release,
};
-/*
- * kprobes/error_injection_list -- shows which functions can be overriden for
- * error injection.
- * */
-static void *kprobe_ei_seq_start(struct seq_file *m, loff_t *pos)
-{
- mutex_lock(&kprobe_ei_mutex);
- return seq_list_start(&kprobe_error_injection_list, *pos);
-}
-
-static void kprobe_ei_seq_stop(struct seq_file *m, void *v)
-{
- mutex_unlock(&kprobe_ei_mutex);
-}
-
-static void *kprobe_ei_seq_next(struct seq_file *m, void *v, loff_t *pos)
-{
- return seq_list_next(v, &kprobe_error_injection_list, pos);
-}
-
-static int kprobe_ei_seq_show(struct seq_file *m, void *v)
-{
- char buffer[KSYM_SYMBOL_LEN];
- struct kprobe_ei_entry *ent =
- list_entry(v, struct kprobe_ei_entry, list);
-
- sprint_symbol(buffer, ent->start_addr);
- seq_printf(m, "%s\n", buffer);
- return 0;
-}
-
-static const struct seq_operations kprobe_ei_seq_ops = {
- .start = kprobe_ei_seq_start,
- .next = kprobe_ei_seq_next,
- .stop = kprobe_ei_seq_stop,
- .show = kprobe_ei_seq_show,
-};
-
-static int kprobe_ei_open(struct inode *inode, struct file *filp)
-{
- return seq_open(filp, &kprobe_ei_seq_ops);
-}
-
-static const struct file_operations debugfs_kprobe_ei_ops = {
- .open = kprobe_ei_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release,
-};
-
static void arm_all_kprobes(void)
{
struct hlist_head *head;
@@ -2706,11 +2548,6 @@ static int __init debugfs_kprobe_init(void)
if (!file)
goto error;
- file = debugfs_create_file("error_injection_list", 0444, dir, NULL,
- &debugfs_kprobe_ei_ops);
- if (!file)
- goto error;
-
return 0;
error:
diff --git a/kernel/module.c b/kernel/module.c
index bd695bfdc5c4..601494d4b7ea 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -3118,10 +3118,10 @@ static int find_module_sections(struct module *mod, struct load_info *info)
sizeof(*mod->ftrace_callsites),
&mod->num_ftrace_callsites);
#endif
-#ifdef CONFIG_BPF_KPROBE_OVERRIDE
- mod->kprobe_ei_funcs = section_objs(info, "_kprobe_error_inject_list",
- sizeof(*mod->kprobe_ei_funcs),
- &mod->num_kprobe_ei_funcs);
+#ifdef CONFIG_FUNCTION_ERROR_INJECTION
+ mod->ei_funcs = section_objs(info, "_error_injection_whitelist",
+ sizeof(*mod->ei_funcs),
+ &mod->num_ei_funcs);
#endif
mod->extable = section_objs(info, "__ex_table",
sizeof(*mod->extable), &mod->num_exentries);
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
index 2ddaec40956f..0926aef10dad 100644
--- a/kernel/sched/completion.c
+++ b/kernel/sched/completion.c
@@ -34,11 +34,6 @@ void complete(struct completion *x)
spin_lock_irqsave(&x->wait.lock, flags);
- /*
- * Perform commit of crossrelease here.
- */
- complete_release_commit(x);
-
if (x->done != UINT_MAX)
x->done++;
__wake_up_locked(&x->wait, TASK_NORMAL, 1);
diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c
index dd7908743dab..9bcbacba82a8 100644
--- a/kernel/sched/membarrier.c
+++ b/kernel/sched/membarrier.c
@@ -89,7 +89,9 @@ static int membarrier_private_expedited(void)
rcu_read_unlock();
}
if (!fallback) {
+ preempt_disable();
smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
+ preempt_enable();
free_cpumask_var(tmpmask);
}
cpus_read_unlock();
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index ae3a2d519e50..0b249e2f0c3c 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -355,7 +355,7 @@ config PROFILE_ANNOTATED_BRANCHES
on if you need to profile the system's use of these macros.
config PROFILE_ALL_BRANCHES
- bool "Profile all if conditionals"
+ bool "Profile all if conditionals" if !FORTIFY_SOURCE
select TRACE_BRANCH_PROFILING
help
This tracer profiles all branch conditions. Every if ()
@@ -533,9 +533,7 @@ config FUNCTION_PROFILER
config BPF_KPROBE_OVERRIDE
bool "Enable BPF programs to override a kprobed function"
depends on BPF_EVENTS
- depends on KPROBES_ON_FTRACE
- depends on HAVE_KPROBE_OVERRIDE
- depends on DYNAMIC_FTRACE_WITH_REGS
+ depends on FUNCTION_ERROR_INJECTION
default n
help
Allows BPF to override the execution of a probed function and
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index f6d2327ecb59..f274468cbc45 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -14,7 +14,7 @@
#include <linux/uaccess.h>
#include <linux/ctype.h>
#include <linux/kprobes.h>
-#include <asm/kprobes.h>
+#include <linux/error-injection.h>
#include "trace_probe.h"
#include "trace.h"
@@ -83,9 +83,8 @@ EXPORT_SYMBOL_GPL(trace_call_bpf);
#ifdef CONFIG_BPF_KPROBE_OVERRIDE
BPF_CALL_2(bpf_override_return, struct pt_regs *, regs, unsigned long, rc)
{
- __this_cpu_write(bpf_kprobe_override, 1);
regs_set_return_value(regs, rc);
- arch_ftrace_kprobe_override_function(regs);
+ override_function_with_return(regs);
return 0;
}
@@ -800,11 +799,11 @@ int perf_event_attach_bpf_prog(struct perf_event *event,
int ret = -EEXIST;
/*
- * Kprobe override only works for ftrace based kprobes, and only if they
- * are on the opt-in list.
+ * Kprobe override only works if they are on the function entry,
+ * and only if they are on the opt-in list.
*/
if (prog->kprobe_override &&
- (!trace_kprobe_ftrace(event->tp_event) ||
+ (!trace_kprobe_on_func_entry(event->tp_event) ||
!trace_kprobe_error_injectable(event->tp_event)))
return -EINVAL;
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 9ab18995ff1e..0cddf60186da 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2534,29 +2534,59 @@ rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)
* The lock and unlock are done within a preempt disable section.
* The current_context per_cpu variable can only be modified
* by the current task between lock and unlock. But it can
- * be modified more than once via an interrupt. There are four
- * different contexts that we need to consider.
+ * be modified more than once via an interrupt. To pass this
+ * information from the lock to the unlock without having to
+ * access the 'in_interrupt()' functions again (which do show
+ * a bit of overhead in something as critical as function tracing,
+ * we use a bitmask trick.
*
- * Normal context.
- * SoftIRQ context
- * IRQ context
- * NMI context
+ * bit 0 = NMI context
+ * bit 1 = IRQ context
+ * bit 2 = SoftIRQ context
+ * bit 3 = normal context.
*
- * If for some reason the ring buffer starts to recurse, we
- * only allow that to happen at most 4 times (one for each
- * context). If it happens 5 times, then we consider this a
- * recusive loop and do not let it go further.
+ * This works because this is the order of contexts that can
+ * preempt other contexts. A SoftIRQ never preempts an IRQ
+ * context.
+ *
+ * When the context is determined, the corresponding bit is
+ * checked and set (if it was set, then a recursion of that context
+ * happened).
+ *
+ * On unlock, we need to clear this bit. To do so, just subtract
+ * 1 from the current_context and AND it to itself.
+ *
+ * (binary)
+ * 101 - 1 = 100
+ * 101 & 100 = 100 (clearing bit zero)
+ *
+ * 1010 - 1 = 1001
+ * 1010 & 1001 = 1000 (clearing bit 1)
+ *
+ * The least significant bit can be cleared this way, and it
+ * just so happens that it is the same bit corresponding to
+ * the current context.
*/
static __always_inline int
trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer)
{
- if (cpu_buffer->current_context >= 4)
+ unsigned int val = cpu_buffer->current_context;
+ unsigned long pc = preempt_count();
+ int bit;
+
+ if (!(pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET)))
+ bit = RB_CTX_NORMAL;
+ else
+ bit = pc & NMI_MASK ? RB_CTX_NMI :
+ pc & HARDIRQ_MASK ? RB_CTX_IRQ :
+ pc & SOFTIRQ_OFFSET ? 2 : RB_CTX_SOFTIRQ;
+
+ if (unlikely(val & (1 << bit)))
return 1;
- cpu_buffer->current_context++;
- /* Interrupts must see this update */
- barrier();
+ val |= (1 << bit);
+ cpu_buffer->current_context = val;
return 0;
}
@@ -2564,9 +2594,7 @@ trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer)
static __always_inline void
trace_recursive_unlock(struct ring_buffer_per_cpu *cpu_buffer)
{
- /* Don't let the dec leak out */
- barrier();
- cpu_buffer->current_context--;
+ cpu_buffer->current_context &= cpu_buffer->current_context - 1;
}
/**
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 91f4b57dab82..1fad24acd444 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -21,6 +21,7 @@
#include <linux/module.h>
#include <linux/uaccess.h>
#include <linux/rculist.h>
+#include <linux/error-injection.h>
#include "trace_probe.h"
@@ -42,8 +43,6 @@ struct trace_kprobe {
(offsetof(struct trace_kprobe, tp.args) + \
(sizeof(struct probe_arg) * (n)))
-DEFINE_PER_CPU(int, bpf_kprobe_override);
-
static nokprobe_inline bool trace_kprobe_is_return(struct trace_kprobe *tk)
{
return tk->rp.handler != NULL;
@@ -88,13 +87,16 @@ static nokprobe_inline unsigned long trace_kprobe_nhit(struct trace_kprobe *tk)
return nhit;
}
-int trace_kprobe_ftrace(struct trace_event_call *call)
+bool trace_kprobe_on_func_entry(struct trace_event_call *call)
{
struct trace_kprobe *tk = (struct trace_kprobe *)call->data;
- return kprobe_ftrace(&tk->rp.kp);
+
+ return kprobe_on_func_entry(tk->rp.kp.addr,
+ tk->rp.kp.addr ? NULL : tk->rp.kp.symbol_name,
+ tk->rp.kp.addr ? 0 : tk->rp.kp.offset);
}
-int trace_kprobe_error_injectable(struct trace_event_call *call)
+bool trace_kprobe_error_injectable(struct trace_event_call *call)
{
struct trace_kprobe *tk = (struct trace_kprobe *)call->data;
unsigned long addr;
@@ -106,7 +108,7 @@ int trace_kprobe_error_injectable(struct trace_event_call *call)
} else {
addr = (unsigned long)tk->rp.kp.addr;
}
- return within_kprobe_error_injection_list(addr);
+ return within_error_injection_list(addr);
}
static int register_kprobe_event(struct trace_kprobe *tk);
@@ -1202,6 +1204,7 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
int rctx;
if (bpf_prog_array_valid(call)) {
+ unsigned long orig_ip = instruction_pointer(regs);
int ret;
ret = trace_call_bpf(call, regs);
@@ -1209,12 +1212,13 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
/*
* We need to check and see if we modified the pc of the
* pt_regs, and if so clear the kprobe and return 1 so that we
- * don't do the instruction skipping. Also reset our state so
- * we are clean the next pass through.
+ * don't do the single stepping.
+ * The ftrace kprobe handler leaves it up to us to re-enable
+ * preemption here before returning if we've modified the ip.
*/
- if (__this_cpu_read(bpf_kprobe_override)) {
- __this_cpu_write(bpf_kprobe_override, 0);
+ if (orig_ip != instruction_pointer(regs)) {
reset_current_kprobe();
+ preempt_enable_no_resched();
return 1;
}
if (!ret)
@@ -1322,15 +1326,8 @@ static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
if (tk->tp.flags & TP_FLAG_TRACE)
kprobe_trace_func(tk, regs);
#ifdef CONFIG_PERF_EVENTS
- if (tk->tp.flags & TP_FLAG_PROFILE) {
+ if (tk->tp.flags & TP_FLAG_PROFILE)
ret = kprobe_perf_func(tk, regs);
- /*
- * The ftrace kprobe handler leaves it up to us to re-enable
- * preemption here before returning if we've modified the ip.
- */
- if (ret)
- preempt_enable_no_resched();
- }
#endif
return ret;
}
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index 5e54d748c84c..e101c5bb9eda 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -252,8 +252,8 @@ struct symbol_cache;
unsigned long update_symbol_cache(struct symbol_cache *sc);
void free_symbol_cache(struct symbol_cache *sc);
struct symbol_cache *alloc_symbol_cache(const char *sym, long offset);
-int trace_kprobe_ftrace(struct trace_event_call *call);
-int trace_kprobe_error_injectable(struct trace_event_call *call);
+bool trace_kprobe_on_func_entry(struct trace_event_call *call);
+bool trace_kprobe_error_injectable(struct trace_event_call *call);
#else
/* uprobes do not support symbol fetch methods */
#define fetch_symbol_u8 NULL
@@ -280,14 +280,14 @@ alloc_symbol_cache(const char *sym, long offset)
return NULL;
}
-static inline int trace_kprobe_ftrace(struct trace_event_call *call)
+static inline bool trace_kprobe_on_func_entry(struct trace_event_call *call)
{
- return 0;
+ return false;
}
-static inline int trace_kprobe_error_injectable(struct trace_event_call *call)
+static inline bool trace_kprobe_error_injectable(struct trace_event_call *call)
{
- return 0;
+ return false;
}
#endif /* CONFIG_KPROBE_EVENTS */