aboutsummaryrefslogtreecommitdiff
path: root/kernel/bpf
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/bpf')
-rw-r--r--kernel/bpf/Makefile2
-rw-r--r--kernel/bpf/arraymap.c137
-rw-r--r--kernel/bpf/bpf_lru_list.c2
-rw-r--r--kernel/bpf/cgroup.c5
-rw-r--r--kernel/bpf/core.c24
-rw-r--r--kernel/bpf/hashtab.c325
-rw-r--r--kernel/bpf/lpm_trie.c14
-rw-r--r--kernel/bpf/map_in_map.c97
-rw-r--r--kernel/bpf/map_in_map.h23
-rw-r--r--kernel/bpf/stackmap.c14
-rw-r--r--kernel/bpf/syscall.c178
-rw-r--r--kernel/bpf/verifier.c262
12 files changed, 738 insertions, 345 deletions
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index e1ce4f4fd7fd..e1e5e658f2db 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -1,7 +1,7 @@
obj-y := core.o
obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o
-obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o
+obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o
ifeq ($(CONFIG_PERF_EVENTS),y)
obj-$(CONFIG_BPF_SYSCALL) += stackmap.o
endif
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 6b6f41f0b211..5e00b2333c26 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -1,4 +1,5 @@
/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ * Copyright (c) 2016,2017 Facebook
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of version 2 of the GNU General Public
@@ -16,6 +17,8 @@
#include <linux/filter.h>
#include <linux/perf_event.h>
+#include "map_in_map.h"
+
static void bpf_array_free_percpu(struct bpf_array *array)
{
int i;
@@ -113,6 +116,30 @@ static void *array_map_lookup_elem(struct bpf_map *map, void *key)
return array->value + array->elem_size * index;
}
+/* emit BPF instructions equivalent to C code of array_map_lookup_elem() */
+static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
+{
+ struct bpf_insn *insn = insn_buf;
+ u32 elem_size = round_up(map->value_size, 8);
+ const int ret = BPF_REG_0;
+ const int map_ptr = BPF_REG_1;
+ const int index = BPF_REG_2;
+
+ *insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value));
+ *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0);
+ *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 3);
+
+ if (is_power_of_2(elem_size)) {
+ *insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(elem_size));
+ } else {
+ *insn++ = BPF_ALU64_IMM(BPF_MUL, ret, elem_size);
+ }
+ *insn++ = BPF_ALU64_REG(BPF_ADD, ret, map_ptr);
+ *insn++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1);
+ *insn++ = BPF_MOV64_IMM(ret, 0);
+ return insn - insn_buf;
+}
+
/* Called from eBPF program */
static void *percpu_array_map_lookup_elem(struct bpf_map *map, void *key)
{
@@ -155,7 +182,7 @@ int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value)
static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
- u32 index = *(u32 *)key;
+ u32 index = key ? *(u32 *)key : U32_MAX;
u32 *next = (u32 *)next_key;
if (index >= array->map.max_entries) {
@@ -260,21 +287,17 @@ static void array_map_free(struct bpf_map *map)
bpf_map_area_free(array);
}
-static const struct bpf_map_ops array_ops = {
+const struct bpf_map_ops array_map_ops = {
.map_alloc = array_map_alloc,
.map_free = array_map_free,
.map_get_next_key = array_map_get_next_key,
.map_lookup_elem = array_map_lookup_elem,
.map_update_elem = array_map_update_elem,
.map_delete_elem = array_map_delete_elem,
+ .map_gen_lookup = array_map_gen_lookup,
};
-static struct bpf_map_type_list array_type __ro_after_init = {
- .ops = &array_ops,
- .type = BPF_MAP_TYPE_ARRAY,
-};
-
-static const struct bpf_map_ops percpu_array_ops = {
+const struct bpf_map_ops percpu_array_map_ops = {
.map_alloc = array_map_alloc,
.map_free = array_map_free,
.map_get_next_key = array_map_get_next_key,
@@ -283,19 +306,6 @@ static const struct bpf_map_ops percpu_array_ops = {
.map_delete_elem = array_map_delete_elem,
};
-static struct bpf_map_type_list percpu_array_type __ro_after_init = {
- .ops = &percpu_array_ops,
- .type = BPF_MAP_TYPE_PERCPU_ARRAY,
-};
-
-static int __init register_array_map(void)
-{
- bpf_register_map_type(&array_type);
- bpf_register_map_type(&percpu_array_type);
- return 0;
-}
-late_initcall(register_array_map);
-
static struct bpf_map *fd_array_map_alloc(union bpf_attr *attr)
{
/* only file descriptors can be stored in this type of map */
@@ -399,7 +409,7 @@ void bpf_fd_array_map_clear(struct bpf_map *map)
fd_array_map_delete_elem(map, &i);
}
-static const struct bpf_map_ops prog_array_ops = {
+const struct bpf_map_ops prog_array_map_ops = {
.map_alloc = fd_array_map_alloc,
.map_free = fd_array_map_free,
.map_get_next_key = array_map_get_next_key,
@@ -409,18 +419,6 @@ static const struct bpf_map_ops prog_array_ops = {
.map_fd_put_ptr = prog_fd_array_put_ptr,
};
-static struct bpf_map_type_list prog_array_type __ro_after_init = {
- .ops = &prog_array_ops,
- .type = BPF_MAP_TYPE_PROG_ARRAY,
-};
-
-static int __init register_prog_array_map(void)
-{
- bpf_register_map_type(&prog_array_type);
- return 0;
-}
-late_initcall(register_prog_array_map);
-
static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file,
struct file *map_file)
{
@@ -511,7 +509,7 @@ static void perf_event_fd_array_release(struct bpf_map *map,
rcu_read_unlock();
}
-static const struct bpf_map_ops perf_event_array_ops = {
+const struct bpf_map_ops perf_event_array_map_ops = {
.map_alloc = fd_array_map_alloc,
.map_free = fd_array_map_free,
.map_get_next_key = array_map_get_next_key,
@@ -522,18 +520,6 @@ static const struct bpf_map_ops perf_event_array_ops = {
.map_release = perf_event_fd_array_release,
};
-static struct bpf_map_type_list perf_event_array_type __ro_after_init = {
- .ops = &perf_event_array_ops,
- .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
-};
-
-static int __init register_perf_event_array_map(void)
-{
- bpf_register_map_type(&perf_event_array_type);
- return 0;
-}
-late_initcall(register_perf_event_array_map);
-
#ifdef CONFIG_CGROUPS
static void *cgroup_fd_array_get_ptr(struct bpf_map *map,
struct file *map_file /* not used */,
@@ -554,7 +540,7 @@ static void cgroup_fd_array_free(struct bpf_map *map)
fd_array_map_free(map);
}
-static const struct bpf_map_ops cgroup_array_ops = {
+const struct bpf_map_ops cgroup_array_map_ops = {
.map_alloc = fd_array_map_alloc,
.map_free = cgroup_fd_array_free,
.map_get_next_key = array_map_get_next_key,
@@ -563,16 +549,53 @@ static const struct bpf_map_ops cgroup_array_ops = {
.map_fd_get_ptr = cgroup_fd_array_get_ptr,
.map_fd_put_ptr = cgroup_fd_array_put_ptr,
};
+#endif
-static struct bpf_map_type_list cgroup_array_type __ro_after_init = {
- .ops = &cgroup_array_ops,
- .type = BPF_MAP_TYPE_CGROUP_ARRAY,
-};
+static struct bpf_map *array_of_map_alloc(union bpf_attr *attr)
+{
+ struct bpf_map *map, *inner_map_meta;
+
+ inner_map_meta = bpf_map_meta_alloc(attr->inner_map_fd);
+ if (IS_ERR(inner_map_meta))
+ return inner_map_meta;
-static int __init register_cgroup_array_map(void)
+ map = fd_array_map_alloc(attr);
+ if (IS_ERR(map)) {
+ bpf_map_meta_free(inner_map_meta);
+ return map;
+ }
+
+ map->inner_map_meta = inner_map_meta;
+
+ return map;
+}
+
+static void array_of_map_free(struct bpf_map *map)
{
- bpf_register_map_type(&cgroup_array_type);
- return 0;
+ /* map->inner_map_meta is only accessed by syscall which
+ * is protected by fdget/fdput.
+ */
+ bpf_map_meta_free(map->inner_map_meta);
+ bpf_fd_array_map_clear(map);
+ fd_array_map_free(map);
}
-late_initcall(register_cgroup_array_map);
-#endif
+
+static void *array_of_map_lookup_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_map **inner_map = array_map_lookup_elem(map, key);
+
+ if (!inner_map)
+ return NULL;
+
+ return READ_ONCE(*inner_map);
+}
+
+const struct bpf_map_ops array_of_maps_map_ops = {
+ .map_alloc = array_of_map_alloc,
+ .map_free = array_of_map_free,
+ .map_get_next_key = array_map_get_next_key,
+ .map_lookup_elem = array_of_map_lookup_elem,
+ .map_delete_elem = fd_array_map_delete_elem,
+ .map_fd_get_ptr = bpf_map_fd_get_ptr,
+ .map_fd_put_ptr = bpf_map_fd_put_ptr,
+};
diff --git a/kernel/bpf/bpf_lru_list.c b/kernel/bpf/bpf_lru_list.c
index f62d1d56f41d..e6ef4401a138 100644
--- a/kernel/bpf/bpf_lru_list.c
+++ b/kernel/bpf/bpf_lru_list.c
@@ -13,7 +13,7 @@
#define LOCAL_FREE_TARGET (128)
#define LOCAL_NR_SCANS LOCAL_FREE_TARGET
-#define PERCPU_FREE_TARGET (16)
+#define PERCPU_FREE_TARGET (4)
#define PERCPU_NR_SCANS PERCPU_FREE_TARGET
/* Helpers to get the local list index */
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index da0f53690295..ea6033cba947 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -154,7 +154,7 @@ int __cgroup_bpf_update(struct cgroup *cgrp, struct cgroup *parent,
/**
* __cgroup_bpf_run_filter_skb() - Run a program for packet filtering
- * @sk: The socken sending or receiving traffic
+ * @sk: The socket sending or receiving traffic
* @skb: The skb that is being sent or received
* @type: The type of program to be exectuted
*
@@ -189,10 +189,13 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk,
prog = rcu_dereference(cgrp->bpf.effective[type]);
if (prog) {
unsigned int offset = skb->data - skb_network_header(skb);
+ struct sock *save_sk = skb->sk;
+ skb->sk = sk;
__skb_push(skb, offset);
ret = bpf_prog_run_save_cb(prog, skb) == 1 ? 0 : -EPERM;
__skb_pull(skb, offset);
+ skb->sk = save_sk;
}
rcu_read_unlock();
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index f45827e205d3..6f81e0f5a0fa 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -394,27 +394,23 @@ static bool bpf_prog_kallsyms_verify_off(const struct bpf_prog *fp)
void bpf_prog_kallsyms_add(struct bpf_prog *fp)
{
- unsigned long flags;
-
if (!bpf_prog_kallsyms_candidate(fp) ||
!capable(CAP_SYS_ADMIN))
return;
- spin_lock_irqsave(&bpf_lock, flags);
+ spin_lock_bh(&bpf_lock);
bpf_prog_ksym_node_add(fp->aux);
- spin_unlock_irqrestore(&bpf_lock, flags);
+ spin_unlock_bh(&bpf_lock);
}
void bpf_prog_kallsyms_del(struct bpf_prog *fp)
{
- unsigned long flags;
-
if (!bpf_prog_kallsyms_candidate(fp))
return;
- spin_lock_irqsave(&bpf_lock, flags);
+ spin_lock_bh(&bpf_lock);
bpf_prog_ksym_node_del(fp->aux);
- spin_unlock_irqrestore(&bpf_lock, flags);
+ spin_unlock_bh(&bpf_lock);
}
static struct bpf_prog *bpf_prog_kallsyms_find(unsigned long addr)
@@ -1162,12 +1158,12 @@ out:
LD_ABS_W: /* BPF_R0 = ntohl(*(u32 *) (skb->data + imm32)) */
off = IMM;
load_word:
- /* BPF_LD + BPD_ABS and BPF_LD + BPF_IND insns are
- * only appearing in the programs where ctx ==
- * skb. All programs keep 'ctx' in regs[BPF_REG_CTX]
- * == BPF_R6, bpf_convert_filter() saves it in BPF_R6,
- * internal BPF verifier will check that BPF_R6 ==
- * ctx.
+ /* BPF_LD + BPD_ABS and BPF_LD + BPF_IND insns are only
+ * appearing in the programs where ctx == skb
+ * (see may_access_skb() in the verifier). All programs
+ * keep 'ctx' in regs[BPF_REG_CTX] == BPF_R6,
+ * bpf_convert_filter() saves it in BPF_R6, internal BPF
+ * verifier will check that BPF_R6 == ctx.
*
* BPF_ABS and BPF_IND are wrappers of function calls,
* so they scratch BPF_R1-BPF_R5 registers, preserve
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index afe5bab376c9..004334ea13ba 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -16,6 +16,7 @@
#include <linux/rculist_nulls.h>
#include "percpu_freelist.h"
#include "bpf_lru_list.h"
+#include "map_in_map.h"
struct bucket {
struct hlist_nulls_head head;
@@ -30,18 +31,12 @@ struct bpf_htab {
struct pcpu_freelist freelist;
struct bpf_lru lru;
};
- void __percpu *extra_elems;
+ struct htab_elem *__percpu *extra_elems;
atomic_t count; /* number of elements in this hashtable */
u32 n_buckets; /* number of hash buckets */
u32 elem_size; /* size of each element in bytes */
};
-enum extra_elem_state {
- HTAB_NOT_AN_EXTRA_ELEM = 0,
- HTAB_EXTRA_ELEM_FREE,
- HTAB_EXTRA_ELEM_USED
-};
-
/* each htab element is struct htab_elem + key + value */
struct htab_elem {
union {
@@ -56,7 +51,6 @@ struct htab_elem {
};
union {
struct rcu_head rcu;
- enum extra_elem_state state;
struct bpf_lru_node lru_node;
};
u32 hash;
@@ -77,6 +71,11 @@ static bool htab_is_percpu(const struct bpf_htab *htab)
htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH;
}
+static bool htab_is_prealloc(const struct bpf_htab *htab)
+{
+ return !(htab->map.map_flags & BPF_F_NO_PREALLOC);
+}
+
static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size,
void __percpu *pptr)
{
@@ -88,6 +87,11 @@ static inline void __percpu *htab_elem_get_ptr(struct htab_elem *l, u32 key_size
return *(void __percpu **)(l->key + key_size);
}
+static void *fd_htab_map_get_ptr(const struct bpf_map *map, struct htab_elem *l)
+{
+ return *(void **)(l->key + roundup(map->key_size, 8));
+}
+
static struct htab_elem *get_htab_elem(struct bpf_htab *htab, int i)
{
return (struct htab_elem *) (htab->elems + i * htab->elem_size);
@@ -128,17 +132,20 @@ static struct htab_elem *prealloc_lru_pop(struct bpf_htab *htab, void *key,
static int prealloc_init(struct bpf_htab *htab)
{
+ u32 num_entries = htab->map.max_entries;
int err = -ENOMEM, i;
- htab->elems = bpf_map_area_alloc(htab->elem_size *
- htab->map.max_entries);
+ if (!htab_is_percpu(htab) && !htab_is_lru(htab))
+ num_entries += num_possible_cpus();
+
+ htab->elems = bpf_map_area_alloc(htab->elem_size * num_entries);
if (!htab->elems)
return -ENOMEM;
if (!htab_is_percpu(htab))
goto skip_percpu_elems;
- for (i = 0; i < htab->map.max_entries; i++) {
+ for (i = 0; i < num_entries; i++) {
u32 size = round_up(htab->map.value_size, 8);
void __percpu *pptr;
@@ -166,11 +173,11 @@ skip_percpu_elems:
if (htab_is_lru(htab))
bpf_lru_populate(&htab->lru, htab->elems,
offsetof(struct htab_elem, lru_node),
- htab->elem_size, htab->map.max_entries);
+ htab->elem_size, num_entries);
else
pcpu_freelist_populate(&htab->freelist,
htab->elems + offsetof(struct htab_elem, fnode),
- htab->elem_size, htab->map.max_entries);
+ htab->elem_size, num_entries);
return 0;
@@ -191,16 +198,22 @@ static void prealloc_destroy(struct bpf_htab *htab)
static int alloc_extra_elems(struct bpf_htab *htab)
{
- void __percpu *pptr;
+ struct htab_elem *__percpu *pptr, *l_new;
+ struct pcpu_freelist_node *l;
int cpu;
- pptr = __alloc_percpu_gfp(htab->elem_size, 8, GFP_USER | __GFP_NOWARN);
+ pptr = __alloc_percpu_gfp(sizeof(struct htab_elem *), 8,
+ GFP_USER | __GFP_NOWARN);
if (!pptr)
return -ENOMEM;
for_each_possible_cpu(cpu) {
- ((struct htab_elem *)per_cpu_ptr(pptr, cpu))->state =
- HTAB_EXTRA_ELEM_FREE;
+ l = pcpu_freelist_pop(&htab->freelist);
+ /* pop will succeed, since prealloc_init()
+ * preallocated extra num_possible_cpus elements
+ */
+ l_new = container_of(l, struct htab_elem, fnode);
+ *per_cpu_ptr(pptr, cpu) = l_new;
}
htab->extra_elems = pptr;
return 0;
@@ -342,25 +355,25 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
raw_spin_lock_init(&htab->buckets[i].lock);
}
- if (!percpu && !lru) {
- /* lru itself can remove the least used element, so
- * there is no need for an extra elem during map_update.
- */
- err = alloc_extra_elems(htab);
- if (err)
- goto free_buckets;
- }
-
if (prealloc) {
err = prealloc_init(htab);
if (err)
- goto free_extra_elems;
+ goto free_buckets;
+
+ if (!percpu && !lru) {
+ /* lru itself can remove the least used element, so
+ * there is no need for an extra elem during map_update.
+ */
+ err = alloc_extra_elems(htab);
+ if (err)
+ goto free_prealloc;
+ }
}
return &htab->map;
-free_extra_elems:
- free_percpu(htab->extra_elems);
+free_prealloc:
+ prealloc_destroy(htab);
free_buckets:
bpf_map_area_free(htab->buckets);
free_htab:
@@ -419,7 +432,11 @@ again:
return NULL;
}
-/* Called from syscall or from eBPF program */
+/* Called from syscall or from eBPF program directly, so
+ * arguments have to match bpf_map_lookup_elem() exactly.
+ * The return value is adjusted by BPF instructions
+ * in htab_map_gen_lookup().
+ */
static void *__htab_map_lookup_elem(struct bpf_map *map, void *key)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
@@ -451,6 +468,30 @@ static void *htab_map_lookup_elem(struct bpf_map *map, void *key)
return NULL;
}
+/* inline bpf_map_lookup_elem() call.
+ * Instead of:
+ * bpf_prog
+ * bpf_map_lookup_elem
+ * map->ops->map_lookup_elem
+ * htab_map_lookup_elem
+ * __htab_map_lookup_elem
+ * do:
+ * bpf_prog
+ * __htab_map_lookup_elem
+ */
+static u32 htab_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
+{
+ struct bpf_insn *insn = insn_buf;
+ const int ret = BPF_REG_0;
+
+ *insn++ = BPF_EMIT_CALL((u64 (*)(u64, u64, u64, u64, u64))__htab_map_lookup_elem);
+ *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 1);
+ *insn++ = BPF_ALU64_IMM(BPF_ADD, ret,
+ offsetof(struct htab_elem, key) +
+ round_up(map->key_size, 8));
+ return insn - insn_buf;
+}
+
static void *htab_lru_map_lookup_elem(struct bpf_map *map, void *key)
{
struct htab_elem *l = __htab_map_lookup_elem(map, key);
@@ -499,12 +540,15 @@ static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
struct hlist_nulls_head *head;
struct htab_elem *l, *next_l;
u32 hash, key_size;
- int i;
+ int i = 0;
WARN_ON_ONCE(!rcu_read_lock_held());
key_size = map->key_size;
+ if (!key)
+ goto find_first_elem;
+
hash = htab_map_hash(key, key_size);
head = select_bucket(htab, hash);
@@ -512,10 +556,8 @@ static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
/* lookup the key */
l = lookup_nulls_elem_raw(head, hash, key, key_size, htab->n_buckets);
- if (!l) {
- i = 0;
+ if (!l)
goto find_first_elem;
- }
/* key was found, get next key in the same bucket */
next_l = hlist_nulls_entry_safe(rcu_dereference_raw(hlist_nulls_next_rcu(&l->hash_node)),
@@ -575,12 +617,15 @@ static void htab_elem_free_rcu(struct rcu_head *head)
static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
{
- if (l->state == HTAB_EXTRA_ELEM_USED) {
- l->state = HTAB_EXTRA_ELEM_FREE;
- return;
+ struct bpf_map *map = &htab->map;
+
+ if (map->ops->map_fd_put_ptr) {
+ void *ptr = fd_htab_map_get_ptr(map, l);
+
+ map->ops->map_fd_put_ptr(ptr);
}
- if (!(htab->map.map_flags & BPF_F_NO_PREALLOC)) {
+ if (htab_is_prealloc(htab)) {
pcpu_freelist_push(&htab->freelist, &l->fnode);
} else {
atomic_dec(&htab->count);
@@ -610,47 +655,43 @@ static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr,
static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
void *value, u32 key_size, u32 hash,
bool percpu, bool onallcpus,
- bool old_elem_exists)
+ struct htab_elem *old_elem)
{
u32 size = htab->map.value_size;
- bool prealloc = !(htab->map.map_flags & BPF_F_NO_PREALLOC);
- struct htab_elem *l_new;
+ bool prealloc = htab_is_prealloc(htab);
+ struct htab_elem *l_new, **pl_new;
void __percpu *pptr;
- int err = 0;
if (prealloc) {
- struct pcpu_freelist_node *l;
+ if (old_elem) {
+ /* if we're updating the existing element,
+ * use per-cpu extra elems to avoid freelist_pop/push
+ */
+ pl_new = this_cpu_ptr(htab->extra_elems);
+ l_new = *pl_new;
+ *pl_new = old_elem;
+ } else {
+ struct pcpu_freelist_node *l;
- l = pcpu_freelist_pop(&htab->freelist);
- if (!l)
- err = -E2BIG;
- else
+ l = pcpu_freelist_pop(&htab->freelist);
+ if (!l)
+ return ERR_PTR(-E2BIG);
l_new = container_of(l, struct htab_elem, fnode);
- } else {
- if (atomic_inc_return(&htab->count) > htab->map.max_entries) {
- atomic_dec(&htab->count);
- err = -E2BIG;
- } else {
- l_new = kmalloc(htab->elem_size,
- GFP_ATOMIC | __GFP_NOWARN);
- if (!l_new)
- return ERR_PTR(-ENOMEM);
}
- }
-
- if (err) {
- if (!old_elem_exists)
- return ERR_PTR(err);
-
- /* if we're updating the existing element and the hash table
- * is full, use per-cpu extra elems
- */
- l_new = this_cpu_ptr(htab->extra_elems);
- if (l_new->state != HTAB_EXTRA_ELEM_FREE)
- return ERR_PTR(-E2BIG);
- l_new->state = HTAB_EXTRA_ELEM_USED;
} else {
- l_new->state = HTAB_NOT_AN_EXTRA_ELEM;
+ if (atomic_inc_return(&htab->count) > htab->map.max_entries)
+ if (!old_elem) {
+ /* when map is full and update() is replacing
+ * old element, it's ok to allocate, since
+ * old element will be freed immediately.
+ * Otherwise return an error
+ */
+ atomic_dec(&htab->count);
+ return ERR_PTR(-E2BIG);
+ }
+ l_new = kmalloc(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN);
+ if (!l_new)
+ return ERR_PTR(-ENOMEM);
}
memcpy(l_new->key, key, key_size);
@@ -731,7 +772,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
goto err;
l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false,
- !!l_old);
+ l_old);
if (IS_ERR(l_new)) {
/* all pre-allocated elements are in use or memory exhausted */
ret = PTR_ERR(l_new);
@@ -744,7 +785,8 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
hlist_nulls_add_head_rcu(&l_new->hash_node, head);
if (l_old) {
hlist_nulls_del_rcu(&l_old->hash_node);
- free_htab_elem(htab, l_old);
+ if (!htab_is_prealloc(htab))
+ free_htab_elem(htab, l_old);
}
ret = 0;
err:
@@ -856,7 +898,7 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
value, onallcpus);
} else {
l_new = alloc_htab_elem(htab, key, value, key_size,
- hash, true, onallcpus, false);
+ hash, true, onallcpus, NULL);
if (IS_ERR(l_new)) {
ret = PTR_ERR(l_new);
goto err;
@@ -1024,11 +1066,11 @@ static void delete_all_elements(struct bpf_htab *htab)
hlist_nulls_for_each_entry_safe(l, n, head, hash_node) {
hlist_nulls_del_rcu(&l->hash_node);
- if (l->state != HTAB_EXTRA_ELEM_USED)
- htab_elem_free(htab, l);
+ htab_elem_free(htab, l);
}
}
}
+
/* Called when map->refcnt goes to zero, either from workqueue or from syscall */
static void htab_map_free(struct bpf_map *map)
{
@@ -1045,7 +1087,7 @@ static void htab_map_free(struct bpf_map *map)
* not have executed. Wait for them.
*/
rcu_barrier();
- if (htab->map.map_flags & BPF_F_NO_PREALLOC)
+ if (!htab_is_prealloc(htab))
delete_all_elements(htab);
else
prealloc_destroy(htab);
@@ -1055,21 +1097,17 @@ static void htab_map_free(struct bpf_map *map)
kfree(htab);
}
-static const struct bpf_map_ops htab_ops = {
+const struct bpf_map_ops htab_map_ops = {
.map_alloc = htab_map_alloc,
.map_free = htab_map_free,
.map_get_next_key = htab_map_get_next_key,
.map_lookup_elem = htab_map_lookup_elem,
.map_update_elem = htab_map_update_elem,
.map_delete_elem = htab_map_delete_elem,
+ .map_gen_lookup = htab_map_gen_lookup,
};
-static struct bpf_map_type_list htab_type __ro_after_init = {
- .ops = &htab_ops,
- .type = BPF_MAP_TYPE_HASH,
-};
-
-static const struct bpf_map_ops htab_lru_ops = {
+const struct bpf_map_ops htab_lru_map_ops = {
.map_alloc = htab_map_alloc,
.map_free = htab_map_free,
.map_get_next_key = htab_map_get_next_key,
@@ -1078,11 +1116,6 @@ static const struct bpf_map_ops htab_lru_ops = {
.map_delete_elem = htab_lru_map_delete_elem,
};
-static struct bpf_map_type_list htab_lru_type __ro_after_init = {
- .ops = &htab_lru_ops,
- .type = BPF_MAP_TYPE_LRU_HASH,
-};
-
/* Called from eBPF program */
static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key)
{
@@ -1156,7 +1189,7 @@ int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value,
return ret;
}
-static const struct bpf_map_ops htab_percpu_ops = {
+const struct bpf_map_ops htab_percpu_map_ops = {
.map_alloc = htab_map_alloc,
.map_free = htab_map_free,
.map_get_next_key = htab_map_get_next_key,
@@ -1165,12 +1198,7 @@ static const struct bpf_map_ops htab_percpu_ops = {
.map_delete_elem = htab_map_delete_elem,
};
-static struct bpf_map_type_list htab_percpu_type __ro_after_init = {
- .ops = &htab_percpu_ops,
- .type = BPF_MAP_TYPE_PERCPU_HASH,
-};
-
-static const struct bpf_map_ops htab_lru_percpu_ops = {
+const struct bpf_map_ops htab_lru_percpu_map_ops = {
.map_alloc = htab_map_alloc,
.map_free = htab_map_free,
.map_get_next_key = htab_map_get_next_key,
@@ -1179,17 +1207,102 @@ static const struct bpf_map_ops htab_lru_percpu_ops = {
.map_delete_elem = htab_lru_map_delete_elem,
};
-static struct bpf_map_type_list htab_lru_percpu_type __ro_after_init = {
- .ops = &htab_lru_percpu_ops,
- .type = BPF_MAP_TYPE_LRU_PERCPU_HASH,
-};
+static struct bpf_map *fd_htab_map_alloc(union bpf_attr *attr)
+{
+ struct bpf_map *map;
+
+ if (attr->value_size != sizeof(u32))
+ return ERR_PTR(-EINVAL);
+
+ /* pointer is stored internally */
+ attr->value_size = sizeof(void *);
+ map = htab_map_alloc(attr);
+ attr->value_size = sizeof(u32);
+
+ return map;
+}
-static int __init register_htab_map(void)
+static void fd_htab_map_free(struct bpf_map *map)
{
- bpf_register_map_type(&htab_type);
- bpf_register_map_type(&htab_percpu_type);
- bpf_register_map_type(&htab_lru_type);
- bpf_register_map_type(&htab_lru_percpu_type);
- return 0;
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+ struct hlist_nulls_node *n;
+ struct hlist_nulls_head *head;
+ struct htab_elem *l;
+ int i;
+
+ for (i = 0; i < htab->n_buckets; i++) {
+ head = select_bucket(htab, i);
+
+ hlist_nulls_for_each_entry_safe(l, n, head, hash_node) {
+ void *ptr = fd_htab_map_get_ptr(map, l);
+
+ map->ops->map_fd_put_ptr(ptr);
+ }
+ }
+
+ htab_map_free(map);
+}
+
+/* only called from syscall */
+int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file,
+ void *key, void *value, u64 map_flags)
+{
+ void *ptr;
+ int ret;
+ u32 ufd = *(u32 *)value;
+
+ ptr = map->ops->map_fd_get_ptr(map, map_file, ufd);
+ if (IS_ERR(ptr))
+ return PTR_ERR(ptr);
+
+ ret = htab_map_update_elem(map, key, &ptr, map_flags);
+ if (ret)
+ map->ops->map_fd_put_ptr(ptr);
+
+ return ret;
+}
+
+static struct bpf_map *htab_of_map_alloc(union bpf_attr *attr)
+{
+ struct bpf_map *map, *inner_map_meta;
+
+ inner_map_meta = bpf_map_meta_alloc(attr->inner_map_fd);
+ if (IS_ERR(inner_map_meta))
+ return inner_map_meta;
+
+ map = fd_htab_map_alloc(attr);
+ if (IS_ERR(map)) {
+ bpf_map_meta_free(inner_map_meta);
+ return map;
+ }
+
+ map->inner_map_meta = inner_map_meta;
+
+ return map;
}
-late_initcall(register_htab_map);
+
+static void *htab_of_map_lookup_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_map **inner_map = htab_map_lookup_elem(map, key);
+
+ if (!inner_map)
+ return NULL;
+
+ return READ_ONCE(*inner_map);
+}
+
+static void htab_of_map_free(struct bpf_map *map)
+{
+ bpf_map_meta_free(map->inner_map_meta);
+ fd_htab_map_free(map);
+}
+
+const struct bpf_map_ops htab_of_maps_map_ops = {
+ .map_alloc = htab_of_map_alloc,
+ .map_free = htab_of_map_free,
+ .map_get_next_key = htab_map_get_next_key,
+ .map_lookup_elem = htab_of_map_lookup_elem,
+ .map_delete_elem = htab_map_delete_elem,
+ .map_fd_get_ptr = bpf_map_fd_get_ptr,
+ .map_fd_put_ptr = bpf_map_fd_put_ptr,
+};
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index b37bd9ab7f57..39cfafd895b8 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -505,7 +505,7 @@ static int trie_get_next_key(struct bpf_map *map, void *key, void *next_key)
return -ENOTSUPP;
}
-static const struct bpf_map_ops trie_ops = {
+const struct bpf_map_ops trie_map_ops = {
.map_alloc = trie_alloc,
.map_free = trie_free,
.map_get_next_key = trie_get_next_key,
@@ -513,15 +513,3 @@ static const struct bpf_map_ops trie_ops = {
.map_update_elem = trie_update_elem,
.map_delete_elem = trie_delete_elem,
};
-
-static struct bpf_map_type_list trie_type __ro_after_init = {
- .ops = &trie_ops,
- .type = BPF_MAP_TYPE_LPM_TRIE,
-};
-
-static int __init register_trie_map(void)
-{
- bpf_register_map_type(&trie_type);
- return 0;
-}
-late_initcall(register_trie_map);
diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c
new file mode 100644
index 000000000000..59bcdf821ae4
--- /dev/null
+++ b/kernel/bpf/map_in_map.c
@@ -0,0 +1,97 @@
+/* Copyright (c) 2017 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <linux/slab.h>
+#include <linux/bpf.h>
+
+#include "map_in_map.h"
+
+struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
+{
+ struct bpf_map *inner_map, *inner_map_meta;
+ struct fd f;
+
+ f = fdget(inner_map_ufd);
+ inner_map = __bpf_map_get(f);
+ if (IS_ERR(inner_map))
+ return inner_map;
+
+ /* prog_array->owner_prog_type and owner_jited
+ * is a runtime binding. Doing static check alone
+ * in the verifier is not enough.
+ */
+ if (inner_map->map_type == BPF_MAP_TYPE_PROG_ARRAY) {
+ fdput(f);
+ return ERR_PTR(-ENOTSUPP);
+ }
+
+ /* Does not support >1 level map-in-map */
+ if (inner_map->inner_map_meta) {
+ fdput(f);
+ return ERR_PTR(-EINVAL);
+ }
+
+ inner_map_meta = kzalloc(sizeof(*inner_map_meta), GFP_USER);
+ if (!inner_map_meta) {
+ fdput(f);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ inner_map_meta->map_type = inner_map->map_type;
+ inner_map_meta->key_size = inner_map->key_size;
+ inner_map_meta->value_size = inner_map->value_size;
+ inner_map_meta->map_flags = inner_map->map_flags;
+ inner_map_meta->ops = inner_map->ops;
+ inner_map_meta->max_entries = inner_map->max_entries;
+
+ fdput(f);
+ return inner_map_meta;
+}
+
+void bpf_map_meta_free(struct bpf_map *map_meta)
+{
+ kfree(map_meta);
+}
+
+bool bpf_map_meta_equal(const struct bpf_map *meta0,
+ const struct bpf_map *meta1)
+{
+ /* No need to compare ops because it is covered by map_type */
+ return meta0->map_type == meta1->map_type &&
+ meta0->key_size == meta1->key_size &&
+ meta0->value_size == meta1->value_size &&
+ meta0->map_flags == meta1->map_flags &&
+ meta0->max_entries == meta1->max_entries;
+}
+
+void *bpf_map_fd_get_ptr(struct bpf_map *map,
+ struct file *map_file /* not used */,
+ int ufd)
+{
+ struct bpf_map *inner_map;
+ struct fd f;
+
+ f = fdget(ufd);
+ inner_map = __bpf_map_get(f);
+ if (IS_ERR(inner_map))
+ return inner_map;
+
+ if (bpf_map_meta_equal(map->inner_map_meta, inner_map))
+ inner_map = bpf_map_inc(inner_map, false);
+ else
+ inner_map = ERR_PTR(-EINVAL);
+
+ fdput(f);
+ return inner_map;
+}
+
+void bpf_map_fd_put_ptr(void *ptr)
+{
+ /* ptr->ops->map_free() has to go through one
+ * rcu grace period by itself.
+ */
+ bpf_map_put(ptr);
+}
diff --git a/kernel/bpf/map_in_map.h b/kernel/bpf/map_in_map.h
new file mode 100644
index 000000000000..177fadb689dc
--- /dev/null
+++ b/kernel/bpf/map_in_map.h
@@ -0,0 +1,23 @@
+/* Copyright (c) 2017 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#ifndef __MAP_IN_MAP_H__
+#define __MAP_IN_MAP_H__
+
+#include <linux/types.h>
+
+struct file;
+struct bpf_map;
+
+struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd);
+void bpf_map_meta_free(struct bpf_map *map_meta);
+bool bpf_map_meta_equal(const struct bpf_map *meta0,
+ const struct bpf_map *meta1);
+void *bpf_map_fd_get_ptr(struct bpf_map *map, struct file *map_file,
+ int ufd);
+void bpf_map_fd_put_ptr(void *ptr);
+
+#endif
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 22aa45cd0324..4dfd6f2ec2f9 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -264,7 +264,7 @@ static void stack_map_free(struct bpf_map *map)
put_callchain_buffers();
}
-static const struct bpf_map_ops stack_map_ops = {
+const struct bpf_map_ops stack_map_ops = {
.map_alloc = stack_map_alloc,
.map_free = stack_map_free,
.map_get_next_key = stack_map_get_next_key,
@@ -272,15 +272,3 @@ static const struct bpf_map_ops stack_map_ops = {
.map_update_elem = stack_map_update_elem,
.map_delete_elem = stack_map_delete_elem,
};
-
-static struct bpf_map_type_list stack_map_type __ro_after_init = {
- .ops = &stack_map_ops,
- .type = BPF_MAP_TYPE_STACK_TRACE,
-};
-
-static int __init register_stack_map(void)
-{
- bpf_register_map_type(&stack_map_type);
- return 0;
-}
-late_initcall(register_stack_map);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 7af0dcc5d755..13642c73dca0 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -27,30 +27,29 @@ DEFINE_PER_CPU(int, bpf_prog_active);
int sysctl_unprivileged_bpf_disabled __read_mostly;
-static LIST_HEAD(bpf_map_types);
+static const struct bpf_map_ops * const bpf_map_types[] = {
+#define BPF_PROG_TYPE(_id, _ops)
+#define BPF_MAP_TYPE(_id, _ops) \
+ [_id] = &_ops,
+#include <linux/bpf_types.h>
+#undef BPF_PROG_TYPE
+#undef BPF_MAP_TYPE
+};
static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
{
- struct bpf_map_type_list *tl;
struct bpf_map *map;
- list_for_each_entry(tl, &bpf_map_types, list_node) {
- if (tl->type == attr->map_type) {
- map = tl->ops->map_alloc(attr);
- if (IS_ERR(map))
- return map;
- map->ops = tl->ops;
- map->map_type = attr->map_type;
- return map;
- }
- }
- return ERR_PTR(-EINVAL);
-}
+ if (attr->map_type >= ARRAY_SIZE(bpf_map_types) ||
+ !bpf_map_types[attr->map_type])
+ return ERR_PTR(-EINVAL);
-/* boot time registration of different map implementations */
-void bpf_register_map_type(struct bpf_map_type_list *tl)
-{
- list_add(&tl->list_node, &bpf_map_types);
+ map = bpf_map_types[attr->map_type]->map_alloc(attr);
+ if (IS_ERR(map))
+ return map;
+ map->ops = bpf_map_types[attr->map_type];
+ map->map_type = attr->map_type;
+ return map;
}
void *bpf_map_area_alloc(size_t size)
@@ -215,7 +214,7 @@ int bpf_map_new_fd(struct bpf_map *map)
offsetof(union bpf_attr, CMD##_LAST_FIELD) - \
sizeof(attr->CMD##_LAST_FIELD)) != NULL
-#define BPF_MAP_CREATE_LAST_FIELD map_flags
+#define BPF_MAP_CREATE_LAST_FIELD inner_map_fd
/* called via syscall */
static int map_create(union bpf_attr *attr)
{
@@ -352,6 +351,9 @@ static int map_lookup_elem(union bpf_attr *attr)
err = bpf_percpu_array_copy(map, key, value);
} else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) {
err = bpf_stackmap_copy(map, key, value);
+ } else if (map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS ||
+ map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) {
+ err = -ENOTSUPP;
} else {
rcu_read_lock();
ptr = map->ops->map_lookup_elem(map, key);
@@ -438,11 +440,17 @@ static int map_update_elem(union bpf_attr *attr)
err = bpf_percpu_array_update(map, key, value, attr->flags);
} else if (map->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY ||
map->map_type == BPF_MAP_TYPE_PROG_ARRAY ||
- map->map_type == BPF_MAP_TYPE_CGROUP_ARRAY) {
+ map->map_type == BPF_MAP_TYPE_CGROUP_ARRAY ||
+ map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS) {
rcu_read_lock();
err = bpf_fd_array_map_update_elem(map, f.file, key, value,
attr->flags);
rcu_read_unlock();
+ } else if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) {
+ rcu_read_lock();
+ err = bpf_fd_htab_map_update_elem(map, f.file, key, value,
+ attr->flags);
+ rcu_read_unlock();
} else {
rcu_read_lock();
err = map->ops->map_update_elem(map, key, value, attr->flags);
@@ -528,14 +536,18 @@ static int map_get_next_key(union bpf_attr *attr)
if (IS_ERR(map))
return PTR_ERR(map);
- err = -ENOMEM;
- key = kmalloc(map->key_size, GFP_USER);
- if (!key)
- goto err_put;
+ if (ukey) {
+ err = -ENOMEM;
+ key = kmalloc(map->key_size, GFP_USER);
+ if (!key)
+ goto err_put;
- err = -EFAULT;
- if (copy_from_user(key, ukey, map->key_size) != 0)
- goto free_key;
+ err = -EFAULT;
+ if (copy_from_user(key, ukey, map->key_size) != 0)
+ goto free_key;
+ } else {
+ key = NULL;
+ }
err = -ENOMEM;
next_key = kmalloc(map->key_size, GFP_USER);
@@ -564,79 +576,23 @@ err_put:
return err;
}
-static LIST_HEAD(bpf_prog_types);
+static const struct bpf_verifier_ops * const bpf_prog_types[] = {
+#define BPF_PROG_TYPE(_id, _ops) \
+ [_id] = &_ops,
+#define BPF_MAP_TYPE(_id, _ops)
+#include <linux/bpf_types.h>
+#undef BPF_PROG_TYPE
+#undef BPF_MAP_TYPE
+};
static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog)
{
- struct bpf_prog_type_list *tl;
-
- list_for_each_entry(tl, &bpf_prog_types, list_node) {
- if (tl->type == type) {
- prog->aux->ops = tl->ops;
- prog->type = type;
- return 0;
- }
- }
-
- return -EINVAL;
-}
-
-void bpf_register_prog_type(struct bpf_prog_type_list *tl)
-{
- list_add(&tl->list_node, &bpf_prog_types);
-}
-
-/* fixup insn->imm field of bpf_call instructions:
- * if (insn->imm == BPF_FUNC_map_lookup_elem)
- * insn->imm = bpf_map_lookup_elem - __bpf_call_base;
- * else if (insn->imm == BPF_FUNC_map_update_elem)
- * insn->imm = bpf_map_update_elem - __bpf_call_base;
- * else ...
- *
- * this function is called after eBPF program passed verification
- */
-static void fixup_bpf_calls(struct bpf_prog *prog)
-{
- const struct bpf_func_proto *fn;
- int i;
+ if (type >= ARRAY_SIZE(bpf_prog_types) || !bpf_prog_types[type])
+ return -EINVAL;
- for (i = 0; i < prog->len; i++) {
- struct bpf_insn *insn = &prog->insnsi[i];
-
- if (insn->code == (BPF_JMP | BPF_CALL)) {
- /* we reach here when program has bpf_call instructions
- * and it passed bpf_check(), means that
- * ops->get_func_proto must have been supplied, check it
- */
- BUG_ON(!prog->aux->ops->get_func_proto);
-
- if (insn->imm == BPF_FUNC_get_route_realm)
- prog->dst_needed = 1;
- if (insn->imm == BPF_FUNC_get_prandom_u32)
- bpf_user_rnd_init_once();
- if (insn->imm == BPF_FUNC_xdp_adjust_head)
- prog->xdp_adjust_head = 1;
- if (insn->imm == BPF_FUNC_tail_call) {
- /* mark bpf_tail_call as different opcode
- * to avoid conditional branch in
- * interpeter for every normal call
- * and to prevent accidental JITing by
- * JIT compiler that doesn't support
- * bpf_tail_call yet
- */
- insn->imm = 0;
- insn->code |= BPF_X;
- continue;
- }
-
- fn = prog->aux->ops->get_func_proto(insn->imm);
- /* all functions that have prototype and verifier allowed
- * programs to call them, must be real in-kernel functions
- */
- BUG_ON(!fn->func);
- insn->imm = fn->func - __bpf_call_base;
- }
- }
+ prog->aux->ops = bpf_prog_types[type];
+ prog->type = type;
+ return 0;
}
/* drop refcnt on maps used by eBPF program and free auxilary data */
@@ -892,9 +848,6 @@ static int bpf_prog_load(union bpf_attr *attr)
if (err < 0)
goto free_used_maps;
- /* fixup BPF_CALL->imm field */
- fixup_bpf_calls(prog);
-
/* eBPF program is ready to be JITed */
prog = bpf_prog_select_runtime(prog, &err);
if (err < 0)
@@ -1020,6 +973,28 @@ static int bpf_prog_detach(const union bpf_attr *attr)
}
#endif /* CONFIG_CGROUP_BPF */
+#define BPF_PROG_TEST_RUN_LAST_FIELD test.duration
+
+static int bpf_prog_test_run(const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ struct bpf_prog *prog;
+ int ret = -ENOTSUPP;
+
+ if (CHECK_ATTR(BPF_PROG_TEST_RUN))
+ return -EINVAL;
+
+ prog = bpf_prog_get(attr->test.prog_fd);
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
+
+ if (prog->aux->ops->test_run)
+ ret = prog->aux->ops->test_run(prog, attr, uattr);
+
+ bpf_prog_put(prog);
+ return ret;
+}
+
SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
{
union bpf_attr attr = {};
@@ -1086,7 +1061,6 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
case BPF_OBJ_GET:
err = bpf_obj_get(&attr);
break;
-
#ifdef CONFIG_CGROUP_BPF
case BPF_PROG_ATTACH:
err = bpf_prog_attach(&attr);
@@ -1095,7 +1069,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
err = bpf_prog_detach(&attr);
break;
#endif
-
+ case BPF_PROG_TEST_RUN:
+ err = bpf_prog_test_run(&attr, uattr);
+ break;
default:
err = -EINVAL;
break;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 796b68d00119..c2ff608c1984 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -143,6 +143,8 @@ struct bpf_verifier_stack_elem {
#define BPF_COMPLEXITY_LIMIT_INSNS 65536
#define BPF_COMPLEXITY_LIMIT_STACK 1024
+#define BPF_MAP_PTR_POISON ((void *)0xeB9F + POISON_POINTER_DELTA)
+
struct bpf_call_arg_meta {
struct bpf_map *map_ptr;
bool raw_mode;
@@ -765,38 +767,56 @@ static bool is_pointer_value(struct bpf_verifier_env *env, int regno)
}
}
-static int check_ptr_alignment(struct bpf_verifier_env *env,
- struct bpf_reg_state *reg, int off, int size)
+static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg,
+ int off, int size)
{
- if (reg->type != PTR_TO_PACKET && reg->type != PTR_TO_MAP_VALUE_ADJ) {
- if (off % size != 0) {
- verbose("misaligned access off %d size %d\n",
- off, size);
- return -EACCES;
- } else {
- return 0;
- }
- }
-
- if (IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS))
- /* misaligned access to packet is ok on x86,arm,arm64 */
- return 0;
-
if (reg->id && size != 1) {
- verbose("Unknown packet alignment. Only byte-sized access allowed\n");
+ verbose("Unknown alignment. Only byte-sized access allowed in packet access.\n");
return -EACCES;
}
/* skb->data is NET_IP_ALIGN-ed */
- if (reg->type == PTR_TO_PACKET &&
- (NET_IP_ALIGN + reg->off + off) % size != 0) {
+ if ((NET_IP_ALIGN + reg->off + off) % size != 0) {
verbose("misaligned packet access off %d+%d+%d size %d\n",
NET_IP_ALIGN, reg->off, off, size);
return -EACCES;
}
+
return 0;
}
+static int check_val_ptr_alignment(const struct bpf_reg_state *reg,
+ int size)
+{
+ if (size != 1) {
+ verbose("Unknown alignment. Only byte-sized access allowed in value access.\n");
+ return -EACCES;
+ }
+
+ return 0;
+}
+
+static int check_ptr_alignment(const struct bpf_reg_state *reg,
+ int off, int size)
+{
+ switch (reg->type) {
+ case PTR_TO_PACKET:
+ return IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) ? 0 :
+ check_pkt_ptr_alignment(reg, off, size);
+ case PTR_TO_MAP_VALUE_ADJ:
+ return IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) ? 0 :
+ check_val_ptr_alignment(reg, size);
+ default:
+ if (off % size != 0) {
+ verbose("misaligned access off %d size %d\n",
+ off, size);
+ return -EACCES;
+ }
+
+ return 0;
+ }
+}
+
/* check whether memory at (regno + off) is accessible for t = (read | write)
* if t==write, value_regno is a register which value is stored into memory
* if t==read, value_regno is a register which will receive the value from memory
@@ -818,7 +838,7 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off,
if (size < 0)
return size;
- err = check_ptr_alignment(env, reg, off, size);
+ err = check_ptr_alignment(reg, off, size);
if (err)
return err;
@@ -1197,6 +1217,10 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
func_id != BPF_FUNC_current_task_under_cgroup)
goto error;
break;
+ case BPF_MAP_TYPE_ARRAY_OF_MAPS:
+ case BPF_MAP_TYPE_HASH_OF_MAPS:
+ if (func_id != BPF_FUNC_map_lookup_elem)
+ goto error;
default:
break;
}
@@ -1273,7 +1297,7 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
}
}
-static int check_call(struct bpf_verifier_env *env, int func_id)
+static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
{
struct bpf_verifier_state *state = &env->cur_state;
const struct bpf_func_proto *fn = NULL;
@@ -1357,6 +1381,8 @@ static int check_call(struct bpf_verifier_env *env, int func_id)
} else if (fn->ret_type == RET_VOID) {
regs[BPF_REG_0].type = NOT_INIT;
} else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL) {
+ struct bpf_insn_aux_data *insn_aux;
+
regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL;
regs[BPF_REG_0].max_value = regs[BPF_REG_0].min_value = 0;
/* remember map_ptr, so that check_map_access()
@@ -1369,6 +1395,11 @@ static int check_call(struct bpf_verifier_env *env, int func_id)
}
regs[BPF_REG_0].map_ptr = meta.map_ptr;
regs[BPF_REG_0].id = ++env->id_gen;
+ insn_aux = &env->insn_aux_data[insn_idx];
+ if (!insn_aux->map_ptr)
+ insn_aux->map_ptr = meta.map_ptr;
+ else if (insn_aux->map_ptr != meta.map_ptr)
+ insn_aux->map_ptr = BPF_MAP_PTR_POISON;
} else {
verbose("unknown return type %d of func %s#%d\n",
fn->ret_type, func_id_name(func_id), func_id);
@@ -1893,6 +1924,17 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
return 0;
} else if (opcode == BPF_ADD &&
BPF_CLASS(insn->code) == BPF_ALU64 &&
+ dst_reg->type == PTR_TO_STACK &&
+ ((BPF_SRC(insn->code) == BPF_X &&
+ regs[insn->src_reg].type == CONST_IMM) ||
+ BPF_SRC(insn->code) == BPF_K)) {
+ if (BPF_SRC(insn->code) == BPF_X)
+ dst_reg->imm += regs[insn->src_reg].imm;
+ else
+ dst_reg->imm += insn->imm;
+ return 0;
+ } else if (opcode == BPF_ADD &&
+ BPF_CLASS(insn->code) == BPF_ALU64 &&
(dst_reg->type == PTR_TO_PACKET ||
(BPF_SRC(insn->code) == BPF_X &&
regs[insn->src_reg].type == PTR_TO_PACKET))) {
@@ -1925,6 +1967,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
* register as unknown.
*/
if (env->allow_ptr_leaks &&
+ BPF_CLASS(insn->code) == BPF_ALU64 && opcode == BPF_ADD &&
(dst_reg->type == PTR_TO_MAP_VALUE ||
dst_reg->type == PTR_TO_MAP_VALUE_ADJ))
dst_reg->type = PTR_TO_MAP_VALUE_ADJ;
@@ -1973,14 +2016,15 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state,
for (i = 0; i < MAX_BPF_REG; i++)
if (regs[i].type == PTR_TO_PACKET && regs[i].id == dst_reg->id)
- regs[i].range = dst_reg->off;
+ /* keep the maximum range already checked */
+ regs[i].range = max(regs[i].range, dst_reg->off);
for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
if (state->stack_slot_type[i] != STACK_SPILL)
continue;
reg = &state->spilled_regs[i / BPF_REG_SIZE];
if (reg->type == PTR_TO_PACKET && reg->id == dst_reg->id)
- reg->range = dst_reg->off;
+ reg->range = max(reg->range, dst_reg->off);
}
}
@@ -2092,14 +2136,19 @@ static void mark_map_reg(struct bpf_reg_state *regs, u32 regno, u32 id,
struct bpf_reg_state *reg = &regs[regno];
if (reg->type == PTR_TO_MAP_VALUE_OR_NULL && reg->id == id) {
- reg->type = type;
+ if (type == UNKNOWN_VALUE) {
+ __mark_reg_unknown_value(regs, regno);
+ } else if (reg->map_ptr->inner_map_meta) {
+ reg->type = CONST_PTR_TO_MAP;
+ reg->map_ptr = reg->map_ptr->inner_map_meta;
+ } else {
+ reg->type = type;
+ }
/* We don't need id from this point onwards anymore, thus we
* should better reset it, so that state pruning has chances
* to take effect.
*/
reg->id = 0;
- if (type == UNKNOWN_VALUE)
- __mark_reg_unknown_value(regs, regno);
}
}
@@ -2940,7 +2989,7 @@ static int do_check(struct bpf_verifier_env *env)
return -EINVAL;
}
- err = check_call(env, insn->imm);
+ err = check_call(env, insn->imm, insn_idx);
if (err)
return err;
@@ -3024,16 +3073,33 @@ process_bpf_exit:
return 0;
}
+static int check_map_prealloc(struct bpf_map *map)
+{
+ return (map->map_type != BPF_MAP_TYPE_HASH &&
+ map->map_type != BPF_MAP_TYPE_PERCPU_HASH &&
+ map->map_type != BPF_MAP_TYPE_HASH_OF_MAPS) ||
+ !(map->map_flags & BPF_F_NO_PREALLOC);
+}
+
static int check_map_prog_compatibility(struct bpf_map *map,
struct bpf_prog *prog)
{
- if (prog->type == BPF_PROG_TYPE_PERF_EVENT &&
- (map->map_type == BPF_MAP_TYPE_HASH ||
- map->map_type == BPF_MAP_TYPE_PERCPU_HASH) &&
- (map->map_flags & BPF_F_NO_PREALLOC)) {
- verbose("perf_event programs can only use preallocated hash map\n");
- return -EINVAL;
+ /* Make sure that BPF_PROG_TYPE_PERF_EVENT programs only use
+ * preallocated hash maps, since doing memory allocation
+ * in overflow_handler can crash depending on where nmi got
+ * triggered.
+ */
+ if (prog->type == BPF_PROG_TYPE_PERF_EVENT) {
+ if (!check_map_prealloc(map)) {
+ verbose("perf_event programs can only use preallocated hash map\n");
+ return -EINVAL;
+ }
+ if (map->inner_map_meta &&
+ !check_map_prealloc(map->inner_map_meta)) {
+ verbose("perf_event programs can only use preallocated inner hash map\n");
+ return -EINVAL;
+ }
}
return 0;
}
@@ -3162,6 +3228,41 @@ static void convert_pseudo_ld_imm64(struct bpf_verifier_env *env)
insn->src_reg = 0;
}
+/* single env->prog->insni[off] instruction was replaced with the range
+ * insni[off, off + cnt). Adjust corresponding insn_aux_data by copying
+ * [0, off) and [off, end) to new locations, so the patched range stays zero
+ */
+static int adjust_insn_aux_data(struct bpf_verifier_env *env, u32 prog_len,
+ u32 off, u32 cnt)
+{
+ struct bpf_insn_aux_data *new_data, *old_data = env->insn_aux_data;
+
+ if (cnt == 1)
+ return 0;
+ new_data = vzalloc(sizeof(struct bpf_insn_aux_data) * prog_len);
+ if (!new_data)
+ return -ENOMEM;
+ memcpy(new_data, old_data, sizeof(struct bpf_insn_aux_data) * off);
+ memcpy(new_data + off + cnt - 1, old_data + off,
+ sizeof(struct bpf_insn_aux_data) * (prog_len - off - cnt + 1));
+ env->insn_aux_data = new_data;
+ vfree(old_data);
+ return 0;
+}
+
+static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 off,
+ const struct bpf_insn *patch, u32 len)
+{
+ struct bpf_prog *new_prog;
+
+ new_prog = bpf_patch_insn_single(env->prog, off, patch, len);
+ if (!new_prog)
+ return NULL;
+ if (adjust_insn_aux_data(env, new_prog->len, off, len))
+ return NULL;
+ return new_prog;
+}
+
/* convert load instructions that access fields of 'struct __sk_buff'
* into sequence of instructions that access fields of 'struct sk_buff'
*/
@@ -3181,10 +3282,10 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
verbose("bpf verifier is misconfigured\n");
return -EINVAL;
} else if (cnt) {
- new_prog = bpf_patch_insn_single(env->prog, 0,
- insn_buf, cnt);
+ new_prog = bpf_patch_insn_data(env, 0, insn_buf, cnt);
if (!new_prog)
return -ENOMEM;
+
env->prog = new_prog;
delta += cnt - 1;
}
@@ -3209,7 +3310,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
else
continue;
- if (env->insn_aux_data[i].ptr_type != PTR_TO_CTX)
+ if (env->insn_aux_data[i + delta].ptr_type != PTR_TO_CTX)
continue;
cnt = ops->convert_ctx_access(type, insn, insn_buf, env->prog);
@@ -3218,8 +3319,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
return -EINVAL;
}
- new_prog = bpf_patch_insn_single(env->prog, i + delta, insn_buf,
- cnt);
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
if (!new_prog)
return -ENOMEM;
@@ -3233,6 +3333,89 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
return 0;
}
+/* fixup insn->imm field of bpf_call instructions
+ * and inline eligible helpers as explicit sequence of BPF instructions
+ *
+ * this function is called after eBPF program passed verification
+ */
+static int fixup_bpf_calls(struct bpf_verifier_env *env)
+{
+ struct bpf_prog *prog = env->prog;
+ struct bpf_insn *insn = prog->insnsi;
+ const struct bpf_func_proto *fn;
+ const int insn_cnt = prog->len;
+ struct bpf_insn insn_buf[16];
+ struct bpf_prog *new_prog;
+ struct bpf_map *map_ptr;
+ int i, cnt, delta = 0;
+
+ for (i = 0; i < insn_cnt; i++, insn++) {
+ if (insn->code != (BPF_JMP | BPF_CALL))
+ continue;
+
+ if (insn->imm == BPF_FUNC_get_route_realm)
+ prog->dst_needed = 1;
+ if (insn->imm == BPF_FUNC_get_prandom_u32)
+ bpf_user_rnd_init_once();
+ if (insn->imm == BPF_FUNC_tail_call) {
+ /* If we tail call into other programs, we
+ * cannot make any assumptions since they can
+ * be replaced dynamically during runtime in
+ * the program array.
+ */
+ prog->cb_access = 1;
+
+ /* mark bpf_tail_call as different opcode to avoid
+ * conditional branch in the interpeter for every normal
+ * call and to prevent accidental JITing by JIT compiler
+ * that doesn't support bpf_tail_call yet
+ */
+ insn->imm = 0;
+ insn->code |= BPF_X;
+ continue;
+ }
+
+ if (ebpf_jit_enabled() && insn->imm == BPF_FUNC_map_lookup_elem) {
+ map_ptr = env->insn_aux_data[i + delta].map_ptr;
+ if (map_ptr == BPF_MAP_PTR_POISON ||
+ !map_ptr->ops->map_gen_lookup)
+ goto patch_call_imm;
+
+ cnt = map_ptr->ops->map_gen_lookup(map_ptr, insn_buf);
+ if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) {
+ verbose("bpf verifier is misconfigured\n");
+ return -EINVAL;
+ }
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf,
+ cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+
+ /* keep walking new program and skip insns we just inserted */
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ continue;
+ }
+
+patch_call_imm:
+ fn = prog->aux->ops->get_func_proto(insn->imm);
+ /* all functions that have prototype and verifier allowed
+ * programs to call them, must be real in-kernel functions
+ */
+ if (!fn->func) {
+ verbose("kernel subsystem misconfigured func %s#%d\n",
+ func_id_name(insn->imm), insn->imm);
+ return -EFAULT;
+ }
+ insn->imm = fn->func - __bpf_call_base;
+ }
+
+ return 0;
+}
+
static void free_states(struct bpf_verifier_env *env)
{
struct bpf_verifier_state_list *sl, *sln;
@@ -3328,6 +3511,9 @@ skip_full_check:
/* program is valid, convert *(u32*)(ctx + off) accesses */
ret = convert_ctx_accesses(env);
+ if (ret == 0)
+ ret = fixup_bpf_calls(env);
+
if (log_level && log_len >= log_size - 1) {
BUG_ON(log_len >= log_size);
/* verifier log exceeded user supplied buffer */