diff options
Diffstat (limited to 'kernel/bpf/arraymap.c')
-rw-r--r-- | kernel/bpf/arraymap.c | 265 |
1 files changed, 253 insertions, 12 deletions
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 1c65ce0098a9..95d77770353c 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -14,7 +14,7 @@ #include "map_in_map.h" #define ARRAY_CREATE_FLAG_MASK \ - (BPF_F_NUMA_NODE | BPF_F_ACCESS_MASK) + (BPF_F_NUMA_NODE | BPF_F_MMAPABLE | BPF_F_ACCESS_MASK) static void bpf_array_free_percpu(struct bpf_array *array) { @@ -59,6 +59,10 @@ int array_map_alloc_check(union bpf_attr *attr) (percpu && numa_node != NUMA_NO_NODE)) return -EINVAL; + if (attr->map_type != BPF_MAP_TYPE_ARRAY && + attr->map_flags & BPF_F_MMAPABLE) + return -EINVAL; + if (attr->value_size > KMALLOC_MAX_SIZE) /* if value_size is bigger, the user space won't be able to * access the elements. @@ -102,10 +106,19 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) } array_size = sizeof(*array); - if (percpu) + if (percpu) { array_size += (u64) max_entries * sizeof(void *); - else - array_size += (u64) max_entries * elem_size; + } else { + /* rely on vmalloc() to return page-aligned memory and + * ensure array->value is exactly page-aligned + */ + if (attr->map_flags & BPF_F_MMAPABLE) { + array_size = PAGE_ALIGN(array_size); + array_size += PAGE_ALIGN((u64) max_entries * elem_size); + } else { + array_size += (u64) max_entries * elem_size; + } + } /* make sure there is no u32 overflow later in round_up() */ cost = array_size; @@ -117,7 +130,20 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) return ERR_PTR(ret); /* allocate all map elements and zero-initialize them */ - array = bpf_map_area_alloc(array_size, numa_node); + if (attr->map_flags & BPF_F_MMAPABLE) { + void *data; + + /* kmalloc'ed memory can't be mmap'ed, use explicit vmalloc */ + data = bpf_map_area_mmapable_alloc(array_size, numa_node); + if (!data) { + bpf_map_charge_finish(&mem); + return ERR_PTR(-ENOMEM); + } + array = data + PAGE_ALIGN(sizeof(struct bpf_array)) + - offsetof(struct bpf_array, value); + } else { + array = bpf_map_area_alloc(array_size, numa_node); + } if (!array) { bpf_map_charge_finish(&mem); return ERR_PTR(-ENOMEM); @@ -350,6 +376,11 @@ static int array_map_delete_elem(struct bpf_map *map, void *key) return -EINVAL; } +static void *array_map_vmalloc_addr(struct bpf_array *array) +{ + return (void *)round_down((unsigned long)array, PAGE_SIZE); +} + /* Called when map->refcnt goes to zero, either from workqueue or from syscall */ static void array_map_free(struct bpf_map *map) { @@ -365,7 +396,10 @@ static void array_map_free(struct bpf_map *map) if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) bpf_array_free_percpu(array); - bpf_map_area_free(array); + if (array->map.map_flags & BPF_F_MMAPABLE) + bpf_map_area_free(array_map_vmalloc_addr(array)); + else + bpf_map_area_free(array); } static void array_map_seq_show_elem(struct bpf_map *map, void *key, @@ -444,6 +478,17 @@ static int array_map_check_btf(const struct bpf_map *map, return 0; } +static int array_map_mmap(struct bpf_map *map, struct vm_area_struct *vma) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + pgoff_t pgoff = PAGE_ALIGN(sizeof(*array)) >> PAGE_SHIFT; + + if (!(map->map_flags & BPF_F_MMAPABLE)) + return -EINVAL; + + return remap_vmalloc_range(vma, array_map_vmalloc_addr(array), pgoff); +} + const struct bpf_map_ops array_map_ops = { .map_alloc_check = array_map_alloc_check, .map_alloc = array_map_alloc, @@ -455,8 +500,11 @@ const struct bpf_map_ops array_map_ops = { .map_gen_lookup = array_map_gen_lookup, .map_direct_value_addr = array_map_direct_value_addr, .map_direct_value_meta = array_map_direct_value_meta, + .map_mmap = array_map_mmap, .map_seq_show_elem = array_map_seq_show_elem, .map_check_btf = array_map_check_btf, + .map_lookup_batch = generic_map_lookup_batch, + .map_update_batch = generic_map_update_batch, }; const struct bpf_map_ops percpu_array_map_ops = { @@ -540,10 +588,17 @@ int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file, if (IS_ERR(new_ptr)) return PTR_ERR(new_ptr); - old_ptr = xchg(array->ptrs + index, new_ptr); + if (map->ops->map_poke_run) { + mutex_lock(&array->aux->poke_mutex); + old_ptr = xchg(array->ptrs + index, new_ptr); + map->ops->map_poke_run(map, index, old_ptr, new_ptr); + mutex_unlock(&array->aux->poke_mutex); + } else { + old_ptr = xchg(array->ptrs + index, new_ptr); + } + if (old_ptr) map->ops->map_fd_put_ptr(old_ptr); - return 0; } @@ -556,7 +611,15 @@ static int fd_array_map_delete_elem(struct bpf_map *map, void *key) if (index >= array->map.max_entries) return -E2BIG; - old_ptr = xchg(array->ptrs + index, NULL); + if (map->ops->map_poke_run) { + mutex_lock(&array->aux->poke_mutex); + old_ptr = xchg(array->ptrs + index, NULL); + map->ops->map_poke_run(map, index, old_ptr, NULL); + mutex_unlock(&array->aux->poke_mutex); + } else { + old_ptr = xchg(array->ptrs + index, NULL); + } + if (old_ptr) { map->ops->map_fd_put_ptr(old_ptr); return 0; @@ -625,17 +688,195 @@ static void prog_array_map_seq_show_elem(struct bpf_map *map, void *key, rcu_read_unlock(); } +struct prog_poke_elem { + struct list_head list; + struct bpf_prog_aux *aux; +}; + +static int prog_array_map_poke_track(struct bpf_map *map, + struct bpf_prog_aux *prog_aux) +{ + struct prog_poke_elem *elem; + struct bpf_array_aux *aux; + int ret = 0; + + aux = container_of(map, struct bpf_array, map)->aux; + mutex_lock(&aux->poke_mutex); + list_for_each_entry(elem, &aux->poke_progs, list) { + if (elem->aux == prog_aux) + goto out; + } + + elem = kmalloc(sizeof(*elem), GFP_KERNEL); + if (!elem) { + ret = -ENOMEM; + goto out; + } + + INIT_LIST_HEAD(&elem->list); + /* We must track the program's aux info at this point in time + * since the program pointer itself may not be stable yet, see + * also comment in prog_array_map_poke_run(). + */ + elem->aux = prog_aux; + + list_add_tail(&elem->list, &aux->poke_progs); +out: + mutex_unlock(&aux->poke_mutex); + return ret; +} + +static void prog_array_map_poke_untrack(struct bpf_map *map, + struct bpf_prog_aux *prog_aux) +{ + struct prog_poke_elem *elem, *tmp; + struct bpf_array_aux *aux; + + aux = container_of(map, struct bpf_array, map)->aux; + mutex_lock(&aux->poke_mutex); + list_for_each_entry_safe(elem, tmp, &aux->poke_progs, list) { + if (elem->aux == prog_aux) { + list_del_init(&elem->list); + kfree(elem); + break; + } + } + mutex_unlock(&aux->poke_mutex); +} + +static void prog_array_map_poke_run(struct bpf_map *map, u32 key, + struct bpf_prog *old, + struct bpf_prog *new) +{ + struct prog_poke_elem *elem; + struct bpf_array_aux *aux; + + aux = container_of(map, struct bpf_array, map)->aux; + WARN_ON_ONCE(!mutex_is_locked(&aux->poke_mutex)); + + list_for_each_entry(elem, &aux->poke_progs, list) { + struct bpf_jit_poke_descriptor *poke; + int i, ret; + + for (i = 0; i < elem->aux->size_poke_tab; i++) { + poke = &elem->aux->poke_tab[i]; + + /* Few things to be aware of: + * + * 1) We can only ever access aux in this context, but + * not aux->prog since it might not be stable yet and + * there could be danger of use after free otherwise. + * 2) Initially when we start tracking aux, the program + * is not JITed yet and also does not have a kallsyms + * entry. We skip these as poke->ip_stable is not + * active yet. The JIT will do the final fixup before + * setting it stable. The various poke->ip_stable are + * successively activated, so tail call updates can + * arrive from here while JIT is still finishing its + * final fixup for non-activated poke entries. + * 3) On program teardown, the program's kallsym entry gets + * removed out of RCU callback, but we can only untrack + * from sleepable context, therefore bpf_arch_text_poke() + * might not see that this is in BPF text section and + * bails out with -EINVAL. As these are unreachable since + * RCU grace period already passed, we simply skip them. + * 4) Also programs reaching refcount of zero while patching + * is in progress is okay since we're protected under + * poke_mutex and untrack the programs before the JIT + * buffer is freed. When we're still in the middle of + * patching and suddenly kallsyms entry of the program + * gets evicted, we just skip the rest which is fine due + * to point 3). + * 5) Any other error happening below from bpf_arch_text_poke() + * is a unexpected bug. + */ + if (!READ_ONCE(poke->ip_stable)) + continue; + if (poke->reason != BPF_POKE_REASON_TAIL_CALL) + continue; + if (poke->tail_call.map != map || + poke->tail_call.key != key) + continue; + + ret = bpf_arch_text_poke(poke->ip, BPF_MOD_JUMP, + old ? (u8 *)old->bpf_func + + poke->adj_off : NULL, + new ? (u8 *)new->bpf_func + + poke->adj_off : NULL); + BUG_ON(ret < 0 && ret != -EINVAL); + } + } +} + +static void prog_array_map_clear_deferred(struct work_struct *work) +{ + struct bpf_map *map = container_of(work, struct bpf_array_aux, + work)->map; + bpf_fd_array_map_clear(map); + bpf_map_put(map); +} + +static void prog_array_map_clear(struct bpf_map *map) +{ + struct bpf_array_aux *aux = container_of(map, struct bpf_array, + map)->aux; + bpf_map_inc(map); + schedule_work(&aux->work); +} + +static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr) +{ + struct bpf_array_aux *aux; + struct bpf_map *map; + + aux = kzalloc(sizeof(*aux), GFP_KERNEL); + if (!aux) + return ERR_PTR(-ENOMEM); + + INIT_WORK(&aux->work, prog_array_map_clear_deferred); + INIT_LIST_HEAD(&aux->poke_progs); + mutex_init(&aux->poke_mutex); + + map = array_map_alloc(attr); + if (IS_ERR(map)) { + kfree(aux); + return map; + } + + container_of(map, struct bpf_array, map)->aux = aux; + aux->map = map; + + return map; +} + +static void prog_array_map_free(struct bpf_map *map) +{ + struct prog_poke_elem *elem, *tmp; + struct bpf_array_aux *aux; + + aux = container_of(map, struct bpf_array, map)->aux; + list_for_each_entry_safe(elem, tmp, &aux->poke_progs, list) { + list_del_init(&elem->list); + kfree(elem); + } + kfree(aux); + fd_array_map_free(map); +} + const struct bpf_map_ops prog_array_map_ops = { .map_alloc_check = fd_array_map_alloc_check, - .map_alloc = array_map_alloc, - .map_free = fd_array_map_free, + .map_alloc = prog_array_map_alloc, + .map_free = prog_array_map_free, + .map_poke_track = prog_array_map_poke_track, + .map_poke_untrack = prog_array_map_poke_untrack, + .map_poke_run = prog_array_map_poke_run, .map_get_next_key = array_map_get_next_key, .map_lookup_elem = fd_array_map_lookup_elem, .map_delete_elem = fd_array_map_delete_elem, .map_fd_get_ptr = prog_fd_array_get_ptr, .map_fd_put_ptr = prog_fd_array_put_ptr, .map_fd_sys_lookup_elem = prog_fd_array_sys_lookup_elem, - .map_release_uref = bpf_fd_array_map_clear, + .map_release_uref = prog_array_map_clear, .map_seq_show_elem = prog_array_map_seq_show_elem, }; |