diff options
Diffstat (limited to 'kernel/bpf/arraymap.c')
| -rw-r--r-- | kernel/bpf/arraymap.c | 263 | 
1 files changed, 251 insertions, 12 deletions
| diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 1c65ce0098a9..f0d19bbb9211 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -14,7 +14,7 @@  #include "map_in_map.h"  #define ARRAY_CREATE_FLAG_MASK \ -	(BPF_F_NUMA_NODE | BPF_F_ACCESS_MASK) +	(BPF_F_NUMA_NODE | BPF_F_MMAPABLE | BPF_F_ACCESS_MASK)  static void bpf_array_free_percpu(struct bpf_array *array)  { @@ -59,6 +59,10 @@ int array_map_alloc_check(union bpf_attr *attr)  	    (percpu && numa_node != NUMA_NO_NODE))  		return -EINVAL; +	if (attr->map_type != BPF_MAP_TYPE_ARRAY && +	    attr->map_flags & BPF_F_MMAPABLE) +		return -EINVAL; +  	if (attr->value_size > KMALLOC_MAX_SIZE)  		/* if value_size is bigger, the user space won't be able to  		 * access the elements. @@ -102,10 +106,19 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)  	}  	array_size = sizeof(*array); -	if (percpu) +	if (percpu) {  		array_size += (u64) max_entries * sizeof(void *); -	else -		array_size += (u64) max_entries * elem_size; +	} else { +		/* rely on vmalloc() to return page-aligned memory and +		 * ensure array->value is exactly page-aligned +		 */ +		if (attr->map_flags & BPF_F_MMAPABLE) { +			array_size = PAGE_ALIGN(array_size); +			array_size += PAGE_ALIGN((u64) max_entries * elem_size); +		} else { +			array_size += (u64) max_entries * elem_size; +		} +	}  	/* make sure there is no u32 overflow later in round_up() */  	cost = array_size; @@ -117,7 +130,20 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)  		return ERR_PTR(ret);  	/* allocate all map elements and zero-initialize them */ -	array = bpf_map_area_alloc(array_size, numa_node); +	if (attr->map_flags & BPF_F_MMAPABLE) { +		void *data; + +		/* kmalloc'ed memory can't be mmap'ed, use explicit vmalloc */ +		data = bpf_map_area_mmapable_alloc(array_size, numa_node); +		if (!data) { +			bpf_map_charge_finish(&mem); +			return ERR_PTR(-ENOMEM); +		} +		array = data + PAGE_ALIGN(sizeof(struct bpf_array)) +			- offsetof(struct bpf_array, value); +	} else { +		array = bpf_map_area_alloc(array_size, numa_node); +	}  	if (!array) {  		bpf_map_charge_finish(&mem);  		return ERR_PTR(-ENOMEM); @@ -350,6 +376,11 @@ static int array_map_delete_elem(struct bpf_map *map, void *key)  	return -EINVAL;  } +static void *array_map_vmalloc_addr(struct bpf_array *array) +{ +	return (void *)round_down((unsigned long)array, PAGE_SIZE); +} +  /* Called when map->refcnt goes to zero, either from workqueue or from syscall */  static void array_map_free(struct bpf_map *map)  { @@ -365,7 +396,10 @@ static void array_map_free(struct bpf_map *map)  	if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY)  		bpf_array_free_percpu(array); -	bpf_map_area_free(array); +	if (array->map.map_flags & BPF_F_MMAPABLE) +		bpf_map_area_free(array_map_vmalloc_addr(array)); +	else +		bpf_map_area_free(array);  }  static void array_map_seq_show_elem(struct bpf_map *map, void *key, @@ -444,6 +478,17 @@ static int array_map_check_btf(const struct bpf_map *map,  	return 0;  } +static int array_map_mmap(struct bpf_map *map, struct vm_area_struct *vma) +{ +	struct bpf_array *array = container_of(map, struct bpf_array, map); +	pgoff_t pgoff = PAGE_ALIGN(sizeof(*array)) >> PAGE_SHIFT; + +	if (!(map->map_flags & BPF_F_MMAPABLE)) +		return -EINVAL; + +	return remap_vmalloc_range(vma, array_map_vmalloc_addr(array), pgoff); +} +  const struct bpf_map_ops array_map_ops = {  	.map_alloc_check = array_map_alloc_check,  	.map_alloc = array_map_alloc, @@ -455,6 +500,7 @@ const struct bpf_map_ops array_map_ops = {  	.map_gen_lookup = array_map_gen_lookup,  	.map_direct_value_addr = array_map_direct_value_addr,  	.map_direct_value_meta = array_map_direct_value_meta, +	.map_mmap = array_map_mmap,  	.map_seq_show_elem = array_map_seq_show_elem,  	.map_check_btf = array_map_check_btf,  }; @@ -540,10 +586,17 @@ int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file,  	if (IS_ERR(new_ptr))  		return PTR_ERR(new_ptr); -	old_ptr = xchg(array->ptrs + index, new_ptr); +	if (map->ops->map_poke_run) { +		mutex_lock(&array->aux->poke_mutex); +		old_ptr = xchg(array->ptrs + index, new_ptr); +		map->ops->map_poke_run(map, index, old_ptr, new_ptr); +		mutex_unlock(&array->aux->poke_mutex); +	} else { +		old_ptr = xchg(array->ptrs + index, new_ptr); +	} +  	if (old_ptr)  		map->ops->map_fd_put_ptr(old_ptr); -  	return 0;  } @@ -556,7 +609,15 @@ static int fd_array_map_delete_elem(struct bpf_map *map, void *key)  	if (index >= array->map.max_entries)  		return -E2BIG; -	old_ptr = xchg(array->ptrs + index, NULL); +	if (map->ops->map_poke_run) { +		mutex_lock(&array->aux->poke_mutex); +		old_ptr = xchg(array->ptrs + index, NULL); +		map->ops->map_poke_run(map, index, old_ptr, NULL); +		mutex_unlock(&array->aux->poke_mutex); +	} else { +		old_ptr = xchg(array->ptrs + index, NULL); +	} +  	if (old_ptr) {  		map->ops->map_fd_put_ptr(old_ptr);  		return 0; @@ -625,17 +686,195 @@ static void prog_array_map_seq_show_elem(struct bpf_map *map, void *key,  	rcu_read_unlock();  } +struct prog_poke_elem { +	struct list_head list; +	struct bpf_prog_aux *aux; +}; + +static int prog_array_map_poke_track(struct bpf_map *map, +				     struct bpf_prog_aux *prog_aux) +{ +	struct prog_poke_elem *elem; +	struct bpf_array_aux *aux; +	int ret = 0; + +	aux = container_of(map, struct bpf_array, map)->aux; +	mutex_lock(&aux->poke_mutex); +	list_for_each_entry(elem, &aux->poke_progs, list) { +		if (elem->aux == prog_aux) +			goto out; +	} + +	elem = kmalloc(sizeof(*elem), GFP_KERNEL); +	if (!elem) { +		ret = -ENOMEM; +		goto out; +	} + +	INIT_LIST_HEAD(&elem->list); +	/* We must track the program's aux info at this point in time +	 * since the program pointer itself may not be stable yet, see +	 * also comment in prog_array_map_poke_run(). +	 */ +	elem->aux = prog_aux; + +	list_add_tail(&elem->list, &aux->poke_progs); +out: +	mutex_unlock(&aux->poke_mutex); +	return ret; +} + +static void prog_array_map_poke_untrack(struct bpf_map *map, +					struct bpf_prog_aux *prog_aux) +{ +	struct prog_poke_elem *elem, *tmp; +	struct bpf_array_aux *aux; + +	aux = container_of(map, struct bpf_array, map)->aux; +	mutex_lock(&aux->poke_mutex); +	list_for_each_entry_safe(elem, tmp, &aux->poke_progs, list) { +		if (elem->aux == prog_aux) { +			list_del_init(&elem->list); +			kfree(elem); +			break; +		} +	} +	mutex_unlock(&aux->poke_mutex); +} + +static void prog_array_map_poke_run(struct bpf_map *map, u32 key, +				    struct bpf_prog *old, +				    struct bpf_prog *new) +{ +	struct prog_poke_elem *elem; +	struct bpf_array_aux *aux; + +	aux = container_of(map, struct bpf_array, map)->aux; +	WARN_ON_ONCE(!mutex_is_locked(&aux->poke_mutex)); + +	list_for_each_entry(elem, &aux->poke_progs, list) { +		struct bpf_jit_poke_descriptor *poke; +		int i, ret; + +		for (i = 0; i < elem->aux->size_poke_tab; i++) { +			poke = &elem->aux->poke_tab[i]; + +			/* Few things to be aware of: +			 * +			 * 1) We can only ever access aux in this context, but +			 *    not aux->prog since it might not be stable yet and +			 *    there could be danger of use after free otherwise. +			 * 2) Initially when we start tracking aux, the program +			 *    is not JITed yet and also does not have a kallsyms +			 *    entry. We skip these as poke->ip_stable is not +			 *    active yet. The JIT will do the final fixup before +			 *    setting it stable. The various poke->ip_stable are +			 *    successively activated, so tail call updates can +			 *    arrive from here while JIT is still finishing its +			 *    final fixup for non-activated poke entries. +			 * 3) On program teardown, the program's kallsym entry gets +			 *    removed out of RCU callback, but we can only untrack +			 *    from sleepable context, therefore bpf_arch_text_poke() +			 *    might not see that this is in BPF text section and +			 *    bails out with -EINVAL. As these are unreachable since +			 *    RCU grace period already passed, we simply skip them. +			 * 4) Also programs reaching refcount of zero while patching +			 *    is in progress is okay since we're protected under +			 *    poke_mutex and untrack the programs before the JIT +			 *    buffer is freed. When we're still in the middle of +			 *    patching and suddenly kallsyms entry of the program +			 *    gets evicted, we just skip the rest which is fine due +			 *    to point 3). +			 * 5) Any other error happening below from bpf_arch_text_poke() +			 *    is a unexpected bug. +			 */ +			if (!READ_ONCE(poke->ip_stable)) +				continue; +			if (poke->reason != BPF_POKE_REASON_TAIL_CALL) +				continue; +			if (poke->tail_call.map != map || +			    poke->tail_call.key != key) +				continue; + +			ret = bpf_arch_text_poke(poke->ip, BPF_MOD_JUMP, +						 old ? (u8 *)old->bpf_func + +						 poke->adj_off : NULL, +						 new ? (u8 *)new->bpf_func + +						 poke->adj_off : NULL); +			BUG_ON(ret < 0 && ret != -EINVAL); +		} +	} +} + +static void prog_array_map_clear_deferred(struct work_struct *work) +{ +	struct bpf_map *map = container_of(work, struct bpf_array_aux, +					   work)->map; +	bpf_fd_array_map_clear(map); +	bpf_map_put(map); +} + +static void prog_array_map_clear(struct bpf_map *map) +{ +	struct bpf_array_aux *aux = container_of(map, struct bpf_array, +						 map)->aux; +	bpf_map_inc(map); +	schedule_work(&aux->work); +} + +static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr) +{ +	struct bpf_array_aux *aux; +	struct bpf_map *map; + +	aux = kzalloc(sizeof(*aux), GFP_KERNEL); +	if (!aux) +		return ERR_PTR(-ENOMEM); + +	INIT_WORK(&aux->work, prog_array_map_clear_deferred); +	INIT_LIST_HEAD(&aux->poke_progs); +	mutex_init(&aux->poke_mutex); + +	map = array_map_alloc(attr); +	if (IS_ERR(map)) { +		kfree(aux); +		return map; +	} + +	container_of(map, struct bpf_array, map)->aux = aux; +	aux->map = map; + +	return map; +} + +static void prog_array_map_free(struct bpf_map *map) +{ +	struct prog_poke_elem *elem, *tmp; +	struct bpf_array_aux *aux; + +	aux = container_of(map, struct bpf_array, map)->aux; +	list_for_each_entry_safe(elem, tmp, &aux->poke_progs, list) { +		list_del_init(&elem->list); +		kfree(elem); +	} +	kfree(aux); +	fd_array_map_free(map); +} +  const struct bpf_map_ops prog_array_map_ops = {  	.map_alloc_check = fd_array_map_alloc_check, -	.map_alloc = array_map_alloc, -	.map_free = fd_array_map_free, +	.map_alloc = prog_array_map_alloc, +	.map_free = prog_array_map_free, +	.map_poke_track = prog_array_map_poke_track, +	.map_poke_untrack = prog_array_map_poke_untrack, +	.map_poke_run = prog_array_map_poke_run,  	.map_get_next_key = array_map_get_next_key,  	.map_lookup_elem = fd_array_map_lookup_elem,  	.map_delete_elem = fd_array_map_delete_elem,  	.map_fd_get_ptr = prog_fd_array_get_ptr,  	.map_fd_put_ptr = prog_fd_array_put_ptr,  	.map_fd_sys_lookup_elem = prog_fd_array_sys_lookup_elem, -	.map_release_uref = bpf_fd_array_map_clear, +	.map_release_uref = prog_array_map_clear,  	.map_seq_show_elem = prog_array_map_seq_show_elem,  }; |