diff options
Diffstat (limited to 'kernel/bpf/hashtab.c')
| -rw-r--r-- | kernel/bpf/hashtab.c | 438 | 
1 files changed, 394 insertions, 44 deletions
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index ad1bc67aff1b..34debc1a9641 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -15,6 +15,7 @@  #include <linux/filter.h>  #include <linux/vmalloc.h>  #include "percpu_freelist.h" +#include "bpf_lru_list.h"  struct bucket {  	struct hlist_head head; @@ -25,7 +26,10 @@ struct bpf_htab {  	struct bpf_map map;  	struct bucket *buckets;  	void *elems; -	struct pcpu_freelist freelist; +	union { +		struct pcpu_freelist freelist; +		struct bpf_lru lru; +	};  	void __percpu *extra_elems;  	atomic_t count;	/* number of elements in this hashtable */  	u32 n_buckets;	/* number of hash buckets */ @@ -48,11 +52,26 @@ struct htab_elem {  	union {  		struct rcu_head rcu;  		enum extra_elem_state state; +		struct bpf_lru_node lru_node;  	};  	u32 hash;  	char key[0] __aligned(8);  }; +static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node); + +static bool htab_is_lru(const struct bpf_htab *htab) +{ +	return htab->map.map_type == BPF_MAP_TYPE_LRU_HASH || +		htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH; +} + +static bool htab_is_percpu(const struct bpf_htab *htab) +{ +	return htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH || +		htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH; +} +  static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size,  				     void __percpu *pptr)  { @@ -73,7 +92,7 @@ static void htab_free_elems(struct bpf_htab *htab)  {  	int i; -	if (htab->map.map_type != BPF_MAP_TYPE_PERCPU_HASH) +	if (!htab_is_percpu(htab))  		goto free_elems;  	for (i = 0; i < htab->map.max_entries; i++) { @@ -87,7 +106,22 @@ free_elems:  	vfree(htab->elems);  } -static int prealloc_elems_and_freelist(struct bpf_htab *htab) +static struct htab_elem *prealloc_lru_pop(struct bpf_htab *htab, void *key, +					  u32 hash) +{ +	struct bpf_lru_node *node = bpf_lru_pop_free(&htab->lru, hash); +	struct htab_elem *l; + +	if (node) { +		l = container_of(node, struct htab_elem, lru_node); +		memcpy(l->key, key, htab->map.key_size); +		return l; +	} + +	return NULL; +} + +static int prealloc_init(struct bpf_htab *htab)  {  	int err = -ENOMEM, i; @@ -95,7 +129,7 @@ static int prealloc_elems_and_freelist(struct bpf_htab *htab)  	if (!htab->elems)  		return -ENOMEM; -	if (htab->map.map_type != BPF_MAP_TYPE_PERCPU_HASH) +	if (!htab_is_percpu(htab))  		goto skip_percpu_elems;  	for (i = 0; i < htab->map.max_entries; i++) { @@ -110,12 +144,27 @@ static int prealloc_elems_and_freelist(struct bpf_htab *htab)  	}  skip_percpu_elems: -	err = pcpu_freelist_init(&htab->freelist); +	if (htab_is_lru(htab)) +		err = bpf_lru_init(&htab->lru, +				   htab->map.map_flags & BPF_F_NO_COMMON_LRU, +				   offsetof(struct htab_elem, hash) - +				   offsetof(struct htab_elem, lru_node), +				   htab_lru_map_delete_node, +				   htab); +	else +		err = pcpu_freelist_init(&htab->freelist); +  	if (err)  		goto free_elems; -	pcpu_freelist_populate(&htab->freelist, htab->elems, htab->elem_size, -			       htab->map.max_entries); +	if (htab_is_lru(htab)) +		bpf_lru_populate(&htab->lru, htab->elems, +				 offsetof(struct htab_elem, lru_node), +				 htab->elem_size, htab->map.max_entries); +	else +		pcpu_freelist_populate(&htab->freelist, htab->elems, +				       htab->elem_size, htab->map.max_entries); +  	return 0;  free_elems: @@ -123,6 +172,16 @@ free_elems:  	return err;  } +static void prealloc_destroy(struct bpf_htab *htab) +{ +	htab_free_elems(htab); + +	if (htab_is_lru(htab)) +		bpf_lru_destroy(&htab->lru); +	else +		pcpu_freelist_destroy(&htab->freelist); +} +  static int alloc_extra_elems(struct bpf_htab *htab)  {  	void __percpu *pptr; @@ -143,15 +202,37 @@ static int alloc_extra_elems(struct bpf_htab *htab)  /* Called from syscall */  static struct bpf_map *htab_map_alloc(union bpf_attr *attr)  { -	bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_HASH; +	bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH || +		       attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH); +	bool lru = (attr->map_type == BPF_MAP_TYPE_LRU_HASH || +		    attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH); +	/* percpu_lru means each cpu has its own LRU list. +	 * it is different from BPF_MAP_TYPE_PERCPU_HASH where +	 * the map's value itself is percpu.  percpu_lru has +	 * nothing to do with the map's value. +	 */ +	bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU); +	bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC);  	struct bpf_htab *htab;  	int err, i;  	u64 cost; -	if (attr->map_flags & ~BPF_F_NO_PREALLOC) +	if (lru && !capable(CAP_SYS_ADMIN)) +		/* LRU implementation is much complicated than other +		 * maps.  Hence, limit to CAP_SYS_ADMIN for now. +		 */ +		return ERR_PTR(-EPERM); + +	if (attr->map_flags & ~(BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU))  		/* reserved bits should not be used */  		return ERR_PTR(-EINVAL); +	if (!lru && percpu_lru) +		return ERR_PTR(-EINVAL); + +	if (lru && !prealloc) +		return ERR_PTR(-ENOTSUPP); +  	htab = kzalloc(sizeof(*htab), GFP_USER);  	if (!htab)  		return ERR_PTR(-ENOMEM); @@ -171,6 +252,18 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)  	    htab->map.value_size == 0)  		goto free_htab; +	if (percpu_lru) { +		/* ensure each CPU's lru list has >=1 elements. +		 * since we are at it, make each lru list has the same +		 * number of elements. +		 */ +		htab->map.max_entries = roundup(attr->max_entries, +						num_possible_cpus()); +		if (htab->map.max_entries < attr->max_entries) +			htab->map.max_entries = rounddown(attr->max_entries, +							  num_possible_cpus()); +	} +  	/* hash table size must be power of 2 */  	htab->n_buckets = roundup_pow_of_two(htab->map.max_entries); @@ -241,14 +334,17 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)  		raw_spin_lock_init(&htab->buckets[i].lock);  	} -	if (!percpu) { +	if (!percpu && !lru) { +		/* lru itself can remove the least used element, so +		 * there is no need for an extra elem during map_update. +		 */  		err = alloc_extra_elems(htab);  		if (err)  			goto free_buckets;  	} -	if (!(attr->map_flags & BPF_F_NO_PREALLOC)) { -		err = prealloc_elems_and_freelist(htab); +	if (prealloc) { +		err = prealloc_init(htab);  		if (err)  			goto free_extra_elems;  	} @@ -323,6 +419,46 @@ static void *htab_map_lookup_elem(struct bpf_map *map, void *key)  	return NULL;  } +static void *htab_lru_map_lookup_elem(struct bpf_map *map, void *key) +{ +	struct htab_elem *l = __htab_map_lookup_elem(map, key); + +	if (l) { +		bpf_lru_node_set_ref(&l->lru_node); +		return l->key + round_up(map->key_size, 8); +	} + +	return NULL; +} + +/* It is called from the bpf_lru_list when the LRU needs to delete + * older elements from the htab. + */ +static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node) +{ +	struct bpf_htab *htab = (struct bpf_htab *)arg; +	struct htab_elem *l, *tgt_l; +	struct hlist_head *head; +	unsigned long flags; +	struct bucket *b; + +	tgt_l = container_of(node, struct htab_elem, lru_node); +	b = __select_bucket(htab, tgt_l->hash); +	head = &b->head; + +	raw_spin_lock_irqsave(&b->lock, flags); + +	hlist_for_each_entry_rcu(l, head, hash_node) +		if (l == tgt_l) { +			hlist_del_rcu(&l->hash_node); +			break; +		} + +	raw_spin_unlock_irqrestore(&b->lock, flags); + +	return l == tgt_l; +} +  /* Called from syscall */  static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)  { @@ -420,6 +556,24 @@ static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)  	}  } +static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr, +			    void *value, bool onallcpus) +{ +	if (!onallcpus) { +		/* copy true value_size bytes */ +		memcpy(this_cpu_ptr(pptr), value, htab->map.value_size); +	} else { +		u32 size = round_up(htab->map.value_size, 8); +		int off = 0, cpu; + +		for_each_possible_cpu(cpu) { +			bpf_long_memcpy(per_cpu_ptr(pptr, cpu), +					value + off, size); +			off += size; +		} +	} +} +  static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,  					 void *value, u32 key_size, u32 hash,  					 bool percpu, bool onallcpus, @@ -479,18 +633,8 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,  			}  		} -		if (!onallcpus) { -			/* copy true value_size bytes */ -			memcpy(this_cpu_ptr(pptr), value, htab->map.value_size); -		} else { -			int off = 0, cpu; +		pcpu_copy_value(htab, pptr, value, onallcpus); -			for_each_possible_cpu(cpu) { -				bpf_long_memcpy(per_cpu_ptr(pptr, cpu), -						value + off, size); -				off += size; -			} -		}  		if (!prealloc)  			htab_elem_set_ptr(l_new, key_size, pptr);  	} else { @@ -571,6 +715,70 @@ err:  	return ret;  } +static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value, +				    u64 map_flags) +{ +	struct bpf_htab *htab = container_of(map, struct bpf_htab, map); +	struct htab_elem *l_new, *l_old = NULL; +	struct hlist_head *head; +	unsigned long flags; +	struct bucket *b; +	u32 key_size, hash; +	int ret; + +	if (unlikely(map_flags > BPF_EXIST)) +		/* unknown flags */ +		return -EINVAL; + +	WARN_ON_ONCE(!rcu_read_lock_held()); + +	key_size = map->key_size; + +	hash = htab_map_hash(key, key_size); + +	b = __select_bucket(htab, hash); +	head = &b->head; + +	/* For LRU, we need to alloc before taking bucket's +	 * spinlock because getting free nodes from LRU may need +	 * to remove older elements from htab and this removal +	 * operation will need a bucket lock. +	 */ +	l_new = prealloc_lru_pop(htab, key, hash); +	if (!l_new) +		return -ENOMEM; +	memcpy(l_new->key + round_up(map->key_size, 8), value, map->value_size); + +	/* bpf_map_update_elem() can be called in_irq() */ +	raw_spin_lock_irqsave(&b->lock, flags); + +	l_old = lookup_elem_raw(head, hash, key, key_size); + +	ret = check_flags(htab, l_old, map_flags); +	if (ret) +		goto err; + +	/* add new element to the head of the list, so that +	 * concurrent search will find it before old elem +	 */ +	hlist_add_head_rcu(&l_new->hash_node, head); +	if (l_old) { +		bpf_lru_node_set_ref(&l_new->lru_node); +		hlist_del_rcu(&l_old->hash_node); +	} +	ret = 0; + +err: +	raw_spin_unlock_irqrestore(&b->lock, flags); + +	if (ret) +		bpf_lru_push_free(&htab->lru, &l_new->lru_node); +	else if (l_old) +		bpf_lru_push_free(&htab->lru, &l_old->lru_node); + +	return ret; +} +  static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key,  					 void *value, u64 map_flags,  					 bool onallcpus) @@ -606,22 +814,9 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key,  		goto err;  	if (l_old) { -		void __percpu *pptr = htab_elem_get_ptr(l_old, key_size); -		u32 size = htab->map.value_size; -  		/* per-cpu hash map can update value in-place */ -		if (!onallcpus) { -			memcpy(this_cpu_ptr(pptr), value, size); -		} else { -			int off = 0, cpu; - -			size = round_up(size, 8); -			for_each_possible_cpu(cpu) { -				bpf_long_memcpy(per_cpu_ptr(pptr, cpu), -						value + off, size); -				off += size; -			} -		} +		pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size), +				value, onallcpus);  	} else {  		l_new = alloc_htab_elem(htab, key, value, key_size,  					hash, true, onallcpus, false); @@ -637,12 +832,84 @@ err:  	return ret;  } +static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, +					     void *value, u64 map_flags, +					     bool onallcpus) +{ +	struct bpf_htab *htab = container_of(map, struct bpf_htab, map); +	struct htab_elem *l_new = NULL, *l_old; +	struct hlist_head *head; +	unsigned long flags; +	struct bucket *b; +	u32 key_size, hash; +	int ret; + +	if (unlikely(map_flags > BPF_EXIST)) +		/* unknown flags */ +		return -EINVAL; + +	WARN_ON_ONCE(!rcu_read_lock_held()); + +	key_size = map->key_size; + +	hash = htab_map_hash(key, key_size); + +	b = __select_bucket(htab, hash); +	head = &b->head; + +	/* For LRU, we need to alloc before taking bucket's +	 * spinlock because LRU's elem alloc may need +	 * to remove older elem from htab and this removal +	 * operation will need a bucket lock. +	 */ +	if (map_flags != BPF_EXIST) { +		l_new = prealloc_lru_pop(htab, key, hash); +		if (!l_new) +			return -ENOMEM; +	} + +	/* bpf_map_update_elem() can be called in_irq() */ +	raw_spin_lock_irqsave(&b->lock, flags); + +	l_old = lookup_elem_raw(head, hash, key, key_size); + +	ret = check_flags(htab, l_old, map_flags); +	if (ret) +		goto err; + +	if (l_old) { +		bpf_lru_node_set_ref(&l_old->lru_node); + +		/* per-cpu hash map can update value in-place */ +		pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size), +				value, onallcpus); +	} else { +		pcpu_copy_value(htab, htab_elem_get_ptr(l_new, key_size), +				value, onallcpus); +		hlist_add_head_rcu(&l_new->hash_node, head); +		l_new = NULL; +	} +	ret = 0; +err: +	raw_spin_unlock_irqrestore(&b->lock, flags); +	if (l_new) +		bpf_lru_push_free(&htab->lru, &l_new->lru_node); +	return ret; +} +  static int htab_percpu_map_update_elem(struct bpf_map *map, void *key,  				       void *value, u64 map_flags)  {  	return __htab_percpu_map_update_elem(map, key, value, map_flags, false);  } +static int htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, +					   void *value, u64 map_flags) +{ +	return __htab_lru_percpu_map_update_elem(map, key, value, map_flags, +						 false); +} +  /* Called from syscall or from eBPF program */  static int htab_map_delete_elem(struct bpf_map *map, void *key)  { @@ -676,6 +943,39 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key)  	return ret;  } +static int htab_lru_map_delete_elem(struct bpf_map *map, void *key) +{ +	struct bpf_htab *htab = container_of(map, struct bpf_htab, map); +	struct hlist_head *head; +	struct bucket *b; +	struct htab_elem *l; +	unsigned long flags; +	u32 hash, key_size; +	int ret = -ENOENT; + +	WARN_ON_ONCE(!rcu_read_lock_held()); + +	key_size = map->key_size; + +	hash = htab_map_hash(key, key_size); +	b = __select_bucket(htab, hash); +	head = &b->head; + +	raw_spin_lock_irqsave(&b->lock, flags); + +	l = lookup_elem_raw(head, hash, key, key_size); + +	if (l) { +		hlist_del_rcu(&l->hash_node); +		ret = 0; +	} + +	raw_spin_unlock_irqrestore(&b->lock, flags); +	if (l) +		bpf_lru_push_free(&htab->lru, &l->lru_node); +	return ret; +} +  static void delete_all_elements(struct bpf_htab *htab)  {  	int i; @@ -708,12 +1008,11 @@ static void htab_map_free(struct bpf_map *map)  	 * not have executed. Wait for them.  	 */  	rcu_barrier(); -	if (htab->map.map_flags & BPF_F_NO_PREALLOC) { +	if (htab->map.map_flags & BPF_F_NO_PREALLOC)  		delete_all_elements(htab); -	} else { -		htab_free_elems(htab); -		pcpu_freelist_destroy(&htab->freelist); -	} +	else +		prealloc_destroy(htab); +  	free_percpu(htab->extra_elems);  	kvfree(htab->buckets);  	kfree(htab); @@ -733,6 +1032,20 @@ static struct bpf_map_type_list htab_type __read_mostly = {  	.type = BPF_MAP_TYPE_HASH,  }; +static const struct bpf_map_ops htab_lru_ops = { +	.map_alloc = htab_map_alloc, +	.map_free = htab_map_free, +	.map_get_next_key = htab_map_get_next_key, +	.map_lookup_elem = htab_lru_map_lookup_elem, +	.map_update_elem = htab_lru_map_update_elem, +	.map_delete_elem = htab_lru_map_delete_elem, +}; + +static struct bpf_map_type_list htab_lru_type __read_mostly = { +	.ops = &htab_lru_ops, +	.type = BPF_MAP_TYPE_LRU_HASH, +}; +  /* Called from eBPF program */  static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key)  { @@ -744,8 +1057,21 @@ static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key)  		return NULL;  } +static void *htab_lru_percpu_map_lookup_elem(struct bpf_map *map, void *key) +{ +	struct htab_elem *l = __htab_map_lookup_elem(map, key); + +	if (l) { +		bpf_lru_node_set_ref(&l->lru_node); +		return this_cpu_ptr(htab_elem_get_ptr(l, map->key_size)); +	} + +	return NULL; +} +  int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value)  { +	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);  	struct htab_elem *l;  	void __percpu *pptr;  	int ret = -ENOENT; @@ -761,6 +1087,8 @@ int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value)  	l = __htab_map_lookup_elem(map, key);  	if (!l)  		goto out; +	if (htab_is_lru(htab)) +		bpf_lru_node_set_ref(&l->lru_node);  	pptr = htab_elem_get_ptr(l, map->key_size);  	for_each_possible_cpu(cpu) {  		bpf_long_memcpy(value + off, @@ -776,10 +1104,16 @@ out:  int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value,  			   u64 map_flags)  { +	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);  	int ret;  	rcu_read_lock(); -	ret = __htab_percpu_map_update_elem(map, key, value, map_flags, true); +	if (htab_is_lru(htab)) +		ret = __htab_lru_percpu_map_update_elem(map, key, value, +							map_flags, true); +	else +		ret = __htab_percpu_map_update_elem(map, key, value, map_flags, +						    true);  	rcu_read_unlock();  	return ret; @@ -799,10 +1133,26 @@ static struct bpf_map_type_list htab_percpu_type __read_mostly = {  	.type = BPF_MAP_TYPE_PERCPU_HASH,  }; +static const struct bpf_map_ops htab_lru_percpu_ops = { +	.map_alloc = htab_map_alloc, +	.map_free = htab_map_free, +	.map_get_next_key = htab_map_get_next_key, +	.map_lookup_elem = htab_lru_percpu_map_lookup_elem, +	.map_update_elem = htab_lru_percpu_map_update_elem, +	.map_delete_elem = htab_lru_map_delete_elem, +}; + +static struct bpf_map_type_list htab_lru_percpu_type __read_mostly = { +	.ops = &htab_lru_percpu_ops, +	.type = BPF_MAP_TYPE_LRU_PERCPU_HASH, +}; +  static int __init register_htab_map(void)  {  	bpf_register_map_type(&htab_type);  	bpf_register_map_type(&htab_percpu_type); +	bpf_register_map_type(&htab_lru_type); +	bpf_register_map_type(&htab_lru_percpu_type);  	return 0;  }  late_initcall(register_htab_map);  |