diff options
Diffstat (limited to 'kernel/bpf')
| -rw-r--r-- | kernel/bpf/Makefile | 3 | ||||
| -rw-r--r-- | kernel/bpf/bpf_lru_list.c | 695 | ||||
| -rw-r--r-- | kernel/bpf/bpf_lru_list.h | 84 | ||||
| -rw-r--r-- | kernel/bpf/cgroup.c | 200 | ||||
| -rw-r--r-- | kernel/bpf/core.c | 68 | ||||
| -rw-r--r-- | kernel/bpf/hashtab.c | 441 | ||||
| -rw-r--r-- | kernel/bpf/helpers.c | 12 | ||||
| -rw-r--r-- | kernel/bpf/inode.c | 99 | ||||
| -rw-r--r-- | kernel/bpf/syscall.c | 186 | ||||
| -rw-r--r-- | kernel/bpf/verifier.c | 210 | 
10 files changed, 1858 insertions, 140 deletions
| diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index eed911d091da..1276474ac3cd 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -1,7 +1,8 @@  obj-y := core.o  obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o -obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o +obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o  ifeq ($(CONFIG_PERF_EVENTS),y)  obj-$(CONFIG_BPF_SYSCALL) += stackmap.o  endif +obj-$(CONFIG_CGROUP_BPF) += cgroup.o diff --git a/kernel/bpf/bpf_lru_list.c b/kernel/bpf/bpf_lru_list.c new file mode 100644 index 000000000000..89b7ef41c86b --- /dev/null +++ b/kernel/bpf/bpf_lru_list.c @@ -0,0 +1,695 @@ +/* Copyright (c) 2016 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include <linux/cpumask.h> +#include <linux/spinlock.h> +#include <linux/percpu.h> + +#include "bpf_lru_list.h" + +#define LOCAL_FREE_TARGET		(128) +#define LOCAL_NR_SCANS			LOCAL_FREE_TARGET + +#define PERCPU_FREE_TARGET		(16) +#define PERCPU_NR_SCANS			PERCPU_FREE_TARGET + +/* Helpers to get the local list index */ +#define LOCAL_LIST_IDX(t)	((t) - BPF_LOCAL_LIST_T_OFFSET) +#define LOCAL_FREE_LIST_IDX	LOCAL_LIST_IDX(BPF_LRU_LOCAL_LIST_T_FREE) +#define LOCAL_PENDING_LIST_IDX	LOCAL_LIST_IDX(BPF_LRU_LOCAL_LIST_T_PENDING) +#define IS_LOCAL_LIST_TYPE(t)	((t) >= BPF_LOCAL_LIST_T_OFFSET) + +static int get_next_cpu(int cpu) +{ +	cpu = cpumask_next(cpu, cpu_possible_mask); +	if (cpu >= nr_cpu_ids) +		cpu = cpumask_first(cpu_possible_mask); +	return cpu; +} + +/* Local list helpers */ +static struct list_head *local_free_list(struct bpf_lru_locallist *loc_l) +{ +	return &loc_l->lists[LOCAL_FREE_LIST_IDX]; +} + +static struct list_head *local_pending_list(struct bpf_lru_locallist *loc_l) +{ +	return &loc_l->lists[LOCAL_PENDING_LIST_IDX]; +} + +/* bpf_lru_node helpers */ +static bool bpf_lru_node_is_ref(const struct bpf_lru_node *node) +{ +	return node->ref; +} + +static void bpf_lru_list_count_inc(struct bpf_lru_list *l, +				   enum bpf_lru_list_type type) +{ +	if (type < NR_BPF_LRU_LIST_COUNT) +		l->counts[type]++; +} + +static void bpf_lru_list_count_dec(struct bpf_lru_list *l, +				   enum bpf_lru_list_type type) +{ +	if (type < NR_BPF_LRU_LIST_COUNT) +		l->counts[type]--; +} + +static void __bpf_lru_node_move_to_free(struct bpf_lru_list *l, +					struct bpf_lru_node *node, +					struct list_head *free_list, +					enum bpf_lru_list_type tgt_free_type) +{ +	if (WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(node->type))) +		return; + +	/* If the removing node is the next_inactive_rotation candidate, +	 * move the next_inactive_rotation pointer also. +	 */ +	if (&node->list == l->next_inactive_rotation) +		l->next_inactive_rotation = l->next_inactive_rotation->prev; + +	bpf_lru_list_count_dec(l, node->type); + +	node->type = tgt_free_type; +	list_move(&node->list, free_list); +} + +/* Move nodes from local list to the LRU list */ +static void __bpf_lru_node_move_in(struct bpf_lru_list *l, +				   struct bpf_lru_node *node, +				   enum bpf_lru_list_type tgt_type) +{ +	if (WARN_ON_ONCE(!IS_LOCAL_LIST_TYPE(node->type)) || +	    WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(tgt_type))) +		return; + +	bpf_lru_list_count_inc(l, tgt_type); +	node->type = tgt_type; +	node->ref = 0; +	list_move(&node->list, &l->lists[tgt_type]); +} + +/* Move nodes between or within active and inactive list (like + * active to inactive, inactive to active or tail of active back to + * the head of active). + */ +static void __bpf_lru_node_move(struct bpf_lru_list *l, +				struct bpf_lru_node *node, +				enum bpf_lru_list_type tgt_type) +{ +	if (WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(node->type)) || +	    WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(tgt_type))) +		return; + +	if (node->type != tgt_type) { +		bpf_lru_list_count_dec(l, node->type); +		bpf_lru_list_count_inc(l, tgt_type); +		node->type = tgt_type; +	} +	node->ref = 0; + +	/* If the moving node is the next_inactive_rotation candidate, +	 * move the next_inactive_rotation pointer also. +	 */ +	if (&node->list == l->next_inactive_rotation) +		l->next_inactive_rotation = l->next_inactive_rotation->prev; + +	list_move(&node->list, &l->lists[tgt_type]); +} + +static bool bpf_lru_list_inactive_low(const struct bpf_lru_list *l) +{ +	return l->counts[BPF_LRU_LIST_T_INACTIVE] < +		l->counts[BPF_LRU_LIST_T_ACTIVE]; +} + +/* Rotate the active list: + * 1. Start from tail + * 2. If the node has the ref bit set, it will be rotated + *    back to the head of active list with the ref bit cleared. + *    Give this node one more chance to survive in the active list. + * 3. If the ref bit is not set, move it to the head of the + *    inactive list. + * 4. It will at most scan nr_scans nodes + */ +static void __bpf_lru_list_rotate_active(struct bpf_lru *lru, +					 struct bpf_lru_list *l) +{ +	struct list_head *active = &l->lists[BPF_LRU_LIST_T_ACTIVE]; +	struct bpf_lru_node *node, *tmp_node, *first_node; +	unsigned int i = 0; + +	first_node = list_first_entry(active, struct bpf_lru_node, list); +	list_for_each_entry_safe_reverse(node, tmp_node, active, list) { +		if (bpf_lru_node_is_ref(node)) +			__bpf_lru_node_move(l, node, BPF_LRU_LIST_T_ACTIVE); +		else +			__bpf_lru_node_move(l, node, BPF_LRU_LIST_T_INACTIVE); + +		if (++i == lru->nr_scans || node == first_node) +			break; +	} +} + +/* Rotate the inactive list.  It starts from the next_inactive_rotation + * 1. If the node has ref bit set, it will be moved to the head + *    of active list with the ref bit cleared. + * 2. If the node does not have ref bit set, it will leave it + *    at its current location (i.e. do nothing) so that it can + *    be considered during the next inactive_shrink. + * 3. It will at most scan nr_scans nodes + */ +static void __bpf_lru_list_rotate_inactive(struct bpf_lru *lru, +					   struct bpf_lru_list *l) +{ +	struct list_head *inactive = &l->lists[BPF_LRU_LIST_T_INACTIVE]; +	struct list_head *cur, *last, *next = inactive; +	struct bpf_lru_node *node; +	unsigned int i = 0; + +	if (list_empty(inactive)) +		return; + +	last = l->next_inactive_rotation->next; +	if (last == inactive) +		last = last->next; + +	cur = l->next_inactive_rotation; +	while (i < lru->nr_scans) { +		if (cur == inactive) { +			cur = cur->prev; +			continue; +		} + +		node = list_entry(cur, struct bpf_lru_node, list); +		next = cur->prev; +		if (bpf_lru_node_is_ref(node)) +			__bpf_lru_node_move(l, node, BPF_LRU_LIST_T_ACTIVE); +		if (cur == last) +			break; +		cur = next; +		i++; +	} + +	l->next_inactive_rotation = next; +} + +/* Shrink the inactive list.  It starts from the tail of the + * inactive list and only move the nodes without the ref bit + * set to the designated free list. + */ +static unsigned int +__bpf_lru_list_shrink_inactive(struct bpf_lru *lru, +			       struct bpf_lru_list *l, +			       unsigned int tgt_nshrink, +			       struct list_head *free_list, +			       enum bpf_lru_list_type tgt_free_type) +{ +	struct list_head *inactive = &l->lists[BPF_LRU_LIST_T_INACTIVE]; +	struct bpf_lru_node *node, *tmp_node, *first_node; +	unsigned int nshrinked = 0; +	unsigned int i = 0; + +	first_node = list_first_entry(inactive, struct bpf_lru_node, list); +	list_for_each_entry_safe_reverse(node, tmp_node, inactive, list) { +		if (bpf_lru_node_is_ref(node)) { +			__bpf_lru_node_move(l, node, BPF_LRU_LIST_T_ACTIVE); +		} else if (lru->del_from_htab(lru->del_arg, node)) { +			__bpf_lru_node_move_to_free(l, node, free_list, +						    tgt_free_type); +			if (++nshrinked == tgt_nshrink) +				break; +		} + +		if (++i == lru->nr_scans) +			break; +	} + +	return nshrinked; +} + +/* 1. Rotate the active list (if needed) + * 2. Always rotate the inactive list + */ +static void __bpf_lru_list_rotate(struct bpf_lru *lru, struct bpf_lru_list *l) +{ +	if (bpf_lru_list_inactive_low(l)) +		__bpf_lru_list_rotate_active(lru, l); + +	__bpf_lru_list_rotate_inactive(lru, l); +} + +/* Calls __bpf_lru_list_shrink_inactive() to shrink some + * ref-bit-cleared nodes and move them to the designated + * free list. + * + * If it cannot get a free node after calling + * __bpf_lru_list_shrink_inactive().  It will just remove + * one node from either inactive or active list without + * honoring the ref-bit.  It prefers inactive list to active + * list in this situation. + */ +static unsigned int __bpf_lru_list_shrink(struct bpf_lru *lru, +					  struct bpf_lru_list *l, +					  unsigned int tgt_nshrink, +					  struct list_head *free_list, +					  enum bpf_lru_list_type tgt_free_type) + +{ +	struct bpf_lru_node *node, *tmp_node; +	struct list_head *force_shrink_list; +	unsigned int nshrinked; + +	nshrinked = __bpf_lru_list_shrink_inactive(lru, l, tgt_nshrink, +						   free_list, tgt_free_type); +	if (nshrinked) +		return nshrinked; + +	/* Do a force shrink by ignoring the reference bit */ +	if (!list_empty(&l->lists[BPF_LRU_LIST_T_INACTIVE])) +		force_shrink_list = &l->lists[BPF_LRU_LIST_T_INACTIVE]; +	else +		force_shrink_list = &l->lists[BPF_LRU_LIST_T_ACTIVE]; + +	list_for_each_entry_safe_reverse(node, tmp_node, force_shrink_list, +					 list) { +		if (lru->del_from_htab(lru->del_arg, node)) { +			__bpf_lru_node_move_to_free(l, node, free_list, +						    tgt_free_type); +			return 1; +		} +	} + +	return 0; +} + +/* Flush the nodes from the local pending list to the LRU list */ +static void __local_list_flush(struct bpf_lru_list *l, +			       struct bpf_lru_locallist *loc_l) +{ +	struct bpf_lru_node *node, *tmp_node; + +	list_for_each_entry_safe_reverse(node, tmp_node, +					 local_pending_list(loc_l), list) { +		if (bpf_lru_node_is_ref(node)) +			__bpf_lru_node_move_in(l, node, BPF_LRU_LIST_T_ACTIVE); +		else +			__bpf_lru_node_move_in(l, node, +					       BPF_LRU_LIST_T_INACTIVE); +	} +} + +static void bpf_lru_list_push_free(struct bpf_lru_list *l, +				   struct bpf_lru_node *node) +{ +	unsigned long flags; + +	if (WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(node->type))) +		return; + +	raw_spin_lock_irqsave(&l->lock, flags); +	__bpf_lru_node_move(l, node, BPF_LRU_LIST_T_FREE); +	raw_spin_unlock_irqrestore(&l->lock, flags); +} + +static void bpf_lru_list_pop_free_to_local(struct bpf_lru *lru, +					   struct bpf_lru_locallist *loc_l) +{ +	struct bpf_lru_list *l = &lru->common_lru.lru_list; +	struct bpf_lru_node *node, *tmp_node; +	unsigned int nfree = 0; + +	raw_spin_lock(&l->lock); + +	__local_list_flush(l, loc_l); + +	__bpf_lru_list_rotate(lru, l); + +	list_for_each_entry_safe(node, tmp_node, &l->lists[BPF_LRU_LIST_T_FREE], +				 list) { +		__bpf_lru_node_move_to_free(l, node, local_free_list(loc_l), +					    BPF_LRU_LOCAL_LIST_T_FREE); +		if (++nfree == LOCAL_FREE_TARGET) +			break; +	} + +	if (nfree < LOCAL_FREE_TARGET) +		__bpf_lru_list_shrink(lru, l, LOCAL_FREE_TARGET - nfree, +				      local_free_list(loc_l), +				      BPF_LRU_LOCAL_LIST_T_FREE); + +	raw_spin_unlock(&l->lock); +} + +static void __local_list_add_pending(struct bpf_lru *lru, +				     struct bpf_lru_locallist *loc_l, +				     int cpu, +				     struct bpf_lru_node *node, +				     u32 hash) +{ +	*(u32 *)((void *)node + lru->hash_offset) = hash; +	node->cpu = cpu; +	node->type = BPF_LRU_LOCAL_LIST_T_PENDING; +	node->ref = 0; +	list_add(&node->list, local_pending_list(loc_l)); +} + +struct bpf_lru_node *__local_list_pop_free(struct bpf_lru_locallist *loc_l) +{ +	struct bpf_lru_node *node; + +	node = list_first_entry_or_null(local_free_list(loc_l), +					struct bpf_lru_node, +					list); +	if (node) +		list_del(&node->list); + +	return node; +} + +struct bpf_lru_node *__local_list_pop_pending(struct bpf_lru *lru, +					      struct bpf_lru_locallist *loc_l) +{ +	struct bpf_lru_node *node; +	bool force = false; + +ignore_ref: +	/* Get from the tail (i.e. older element) of the pending list. */ +	list_for_each_entry_reverse(node, local_pending_list(loc_l), +				    list) { +		if ((!bpf_lru_node_is_ref(node) || force) && +		    lru->del_from_htab(lru->del_arg, node)) { +			list_del(&node->list); +			return node; +		} +	} + +	if (!force) { +		force = true; +		goto ignore_ref; +	} + +	return NULL; +} + +static struct bpf_lru_node *bpf_percpu_lru_pop_free(struct bpf_lru *lru, +						    u32 hash) +{ +	struct list_head *free_list; +	struct bpf_lru_node *node = NULL; +	struct bpf_lru_list *l; +	unsigned long flags; +	int cpu = raw_smp_processor_id(); + +	l = per_cpu_ptr(lru->percpu_lru, cpu); + +	raw_spin_lock_irqsave(&l->lock, flags); + +	__bpf_lru_list_rotate(lru, l); + +	free_list = &l->lists[BPF_LRU_LIST_T_FREE]; +	if (list_empty(free_list)) +		__bpf_lru_list_shrink(lru, l, PERCPU_FREE_TARGET, free_list, +				      BPF_LRU_LIST_T_FREE); + +	if (!list_empty(free_list)) { +		node = list_first_entry(free_list, struct bpf_lru_node, list); +		*(u32 *)((void *)node + lru->hash_offset) = hash; +		node->ref = 0; +		__bpf_lru_node_move(l, node, BPF_LRU_LIST_T_INACTIVE); +	} + +	raw_spin_unlock_irqrestore(&l->lock, flags); + +	return node; +} + +static struct bpf_lru_node *bpf_common_lru_pop_free(struct bpf_lru *lru, +						    u32 hash) +{ +	struct bpf_lru_locallist *loc_l, *steal_loc_l; +	struct bpf_common_lru *clru = &lru->common_lru; +	struct bpf_lru_node *node; +	int steal, first_steal; +	unsigned long flags; +	int cpu = raw_smp_processor_id(); + +	loc_l = per_cpu_ptr(clru->local_list, cpu); + +	raw_spin_lock_irqsave(&loc_l->lock, flags); + +	node = __local_list_pop_free(loc_l); +	if (!node) { +		bpf_lru_list_pop_free_to_local(lru, loc_l); +		node = __local_list_pop_free(loc_l); +	} + +	if (node) +		__local_list_add_pending(lru, loc_l, cpu, node, hash); + +	raw_spin_unlock_irqrestore(&loc_l->lock, flags); + +	if (node) +		return node; + +	/* No free nodes found from the local free list and +	 * the global LRU list. +	 * +	 * Steal from the local free/pending list of the +	 * current CPU and remote CPU in RR.  It starts +	 * with the loc_l->next_steal CPU. +	 */ + +	first_steal = loc_l->next_steal; +	steal = first_steal; +	do { +		steal_loc_l = per_cpu_ptr(clru->local_list, steal); + +		raw_spin_lock_irqsave(&steal_loc_l->lock, flags); + +		node = __local_list_pop_free(steal_loc_l); +		if (!node) +			node = __local_list_pop_pending(lru, steal_loc_l); + +		raw_spin_unlock_irqrestore(&steal_loc_l->lock, flags); + +		steal = get_next_cpu(steal); +	} while (!node && steal != first_steal); + +	loc_l->next_steal = steal; + +	if (node) { +		raw_spin_lock_irqsave(&loc_l->lock, flags); +		__local_list_add_pending(lru, loc_l, cpu, node, hash); +		raw_spin_unlock_irqrestore(&loc_l->lock, flags); +	} + +	return node; +} + +struct bpf_lru_node *bpf_lru_pop_free(struct bpf_lru *lru, u32 hash) +{ +	if (lru->percpu) +		return bpf_percpu_lru_pop_free(lru, hash); +	else +		return bpf_common_lru_pop_free(lru, hash); +} + +static void bpf_common_lru_push_free(struct bpf_lru *lru, +				     struct bpf_lru_node *node) +{ +	unsigned long flags; + +	if (WARN_ON_ONCE(node->type == BPF_LRU_LIST_T_FREE) || +	    WARN_ON_ONCE(node->type == BPF_LRU_LOCAL_LIST_T_FREE)) +		return; + +	if (node->type == BPF_LRU_LOCAL_LIST_T_PENDING) { +		struct bpf_lru_locallist *loc_l; + +		loc_l = per_cpu_ptr(lru->common_lru.local_list, node->cpu); + +		raw_spin_lock_irqsave(&loc_l->lock, flags); + +		if (unlikely(node->type != BPF_LRU_LOCAL_LIST_T_PENDING)) { +			raw_spin_unlock_irqrestore(&loc_l->lock, flags); +			goto check_lru_list; +		} + +		node->type = BPF_LRU_LOCAL_LIST_T_FREE; +		node->ref = 0; +		list_move(&node->list, local_free_list(loc_l)); + +		raw_spin_unlock_irqrestore(&loc_l->lock, flags); +		return; +	} + +check_lru_list: +	bpf_lru_list_push_free(&lru->common_lru.lru_list, node); +} + +static void bpf_percpu_lru_push_free(struct bpf_lru *lru, +				     struct bpf_lru_node *node) +{ +	struct bpf_lru_list *l; +	unsigned long flags; + +	l = per_cpu_ptr(lru->percpu_lru, node->cpu); + +	raw_spin_lock_irqsave(&l->lock, flags); + +	__bpf_lru_node_move(l, node, BPF_LRU_LIST_T_FREE); + +	raw_spin_unlock_irqrestore(&l->lock, flags); +} + +void bpf_lru_push_free(struct bpf_lru *lru, struct bpf_lru_node *node) +{ +	if (lru->percpu) +		bpf_percpu_lru_push_free(lru, node); +	else +		bpf_common_lru_push_free(lru, node); +} + +void bpf_common_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset, +			     u32 elem_size, u32 nr_elems) +{ +	struct bpf_lru_list *l = &lru->common_lru.lru_list; +	u32 i; + +	for (i = 0; i < nr_elems; i++) { +		struct bpf_lru_node *node; + +		node = (struct bpf_lru_node *)(buf + node_offset); +		node->type = BPF_LRU_LIST_T_FREE; +		node->ref = 0; +		list_add(&node->list, &l->lists[BPF_LRU_LIST_T_FREE]); +		buf += elem_size; +	} +} + +void bpf_percpu_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset, +			     u32 elem_size, u32 nr_elems) +{ +	u32 i, pcpu_entries; +	int cpu; +	struct bpf_lru_list *l; + +	pcpu_entries = nr_elems / num_possible_cpus(); + +	i = 0; + +	for_each_possible_cpu(cpu) { +		struct bpf_lru_node *node; + +		l = per_cpu_ptr(lru->percpu_lru, cpu); +again: +		node = (struct bpf_lru_node *)(buf + node_offset); +		node->cpu = cpu; +		node->type = BPF_LRU_LIST_T_FREE; +		node->ref = 0; +		list_add(&node->list, &l->lists[BPF_LRU_LIST_T_FREE]); +		i++; +		buf += elem_size; +		if (i == nr_elems) +			break; +		if (i % pcpu_entries) +			goto again; +	} +} + +void bpf_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset, +		      u32 elem_size, u32 nr_elems) +{ +	if (lru->percpu) +		bpf_percpu_lru_populate(lru, buf, node_offset, elem_size, +					nr_elems); +	else +		bpf_common_lru_populate(lru, buf, node_offset, elem_size, +					nr_elems); +} + +static void bpf_lru_locallist_init(struct bpf_lru_locallist *loc_l, int cpu) +{ +	int i; + +	for (i = 0; i < NR_BPF_LRU_LOCAL_LIST_T; i++) +		INIT_LIST_HEAD(&loc_l->lists[i]); + +	loc_l->next_steal = cpu; + +	raw_spin_lock_init(&loc_l->lock); +} + +static void bpf_lru_list_init(struct bpf_lru_list *l) +{ +	int i; + +	for (i = 0; i < NR_BPF_LRU_LIST_T; i++) +		INIT_LIST_HEAD(&l->lists[i]); + +	for (i = 0; i < NR_BPF_LRU_LIST_COUNT; i++) +		l->counts[i] = 0; + +	l->next_inactive_rotation = &l->lists[BPF_LRU_LIST_T_INACTIVE]; + +	raw_spin_lock_init(&l->lock); +} + +int bpf_lru_init(struct bpf_lru *lru, bool percpu, u32 hash_offset, +		 del_from_htab_func del_from_htab, void *del_arg) +{ +	int cpu; + +	if (percpu) { +		lru->percpu_lru = alloc_percpu(struct bpf_lru_list); +		if (!lru->percpu_lru) +			return -ENOMEM; + +		for_each_possible_cpu(cpu) { +			struct bpf_lru_list *l; + +			l = per_cpu_ptr(lru->percpu_lru, cpu); +			bpf_lru_list_init(l); +		} +		lru->nr_scans = PERCPU_NR_SCANS; +	} else { +		struct bpf_common_lru *clru = &lru->common_lru; + +		clru->local_list = alloc_percpu(struct bpf_lru_locallist); +		if (!clru->local_list) +			return -ENOMEM; + +		for_each_possible_cpu(cpu) { +			struct bpf_lru_locallist *loc_l; + +			loc_l = per_cpu_ptr(clru->local_list, cpu); +			bpf_lru_locallist_init(loc_l, cpu); +		} + +		bpf_lru_list_init(&clru->lru_list); +		lru->nr_scans = LOCAL_NR_SCANS; +	} + +	lru->percpu = percpu; +	lru->del_from_htab = del_from_htab; +	lru->del_arg = del_arg; +	lru->hash_offset = hash_offset; + +	return 0; +} + +void bpf_lru_destroy(struct bpf_lru *lru) +{ +	if (lru->percpu) +		free_percpu(lru->percpu_lru); +	else +		free_percpu(lru->common_lru.local_list); +} diff --git a/kernel/bpf/bpf_lru_list.h b/kernel/bpf/bpf_lru_list.h new file mode 100644 index 000000000000..5c35a98d02bf --- /dev/null +++ b/kernel/bpf/bpf_lru_list.h @@ -0,0 +1,84 @@ +/* Copyright (c) 2016 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#ifndef __BPF_LRU_LIST_H_ +#define __BPF_LRU_LIST_H_ + +#include <linux/list.h> +#include <linux/spinlock_types.h> + +#define NR_BPF_LRU_LIST_T	(3) +#define NR_BPF_LRU_LIST_COUNT	(2) +#define NR_BPF_LRU_LOCAL_LIST_T (2) +#define BPF_LOCAL_LIST_T_OFFSET NR_BPF_LRU_LIST_T + +enum bpf_lru_list_type { +	BPF_LRU_LIST_T_ACTIVE, +	BPF_LRU_LIST_T_INACTIVE, +	BPF_LRU_LIST_T_FREE, +	BPF_LRU_LOCAL_LIST_T_FREE, +	BPF_LRU_LOCAL_LIST_T_PENDING, +}; + +struct bpf_lru_node { +	struct list_head list; +	u16 cpu; +	u8 type; +	u8 ref; +}; + +struct bpf_lru_list { +	struct list_head lists[NR_BPF_LRU_LIST_T]; +	unsigned int counts[NR_BPF_LRU_LIST_COUNT]; +	/* The next inacitve list rotation starts from here */ +	struct list_head *next_inactive_rotation; + +	raw_spinlock_t lock ____cacheline_aligned_in_smp; +}; + +struct bpf_lru_locallist { +	struct list_head lists[NR_BPF_LRU_LOCAL_LIST_T]; +	u16 next_steal; +	raw_spinlock_t lock; +}; + +struct bpf_common_lru { +	struct bpf_lru_list lru_list; +	struct bpf_lru_locallist __percpu *local_list; +}; + +typedef bool (*del_from_htab_func)(void *arg, struct bpf_lru_node *node); + +struct bpf_lru { +	union { +		struct bpf_common_lru common_lru; +		struct bpf_lru_list __percpu *percpu_lru; +	}; +	del_from_htab_func del_from_htab; +	void *del_arg; +	unsigned int hash_offset; +	unsigned int nr_scans; +	bool percpu; +}; + +static inline void bpf_lru_node_set_ref(struct bpf_lru_node *node) +{ +	/* ref is an approximation on access frequency.  It does not +	 * have to be very accurate.  Hence, no protection is used. +	 */ +	node->ref = 1; +} + +int bpf_lru_init(struct bpf_lru *lru, bool percpu, u32 hash_offset, +		 del_from_htab_func del_from_htab, void *delete_arg); +void bpf_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset, +		      u32 elem_size, u32 nr_elems); +void bpf_lru_destroy(struct bpf_lru *lru); +struct bpf_lru_node *bpf_lru_pop_free(struct bpf_lru *lru, u32 hash); +void bpf_lru_push_free(struct bpf_lru *lru, struct bpf_lru_node *node); +void bpf_lru_promote(struct bpf_lru *lru, struct bpf_lru_node *node); + +#endif diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c new file mode 100644 index 000000000000..a515f7b007c6 --- /dev/null +++ b/kernel/bpf/cgroup.c @@ -0,0 +1,200 @@ +/* + * Functions to manage eBPF programs attached to cgroups + * + * Copyright (c) 2016 Daniel Mack + * + * This file is subject to the terms and conditions of version 2 of the GNU + * General Public License.  See the file COPYING in the main directory of the + * Linux distribution for more details. + */ + +#include <linux/kernel.h> +#include <linux/atomic.h> +#include <linux/cgroup.h> +#include <linux/slab.h> +#include <linux/bpf.h> +#include <linux/bpf-cgroup.h> +#include <net/sock.h> + +DEFINE_STATIC_KEY_FALSE(cgroup_bpf_enabled_key); +EXPORT_SYMBOL(cgroup_bpf_enabled_key); + +/** + * cgroup_bpf_put() - put references of all bpf programs + * @cgrp: the cgroup to modify + */ +void cgroup_bpf_put(struct cgroup *cgrp) +{ +	unsigned int type; + +	for (type = 0; type < ARRAY_SIZE(cgrp->bpf.prog); type++) { +		struct bpf_prog *prog = cgrp->bpf.prog[type]; + +		if (prog) { +			bpf_prog_put(prog); +			static_branch_dec(&cgroup_bpf_enabled_key); +		} +	} +} + +/** + * cgroup_bpf_inherit() - inherit effective programs from parent + * @cgrp: the cgroup to modify + * @parent: the parent to inherit from + */ +void cgroup_bpf_inherit(struct cgroup *cgrp, struct cgroup *parent) +{ +	unsigned int type; + +	for (type = 0; type < ARRAY_SIZE(cgrp->bpf.effective); type++) { +		struct bpf_prog *e; + +		e = rcu_dereference_protected(parent->bpf.effective[type], +					      lockdep_is_held(&cgroup_mutex)); +		rcu_assign_pointer(cgrp->bpf.effective[type], e); +	} +} + +/** + * __cgroup_bpf_update() - Update the pinned program of a cgroup, and + *                         propagate the change to descendants + * @cgrp: The cgroup which descendants to traverse + * @parent: The parent of @cgrp, or %NULL if @cgrp is the root + * @prog: A new program to pin + * @type: Type of pinning operation (ingress/egress) + * + * Each cgroup has a set of two pointers for bpf programs; one for eBPF + * programs it owns, and which is effective for execution. + * + * If @prog is not %NULL, this function attaches a new program to the cgroup + * and releases the one that is currently attached, if any. @prog is then made + * the effective program of type @type in that cgroup. + * + * If @prog is %NULL, the currently attached program of type @type is released, + * and the effective program of the parent cgroup (if any) is inherited to + * @cgrp. + * + * Then, the descendants of @cgrp are walked and the effective program for + * each of them is set to the effective program of @cgrp unless the + * descendant has its own program attached, in which case the subbranch is + * skipped. This ensures that delegated subcgroups with own programs are left + * untouched. + * + * Must be called with cgroup_mutex held. + */ +void __cgroup_bpf_update(struct cgroup *cgrp, +			 struct cgroup *parent, +			 struct bpf_prog *prog, +			 enum bpf_attach_type type) +{ +	struct bpf_prog *old_prog, *effective; +	struct cgroup_subsys_state *pos; + +	old_prog = xchg(cgrp->bpf.prog + type, prog); + +	effective = (!prog && parent) ? +		rcu_dereference_protected(parent->bpf.effective[type], +					  lockdep_is_held(&cgroup_mutex)) : +		prog; + +	css_for_each_descendant_pre(pos, &cgrp->self) { +		struct cgroup *desc = container_of(pos, struct cgroup, self); + +		/* skip the subtree if the descendant has its own program */ +		if (desc->bpf.prog[type] && desc != cgrp) +			pos = css_rightmost_descendant(pos); +		else +			rcu_assign_pointer(desc->bpf.effective[type], +					   effective); +	} + +	if (prog) +		static_branch_inc(&cgroup_bpf_enabled_key); + +	if (old_prog) { +		bpf_prog_put(old_prog); +		static_branch_dec(&cgroup_bpf_enabled_key); +	} +} + +/** + * __cgroup_bpf_run_filter_skb() - Run a program for packet filtering + * @sk: The socken sending or receiving traffic + * @skb: The skb that is being sent or received + * @type: The type of program to be exectuted + * + * If no socket is passed, or the socket is not of type INET or INET6, + * this function does nothing and returns 0. + * + * The program type passed in via @type must be suitable for network + * filtering. No further check is performed to assert that. + * + * This function will return %-EPERM if any if an attached program was found + * and if it returned != 1 during execution. In all other cases, 0 is returned. + */ +int __cgroup_bpf_run_filter_skb(struct sock *sk, +				struct sk_buff *skb, +				enum bpf_attach_type type) +{ +	struct bpf_prog *prog; +	struct cgroup *cgrp; +	int ret = 0; + +	if (!sk || !sk_fullsock(sk)) +		return 0; + +	if (sk->sk_family != AF_INET && +	    sk->sk_family != AF_INET6) +		return 0; + +	cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); + +	rcu_read_lock(); + +	prog = rcu_dereference(cgrp->bpf.effective[type]); +	if (prog) { +		unsigned int offset = skb->data - skb_network_header(skb); + +		__skb_push(skb, offset); +		ret = bpf_prog_run_save_cb(prog, skb) == 1 ? 0 : -EPERM; +		__skb_pull(skb, offset); +	} + +	rcu_read_unlock(); + +	return ret; +} +EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb); + +/** + * __cgroup_bpf_run_filter_sk() - Run a program on a sock + * @sk: sock structure to manipulate + * @type: The type of program to be exectuted + * + * socket is passed is expected to be of type INET or INET6. + * + * The program type passed in via @type must be suitable for sock + * filtering. No further check is performed to assert that. + * + * This function will return %-EPERM if any if an attached program was found + * and if it returned != 1 during execution. In all other cases, 0 is returned. + */ +int __cgroup_bpf_run_filter_sk(struct sock *sk, +			       enum bpf_attach_type type) +{ +	struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); +	struct bpf_prog *prog; +	int ret = 0; + + +	rcu_read_lock(); + +	prog = rcu_dereference(cgrp->bpf.effective[type]); +	if (prog) +		ret = BPF_PROG_RUN(prog, sk) == 1 ? 0 : -EPERM; + +	rcu_read_unlock(); + +	return ret; +} +EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index aa6d98154106..83e0d153b0b4 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -136,6 +136,71 @@ void __bpf_prog_free(struct bpf_prog *fp)  	vfree(fp);  } +#define SHA_BPF_RAW_SIZE						\ +	round_up(MAX_BPF_SIZE + sizeof(__be64) + 1, SHA_MESSAGE_BYTES) + +/* Called under verifier mutex. */ +void bpf_prog_calc_digest(struct bpf_prog *fp) +{ +	const u32 bits_offset = SHA_MESSAGE_BYTES - sizeof(__be64); +	static u32 ws[SHA_WORKSPACE_WORDS]; +	static u8 raw[SHA_BPF_RAW_SIZE]; +	struct bpf_insn *dst = (void *)raw; +	u32 i, bsize, psize, blocks; +	bool was_ld_map; +	u8 *todo = raw; +	__be32 *result; +	__be64 *bits; + +	sha_init(fp->digest); +	memset(ws, 0, sizeof(ws)); + +	/* We need to take out the map fd for the digest calculation +	 * since they are unstable from user space side. +	 */ +	for (i = 0, was_ld_map = false; i < fp->len; i++) { +		dst[i] = fp->insnsi[i]; +		if (!was_ld_map && +		    dst[i].code == (BPF_LD | BPF_IMM | BPF_DW) && +		    dst[i].src_reg == BPF_PSEUDO_MAP_FD) { +			was_ld_map = true; +			dst[i].imm = 0; +		} else if (was_ld_map && +			   dst[i].code == 0 && +			   dst[i].dst_reg == 0 && +			   dst[i].src_reg == 0 && +			   dst[i].off == 0) { +			was_ld_map = false; +			dst[i].imm = 0; +		} else { +			was_ld_map = false; +		} +	} + +	psize = fp->len * sizeof(struct bpf_insn); +	memset(&raw[psize], 0, sizeof(raw) - psize); +	raw[psize++] = 0x80; + +	bsize  = round_up(psize, SHA_MESSAGE_BYTES); +	blocks = bsize / SHA_MESSAGE_BYTES; +	if (bsize - psize >= sizeof(__be64)) { +		bits = (__be64 *)(todo + bsize - sizeof(__be64)); +	} else { +		bits = (__be64 *)(todo + bsize + bits_offset); +		blocks++; +	} +	*bits = cpu_to_be64((psize - 1) << 3); + +	while (blocks--) { +		sha_transform(fp->digest, todo, ws); +		todo += SHA_MESSAGE_BYTES; +	} + +	result = (__force __be32 *)fp->digest; +	for (i = 0; i < SHA_DIGEST_WORDS; i++) +		result[i] = cpu_to_be32(fp->digest[i]); +} +  static bool bpf_is_jmp_and_has_target(const struct bpf_insn *insn)  {  	return BPF_CLASS(insn->code) == BPF_JMP  && @@ -1043,6 +1108,7 @@ const struct bpf_func_proto bpf_map_delete_elem_proto __weak;  const struct bpf_func_proto bpf_get_prandom_u32_proto __weak;  const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak; +const struct bpf_func_proto bpf_get_numa_node_id_proto __weak;  const struct bpf_func_proto bpf_ktime_get_ns_proto __weak;  const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak; @@ -1077,7 +1143,7 @@ struct bpf_prog * __weak bpf_int_jit_compile(struct bpf_prog *prog)  	return prog;  } -bool __weak bpf_helper_changes_skb_data(void *func) +bool __weak bpf_helper_changes_pkt_data(void *func)  {  	return false;  } diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 570eeca7bdfa..34debc1a9641 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -15,6 +15,7 @@  #include <linux/filter.h>  #include <linux/vmalloc.h>  #include "percpu_freelist.h" +#include "bpf_lru_list.h"  struct bucket {  	struct hlist_head head; @@ -25,7 +26,10 @@ struct bpf_htab {  	struct bpf_map map;  	struct bucket *buckets;  	void *elems; -	struct pcpu_freelist freelist; +	union { +		struct pcpu_freelist freelist; +		struct bpf_lru lru; +	};  	void __percpu *extra_elems;  	atomic_t count;	/* number of elements in this hashtable */  	u32 n_buckets;	/* number of hash buckets */ @@ -48,11 +52,26 @@ struct htab_elem {  	union {  		struct rcu_head rcu;  		enum extra_elem_state state; +		struct bpf_lru_node lru_node;  	};  	u32 hash;  	char key[0] __aligned(8);  }; +static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node); + +static bool htab_is_lru(const struct bpf_htab *htab) +{ +	return htab->map.map_type == BPF_MAP_TYPE_LRU_HASH || +		htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH; +} + +static bool htab_is_percpu(const struct bpf_htab *htab) +{ +	return htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH || +		htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH; +} +  static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size,  				     void __percpu *pptr)  { @@ -73,7 +92,7 @@ static void htab_free_elems(struct bpf_htab *htab)  {  	int i; -	if (htab->map.map_type != BPF_MAP_TYPE_PERCPU_HASH) +	if (!htab_is_percpu(htab))  		goto free_elems;  	for (i = 0; i < htab->map.max_entries; i++) { @@ -87,7 +106,22 @@ free_elems:  	vfree(htab->elems);  } -static int prealloc_elems_and_freelist(struct bpf_htab *htab) +static struct htab_elem *prealloc_lru_pop(struct bpf_htab *htab, void *key, +					  u32 hash) +{ +	struct bpf_lru_node *node = bpf_lru_pop_free(&htab->lru, hash); +	struct htab_elem *l; + +	if (node) { +		l = container_of(node, struct htab_elem, lru_node); +		memcpy(l->key, key, htab->map.key_size); +		return l; +	} + +	return NULL; +} + +static int prealloc_init(struct bpf_htab *htab)  {  	int err = -ENOMEM, i; @@ -95,7 +129,7 @@ static int prealloc_elems_and_freelist(struct bpf_htab *htab)  	if (!htab->elems)  		return -ENOMEM; -	if (htab->map.map_type != BPF_MAP_TYPE_PERCPU_HASH) +	if (!htab_is_percpu(htab))  		goto skip_percpu_elems;  	for (i = 0; i < htab->map.max_entries; i++) { @@ -110,12 +144,27 @@ static int prealloc_elems_and_freelist(struct bpf_htab *htab)  	}  skip_percpu_elems: -	err = pcpu_freelist_init(&htab->freelist); +	if (htab_is_lru(htab)) +		err = bpf_lru_init(&htab->lru, +				   htab->map.map_flags & BPF_F_NO_COMMON_LRU, +				   offsetof(struct htab_elem, hash) - +				   offsetof(struct htab_elem, lru_node), +				   htab_lru_map_delete_node, +				   htab); +	else +		err = pcpu_freelist_init(&htab->freelist); +  	if (err)  		goto free_elems; -	pcpu_freelist_populate(&htab->freelist, htab->elems, htab->elem_size, -			       htab->map.max_entries); +	if (htab_is_lru(htab)) +		bpf_lru_populate(&htab->lru, htab->elems, +				 offsetof(struct htab_elem, lru_node), +				 htab->elem_size, htab->map.max_entries); +	else +		pcpu_freelist_populate(&htab->freelist, htab->elems, +				       htab->elem_size, htab->map.max_entries); +  	return 0;  free_elems: @@ -123,6 +172,16 @@ free_elems:  	return err;  } +static void prealloc_destroy(struct bpf_htab *htab) +{ +	htab_free_elems(htab); + +	if (htab_is_lru(htab)) +		bpf_lru_destroy(&htab->lru); +	else +		pcpu_freelist_destroy(&htab->freelist); +} +  static int alloc_extra_elems(struct bpf_htab *htab)  {  	void __percpu *pptr; @@ -143,15 +202,37 @@ static int alloc_extra_elems(struct bpf_htab *htab)  /* Called from syscall */  static struct bpf_map *htab_map_alloc(union bpf_attr *attr)  { -	bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_HASH; +	bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH || +		       attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH); +	bool lru = (attr->map_type == BPF_MAP_TYPE_LRU_HASH || +		    attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH); +	/* percpu_lru means each cpu has its own LRU list. +	 * it is different from BPF_MAP_TYPE_PERCPU_HASH where +	 * the map's value itself is percpu.  percpu_lru has +	 * nothing to do with the map's value. +	 */ +	bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU); +	bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC);  	struct bpf_htab *htab;  	int err, i;  	u64 cost; -	if (attr->map_flags & ~BPF_F_NO_PREALLOC) +	if (lru && !capable(CAP_SYS_ADMIN)) +		/* LRU implementation is much complicated than other +		 * maps.  Hence, limit to CAP_SYS_ADMIN for now. +		 */ +		return ERR_PTR(-EPERM); + +	if (attr->map_flags & ~(BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU))  		/* reserved bits should not be used */  		return ERR_PTR(-EINVAL); +	if (!lru && percpu_lru) +		return ERR_PTR(-EINVAL); + +	if (lru && !prealloc) +		return ERR_PTR(-ENOTSUPP); +  	htab = kzalloc(sizeof(*htab), GFP_USER);  	if (!htab)  		return ERR_PTR(-ENOMEM); @@ -171,6 +252,18 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)  	    htab->map.value_size == 0)  		goto free_htab; +	if (percpu_lru) { +		/* ensure each CPU's lru list has >=1 elements. +		 * since we are at it, make each lru list has the same +		 * number of elements. +		 */ +		htab->map.max_entries = roundup(attr->max_entries, +						num_possible_cpus()); +		if (htab->map.max_entries < attr->max_entries) +			htab->map.max_entries = rounddown(attr->max_entries, +							  num_possible_cpus()); +	} +  	/* hash table size must be power of 2 */  	htab->n_buckets = roundup_pow_of_two(htab->map.max_entries); @@ -241,14 +334,17 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)  		raw_spin_lock_init(&htab->buckets[i].lock);  	} -	if (!percpu) { +	if (!percpu && !lru) { +		/* lru itself can remove the least used element, so +		 * there is no need for an extra elem during map_update. +		 */  		err = alloc_extra_elems(htab);  		if (err)  			goto free_buckets;  	} -	if (!(attr->map_flags & BPF_F_NO_PREALLOC)) { -		err = prealloc_elems_and_freelist(htab); +	if (prealloc) { +		err = prealloc_init(htab);  		if (err)  			goto free_extra_elems;  	} @@ -323,6 +419,46 @@ static void *htab_map_lookup_elem(struct bpf_map *map, void *key)  	return NULL;  } +static void *htab_lru_map_lookup_elem(struct bpf_map *map, void *key) +{ +	struct htab_elem *l = __htab_map_lookup_elem(map, key); + +	if (l) { +		bpf_lru_node_set_ref(&l->lru_node); +		return l->key + round_up(map->key_size, 8); +	} + +	return NULL; +} + +/* It is called from the bpf_lru_list when the LRU needs to delete + * older elements from the htab. + */ +static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node) +{ +	struct bpf_htab *htab = (struct bpf_htab *)arg; +	struct htab_elem *l, *tgt_l; +	struct hlist_head *head; +	unsigned long flags; +	struct bucket *b; + +	tgt_l = container_of(node, struct htab_elem, lru_node); +	b = __select_bucket(htab, tgt_l->hash); +	head = &b->head; + +	raw_spin_lock_irqsave(&b->lock, flags); + +	hlist_for_each_entry_rcu(l, head, hash_node) +		if (l == tgt_l) { +			hlist_del_rcu(&l->hash_node); +			break; +		} + +	raw_spin_unlock_irqrestore(&b->lock, flags); + +	return l == tgt_l; +} +  /* Called from syscall */  static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)  { @@ -420,6 +556,24 @@ static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)  	}  } +static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr, +			    void *value, bool onallcpus) +{ +	if (!onallcpus) { +		/* copy true value_size bytes */ +		memcpy(this_cpu_ptr(pptr), value, htab->map.value_size); +	} else { +		u32 size = round_up(htab->map.value_size, 8); +		int off = 0, cpu; + +		for_each_possible_cpu(cpu) { +			bpf_long_memcpy(per_cpu_ptr(pptr, cpu), +					value + off, size); +			off += size; +		} +	} +} +  static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,  					 void *value, u32 key_size, u32 hash,  					 bool percpu, bool onallcpus, @@ -479,18 +633,8 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,  			}  		} -		if (!onallcpus) { -			/* copy true value_size bytes */ -			memcpy(this_cpu_ptr(pptr), value, htab->map.value_size); -		} else { -			int off = 0, cpu; +		pcpu_copy_value(htab, pptr, value, onallcpus); -			for_each_possible_cpu(cpu) { -				bpf_long_memcpy(per_cpu_ptr(pptr, cpu), -						value + off, size); -				off += size; -			} -		}  		if (!prealloc)  			htab_elem_set_ptr(l_new, key_size, pptr);  	} else { @@ -571,6 +715,70 @@ err:  	return ret;  } +static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value, +				    u64 map_flags) +{ +	struct bpf_htab *htab = container_of(map, struct bpf_htab, map); +	struct htab_elem *l_new, *l_old = NULL; +	struct hlist_head *head; +	unsigned long flags; +	struct bucket *b; +	u32 key_size, hash; +	int ret; + +	if (unlikely(map_flags > BPF_EXIST)) +		/* unknown flags */ +		return -EINVAL; + +	WARN_ON_ONCE(!rcu_read_lock_held()); + +	key_size = map->key_size; + +	hash = htab_map_hash(key, key_size); + +	b = __select_bucket(htab, hash); +	head = &b->head; + +	/* For LRU, we need to alloc before taking bucket's +	 * spinlock because getting free nodes from LRU may need +	 * to remove older elements from htab and this removal +	 * operation will need a bucket lock. +	 */ +	l_new = prealloc_lru_pop(htab, key, hash); +	if (!l_new) +		return -ENOMEM; +	memcpy(l_new->key + round_up(map->key_size, 8), value, map->value_size); + +	/* bpf_map_update_elem() can be called in_irq() */ +	raw_spin_lock_irqsave(&b->lock, flags); + +	l_old = lookup_elem_raw(head, hash, key, key_size); + +	ret = check_flags(htab, l_old, map_flags); +	if (ret) +		goto err; + +	/* add new element to the head of the list, so that +	 * concurrent search will find it before old elem +	 */ +	hlist_add_head_rcu(&l_new->hash_node, head); +	if (l_old) { +		bpf_lru_node_set_ref(&l_new->lru_node); +		hlist_del_rcu(&l_old->hash_node); +	} +	ret = 0; + +err: +	raw_spin_unlock_irqrestore(&b->lock, flags); + +	if (ret) +		bpf_lru_push_free(&htab->lru, &l_new->lru_node); +	else if (l_old) +		bpf_lru_push_free(&htab->lru, &l_old->lru_node); + +	return ret; +} +  static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key,  					 void *value, u64 map_flags,  					 bool onallcpus) @@ -606,22 +814,9 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key,  		goto err;  	if (l_old) { -		void __percpu *pptr = htab_elem_get_ptr(l_old, key_size); -		u32 size = htab->map.value_size; -  		/* per-cpu hash map can update value in-place */ -		if (!onallcpus) { -			memcpy(this_cpu_ptr(pptr), value, size); -		} else { -			int off = 0, cpu; - -			size = round_up(size, 8); -			for_each_possible_cpu(cpu) { -				bpf_long_memcpy(per_cpu_ptr(pptr, cpu), -						value + off, size); -				off += size; -			} -		} +		pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size), +				value, onallcpus);  	} else {  		l_new = alloc_htab_elem(htab, key, value, key_size,  					hash, true, onallcpus, false); @@ -637,12 +832,84 @@ err:  	return ret;  } +static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, +					     void *value, u64 map_flags, +					     bool onallcpus) +{ +	struct bpf_htab *htab = container_of(map, struct bpf_htab, map); +	struct htab_elem *l_new = NULL, *l_old; +	struct hlist_head *head; +	unsigned long flags; +	struct bucket *b; +	u32 key_size, hash; +	int ret; + +	if (unlikely(map_flags > BPF_EXIST)) +		/* unknown flags */ +		return -EINVAL; + +	WARN_ON_ONCE(!rcu_read_lock_held()); + +	key_size = map->key_size; + +	hash = htab_map_hash(key, key_size); + +	b = __select_bucket(htab, hash); +	head = &b->head; + +	/* For LRU, we need to alloc before taking bucket's +	 * spinlock because LRU's elem alloc may need +	 * to remove older elem from htab and this removal +	 * operation will need a bucket lock. +	 */ +	if (map_flags != BPF_EXIST) { +		l_new = prealloc_lru_pop(htab, key, hash); +		if (!l_new) +			return -ENOMEM; +	} + +	/* bpf_map_update_elem() can be called in_irq() */ +	raw_spin_lock_irqsave(&b->lock, flags); + +	l_old = lookup_elem_raw(head, hash, key, key_size); + +	ret = check_flags(htab, l_old, map_flags); +	if (ret) +		goto err; + +	if (l_old) { +		bpf_lru_node_set_ref(&l_old->lru_node); + +		/* per-cpu hash map can update value in-place */ +		pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size), +				value, onallcpus); +	} else { +		pcpu_copy_value(htab, htab_elem_get_ptr(l_new, key_size), +				value, onallcpus); +		hlist_add_head_rcu(&l_new->hash_node, head); +		l_new = NULL; +	} +	ret = 0; +err: +	raw_spin_unlock_irqrestore(&b->lock, flags); +	if (l_new) +		bpf_lru_push_free(&htab->lru, &l_new->lru_node); +	return ret; +} +  static int htab_percpu_map_update_elem(struct bpf_map *map, void *key,  				       void *value, u64 map_flags)  {  	return __htab_percpu_map_update_elem(map, key, value, map_flags, false);  } +static int htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, +					   void *value, u64 map_flags) +{ +	return __htab_lru_percpu_map_update_elem(map, key, value, map_flags, +						 false); +} +  /* Called from syscall or from eBPF program */  static int htab_map_delete_elem(struct bpf_map *map, void *key)  { @@ -676,6 +943,39 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key)  	return ret;  } +static int htab_lru_map_delete_elem(struct bpf_map *map, void *key) +{ +	struct bpf_htab *htab = container_of(map, struct bpf_htab, map); +	struct hlist_head *head; +	struct bucket *b; +	struct htab_elem *l; +	unsigned long flags; +	u32 hash, key_size; +	int ret = -ENOENT; + +	WARN_ON_ONCE(!rcu_read_lock_held()); + +	key_size = map->key_size; + +	hash = htab_map_hash(key, key_size); +	b = __select_bucket(htab, hash); +	head = &b->head; + +	raw_spin_lock_irqsave(&b->lock, flags); + +	l = lookup_elem_raw(head, hash, key, key_size); + +	if (l) { +		hlist_del_rcu(&l->hash_node); +		ret = 0; +	} + +	raw_spin_unlock_irqrestore(&b->lock, flags); +	if (l) +		bpf_lru_push_free(&htab->lru, &l->lru_node); +	return ret; +} +  static void delete_all_elements(struct bpf_htab *htab)  {  	int i; @@ -687,7 +987,8 @@ static void delete_all_elements(struct bpf_htab *htab)  		hlist_for_each_entry_safe(l, n, head, hash_node) {  			hlist_del_rcu(&l->hash_node); -			htab_elem_free(htab, l); +			if (l->state != HTAB_EXTRA_ELEM_USED) +				htab_elem_free(htab, l);  		}  	}  } @@ -707,12 +1008,11 @@ static void htab_map_free(struct bpf_map *map)  	 * not have executed. Wait for them.  	 */  	rcu_barrier(); -	if (htab->map.map_flags & BPF_F_NO_PREALLOC) { +	if (htab->map.map_flags & BPF_F_NO_PREALLOC)  		delete_all_elements(htab); -	} else { -		htab_free_elems(htab); -		pcpu_freelist_destroy(&htab->freelist); -	} +	else +		prealloc_destroy(htab); +  	free_percpu(htab->extra_elems);  	kvfree(htab->buckets);  	kfree(htab); @@ -732,6 +1032,20 @@ static struct bpf_map_type_list htab_type __read_mostly = {  	.type = BPF_MAP_TYPE_HASH,  }; +static const struct bpf_map_ops htab_lru_ops = { +	.map_alloc = htab_map_alloc, +	.map_free = htab_map_free, +	.map_get_next_key = htab_map_get_next_key, +	.map_lookup_elem = htab_lru_map_lookup_elem, +	.map_update_elem = htab_lru_map_update_elem, +	.map_delete_elem = htab_lru_map_delete_elem, +}; + +static struct bpf_map_type_list htab_lru_type __read_mostly = { +	.ops = &htab_lru_ops, +	.type = BPF_MAP_TYPE_LRU_HASH, +}; +  /* Called from eBPF program */  static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key)  { @@ -743,8 +1057,21 @@ static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key)  		return NULL;  } +static void *htab_lru_percpu_map_lookup_elem(struct bpf_map *map, void *key) +{ +	struct htab_elem *l = __htab_map_lookup_elem(map, key); + +	if (l) { +		bpf_lru_node_set_ref(&l->lru_node); +		return this_cpu_ptr(htab_elem_get_ptr(l, map->key_size)); +	} + +	return NULL; +} +  int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value)  { +	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);  	struct htab_elem *l;  	void __percpu *pptr;  	int ret = -ENOENT; @@ -760,6 +1087,8 @@ int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value)  	l = __htab_map_lookup_elem(map, key);  	if (!l)  		goto out; +	if (htab_is_lru(htab)) +		bpf_lru_node_set_ref(&l->lru_node);  	pptr = htab_elem_get_ptr(l, map->key_size);  	for_each_possible_cpu(cpu) {  		bpf_long_memcpy(value + off, @@ -775,10 +1104,16 @@ out:  int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value,  			   u64 map_flags)  { +	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);  	int ret;  	rcu_read_lock(); -	ret = __htab_percpu_map_update_elem(map, key, value, map_flags, true); +	if (htab_is_lru(htab)) +		ret = __htab_lru_percpu_map_update_elem(map, key, value, +							map_flags, true); +	else +		ret = __htab_percpu_map_update_elem(map, key, value, map_flags, +						    true);  	rcu_read_unlock();  	return ret; @@ -798,10 +1133,26 @@ static struct bpf_map_type_list htab_percpu_type __read_mostly = {  	.type = BPF_MAP_TYPE_PERCPU_HASH,  }; +static const struct bpf_map_ops htab_lru_percpu_ops = { +	.map_alloc = htab_map_alloc, +	.map_free = htab_map_free, +	.map_get_next_key = htab_map_get_next_key, +	.map_lookup_elem = htab_lru_percpu_map_lookup_elem, +	.map_update_elem = htab_lru_percpu_map_update_elem, +	.map_delete_elem = htab_lru_map_delete_elem, +}; + +static struct bpf_map_type_list htab_lru_percpu_type __read_mostly = { +	.ops = &htab_lru_percpu_ops, +	.type = BPF_MAP_TYPE_LRU_PERCPU_HASH, +}; +  static int __init register_htab_map(void)  {  	bpf_register_map_type(&htab_type);  	bpf_register_map_type(&htab_percpu_type); +	bpf_register_map_type(&htab_lru_type); +	bpf_register_map_type(&htab_lru_percpu_type);  	return 0;  }  late_initcall(register_htab_map); diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 39918402e6e9..045cbe673356 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -13,6 +13,7 @@  #include <linux/rcupdate.h>  #include <linux/random.h>  #include <linux/smp.h> +#include <linux/topology.h>  #include <linux/ktime.h>  #include <linux/sched.h>  #include <linux/uidgid.h> @@ -92,6 +93,17 @@ const struct bpf_func_proto bpf_get_smp_processor_id_proto = {  	.ret_type	= RET_INTEGER,  }; +BPF_CALL_0(bpf_get_numa_node_id) +{ +	return numa_node_id(); +} + +const struct bpf_func_proto bpf_get_numa_node_id_proto = { +	.func		= bpf_get_numa_node_id, +	.gpl_only	= false, +	.ret_type	= RET_INTEGER, +}; +  BPF_CALL_0(bpf_ktime_get_ns)  {  	/* NMI safe access to clock monotonic */ diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 1ed8473ec537..0b030c9126d3 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -18,6 +18,7 @@  #include <linux/namei.h>  #include <linux/fs.h>  #include <linux/kdev_t.h> +#include <linux/parser.h>  #include <linux/filter.h>  #include <linux/bpf.h> @@ -87,6 +88,7 @@ static struct inode *bpf_get_inode(struct super_block *sb,  	switch (mode & S_IFMT) {  	case S_IFDIR:  	case S_IFREG: +	case S_IFLNK:  		break;  	default:  		return ERR_PTR(-EINVAL); @@ -119,6 +121,16 @@ static int bpf_inode_type(const struct inode *inode, enum bpf_type *type)  	return 0;  } +static void bpf_dentry_finalize(struct dentry *dentry, struct inode *inode, +				struct inode *dir) +{ +	d_instantiate(dentry, inode); +	dget(dentry); + +	dir->i_mtime = current_time(dir); +	dir->i_ctime = dir->i_mtime; +} +  static int bpf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)  {  	struct inode *inode; @@ -133,9 +145,7 @@ static int bpf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)  	inc_nlink(inode);  	inc_nlink(dir); -	d_instantiate(dentry, inode); -	dget(dentry); - +	bpf_dentry_finalize(dentry, inode, dir);  	return 0;  } @@ -151,9 +161,7 @@ static int bpf_mkobj_ops(struct inode *dir, struct dentry *dentry,  	inode->i_op = iops;  	inode->i_private = dentry->d_fsdata; -	d_instantiate(dentry, inode); -	dget(dentry); - +	bpf_dentry_finalize(dentry, inode, dir);  	return 0;  } @@ -181,13 +189,37 @@ bpf_lookup(struct inode *dir, struct dentry *dentry, unsigned flags)  {  	if (strchr(dentry->d_name.name, '.'))  		return ERR_PTR(-EPERM); +  	return simple_lookup(dir, dentry, flags);  } +static int bpf_symlink(struct inode *dir, struct dentry *dentry, +		       const char *target) +{ +	char *link = kstrdup(target, GFP_USER | __GFP_NOWARN); +	struct inode *inode; + +	if (!link) +		return -ENOMEM; + +	inode = bpf_get_inode(dir->i_sb, dir, S_IRWXUGO | S_IFLNK); +	if (IS_ERR(inode)) { +		kfree(link); +		return PTR_ERR(inode); +	} + +	inode->i_op = &simple_symlink_inode_operations; +	inode->i_link = link; + +	bpf_dentry_finalize(dentry, inode, dir); +	return 0; +} +  static const struct inode_operations bpf_dir_iops = {  	.lookup		= bpf_lookup,  	.mknod		= bpf_mkobj,  	.mkdir		= bpf_mkdir, +	.symlink	= bpf_symlink,  	.rmdir		= simple_rmdir,  	.rename		= simple_rename,  	.link		= simple_link, @@ -324,6 +356,8 @@ static void bpf_evict_inode(struct inode *inode)  	truncate_inode_pages_final(&inode->i_data);  	clear_inode(inode); +	if (S_ISLNK(inode->i_mode)) +		kfree(inode->i_link);  	if (!bpf_inode_type(inode, &type))  		bpf_any_put(inode->i_private, type);  } @@ -331,15 +365,66 @@ static void bpf_evict_inode(struct inode *inode)  static const struct super_operations bpf_super_ops = {  	.statfs		= simple_statfs,  	.drop_inode	= generic_delete_inode, +	.show_options	= generic_show_options,  	.evict_inode	= bpf_evict_inode,  }; +enum { +	OPT_MODE, +	OPT_ERR, +}; + +static const match_table_t bpf_mount_tokens = { +	{ OPT_MODE, "mode=%o" }, +	{ OPT_ERR, NULL }, +}; + +struct bpf_mount_opts { +	umode_t mode; +}; + +static int bpf_parse_options(char *data, struct bpf_mount_opts *opts) +{ +	substring_t args[MAX_OPT_ARGS]; +	int option, token; +	char *ptr; + +	opts->mode = S_IRWXUGO; + +	while ((ptr = strsep(&data, ",")) != NULL) { +		if (!*ptr) +			continue; + +		token = match_token(ptr, bpf_mount_tokens, args); +		switch (token) { +		case OPT_MODE: +			if (match_octal(&args[0], &option)) +				return -EINVAL; +			opts->mode = option & S_IALLUGO; +			break; +		/* We might like to report bad mount options here, but +		 * traditionally we've ignored all mount options, so we'd +		 * better continue to ignore non-existing options for bpf. +		 */ +		} +	} + +	return 0; +} +  static int bpf_fill_super(struct super_block *sb, void *data, int silent)  {  	static struct tree_descr bpf_rfiles[] = { { "" } }; +	struct bpf_mount_opts opts;  	struct inode *inode;  	int ret; +	save_mount_options(sb, data); + +	ret = bpf_parse_options(data, &opts); +	if (ret) +		return ret; +  	ret = simple_fill_super(sb, BPF_FS_MAGIC, bpf_rfiles);  	if (ret)  		return ret; @@ -349,7 +434,7 @@ static int bpf_fill_super(struct super_block *sb, void *data, int silent)  	inode = sb->s_root->d_inode;  	inode->i_op = &bpf_dir_iops;  	inode->i_mode &= ~S_IALLUGO; -	inode->i_mode |= S_ISVTX | S_IRWXUGO; +	inode->i_mode |= S_ISVTX | opts.mode;  	return 0;  } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 228f962447a5..4819ec9d95f6 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -17,6 +17,7 @@  #include <linux/license.h>  #include <linux/filter.h>  #include <linux/version.h> +#include <linux/kernel.h>  DEFINE_PER_CPU(int, bpf_prog_active); @@ -137,18 +138,31 @@ static int bpf_map_release(struct inode *inode, struct file *filp)  static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)  {  	const struct bpf_map *map = filp->private_data; +	const struct bpf_array *array; +	u32 owner_prog_type = 0; + +	if (map->map_type == BPF_MAP_TYPE_PROG_ARRAY) { +		array = container_of(map, struct bpf_array, map); +		owner_prog_type = array->owner_prog_type; +	}  	seq_printf(m,  		   "map_type:\t%u\n"  		   "key_size:\t%u\n"  		   "value_size:\t%u\n"  		   "max_entries:\t%u\n" -		   "map_flags:\t%#x\n", +		   "map_flags:\t%#x\n" +		   "memlock:\t%llu\n",  		   map->map_type,  		   map->key_size,  		   map->value_size,  		   map->max_entries, -		   map->map_flags); +		   map->map_flags, +		   map->pages * 1ULL << PAGE_SHIFT); + +	if (owner_prog_type) +		seq_printf(m, "owner_prog_type:\t%u\n", +			   owner_prog_type);  }  #endif @@ -194,7 +208,7 @@ static int map_create(union bpf_attr *attr)  	err = bpf_map_charge_memlock(map);  	if (err) -		goto free_map; +		goto free_map_nouncharge;  	err = bpf_map_new_fd(map);  	if (err < 0) @@ -204,6 +218,8 @@ static int map_create(union bpf_attr *attr)  	return err;  free_map: +	bpf_map_uncharge_memlock(map); +free_map_nouncharge:  	map->ops->map_free(map);  	return err;  } @@ -252,12 +268,6 @@ struct bpf_map *bpf_map_get_with_uref(u32 ufd)  	return map;  } -/* helper to convert user pointers passed inside __aligned_u64 fields */ -static void __user *u64_to_ptr(__u64 val) -{ -	return (void __user *) (unsigned long) val; -} -  int __weak bpf_stackmap_copy(struct bpf_map *map, void *key, void *value)  {  	return -ENOTSUPP; @@ -268,8 +278,8 @@ int __weak bpf_stackmap_copy(struct bpf_map *map, void *key, void *value)  static int map_lookup_elem(union bpf_attr *attr)  { -	void __user *ukey = u64_to_ptr(attr->key); -	void __user *uvalue = u64_to_ptr(attr->value); +	void __user *ukey = u64_to_user_ptr(attr->key); +	void __user *uvalue = u64_to_user_ptr(attr->value);  	int ufd = attr->map_fd;  	struct bpf_map *map;  	void *key, *value, *ptr; @@ -295,6 +305,7 @@ static int map_lookup_elem(union bpf_attr *attr)  		goto free_key;  	if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || +	    map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||  	    map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY)  		value_size = round_up(map->value_size, 8) * num_possible_cpus();  	else @@ -305,7 +316,8 @@ static int map_lookup_elem(union bpf_attr *attr)  	if (!value)  		goto free_key; -	if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH) { +	if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || +	    map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {  		err = bpf_percpu_hash_copy(map, key, value);  	} else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {  		err = bpf_percpu_array_copy(map, key, value); @@ -342,8 +354,8 @@ err_put:  static int map_update_elem(union bpf_attr *attr)  { -	void __user *ukey = u64_to_ptr(attr->key); -	void __user *uvalue = u64_to_ptr(attr->value); +	void __user *ukey = u64_to_user_ptr(attr->key); +	void __user *uvalue = u64_to_user_ptr(attr->value);  	int ufd = attr->map_fd;  	struct bpf_map *map;  	void *key, *value; @@ -369,6 +381,7 @@ static int map_update_elem(union bpf_attr *attr)  		goto free_key;  	if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || +	    map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||  	    map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY)  		value_size = round_up(map->value_size, 8) * num_possible_cpus();  	else @@ -388,7 +401,8 @@ static int map_update_elem(union bpf_attr *attr)  	 */  	preempt_disable();  	__this_cpu_inc(bpf_prog_active); -	if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH) { +	if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || +	    map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {  		err = bpf_percpu_hash_update(map, key, value, attr->flags);  	} else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {  		err = bpf_percpu_array_update(map, key, value, attr->flags); @@ -420,7 +434,7 @@ err_put:  static int map_delete_elem(union bpf_attr *attr)  { -	void __user *ukey = u64_to_ptr(attr->key); +	void __user *ukey = u64_to_user_ptr(attr->key);  	int ufd = attr->map_fd;  	struct bpf_map *map;  	struct fd f; @@ -464,8 +478,8 @@ err_put:  static int map_get_next_key(union bpf_attr *attr)  { -	void __user *ukey = u64_to_ptr(attr->key); -	void __user *unext_key = u64_to_ptr(attr->next_key); +	void __user *ukey = u64_to_user_ptr(attr->key); +	void __user *unext_key = u64_to_user_ptr(attr->next_key);  	int ufd = attr->map_fd;  	struct bpf_map *map;  	void *key, *next_key; @@ -565,6 +579,8 @@ static void fixup_bpf_calls(struct bpf_prog *prog)  				prog->dst_needed = 1;  			if (insn->imm == BPF_FUNC_get_prandom_u32)  				bpf_user_rnd_init_once(); +			if (insn->imm == BPF_FUNC_xdp_adjust_head) +				prog->xdp_adjust_head = 1;  			if (insn->imm == BPF_FUNC_tail_call) {  				/* mark bpf_tail_call as different opcode  				 * to avoid conditional branch in @@ -648,8 +664,30 @@ static int bpf_prog_release(struct inode *inode, struct file *filp)  	return 0;  } +#ifdef CONFIG_PROC_FS +static void bpf_prog_show_fdinfo(struct seq_file *m, struct file *filp) +{ +	const struct bpf_prog *prog = filp->private_data; +	char prog_digest[sizeof(prog->digest) * 2 + 1] = { }; + +	bin2hex(prog_digest, prog->digest, sizeof(prog->digest)); +	seq_printf(m, +		   "prog_type:\t%u\n" +		   "prog_jited:\t%u\n" +		   "prog_digest:\t%s\n" +		   "memlock:\t%llu\n", +		   prog->type, +		   prog->jited, +		   prog_digest, +		   prog->pages * 1ULL << PAGE_SHIFT); +} +#endif +  static const struct file_operations bpf_prog_fops = { -        .release = bpf_prog_release, +#ifdef CONFIG_PROC_FS +	.show_fdinfo	= bpf_prog_show_fdinfo, +#endif +	.release	= bpf_prog_release,  };  int bpf_prog_new_fd(struct bpf_prog *prog) @@ -680,10 +718,22 @@ struct bpf_prog *bpf_prog_add(struct bpf_prog *prog, int i)  }  EXPORT_SYMBOL_GPL(bpf_prog_add); +void bpf_prog_sub(struct bpf_prog *prog, int i) +{ +	/* Only to be used for undoing previous bpf_prog_add() in some +	 * error path. We still know that another entity in our call +	 * path holds a reference to the program, thus atomic_sub() can +	 * be safely used in such cases! +	 */ +	WARN_ON(atomic_sub_return(i, &prog->aux->refcnt) == 0); +} +EXPORT_SYMBOL_GPL(bpf_prog_sub); +  struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog)  {  	return bpf_prog_add(prog, 1);  } +EXPORT_SYMBOL_GPL(bpf_prog_inc);  static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *type)  { @@ -730,7 +780,7 @@ static int bpf_prog_load(union bpf_attr *attr)  		return -EINVAL;  	/* copy eBPF program license from user space */ -	if (strncpy_from_user(license, u64_to_ptr(attr->license), +	if (strncpy_from_user(license, u64_to_user_ptr(attr->license),  			      sizeof(license) - 1) < 0)  		return -EFAULT;  	license[sizeof(license) - 1] = 0; @@ -738,8 +788,8 @@ static int bpf_prog_load(union bpf_attr *attr)  	/* eBPF programs must be GPL compatible to use GPL-ed functions */  	is_gpl = license_is_gpl_compatible(license); -	if (attr->insn_cnt >= BPF_MAXINSNS) -		return -EINVAL; +	if (attr->insn_cnt == 0 || attr->insn_cnt > BPF_MAXINSNS) +		return -E2BIG;  	if (type == BPF_PROG_TYPE_KPROBE &&  	    attr->kern_version != LINUX_VERSION_CODE) @@ -760,7 +810,7 @@ static int bpf_prog_load(union bpf_attr *attr)  	prog->len = attr->insn_cnt;  	err = -EFAULT; -	if (copy_from_user(prog->insns, u64_to_ptr(attr->insns), +	if (copy_from_user(prog->insns, u64_to_user_ptr(attr->insns),  			   prog->len * sizeof(struct bpf_insn)) != 0)  		goto free_prog; @@ -811,7 +861,7 @@ static int bpf_obj_pin(const union bpf_attr *attr)  	if (CHECK_ATTR(BPF_OBJ))  		return -EINVAL; -	return bpf_obj_pin_user(attr->bpf_fd, u64_to_ptr(attr->pathname)); +	return bpf_obj_pin_user(attr->bpf_fd, u64_to_user_ptr(attr->pathname));  }  static int bpf_obj_get(const union bpf_attr *attr) @@ -819,9 +869,85 @@ static int bpf_obj_get(const union bpf_attr *attr)  	if (CHECK_ATTR(BPF_OBJ) || attr->bpf_fd != 0)  		return -EINVAL; -	return bpf_obj_get_user(u64_to_ptr(attr->pathname)); +	return bpf_obj_get_user(u64_to_user_ptr(attr->pathname)); +} + +#ifdef CONFIG_CGROUP_BPF + +#define BPF_PROG_ATTACH_LAST_FIELD attach_type + +static int bpf_prog_attach(const union bpf_attr *attr) +{ +	struct bpf_prog *prog; +	struct cgroup *cgrp; +	enum bpf_prog_type ptype; + +	if (!capable(CAP_NET_ADMIN)) +		return -EPERM; + +	if (CHECK_ATTR(BPF_PROG_ATTACH)) +		return -EINVAL; + +	switch (attr->attach_type) { +	case BPF_CGROUP_INET_INGRESS: +	case BPF_CGROUP_INET_EGRESS: +		ptype = BPF_PROG_TYPE_CGROUP_SKB; +		break; +	case BPF_CGROUP_INET_SOCK_CREATE: +		ptype = BPF_PROG_TYPE_CGROUP_SOCK; +		break; +	default: +		return -EINVAL; +	} + +	prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype); +	if (IS_ERR(prog)) +		return PTR_ERR(prog); + +	cgrp = cgroup_get_from_fd(attr->target_fd); +	if (IS_ERR(cgrp)) { +		bpf_prog_put(prog); +		return PTR_ERR(cgrp); +	} + +	cgroup_bpf_update(cgrp, prog, attr->attach_type); +	cgroup_put(cgrp); + +	return 0;  } +#define BPF_PROG_DETACH_LAST_FIELD attach_type + +static int bpf_prog_detach(const union bpf_attr *attr) +{ +	struct cgroup *cgrp; + +	if (!capable(CAP_NET_ADMIN)) +		return -EPERM; + +	if (CHECK_ATTR(BPF_PROG_DETACH)) +		return -EINVAL; + +	switch (attr->attach_type) { +	case BPF_CGROUP_INET_INGRESS: +	case BPF_CGROUP_INET_EGRESS: +	case BPF_CGROUP_INET_SOCK_CREATE: +		cgrp = cgroup_get_from_fd(attr->target_fd); +		if (IS_ERR(cgrp)) +			return PTR_ERR(cgrp); + +		cgroup_bpf_update(cgrp, NULL, attr->attach_type); +		cgroup_put(cgrp); +		break; + +	default: +		return -EINVAL; +	} + +	return 0; +} +#endif /* CONFIG_CGROUP_BPF */ +  SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)  {  	union bpf_attr attr = {}; @@ -888,6 +1014,16 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz  	case BPF_OBJ_GET:  		err = bpf_obj_get(&attr);  		break; + +#ifdef CONFIG_CGROUP_BPF +	case BPF_PROG_ATTACH: +		err = bpf_prog_attach(&attr); +		break; +	case BPF_PROG_DETACH: +		err = bpf_prog_detach(&attr); +		break; +#endif +  	default:  		err = -EINVAL;  		break; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 99a7e5b388f2..d28f9a3380a9 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -19,6 +19,7 @@  #include <net/netlink.h>  #include <linux/file.h>  #include <linux/vmalloc.h> +#include <linux/stringify.h>  /* bpf_check() is a static code analyzer that walks eBPF program   * instruction by instruction and updates register/stack state. @@ -190,6 +191,22 @@ static const char * const reg_type_str[] = {  	[PTR_TO_PACKET_END]	= "pkt_end",  }; +#define __BPF_FUNC_STR_FN(x) [BPF_FUNC_ ## x] = __stringify(bpf_ ## x) +static const char * const func_id_str[] = { +	__BPF_FUNC_MAPPER(__BPF_FUNC_STR_FN) +}; +#undef __BPF_FUNC_STR_FN + +static const char *func_id_name(int id) +{ +	BUILD_BUG_ON(ARRAY_SIZE(func_id_str) != __BPF_FUNC_MAX_ID); + +	if (id >= 0 && id < __BPF_FUNC_MAX_ID && func_id_str[id]) +		return func_id_str[id]; +	else +		return "unknown"; +} +  static void print_verifier_state(struct bpf_verifier_state *state)  {  	struct bpf_reg_state *reg; @@ -212,12 +229,13 @@ static void print_verifier_state(struct bpf_verifier_state *state)  		else if (t == CONST_PTR_TO_MAP || t == PTR_TO_MAP_VALUE ||  			 t == PTR_TO_MAP_VALUE_OR_NULL ||  			 t == PTR_TO_MAP_VALUE_ADJ) -			verbose("(ks=%d,vs=%d)", +			verbose("(ks=%d,vs=%d,id=%u)",  				reg->map_ptr->key_size, -				reg->map_ptr->value_size); +				reg->map_ptr->value_size, +				reg->id);  		if (reg->min_value != BPF_REGISTER_MIN_RANGE) -			verbose(",min_value=%llu", -				(unsigned long long)reg->min_value); +			verbose(",min_value=%lld", +				(long long)reg->min_value);  		if (reg->max_value != BPF_REGISTER_MAX_RANGE)  			verbose(",max_value=%llu",  				(unsigned long long)reg->max_value); @@ -353,7 +371,8 @@ static void print_bpf_insn(struct bpf_insn *insn)  		u8 opcode = BPF_OP(insn->code);  		if (opcode == BPF_CALL) { -			verbose("(%02x) call %d\n", insn->code, insn->imm); +			verbose("(%02x) call %s#%d\n", insn->code, +				func_id_name(insn->imm), insn->imm);  		} else if (insn->code == (BPF_JMP | BPF_JA)) {  			verbose("(%02x) goto pc%+d\n",  				insn->code, insn->off); @@ -447,6 +466,7 @@ static void mark_reg_unknown_value(struct bpf_reg_state *regs, u32 regno)  {  	BUG_ON(regno >= MAX_BPF_REG);  	regs[regno].type = UNKNOWN_VALUE; +	regs[regno].id = 0;  	regs[regno].imm = 0;  } @@ -613,12 +633,19 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, int off,  #define MAX_PACKET_OFF 0xffff  static bool may_access_direct_pkt_data(struct bpf_verifier_env *env, -				       const struct bpf_call_arg_meta *meta) +				       const struct bpf_call_arg_meta *meta, +				       enum bpf_access_type t)  {  	switch (env->prog->type) { +	case BPF_PROG_TYPE_LWT_IN: +	case BPF_PROG_TYPE_LWT_OUT: +		/* dst_input() and dst_output() can't write for now */ +		if (t == BPF_WRITE) +			return false;  	case BPF_PROG_TYPE_SCHED_CLS:  	case BPF_PROG_TYPE_SCHED_ACT:  	case BPF_PROG_TYPE_XDP: +	case BPF_PROG_TYPE_LWT_XMIT:  		if (meta)  			return meta->pkt_access; @@ -758,7 +785,7 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off,  			 * index'es we need to make sure that whatever we use  			 * will have a set floor within our range.  			 */ -			if ((s64)reg->min_value < 0) { +			if (reg->min_value < 0) {  				verbose("R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n",  					regno);  				return -EACCES; @@ -817,7 +844,7 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off,  			err = check_stack_read(state, off, size, value_regno);  		}  	} else if (state->regs[regno].type == PTR_TO_PACKET) { -		if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL)) { +		if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL, t)) {  			verbose("cannot write into packet\n");  			return -EACCES;  		} @@ -950,7 +977,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,  		return 0;  	} -	if (type == PTR_TO_PACKET && !may_access_direct_pkt_data(env, meta)) { +	if (type == PTR_TO_PACKET && +	    !may_access_direct_pkt_data(env, meta, BPF_READ)) {  		verbose("helper access to the packet is not allowed\n");  		return -EACCES;  	} @@ -1112,8 +1140,8 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)  	return 0;  error: -	verbose("cannot pass map_type %d into func %d\n", -		map->map_type, func_id); +	verbose("cannot pass map_type %d into func %s#%d\n", +		map->map_type, func_id_name(func_id), func_id);  	return -EINVAL;  } @@ -1170,7 +1198,7 @@ static int check_call(struct bpf_verifier_env *env, int func_id)  	/* find function prototype */  	if (func_id < 0 || func_id >= __BPF_FUNC_MAX_ID) { -		verbose("invalid func %d\n", func_id); +		verbose("invalid func %s#%d\n", func_id_name(func_id), func_id);  		return -EINVAL;  	} @@ -1178,7 +1206,7 @@ static int check_call(struct bpf_verifier_env *env, int func_id)  		fn = env->prog->aux->ops->get_func_proto(func_id);  	if (!fn) { -		verbose("unknown func %d\n", func_id); +		verbose("unknown func %s#%d\n", func_id_name(func_id), func_id);  		return -EINVAL;  	} @@ -1188,7 +1216,7 @@ static int check_call(struct bpf_verifier_env *env, int func_id)  		return -EINVAL;  	} -	changes_data = bpf_helper_changes_skb_data(fn->func); +	changes_data = bpf_helper_changes_pkt_data(fn->func);  	memset(&meta, 0, sizeof(meta));  	meta.pkt_access = fn->pkt_access; @@ -1198,7 +1226,8 @@ static int check_call(struct bpf_verifier_env *env, int func_id)  	 */  	err = check_raw_mode(fn);  	if (err) { -		verbose("kernel subsystem misconfigured func %d\n", func_id); +		verbose("kernel subsystem misconfigured func %s#%d\n", +			func_id_name(func_id), func_id);  		return err;  	} @@ -1252,9 +1281,10 @@ static int check_call(struct bpf_verifier_env *env, int func_id)  			return -EINVAL;  		}  		regs[BPF_REG_0].map_ptr = meta.map_ptr; +		regs[BPF_REG_0].id = ++env->id_gen;  	} else { -		verbose("unknown return type %d of func %d\n", -			fn->ret_type, func_id); +		verbose("unknown return type %d of func %s#%d\n", +			fn->ret_type, func_id_name(func_id), func_id);  		return -EINVAL;  	} @@ -1451,14 +1481,19 @@ static int evaluate_reg_imm_alu(struct bpf_verifier_env *env,  	struct bpf_reg_state *src_reg = ®s[insn->src_reg];  	u8 opcode = BPF_OP(insn->code); -	/* dst_reg->type == CONST_IMM here, simulate execution of 'add' insn. -	 * Don't care about overflow or negative values, just add them +	/* dst_reg->type == CONST_IMM here, simulate execution of 'add'/'or' +	 * insn. Don't care about overflow or negative values, just add them  	 */  	if (opcode == BPF_ADD && BPF_SRC(insn->code) == BPF_K)  		dst_reg->imm += insn->imm;  	else if (opcode == BPF_ADD && BPF_SRC(insn->code) == BPF_X &&  		 src_reg->type == CONST_IMM)  		dst_reg->imm += src_reg->imm; +	else if (opcode == BPF_OR && BPF_SRC(insn->code) == BPF_K) +		dst_reg->imm |= insn->imm; +	else if (opcode == BPF_OR && BPF_SRC(insn->code) == BPF_X && +		 src_reg->type == CONST_IMM) +		dst_reg->imm |= src_reg->imm;  	else  		mark_reg_unknown_value(regs, insn->dst_reg);  	return 0; @@ -1468,7 +1503,8 @@ static void check_reg_overflow(struct bpf_reg_state *reg)  {  	if (reg->max_value > BPF_REGISTER_MAX_RANGE)  		reg->max_value = BPF_REGISTER_MAX_RANGE; -	if ((s64)reg->min_value < BPF_REGISTER_MIN_RANGE) +	if (reg->min_value < BPF_REGISTER_MIN_RANGE || +	    reg->min_value > BPF_REGISTER_MAX_RANGE)  		reg->min_value = BPF_REGISTER_MIN_RANGE;  } @@ -1476,8 +1512,8 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env,  				    struct bpf_insn *insn)  {  	struct bpf_reg_state *regs = env->cur_state.regs, *dst_reg; -	u64 min_val = BPF_REGISTER_MIN_RANGE, max_val = BPF_REGISTER_MAX_RANGE; -	bool min_set = false, max_set = false; +	s64 min_val = BPF_REGISTER_MIN_RANGE; +	u64 max_val = BPF_REGISTER_MAX_RANGE;  	u8 opcode = BPF_OP(insn->code);  	dst_reg = ®s[insn->dst_reg]; @@ -1500,7 +1536,6 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env,  	} else if (insn->imm < BPF_REGISTER_MAX_RANGE &&  		   (s64)insn->imm > BPF_REGISTER_MIN_RANGE) {  		min_val = max_val = insn->imm; -		min_set = max_set = true;  	}  	/* We don't know anything about what was done to this register, mark it @@ -1512,22 +1547,43 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env,  		return;  	} +	/* If one of our values was at the end of our ranges then we can't just +	 * do our normal operations to the register, we need to set the values +	 * to the min/max since they are undefined. +	 */ +	if (min_val == BPF_REGISTER_MIN_RANGE) +		dst_reg->min_value = BPF_REGISTER_MIN_RANGE; +	if (max_val == BPF_REGISTER_MAX_RANGE) +		dst_reg->max_value = BPF_REGISTER_MAX_RANGE; +  	switch (opcode) {  	case BPF_ADD: -		dst_reg->min_value += min_val; -		dst_reg->max_value += max_val; +		if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE) +			dst_reg->min_value += min_val; +		if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE) +			dst_reg->max_value += max_val;  		break;  	case BPF_SUB: -		dst_reg->min_value -= min_val; -		dst_reg->max_value -= max_val; +		if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE) +			dst_reg->min_value -= min_val; +		if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE) +			dst_reg->max_value -= max_val;  		break;  	case BPF_MUL: -		dst_reg->min_value *= min_val; -		dst_reg->max_value *= max_val; +		if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE) +			dst_reg->min_value *= min_val; +		if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE) +			dst_reg->max_value *= max_val;  		break;  	case BPF_AND: -		/* & is special since it could end up with 0 bits set. */ -		dst_reg->min_value &= min_val; +		/* Disallow AND'ing of negative numbers, ain't nobody got time +		 * for that.  Otherwise the minimum is 0 and the max is the max +		 * value we could AND against. +		 */ +		if (min_val < 0) +			dst_reg->min_value = BPF_REGISTER_MIN_RANGE; +		else +			dst_reg->min_value = 0;  		dst_reg->max_value = max_val;  		break;  	case BPF_LSH: @@ -1537,24 +1593,25 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env,  		 */  		if (min_val > ilog2(BPF_REGISTER_MAX_RANGE))  			dst_reg->min_value = BPF_REGISTER_MIN_RANGE; -		else +		else if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE)  			dst_reg->min_value <<= min_val;  		if (max_val > ilog2(BPF_REGISTER_MAX_RANGE))  			dst_reg->max_value = BPF_REGISTER_MAX_RANGE; -		else +		else if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE)  			dst_reg->max_value <<= max_val;  		break;  	case BPF_RSH: -		dst_reg->min_value >>= min_val; -		dst_reg->max_value >>= max_val; -		break; -	case BPF_MOD: -		/* % is special since it is an unsigned modulus, so the floor -		 * will always be 0. +		/* RSH by a negative number is undefined, and the BPF_RSH is an +		 * unsigned shift, so make the appropriate casts.  		 */ -		dst_reg->min_value = 0; -		dst_reg->max_value = max_val - 1; +		if (min_val < 0 || dst_reg->min_value < 0) +			dst_reg->min_value = BPF_REGISTER_MIN_RANGE; +		else +			dst_reg->min_value = +				(u64)(dst_reg->min_value) >> min_val; +		if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE) +			dst_reg->max_value >>= max_val;  		break;  	default:  		reset_reg_range_values(regs, insn->dst_reg); @@ -1644,8 +1701,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)  						insn->src_reg);  					return -EACCES;  				} -				regs[insn->dst_reg].type = UNKNOWN_VALUE; -				regs[insn->dst_reg].map_ptr = NULL; +				mark_reg_unknown_value(regs, insn->dst_reg);  			}  		} else {  			/* case: R = imm @@ -1907,6 +1963,38 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg,  	check_reg_overflow(true_reg);  } +static void mark_map_reg(struct bpf_reg_state *regs, u32 regno, u32 id, +			 enum bpf_reg_type type) +{ +	struct bpf_reg_state *reg = ®s[regno]; + +	if (reg->type == PTR_TO_MAP_VALUE_OR_NULL && reg->id == id) { +		reg->type = type; +		if (type == UNKNOWN_VALUE) +			mark_reg_unknown_value(regs, regno); +	} +} + +/* The logic is similar to find_good_pkt_pointers(), both could eventually + * be folded together at some point. + */ +static void mark_map_regs(struct bpf_verifier_state *state, u32 regno, +			  enum bpf_reg_type type) +{ +	struct bpf_reg_state *regs = state->regs; +	int i; + +	for (i = 0; i < MAX_BPF_REG; i++) +		mark_map_reg(regs, i, regs[regno].id, type); + +	for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) { +		if (state->stack_slot_type[i] != STACK_SPILL) +			continue; +		mark_map_reg(state->spilled_regs, i / BPF_REG_SIZE, +			     regs[regno].id, type); +	} +} +  static int check_cond_jmp_op(struct bpf_verifier_env *env,  			     struct bpf_insn *insn, int *insn_idx)  { @@ -1994,18 +2082,13 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,  	if (BPF_SRC(insn->code) == BPF_K &&  	    insn->imm == 0 && (opcode == BPF_JEQ || opcode == BPF_JNE) &&  	    dst_reg->type == PTR_TO_MAP_VALUE_OR_NULL) { -		if (opcode == BPF_JEQ) { -			/* next fallthrough insn can access memory via -			 * this register -			 */ -			regs[insn->dst_reg].type = PTR_TO_MAP_VALUE; -			/* branch targer cannot access it, since reg == 0 */ -			mark_reg_unknown_value(other_branch->regs, -					       insn->dst_reg); -		} else { -			other_branch->regs[insn->dst_reg].type = PTR_TO_MAP_VALUE; -			mark_reg_unknown_value(regs, insn->dst_reg); -		} +		/* Mark all identical map registers in each branch as either +		 * safe or unknown depending R == 0 or R != 0 conditional. +		 */ +		mark_map_regs(this_branch, insn->dst_reg, +			      opcode == BPF_JEQ ? PTR_TO_MAP_VALUE : UNKNOWN_VALUE); +		mark_map_regs(other_branch, insn->dst_reg, +			      opcode == BPF_JEQ ? UNKNOWN_VALUE : PTR_TO_MAP_VALUE);  	} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT &&  		   dst_reg->type == PTR_TO_PACKET &&  		   regs[insn->src_reg].type == PTR_TO_PACKET_END) { @@ -2430,6 +2513,7 @@ static bool states_equal(struct bpf_verifier_env *env,  			 struct bpf_verifier_state *old,  			 struct bpf_verifier_state *cur)  { +	bool varlen_map_access = env->varlen_map_value_access;  	struct bpf_reg_state *rold, *rcur;  	int i; @@ -2443,12 +2527,17 @@ static bool states_equal(struct bpf_verifier_env *env,  		/* If the ranges were not the same, but everything else was and  		 * we didn't do a variable access into a map then we are a-ok.  		 */ -		if (!env->varlen_map_value_access && -		    rold->type == rcur->type && rold->imm == rcur->imm) +		if (!varlen_map_access && +		    memcmp(rold, rcur, offsetofend(struct bpf_reg_state, id)) == 0)  			continue; +		/* If we didn't map access then again we don't care about the +		 * mismatched range values and it's ok if our old type was +		 * UNKNOWN and we didn't go to a NOT_INIT'ed reg. +		 */  		if (rold->type == NOT_INIT || -		    (rold->type == UNKNOWN_VALUE && rcur->type != NOT_INIT)) +		    (!varlen_map_access && rold->type == UNKNOWN_VALUE && +		     rcur->type != NOT_INIT))  			continue;  		if (rold->type == PTR_TO_PACKET && rcur->type == PTR_TO_PACKET && @@ -3044,9 +3133,6 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)  	struct bpf_verifier_env *env;  	int ret = -EINVAL; -	if ((*prog)->len <= 0 || (*prog)->len > BPF_MAXINSNS) -		return -E2BIG; -  	/* 'struct bpf_verifier_env' can be global, but since it's not small,  	 * allocate/free it every time bpf_check() is called  	 */ @@ -3087,6 +3173,8 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)  		log_level = 0;  	} +	bpf_prog_calc_digest(env->prog); +  	ret = replace_map_fd_with_map_ptr(env);  	if (ret < 0)  		goto skip_full_check; |