diff options
Diffstat (limited to 'net/core')
32 files changed, 4983 insertions, 2108 deletions
diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index b988f48153a4..4edd033e899c 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -6,550 +6,48 @@ #include <linux/types.h> #include <linux/spinlock.h> #include <linux/bpf.h> +#include <linux/btf.h> #include <linux/btf_ids.h> +#include <linux/bpf_local_storage.h> #include <net/bpf_sk_storage.h> #include <net/sock.h> #include <uapi/linux/sock_diag.h> #include <uapi/linux/btf.h> -#define SK_STORAGE_CREATE_FLAG_MASK \ - (BPF_F_NO_PREALLOC | BPF_F_CLONE) +DEFINE_BPF_STORAGE_CACHE(sk_cache); -struct bucket { - struct hlist_head list; - raw_spinlock_t lock; -}; - -/* Thp map is not the primary owner of a bpf_sk_storage_elem. - * Instead, the sk->sk_bpf_storage is. - * - * The map (bpf_sk_storage_map) is for two purposes - * 1. Define the size of the "sk local storage". It is - * the map's value_size. - * - * 2. Maintain a list to keep track of all elems such - * that they can be cleaned up during the map destruction. - * - * When a bpf local storage is being looked up for a - * particular sk, the "bpf_map" pointer is actually used - * as the "key" to search in the list of elem in - * sk->sk_bpf_storage. - * - * Hence, consider sk->sk_bpf_storage is the mini-map - * with the "bpf_map" pointer as the searching key. - */ -struct bpf_sk_storage_map { - struct bpf_map map; - /* Lookup elem does not require accessing the map. - * - * Updating/Deleting requires a bucket lock to - * link/unlink the elem from the map. Having - * multiple buckets to improve contention. - */ - struct bucket *buckets; - u32 bucket_log; - u16 elem_size; - u16 cache_idx; -}; - -struct bpf_sk_storage_data { - /* smap is used as the searching key when looking up - * from sk->sk_bpf_storage. - * - * Put it in the same cacheline as the data to minimize - * the number of cachelines access during the cache hit case. - */ - struct bpf_sk_storage_map __rcu *smap; - u8 data[] __aligned(8); -}; - -/* Linked to bpf_sk_storage and bpf_sk_storage_map */ -struct bpf_sk_storage_elem { - struct hlist_node map_node; /* Linked to bpf_sk_storage_map */ - struct hlist_node snode; /* Linked to bpf_sk_storage */ - struct bpf_sk_storage __rcu *sk_storage; - struct rcu_head rcu; - /* 8 bytes hole */ - /* The data is stored in aother cacheline to minimize - * the number of cachelines access during a cache hit. - */ - struct bpf_sk_storage_data sdata ____cacheline_aligned; -}; - -#define SELEM(_SDATA) container_of((_SDATA), struct bpf_sk_storage_elem, sdata) -#define SDATA(_SELEM) (&(_SELEM)->sdata) -#define BPF_SK_STORAGE_CACHE_SIZE 16 - -static DEFINE_SPINLOCK(cache_idx_lock); -static u64 cache_idx_usage_counts[BPF_SK_STORAGE_CACHE_SIZE]; - -struct bpf_sk_storage { - struct bpf_sk_storage_data __rcu *cache[BPF_SK_STORAGE_CACHE_SIZE]; - struct hlist_head list; /* List of bpf_sk_storage_elem */ - struct sock *sk; /* The sk that owns the the above "list" of - * bpf_sk_storage_elem. - */ - struct rcu_head rcu; - raw_spinlock_t lock; /* Protect adding/removing from the "list" */ -}; - -static struct bucket *select_bucket(struct bpf_sk_storage_map *smap, - struct bpf_sk_storage_elem *selem) +static struct bpf_local_storage_data * +bpf_sk_storage_lookup(struct sock *sk, struct bpf_map *map, bool cacheit_lockit) { - return &smap->buckets[hash_ptr(selem, smap->bucket_log)]; -} - -static int omem_charge(struct sock *sk, unsigned int size) -{ - /* same check as in sock_kmalloc() */ - if (size <= sysctl_optmem_max && - atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) { - atomic_add(size, &sk->sk_omem_alloc); - return 0; - } - - return -ENOMEM; -} - -static bool selem_linked_to_sk(const struct bpf_sk_storage_elem *selem) -{ - return !hlist_unhashed(&selem->snode); -} - -static bool selem_linked_to_map(const struct bpf_sk_storage_elem *selem) -{ - return !hlist_unhashed(&selem->map_node); -} - -static struct bpf_sk_storage_elem *selem_alloc(struct bpf_sk_storage_map *smap, - struct sock *sk, void *value, - bool charge_omem) -{ - struct bpf_sk_storage_elem *selem; - - if (charge_omem && omem_charge(sk, smap->elem_size)) - return NULL; - - selem = kzalloc(smap->elem_size, GFP_ATOMIC | __GFP_NOWARN); - if (selem) { - if (value) - memcpy(SDATA(selem)->data, value, smap->map.value_size); - return selem; - } - - if (charge_omem) - atomic_sub(smap->elem_size, &sk->sk_omem_alloc); - - return NULL; -} - -/* sk_storage->lock must be held and selem->sk_storage == sk_storage. - * The caller must ensure selem->smap is still valid to be - * dereferenced for its smap->elem_size and smap->cache_idx. - */ -static bool __selem_unlink_sk(struct bpf_sk_storage *sk_storage, - struct bpf_sk_storage_elem *selem, - bool uncharge_omem) -{ - struct bpf_sk_storage_map *smap; - bool free_sk_storage; - struct sock *sk; - - smap = rcu_dereference(SDATA(selem)->smap); - sk = sk_storage->sk; - - /* All uncharging on sk->sk_omem_alloc must be done first. - * sk may be freed once the last selem is unlinked from sk_storage. - */ - if (uncharge_omem) - atomic_sub(smap->elem_size, &sk->sk_omem_alloc); - - free_sk_storage = hlist_is_singular_node(&selem->snode, - &sk_storage->list); - if (free_sk_storage) { - atomic_sub(sizeof(struct bpf_sk_storage), &sk->sk_omem_alloc); - sk_storage->sk = NULL; - /* After this RCU_INIT, sk may be freed and cannot be used */ - RCU_INIT_POINTER(sk->sk_bpf_storage, NULL); - - /* sk_storage is not freed now. sk_storage->lock is - * still held and raw_spin_unlock_bh(&sk_storage->lock) - * will be done by the caller. - * - * Although the unlock will be done under - * rcu_read_lock(), it is more intutivie to - * read if kfree_rcu(sk_storage, rcu) is done - * after the raw_spin_unlock_bh(&sk_storage->lock). - * - * Hence, a "bool free_sk_storage" is returned - * to the caller which then calls the kfree_rcu() - * after unlock. - */ - } - hlist_del_init_rcu(&selem->snode); - if (rcu_access_pointer(sk_storage->cache[smap->cache_idx]) == - SDATA(selem)) - RCU_INIT_POINTER(sk_storage->cache[smap->cache_idx], NULL); - - kfree_rcu(selem, rcu); - - return free_sk_storage; -} - -static void selem_unlink_sk(struct bpf_sk_storage_elem *selem) -{ - struct bpf_sk_storage *sk_storage; - bool free_sk_storage = false; - - if (unlikely(!selem_linked_to_sk(selem))) - /* selem has already been unlinked from sk */ - return; - - sk_storage = rcu_dereference(selem->sk_storage); - raw_spin_lock_bh(&sk_storage->lock); - if (likely(selem_linked_to_sk(selem))) - free_sk_storage = __selem_unlink_sk(sk_storage, selem, true); - raw_spin_unlock_bh(&sk_storage->lock); - - if (free_sk_storage) - kfree_rcu(sk_storage, rcu); -} - -static void __selem_link_sk(struct bpf_sk_storage *sk_storage, - struct bpf_sk_storage_elem *selem) -{ - RCU_INIT_POINTER(selem->sk_storage, sk_storage); - hlist_add_head(&selem->snode, &sk_storage->list); -} - -static void selem_unlink_map(struct bpf_sk_storage_elem *selem) -{ - struct bpf_sk_storage_map *smap; - struct bucket *b; - - if (unlikely(!selem_linked_to_map(selem))) - /* selem has already be unlinked from smap */ - return; - - smap = rcu_dereference(SDATA(selem)->smap); - b = select_bucket(smap, selem); - raw_spin_lock_bh(&b->lock); - if (likely(selem_linked_to_map(selem))) - hlist_del_init_rcu(&selem->map_node); - raw_spin_unlock_bh(&b->lock); -} - -static void selem_link_map(struct bpf_sk_storage_map *smap, - struct bpf_sk_storage_elem *selem) -{ - struct bucket *b = select_bucket(smap, selem); - - raw_spin_lock_bh(&b->lock); - RCU_INIT_POINTER(SDATA(selem)->smap, smap); - hlist_add_head_rcu(&selem->map_node, &b->list); - raw_spin_unlock_bh(&b->lock); -} - -static void selem_unlink(struct bpf_sk_storage_elem *selem) -{ - /* Always unlink from map before unlinking from sk_storage - * because selem will be freed after successfully unlinked from - * the sk_storage. - */ - selem_unlink_map(selem); - selem_unlink_sk(selem); -} - -static struct bpf_sk_storage_data * -__sk_storage_lookup(struct bpf_sk_storage *sk_storage, - struct bpf_sk_storage_map *smap, - bool cacheit_lockit) -{ - struct bpf_sk_storage_data *sdata; - struct bpf_sk_storage_elem *selem; - - /* Fast path (cache hit) */ - sdata = rcu_dereference(sk_storage->cache[smap->cache_idx]); - if (sdata && rcu_access_pointer(sdata->smap) == smap) - return sdata; - - /* Slow path (cache miss) */ - hlist_for_each_entry_rcu(selem, &sk_storage->list, snode) - if (rcu_access_pointer(SDATA(selem)->smap) == smap) - break; - - if (!selem) - return NULL; - - sdata = SDATA(selem); - if (cacheit_lockit) { - /* spinlock is needed to avoid racing with the - * parallel delete. Otherwise, publishing an already - * deleted sdata to the cache will become a use-after-free - * problem in the next __sk_storage_lookup(). - */ - raw_spin_lock_bh(&sk_storage->lock); - if (selem_linked_to_sk(selem)) - rcu_assign_pointer(sk_storage->cache[smap->cache_idx], - sdata); - raw_spin_unlock_bh(&sk_storage->lock); - } - - return sdata; -} - -static struct bpf_sk_storage_data * -sk_storage_lookup(struct sock *sk, struct bpf_map *map, bool cacheit_lockit) -{ - struct bpf_sk_storage *sk_storage; - struct bpf_sk_storage_map *smap; + struct bpf_local_storage *sk_storage; + struct bpf_local_storage_map *smap; sk_storage = rcu_dereference(sk->sk_bpf_storage); if (!sk_storage) return NULL; - smap = (struct bpf_sk_storage_map *)map; - return __sk_storage_lookup(sk_storage, smap, cacheit_lockit); -} - -static int check_flags(const struct bpf_sk_storage_data *old_sdata, - u64 map_flags) -{ - if (old_sdata && (map_flags & ~BPF_F_LOCK) == BPF_NOEXIST) - /* elem already exists */ - return -EEXIST; - - if (!old_sdata && (map_flags & ~BPF_F_LOCK) == BPF_EXIST) - /* elem doesn't exist, cannot update it */ - return -ENOENT; - - return 0; -} - -static int sk_storage_alloc(struct sock *sk, - struct bpf_sk_storage_map *smap, - struct bpf_sk_storage_elem *first_selem) -{ - struct bpf_sk_storage *prev_sk_storage, *sk_storage; - int err; - - err = omem_charge(sk, sizeof(*sk_storage)); - if (err) - return err; - - sk_storage = kzalloc(sizeof(*sk_storage), GFP_ATOMIC | __GFP_NOWARN); - if (!sk_storage) { - err = -ENOMEM; - goto uncharge; - } - INIT_HLIST_HEAD(&sk_storage->list); - raw_spin_lock_init(&sk_storage->lock); - sk_storage->sk = sk; - - __selem_link_sk(sk_storage, first_selem); - selem_link_map(smap, first_selem); - /* Publish sk_storage to sk. sk->sk_lock cannot be acquired. - * Hence, atomic ops is used to set sk->sk_bpf_storage - * from NULL to the newly allocated sk_storage ptr. - * - * From now on, the sk->sk_bpf_storage pointer is protected - * by the sk_storage->lock. Hence, when freeing - * the sk->sk_bpf_storage, the sk_storage->lock must - * be held before setting sk->sk_bpf_storage to NULL. - */ - prev_sk_storage = cmpxchg((struct bpf_sk_storage **)&sk->sk_bpf_storage, - NULL, sk_storage); - if (unlikely(prev_sk_storage)) { - selem_unlink_map(first_selem); - err = -EAGAIN; - goto uncharge; - - /* Note that even first_selem was linked to smap's - * bucket->list, first_selem can be freed immediately - * (instead of kfree_rcu) because - * bpf_sk_storage_map_free() does a - * synchronize_rcu() before walking the bucket->list. - * Hence, no one is accessing selem from the - * bucket->list under rcu_read_lock(). - */ - } - - return 0; - -uncharge: - kfree(sk_storage); - atomic_sub(sizeof(*sk_storage), &sk->sk_omem_alloc); - return err; -} - -/* sk cannot be going away because it is linking new elem - * to sk->sk_bpf_storage. (i.e. sk->sk_refcnt cannot be 0). - * Otherwise, it will become a leak (and other memory issues - * during map destruction). - */ -static struct bpf_sk_storage_data *sk_storage_update(struct sock *sk, - struct bpf_map *map, - void *value, - u64 map_flags) -{ - struct bpf_sk_storage_data *old_sdata = NULL; - struct bpf_sk_storage_elem *selem; - struct bpf_sk_storage *sk_storage; - struct bpf_sk_storage_map *smap; - int err; - - /* BPF_EXIST and BPF_NOEXIST cannot be both set */ - if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST) || - /* BPF_F_LOCK can only be used in a value with spin_lock */ - unlikely((map_flags & BPF_F_LOCK) && !map_value_has_spin_lock(map))) - return ERR_PTR(-EINVAL); - - smap = (struct bpf_sk_storage_map *)map; - sk_storage = rcu_dereference(sk->sk_bpf_storage); - if (!sk_storage || hlist_empty(&sk_storage->list)) { - /* Very first elem for this sk */ - err = check_flags(NULL, map_flags); - if (err) - return ERR_PTR(err); - - selem = selem_alloc(smap, sk, value, true); - if (!selem) - return ERR_PTR(-ENOMEM); - - err = sk_storage_alloc(sk, smap, selem); - if (err) { - kfree(selem); - atomic_sub(smap->elem_size, &sk->sk_omem_alloc); - return ERR_PTR(err); - } - - return SDATA(selem); - } - - if ((map_flags & BPF_F_LOCK) && !(map_flags & BPF_NOEXIST)) { - /* Hoping to find an old_sdata to do inline update - * such that it can avoid taking the sk_storage->lock - * and changing the lists. - */ - old_sdata = __sk_storage_lookup(sk_storage, smap, false); - err = check_flags(old_sdata, map_flags); - if (err) - return ERR_PTR(err); - if (old_sdata && selem_linked_to_sk(SELEM(old_sdata))) { - copy_map_value_locked(map, old_sdata->data, - value, false); - return old_sdata; - } - } - - raw_spin_lock_bh(&sk_storage->lock); - - /* Recheck sk_storage->list under sk_storage->lock */ - if (unlikely(hlist_empty(&sk_storage->list))) { - /* A parallel del is happening and sk_storage is going - * away. It has just been checked before, so very - * unlikely. Return instead of retry to keep things - * simple. - */ - err = -EAGAIN; - goto unlock_err; - } - - old_sdata = __sk_storage_lookup(sk_storage, smap, false); - err = check_flags(old_sdata, map_flags); - if (err) - goto unlock_err; - - if (old_sdata && (map_flags & BPF_F_LOCK)) { - copy_map_value_locked(map, old_sdata->data, value, false); - selem = SELEM(old_sdata); - goto unlock; - } - - /* sk_storage->lock is held. Hence, we are sure - * we can unlink and uncharge the old_sdata successfully - * later. Hence, instead of charging the new selem now - * and then uncharge the old selem later (which may cause - * a potential but unnecessary charge failure), avoid taking - * a charge at all here (the "!old_sdata" check) and the - * old_sdata will not be uncharged later during __selem_unlink_sk(). - */ - selem = selem_alloc(smap, sk, value, !old_sdata); - if (!selem) { - err = -ENOMEM; - goto unlock_err; - } - - /* First, link the new selem to the map */ - selem_link_map(smap, selem); - - /* Second, link (and publish) the new selem to sk_storage */ - __selem_link_sk(sk_storage, selem); - - /* Third, remove old selem, SELEM(old_sdata) */ - if (old_sdata) { - selem_unlink_map(SELEM(old_sdata)); - __selem_unlink_sk(sk_storage, SELEM(old_sdata), false); - } - -unlock: - raw_spin_unlock_bh(&sk_storage->lock); - return SDATA(selem); - -unlock_err: - raw_spin_unlock_bh(&sk_storage->lock); - return ERR_PTR(err); + smap = (struct bpf_local_storage_map *)map; + return bpf_local_storage_lookup(sk_storage, smap, cacheit_lockit); } -static int sk_storage_delete(struct sock *sk, struct bpf_map *map) +static int bpf_sk_storage_del(struct sock *sk, struct bpf_map *map) { - struct bpf_sk_storage_data *sdata; + struct bpf_local_storage_data *sdata; - sdata = sk_storage_lookup(sk, map, false); + sdata = bpf_sk_storage_lookup(sk, map, false); if (!sdata) return -ENOENT; - selem_unlink(SELEM(sdata)); + bpf_selem_unlink(SELEM(sdata)); return 0; } -static u16 cache_idx_get(void) -{ - u64 min_usage = U64_MAX; - u16 i, res = 0; - - spin_lock(&cache_idx_lock); - - for (i = 0; i < BPF_SK_STORAGE_CACHE_SIZE; i++) { - if (cache_idx_usage_counts[i] < min_usage) { - min_usage = cache_idx_usage_counts[i]; - res = i; - - /* Found a free cache_idx */ - if (!min_usage) - break; - } - } - cache_idx_usage_counts[res]++; - - spin_unlock(&cache_idx_lock); - - return res; -} - -static void cache_idx_free(u16 idx) -{ - spin_lock(&cache_idx_lock); - cache_idx_usage_counts[idx]--; - spin_unlock(&cache_idx_lock); -} - /* Called by __sk_destruct() & bpf_sk_storage_clone() */ void bpf_sk_storage_free(struct sock *sk) { - struct bpf_sk_storage_elem *selem; - struct bpf_sk_storage *sk_storage; + struct bpf_local_storage_elem *selem; + struct bpf_local_storage *sk_storage; bool free_sk_storage = false; struct hlist_node *n; @@ -565,7 +63,7 @@ void bpf_sk_storage_free(struct sock *sk) * Thus, no elem can be added-to or deleted-from the * sk_storage->list by the bpf_prog or by the bpf-map's syscall. * - * It is racing with bpf_sk_storage_map_free() alone + * It is racing with bpf_local_storage_map_free() alone * when unlinking elem from the sk_storage->list and * the map's bucket->list. */ @@ -574,8 +72,9 @@ void bpf_sk_storage_free(struct sock *sk) /* Always unlink from map before unlinking from * sk_storage. */ - selem_unlink_map(selem); - free_sk_storage = __selem_unlink_sk(sk_storage, selem, true); + bpf_selem_unlink_map(selem); + free_sk_storage = bpf_selem_unlink_storage_nolock(sk_storage, + selem, true); } raw_spin_unlock_bh(&sk_storage->lock); rcu_read_unlock(); @@ -586,130 +85,22 @@ void bpf_sk_storage_free(struct sock *sk) static void bpf_sk_storage_map_free(struct bpf_map *map) { - struct bpf_sk_storage_elem *selem; - struct bpf_sk_storage_map *smap; - struct bucket *b; - unsigned int i; - - smap = (struct bpf_sk_storage_map *)map; - - cache_idx_free(smap->cache_idx); - - /* Note that this map might be concurrently cloned from - * bpf_sk_storage_clone. Wait for any existing bpf_sk_storage_clone - * RCU read section to finish before proceeding. New RCU - * read sections should be prevented via bpf_map_inc_not_zero. - */ - synchronize_rcu(); + struct bpf_local_storage_map *smap; - /* bpf prog and the userspace can no longer access this map - * now. No new selem (of this map) can be added - * to the sk->sk_bpf_storage or to the map bucket's list. - * - * The elem of this map can be cleaned up here - * or - * by bpf_sk_storage_free() during __sk_destruct(). - */ - for (i = 0; i < (1U << smap->bucket_log); i++) { - b = &smap->buckets[i]; - - rcu_read_lock(); - /* No one is adding to b->list now */ - while ((selem = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(&b->list)), - struct bpf_sk_storage_elem, - map_node))) { - selem_unlink(selem); - cond_resched_rcu(); - } - rcu_read_unlock(); - } - - /* bpf_sk_storage_free() may still need to access the map. - * e.g. bpf_sk_storage_free() has unlinked selem from the map - * which then made the above while((selem = ...)) loop - * exited immediately. - * - * However, the bpf_sk_storage_free() still needs to access - * the smap->elem_size to do the uncharging in - * __selem_unlink_sk(). - * - * Hence, wait another rcu grace period for the - * bpf_sk_storage_free() to finish. - */ - synchronize_rcu(); - - kvfree(smap->buckets); - kfree(map); -} - -/* U16_MAX is much more than enough for sk local storage - * considering a tcp_sock is ~2k. - */ -#define MAX_VALUE_SIZE \ - min_t(u32, \ - (KMALLOC_MAX_SIZE - MAX_BPF_STACK - sizeof(struct bpf_sk_storage_elem)), \ - (U16_MAX - sizeof(struct bpf_sk_storage_elem))) - -static int bpf_sk_storage_map_alloc_check(union bpf_attr *attr) -{ - if (attr->map_flags & ~SK_STORAGE_CREATE_FLAG_MASK || - !(attr->map_flags & BPF_F_NO_PREALLOC) || - attr->max_entries || - attr->key_size != sizeof(int) || !attr->value_size || - /* Enforce BTF for userspace sk dumping */ - !attr->btf_key_type_id || !attr->btf_value_type_id) - return -EINVAL; - - if (!bpf_capable()) - return -EPERM; - - if (attr->value_size > MAX_VALUE_SIZE) - return -E2BIG; - - return 0; + smap = (struct bpf_local_storage_map *)map; + bpf_local_storage_cache_idx_free(&sk_cache, smap->cache_idx); + bpf_local_storage_map_free(smap); } static struct bpf_map *bpf_sk_storage_map_alloc(union bpf_attr *attr) { - struct bpf_sk_storage_map *smap; - unsigned int i; - u32 nbuckets; - u64 cost; - int ret; - - smap = kzalloc(sizeof(*smap), GFP_USER | __GFP_NOWARN); - if (!smap) - return ERR_PTR(-ENOMEM); - bpf_map_init_from_attr(&smap->map, attr); - - nbuckets = roundup_pow_of_two(num_possible_cpus()); - /* Use at least 2 buckets, select_bucket() is undefined behavior with 1 bucket */ - nbuckets = max_t(u32, 2, nbuckets); - smap->bucket_log = ilog2(nbuckets); - cost = sizeof(*smap->buckets) * nbuckets + sizeof(*smap); - - ret = bpf_map_charge_init(&smap->map.memory, cost); - if (ret < 0) { - kfree(smap); - return ERR_PTR(ret); - } - - smap->buckets = kvcalloc(sizeof(*smap->buckets), nbuckets, - GFP_USER | __GFP_NOWARN); - if (!smap->buckets) { - bpf_map_charge_finish(&smap->map.memory); - kfree(smap); - return ERR_PTR(-ENOMEM); - } + struct bpf_local_storage_map *smap; - for (i = 0; i < nbuckets; i++) { - INIT_HLIST_HEAD(&smap->buckets[i].list); - raw_spin_lock_init(&smap->buckets[i].lock); - } - - smap->elem_size = sizeof(struct bpf_sk_storage_elem) + attr->value_size; - smap->cache_idx = cache_idx_get(); + smap = bpf_local_storage_map_alloc(attr); + if (IS_ERR(smap)) + return ERR_CAST(smap); + smap->cache_idx = bpf_local_storage_cache_idx_get(&sk_cache); return &smap->map; } @@ -719,33 +110,16 @@ static int notsupp_get_next_key(struct bpf_map *map, void *key, return -ENOTSUPP; } -static int bpf_sk_storage_map_check_btf(const struct bpf_map *map, - const struct btf *btf, - const struct btf_type *key_type, - const struct btf_type *value_type) -{ - u32 int_data; - - if (BTF_INFO_KIND(key_type->info) != BTF_KIND_INT) - return -EINVAL; - - int_data = *(u32 *)(key_type + 1); - if (BTF_INT_BITS(int_data) != 32 || BTF_INT_OFFSET(int_data)) - return -EINVAL; - - return 0; -} - static void *bpf_fd_sk_storage_lookup_elem(struct bpf_map *map, void *key) { - struct bpf_sk_storage_data *sdata; + struct bpf_local_storage_data *sdata; struct socket *sock; int fd, err; fd = *(int *)key; sock = sockfd_lookup(fd, &err); if (sock) { - sdata = sk_storage_lookup(sock->sk, map, true); + sdata = bpf_sk_storage_lookup(sock->sk, map, true); sockfd_put(sock); return sdata ? sdata->data : NULL; } @@ -756,14 +130,16 @@ static void *bpf_fd_sk_storage_lookup_elem(struct bpf_map *map, void *key) static int bpf_fd_sk_storage_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) { - struct bpf_sk_storage_data *sdata; + struct bpf_local_storage_data *sdata; struct socket *sock; int fd, err; fd = *(int *)key; sock = sockfd_lookup(fd, &err); if (sock) { - sdata = sk_storage_update(sock->sk, map, value, map_flags); + sdata = bpf_local_storage_update( + sock->sk, (struct bpf_local_storage_map *)map, value, + map_flags); sockfd_put(sock); return PTR_ERR_OR_ZERO(sdata); } @@ -779,7 +155,7 @@ static int bpf_fd_sk_storage_delete_elem(struct bpf_map *map, void *key) fd = *(int *)key; sock = sockfd_lookup(fd, &err); if (sock) { - err = sk_storage_delete(sock->sk, map); + err = bpf_sk_storage_del(sock->sk, map); sockfd_put(sock); return err; } @@ -787,14 +163,14 @@ static int bpf_fd_sk_storage_delete_elem(struct bpf_map *map, void *key) return err; } -static struct bpf_sk_storage_elem * +static struct bpf_local_storage_elem * bpf_sk_storage_clone_elem(struct sock *newsk, - struct bpf_sk_storage_map *smap, - struct bpf_sk_storage_elem *selem) + struct bpf_local_storage_map *smap, + struct bpf_local_storage_elem *selem) { - struct bpf_sk_storage_elem *copy_selem; + struct bpf_local_storage_elem *copy_selem; - copy_selem = selem_alloc(smap, newsk, NULL, true); + copy_selem = bpf_selem_alloc(smap, newsk, NULL, true); if (!copy_selem) return NULL; @@ -810,9 +186,9 @@ bpf_sk_storage_clone_elem(struct sock *newsk, int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk) { - struct bpf_sk_storage *new_sk_storage = NULL; - struct bpf_sk_storage *sk_storage; - struct bpf_sk_storage_elem *selem; + struct bpf_local_storage *new_sk_storage = NULL; + struct bpf_local_storage *sk_storage; + struct bpf_local_storage_elem *selem; int ret = 0; RCU_INIT_POINTER(newsk->sk_bpf_storage, NULL); @@ -824,8 +200,8 @@ int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk) goto out; hlist_for_each_entry_rcu(selem, &sk_storage->list, snode) { - struct bpf_sk_storage_elem *copy_selem; - struct bpf_sk_storage_map *smap; + struct bpf_local_storage_elem *copy_selem; + struct bpf_local_storage_map *smap; struct bpf_map *map; smap = rcu_dereference(SDATA(selem)->smap); @@ -833,7 +209,7 @@ int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk) continue; /* Note that for lockless listeners adding new element - * here can race with cleanup in bpf_sk_storage_map_free. + * here can race with cleanup in bpf_local_storage_map_free. * Try to grab map refcnt to make sure that it's still * alive and prevent concurrent removal. */ @@ -849,10 +225,10 @@ int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk) } if (new_sk_storage) { - selem_link_map(smap, copy_selem); - __selem_link_sk(new_sk_storage, copy_selem); + bpf_selem_link_map(smap, copy_selem); + bpf_selem_link_storage_nolock(new_sk_storage, copy_selem); } else { - ret = sk_storage_alloc(newsk, smap, copy_selem); + ret = bpf_local_storage_alloc(newsk, smap, copy_selem); if (ret) { kfree(copy_selem); atomic_sub(smap->elem_size, @@ -861,7 +237,8 @@ int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk) goto out; } - new_sk_storage = rcu_dereference(copy_selem->sk_storage); + new_sk_storage = + rcu_dereference(copy_selem->local_storage); } bpf_map_put(map); } @@ -879,12 +256,12 @@ out: BPF_CALL_4(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk, void *, value, u64, flags) { - struct bpf_sk_storage_data *sdata; + struct bpf_local_storage_data *sdata; - if (flags > BPF_SK_STORAGE_GET_F_CREATE) + if (!sk || !sk_fullsock(sk) || flags > BPF_SK_STORAGE_GET_F_CREATE) return (unsigned long)NULL; - sdata = sk_storage_lookup(sk, map, true); + sdata = bpf_sk_storage_lookup(sk, map, true); if (sdata) return (unsigned long)sdata->data; @@ -895,7 +272,9 @@ BPF_CALL_4(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk, * destruction). */ refcount_inc_not_zero(&sk->sk_refcnt)) { - sdata = sk_storage_update(sk, map, value, BPF_NOEXIST); + sdata = bpf_local_storage_update( + sk, (struct bpf_local_storage_map *)map, value, + BPF_NOEXIST); /* sk must be a fullsock (guaranteed by verifier), * so sock_gen_put() is unnecessary. */ @@ -909,10 +288,13 @@ BPF_CALL_4(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk, BPF_CALL_2(bpf_sk_storage_delete, struct bpf_map *, map, struct sock *, sk) { + if (!sk || !sk_fullsock(sk)) + return -EINVAL; + if (refcount_inc_not_zero(&sk->sk_refcnt)) { int err; - err = sk_storage_delete(sk, map); + err = bpf_sk_storage_del(sk, map); sock_put(sk); return err; } @@ -920,18 +302,53 @@ BPF_CALL_2(bpf_sk_storage_delete, struct bpf_map *, map, struct sock *, sk) return -ENOENT; } +static int bpf_sk_storage_charge(struct bpf_local_storage_map *smap, + void *owner, u32 size) +{ + struct sock *sk = (struct sock *)owner; + + /* same check as in sock_kmalloc() */ + if (size <= sysctl_optmem_max && + atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) { + atomic_add(size, &sk->sk_omem_alloc); + return 0; + } + + return -ENOMEM; +} + +static void bpf_sk_storage_uncharge(struct bpf_local_storage_map *smap, + void *owner, u32 size) +{ + struct sock *sk = owner; + + atomic_sub(size, &sk->sk_omem_alloc); +} + +static struct bpf_local_storage __rcu ** +bpf_sk_storage_ptr(void *owner) +{ + struct sock *sk = owner; + + return &sk->sk_bpf_storage; +} + static int sk_storage_map_btf_id; const struct bpf_map_ops sk_storage_map_ops = { - .map_alloc_check = bpf_sk_storage_map_alloc_check, + .map_meta_equal = bpf_map_meta_equal, + .map_alloc_check = bpf_local_storage_map_alloc_check, .map_alloc = bpf_sk_storage_map_alloc, .map_free = bpf_sk_storage_map_free, .map_get_next_key = notsupp_get_next_key, .map_lookup_elem = bpf_fd_sk_storage_lookup_elem, .map_update_elem = bpf_fd_sk_storage_update_elem, .map_delete_elem = bpf_fd_sk_storage_delete_elem, - .map_check_btf = bpf_sk_storage_map_check_btf, - .map_btf_name = "bpf_sk_storage_map", + .map_check_btf = bpf_local_storage_map_check_btf, + .map_btf_name = "bpf_local_storage_map", .map_btf_id = &sk_storage_map_btf_id, + .map_local_storage_charge = bpf_sk_storage_charge, + .map_local_storage_uncharge = bpf_sk_storage_uncharge, + .map_owner_storage_ptr = bpf_sk_storage_ptr, }; const struct bpf_func_proto bpf_sk_storage_get_proto = { @@ -939,7 +356,7 @@ const struct bpf_func_proto bpf_sk_storage_get_proto = { .gpl_only = false, .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, .arg1_type = ARG_CONST_MAP_PTR, - .arg2_type = ARG_PTR_TO_SOCKET, + .arg2_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL, .arg4_type = ARG_ANYTHING, }; @@ -959,7 +376,81 @@ const struct bpf_func_proto bpf_sk_storage_delete_proto = { .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, - .arg2_type = ARG_PTR_TO_SOCKET, + .arg2_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, +}; + +static bool bpf_sk_storage_tracing_allowed(const struct bpf_prog *prog) +{ + const struct btf *btf_vmlinux; + const struct btf_type *t; + const char *tname; + u32 btf_id; + + if (prog->aux->dst_prog) + return false; + + /* Ensure the tracing program is not tracing + * any bpf_sk_storage*() function and also + * use the bpf_sk_storage_(get|delete) helper. + */ + switch (prog->expected_attach_type) { + case BPF_TRACE_ITER: + case BPF_TRACE_RAW_TP: + /* bpf_sk_storage has no trace point */ + return true; + case BPF_TRACE_FENTRY: + case BPF_TRACE_FEXIT: + btf_vmlinux = bpf_get_btf_vmlinux(); + btf_id = prog->aux->attach_btf_id; + t = btf_type_by_id(btf_vmlinux, btf_id); + tname = btf_name_by_offset(btf_vmlinux, t->name_off); + return !!strncmp(tname, "bpf_sk_storage", + strlen("bpf_sk_storage")); + default: + return false; + } + + return false; +} + +BPF_CALL_4(bpf_sk_storage_get_tracing, struct bpf_map *, map, struct sock *, sk, + void *, value, u64, flags) +{ + if (in_irq() || in_nmi()) + return (unsigned long)NULL; + + return (unsigned long)____bpf_sk_storage_get(map, sk, value, flags); +} + +BPF_CALL_2(bpf_sk_storage_delete_tracing, struct bpf_map *, map, + struct sock *, sk) +{ + if (in_irq() || in_nmi()) + return -EPERM; + + return ____bpf_sk_storage_delete(map, sk); +} + +const struct bpf_func_proto bpf_sk_storage_get_tracing_proto = { + .func = bpf_sk_storage_get_tracing, + .gpl_only = false, + .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_BTF_ID, + .arg2_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON], + .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL, + .arg4_type = ARG_ANYTHING, + .allowed = bpf_sk_storage_tracing_allowed, +}; + +const struct bpf_func_proto bpf_sk_storage_delete_tracing_proto = { + .func = bpf_sk_storage_delete_tracing, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_BTF_ID, + .arg2_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON], + .allowed = bpf_sk_storage_tracing_allowed, }; struct bpf_sk_storage_diag { @@ -1022,7 +513,7 @@ bpf_sk_storage_diag_alloc(const struct nlattr *nla_stgs) u32 nr_maps = 0; int rem, err; - /* bpf_sk_storage_map is currently limited to CAP_SYS_ADMIN as + /* bpf_local_storage_map is currently limited to CAP_SYS_ADMIN as * the map_alloc_check() side also does. */ if (!bpf_capable()) @@ -1072,13 +563,13 @@ err_free: } EXPORT_SYMBOL_GPL(bpf_sk_storage_diag_alloc); -static int diag_get(struct bpf_sk_storage_data *sdata, struct sk_buff *skb) +static int diag_get(struct bpf_local_storage_data *sdata, struct sk_buff *skb) { struct nlattr *nla_stg, *nla_value; - struct bpf_sk_storage_map *smap; + struct bpf_local_storage_map *smap; /* It cannot exceed max nlattr's payload */ - BUILD_BUG_ON(U16_MAX - NLA_HDRLEN < MAX_VALUE_SIZE); + BUILD_BUG_ON(U16_MAX - NLA_HDRLEN < BPF_LOCAL_STORAGE_MAX_VALUE_SIZE); nla_stg = nla_nest_start(skb, SK_DIAG_BPF_STORAGE); if (!nla_stg) @@ -1114,9 +605,9 @@ static int bpf_sk_storage_diag_put_all(struct sock *sk, struct sk_buff *skb, { /* stg_array_type (e.g. INET_DIAG_BPF_SK_STORAGES) */ unsigned int diag_size = nla_total_size(0); - struct bpf_sk_storage *sk_storage; - struct bpf_sk_storage_elem *selem; - struct bpf_sk_storage_map *smap; + struct bpf_local_storage *sk_storage; + struct bpf_local_storage_elem *selem; + struct bpf_local_storage_map *smap; struct nlattr *nla_stgs; unsigned int saved_len; int err = 0; @@ -1169,8 +660,8 @@ int bpf_sk_storage_diag_put(struct bpf_sk_storage_diag *diag, { /* stg_array_type (e.g. INET_DIAG_BPF_SK_STORAGES) */ unsigned int diag_size = nla_total_size(0); - struct bpf_sk_storage *sk_storage; - struct bpf_sk_storage_data *sdata; + struct bpf_local_storage *sk_storage; + struct bpf_local_storage_data *sdata; struct nlattr *nla_stgs; unsigned int saved_len; int err = 0; @@ -1197,8 +688,8 @@ int bpf_sk_storage_diag_put(struct bpf_sk_storage_diag *diag, saved_len = skb->len; for (i = 0; i < diag->nr_maps; i++) { - sdata = __sk_storage_lookup(sk_storage, - (struct bpf_sk_storage_map *)diag->maps[i], + sdata = bpf_local_storage_lookup(sk_storage, + (struct bpf_local_storage_map *)diag->maps[i], false); if (!sdata) @@ -1235,19 +726,20 @@ struct bpf_iter_seq_sk_storage_map_info { unsigned skip_elems; }; -static struct bpf_sk_storage_elem * +static struct bpf_local_storage_elem * bpf_sk_storage_map_seq_find_next(struct bpf_iter_seq_sk_storage_map_info *info, - struct bpf_sk_storage_elem *prev_selem) + struct bpf_local_storage_elem *prev_selem) + __acquires(RCU) __releases(RCU) { - struct bpf_sk_storage *sk_storage; - struct bpf_sk_storage_elem *selem; + struct bpf_local_storage *sk_storage; + struct bpf_local_storage_elem *selem; u32 skip_elems = info->skip_elems; - struct bpf_sk_storage_map *smap; + struct bpf_local_storage_map *smap; u32 bucket_id = info->bucket_id; u32 i, count, n_buckets; - struct bucket *b; + struct bpf_local_storage_map_bucket *b; - smap = (struct bpf_sk_storage_map *)info->map; + smap = (struct bpf_local_storage_map *)info->map; n_buckets = 1U << smap->bucket_log; if (bucket_id >= n_buckets) return NULL; @@ -1256,16 +748,16 @@ bpf_sk_storage_map_seq_find_next(struct bpf_iter_seq_sk_storage_map_info *info, selem = prev_selem; count = 0; while (selem) { - selem = hlist_entry_safe(selem->map_node.next, - struct bpf_sk_storage_elem, map_node); + selem = hlist_entry_safe(rcu_dereference(hlist_next_rcu(&selem->map_node)), + struct bpf_local_storage_elem, map_node); if (!selem) { /* not found, unlock and go to the next bucket */ b = &smap->buckets[bucket_id++]; - raw_spin_unlock_bh(&b->lock); + rcu_read_unlock(); skip_elems = 0; break; } - sk_storage = rcu_dereference_raw(selem->sk_storage); + sk_storage = rcu_dereference(selem->local_storage); if (sk_storage) { info->skip_elems = skip_elems + count; return selem; @@ -1275,10 +767,10 @@ bpf_sk_storage_map_seq_find_next(struct bpf_iter_seq_sk_storage_map_info *info, for (i = bucket_id; i < (1U << smap->bucket_log); i++) { b = &smap->buckets[i]; - raw_spin_lock_bh(&b->lock); + rcu_read_lock(); count = 0; - hlist_for_each_entry(selem, &b->list, map_node) { - sk_storage = rcu_dereference_raw(selem->sk_storage); + hlist_for_each_entry_rcu(selem, &b->list, map_node) { + sk_storage = rcu_dereference(selem->local_storage); if (sk_storage && count >= skip_elems) { info->bucket_id = i; info->skip_elems = count; @@ -1286,7 +778,7 @@ bpf_sk_storage_map_seq_find_next(struct bpf_iter_seq_sk_storage_map_info *info, } count++; } - raw_spin_unlock_bh(&b->lock); + rcu_read_unlock(); skip_elems = 0; } @@ -1297,7 +789,7 @@ bpf_sk_storage_map_seq_find_next(struct bpf_iter_seq_sk_storage_map_info *info, static void *bpf_sk_storage_map_seq_start(struct seq_file *seq, loff_t *pos) { - struct bpf_sk_storage_elem *selem; + struct bpf_local_storage_elem *selem; selem = bpf_sk_storage_map_seq_find_next(seq->private, NULL); if (!selem) @@ -1330,11 +822,11 @@ DEFINE_BPF_ITER_FUNC(bpf_sk_storage_map, struct bpf_iter_meta *meta, void *value) static int __bpf_sk_storage_map_seq_show(struct seq_file *seq, - struct bpf_sk_storage_elem *selem) + struct bpf_local_storage_elem *selem) { struct bpf_iter_seq_sk_storage_map_info *info = seq->private; struct bpf_iter__bpf_sk_storage_map ctx = {}; - struct bpf_sk_storage *sk_storage; + struct bpf_local_storage *sk_storage; struct bpf_iter_meta meta; struct bpf_prog *prog; int ret = 0; @@ -1345,8 +837,8 @@ static int __bpf_sk_storage_map_seq_show(struct seq_file *seq, ctx.meta = &meta; ctx.map = info->map; if (selem) { - sk_storage = rcu_dereference_raw(selem->sk_storage); - ctx.sk = sk_storage->sk; + sk_storage = rcu_dereference(selem->local_storage); + ctx.sk = sk_storage->owner; ctx.value = SDATA(selem)->data; } ret = bpf_iter_run_prog(prog, &ctx); @@ -1361,18 +853,12 @@ static int bpf_sk_storage_map_seq_show(struct seq_file *seq, void *v) } static void bpf_sk_storage_map_seq_stop(struct seq_file *seq, void *v) + __releases(RCU) { - struct bpf_iter_seq_sk_storage_map_info *info = seq->private; - struct bpf_sk_storage_map *smap; - struct bucket *b; - - if (!v) { + if (!v) (void)__bpf_sk_storage_map_seq_show(seq, v); - } else { - smap = (struct bpf_sk_storage_map *)info->map; - b = &smap->buckets[info->bucket_id]; - raw_spin_unlock_bh(&b->lock); - } + else + rcu_read_unlock(); } static int bpf_iter_init_sk_storage_map(void *priv_data, @@ -1437,6 +923,8 @@ static struct bpf_iter_reg bpf_sk_storage_map_reg_info = { .target = "bpf_sk_storage_map", .attach_target = bpf_iter_attach_map, .detach_target = bpf_iter_detach_map, + .show_fdinfo = bpf_iter_map_show_fdinfo, + .fill_link_info = bpf_iter_map_fill_link_info, .ctx_arg_info_size = 2, .ctx_arg_info = { { offsetof(struct bpf_iter__bpf_sk_storage_map, sk), diff --git a/net/core/datagram.c b/net/core/datagram.c index 639745d4f3b9..15ab9ffb27fe 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -623,10 +623,11 @@ int __zerocopy_sg_from_iter(struct sock *sk, struct sk_buff *skb, while (length && iov_iter_count(from)) { struct page *pages[MAX_SKB_FRAGS]; + struct page *last_head = NULL; size_t start; ssize_t copied; unsigned long truesize; - int n = 0; + int refs, n = 0; if (frag == MAX_SKB_FRAGS) return -EMSGSIZE; @@ -649,13 +650,37 @@ int __zerocopy_sg_from_iter(struct sock *sk, struct sk_buff *skb, } else { refcount_add(truesize, &skb->sk->sk_wmem_alloc); } - while (copied) { + for (refs = 0; copied != 0; start = 0) { int size = min_t(int, copied, PAGE_SIZE - start); - skb_fill_page_desc(skb, frag++, pages[n], start, size); - start = 0; + struct page *head = compound_head(pages[n]); + + start += (pages[n] - head) << PAGE_SHIFT; copied -= size; n++; + if (frag) { + skb_frag_t *last = &skb_shinfo(skb)->frags[frag - 1]; + + if (head == skb_frag_page(last) && + start == skb_frag_off(last) + skb_frag_size(last)) { + skb_frag_size_add(last, size); + /* We combined this page, we need to release + * a reference. Since compound pages refcount + * is shared among many pages, batch the refcount + * adjustments to limit false sharing. + */ + last_head = head; + refs++; + continue; + } + } + if (refs) { + page_ref_sub(last_head, refs); + refs = 0; + } + skb_fill_page_desc(skb, frag++, head, start, size); } + if (refs) + page_ref_sub(last_head, refs); } return 0; } @@ -684,7 +709,7 @@ int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *from) EXPORT_SYMBOL(zerocopy_sg_from_iter); /** - * skb_copy_and_csum_datagram_iter - Copy datagram to an iovec iterator + * skb_copy_and_csum_datagram - Copy datagram to an iovec iterator * and update a checksum. * @skb: buffer to copy * @offset: offset in the buffer to start copying from @@ -696,8 +721,16 @@ static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset, struct iov_iter *to, int len, __wsum *csump) { - return __skb_datagram_iter(skb, offset, to, len, true, - csum_and_copy_to_iter, csump); + struct csum_state csdata = { .csum = *csump }; + int ret; + + ret = __skb_datagram_iter(skb, offset, to, len, true, + csum_and_copy_to_iter, &csdata); + if (ret) + return ret; + + *csump = csdata.csum; + return 0; } /** diff --git a/net/core/dev.c b/net/core/dev.c index 4906b44af850..6c5967e80132 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -91,6 +91,7 @@ #include <linux/etherdevice.h> #include <linux/ethtool.h> #include <linux/skbuff.h> +#include <linux/kthread.h> #include <linux/bpf.h> #include <linux/bpf_trace.h> #include <net/net_namespace.h> @@ -98,8 +99,10 @@ #include <net/busy_poll.h> #include <linux/rtnetlink.h> #include <linux/stat.h> +#include <net/dsa.h> #include <net/dst.h> #include <net/dst_metadata.h> +#include <net/gro.h> #include <net/pkt_sched.h> #include <net/pkt_cls.h> #include <net/checksum.h> @@ -144,6 +147,7 @@ #include <linux/indirect_call_wrapper.h> #include <net/devlink.h> #include <linux/pm_runtime.h> +#include <linux/prandom.h> #include "net-sysfs.h" @@ -1067,19 +1071,6 @@ struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type, } EXPORT_SYMBOL(dev_getbyhwaddr_rcu); -struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type) -{ - struct net_device *dev; - - ASSERT_RTNL(); - for_each_netdev(net, dev) - if (dev->type == type) - return dev; - - return NULL; -} -EXPORT_SYMBOL(__dev_getfirstbyhwtype); - struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type) { struct net_device *dev, *ret = NULL; @@ -1130,7 +1121,7 @@ EXPORT_SYMBOL(__dev_get_by_flags); * @name: name string * * Network device names need to be valid file names to - * to allow sysfs to work. We also disallow any kind of + * allow sysfs to work. We also disallow any kind of * whitespace. */ bool dev_valid_name(const char *name) @@ -1468,6 +1459,25 @@ void netdev_state_change(struct net_device *dev) EXPORT_SYMBOL(netdev_state_change); /** + * __netdev_notify_peers - notify network peers about existence of @dev, + * to be called when rtnl lock is already held. + * @dev: network device + * + * Generate traffic such that interested network peers are aware of + * @dev, such as by generating a gratuitous ARP. This may be used when + * a device wants to inform the rest of the network about some sort of + * reconfiguration such as a failover event or virtual machine + * migration. + */ +void __netdev_notify_peers(struct net_device *dev) +{ + ASSERT_RTNL(); + call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev); + call_netdevice_notifiers(NETDEV_RESEND_IGMP, dev); +} +EXPORT_SYMBOL(__netdev_notify_peers); + +/** * netdev_notify_peers - notify network peers about existence of @dev * @dev: network device * @@ -1480,12 +1490,32 @@ EXPORT_SYMBOL(netdev_state_change); void netdev_notify_peers(struct net_device *dev) { rtnl_lock(); - call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev); - call_netdevice_notifiers(NETDEV_RESEND_IGMP, dev); + __netdev_notify_peers(dev); rtnl_unlock(); } EXPORT_SYMBOL(netdev_notify_peers); +static int napi_threaded_poll(void *data); + +static int napi_kthread_create(struct napi_struct *n) +{ + int err = 0; + + /* Create and wake up the kthread once to put it in + * TASK_INTERRUPTIBLE mode to avoid the blocked task + * warning and work with loadavg. + */ + n->thread = kthread_run(napi_threaded_poll, n, "napi/%s-%d", + n->dev->name, n->napi_id); + if (IS_ERR(n->thread)) { + err = PTR_ERR(n->thread); + pr_err("kthread_run failed with err %d\n", err); + n->thread = NULL; + } + + return err; +} + static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack) { const struct net_device_ops *ops = dev->netdev_ops; @@ -2187,28 +2217,14 @@ static inline void net_timestamp_set(struct sk_buff *skb) bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb) { - unsigned int len; - - if (!(dev->flags & IFF_UP)) - return false; - - len = dev->mtu + dev->hard_header_len + VLAN_HLEN; - if (skb->len <= len) - return true; - - /* if TSO is enabled, we don't care about the length as the packet - * could be forwarded without being segmented before - */ - if (skb_is_gso(skb)) - return true; - - return false; + return __is_skb_forwardable(dev, skb, true); } EXPORT_SYMBOL_GPL(is_skb_forwardable); -int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb) +static int __dev_forward_skb2(struct net_device *dev, struct sk_buff *skb, + bool check_mtu) { - int ret = ____dev_forward_skb(dev, skb); + int ret = ____dev_forward_skb(dev, skb, check_mtu); if (likely(!ret)) { skb->protocol = eth_type_trans(skb, dev); @@ -2217,6 +2233,11 @@ int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb) return ret; } + +int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb) +{ + return __dev_forward_skb2(dev, skb, true); +} EXPORT_SYMBOL_GPL(__dev_forward_skb); /** @@ -2243,6 +2264,11 @@ int dev_forward_skb(struct net_device *dev, struct sk_buff *skb) } EXPORT_SYMBOL_GPL(dev_forward_skb); +int dev_forward_skb_nomtu(struct net_device *dev, struct sk_buff *skb) +{ + return __dev_forward_skb2(dev, skb, false) ?: netif_rx_internal(skb); +} + static inline int deliver_skb(struct sk_buff *skb, struct packet_type *pt_prev, struct net_device *orig_dev) @@ -3204,7 +3230,7 @@ int skb_checksum_help(struct sk_buff *skb) if (skb->ip_summed == CHECKSUM_COMPLETE) goto out_set_summed; - if (unlikely(skb_shinfo(skb)->gso_size)) { + if (unlikely(skb_is_gso(skb))) { skb_warn_bad_offload(skb); return -EINVAL; } @@ -3493,6 +3519,11 @@ static netdev_features_t gso_features_check(const struct sk_buff *skb, if (gso_segs > dev->gso_max_segs) return features & ~NETIF_F_GSO_MASK; + if (!skb_shinfo(skb)->gso_type) { + skb_warn_bad_offload(skb); + return features & ~NETIF_F_GSO_MASK; + } + /* Support for GSO partial features requires software * intervention before we can actually process the packets * so we need to strip support for any partial features now @@ -3557,6 +3588,7 @@ static int xmit_one(struct sk_buff *skb, struct net_device *dev, dev_queue_xmit_nit(skb, dev); len = skb->len; + PRANDOM_ADD_NOISE(skb, dev, txq, len + jiffies); trace_net_dev_start_xmit(skb, dev); rc = netdev_start_xmit(skb, dev, txq, more); trace_net_dev_xmit(skb, rc, dev, len); @@ -3604,11 +3636,22 @@ static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb, int skb_csum_hwoffload_help(struct sk_buff *skb, const netdev_features_t features) { - if (unlikely(skb->csum_not_inet)) + if (unlikely(skb_csum_is_sctp(skb))) return !!(features & NETIF_F_SCTP_CRC) ? 0 : skb_crc32c_csum_help(skb); - return !!(features & NETIF_F_CSUM_MASK) ? 0 : skb_checksum_help(skb); + if (features & NETIF_F_HW_CSUM) + return 0; + + if (features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) { + switch (skb->csum_offset) { + case offsetof(struct tcphdr, check): + case offsetof(struct udphdr, check): + return 0; + } + } + + return skb_checksum_help(skb); } EXPORT_SYMBOL(skb_csum_hwoffload_help); @@ -3864,6 +3907,8 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev) return skb; /* qdisc_skb_cb(skb)->pkt_len was already set by the caller. */ + qdisc_skb_cb(skb)->mru = 0; + qdisc_skb_cb(skb)->post_ct = false; mini_qdisc_bstats_cpu_update(miniq, skb); switch (tcf_classify(skb, miniq->filter_list, &cl_res, false)) { @@ -4069,7 +4114,7 @@ static int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev) skb_reset_mac_header(skb); if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP)) - __skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED); + __skb_tstamp_tx(skb, NULL, NULL, skb->sk, SCM_TSTAMP_SCHED); /* Disable soft irqs for various locks below. Also * stops preemption for RCU. @@ -4129,6 +4174,7 @@ static int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev) if (!skb) goto out; + PRANDOM_ADD_NOISE(skb, dev, txq, jiffies); HARD_TX_LOCK(dev, txq, cpu); if (!netif_xmit_stopped(txq)) { @@ -4176,7 +4222,7 @@ int dev_queue_xmit_accel(struct sk_buff *skb, struct net_device *sb_dev) } EXPORT_SYMBOL(dev_queue_xmit_accel); -int dev_direct_xmit(struct sk_buff *skb, u16 queue_id) +int __dev_direct_xmit(struct sk_buff *skb, u16 queue_id) { struct net_device *dev = skb->dev; struct sk_buff *orig_skb = skb; @@ -4194,6 +4240,7 @@ int dev_direct_xmit(struct sk_buff *skb, u16 queue_id) skb_set_queue_mapping(skb, queue_id); txq = skb_get_tx_queue(dev, skb); + PRANDOM_ADD_NOISE(skb, dev, txq, jiffies); local_bh_disable(); @@ -4205,17 +4252,13 @@ int dev_direct_xmit(struct sk_buff *skb, u16 queue_id) dev_xmit_recursion_dec(); local_bh_enable(); - - if (!dev_xmit_complete(ret)) - kfree_skb(skb); - return ret; drop: atomic_long_inc(&dev->tx_dropped); kfree_skb_list(skb); return NET_XMIT_DROP; } -EXPORT_SYMBOL(dev_direct_xmit); +EXPORT_SYMBOL(__dev_direct_xmit); /************************************************************************* * Receiver routines @@ -4240,6 +4283,22 @@ int gro_normal_batch __read_mostly = 8; static inline void ____napi_schedule(struct softnet_data *sd, struct napi_struct *napi) { + struct task_struct *thread; + + if (test_bit(NAPI_STATE_THREADED, &napi->state)) { + /* Paired with smp_mb__before_atomic() in + * napi_enable()/dev_set_threaded(). + * Use READ_ONCE() to guarantee a complete + * read on napi->thread. Only call + * wake_up_process() when it's not NULL. + */ + thread = READ_ONCE(napi->thread); + if (thread) { + wake_up_process(thread); + return; + } + } + list_add_tail(&napi->poll_list, &sd->poll_list); __raise_softirq_irqoff(NET_RX_SOFTIRQ); } @@ -4591,14 +4650,14 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp, struct bpf_prog *xdp_prog) { + void *orig_data, *orig_data_end, *hard_start; struct netdev_rx_queue *rxqueue; - void *orig_data, *orig_data_end; u32 metalen, act = XDP_DROP; + u32 mac_len, frame_sz; __be16 orig_eth_type; struct ethhdr *eth; bool orig_bcast; - int hlen, off; - u32 mac_len; + int off; /* Reinjected packets coming from act_mirred or similar should * not get XDP generic processing. @@ -4630,15 +4689,16 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, * header. */ mac_len = skb->data - skb_mac_header(skb); - hlen = skb_headlen(skb) + mac_len; - xdp->data = skb->data - mac_len; - xdp->data_meta = xdp->data; - xdp->data_end = xdp->data + hlen; - xdp->data_hard_start = skb->data - skb_headroom(skb); + hard_start = skb->data - skb_headroom(skb); /* SKB "head" area always have tailroom for skb_shared_info */ - xdp->frame_sz = (void *)skb_end_pointer(skb) - xdp->data_hard_start; - xdp->frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + frame_sz = (void *)skb_end_pointer(skb) - hard_start; + frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + + rxqueue = netif_get_rxqueue(skb); + xdp_init_buff(xdp, frame_sz, &rxqueue->xdp_rxq); + xdp_prepare_buff(xdp, hard_start, skb_headroom(skb) - mac_len, + skb_headlen(skb) + mac_len, true); orig_data_end = xdp->data_end; orig_data = xdp->data; @@ -4646,9 +4706,6 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, orig_bcast = is_multicast_ether_addr_64bits(eth->h_dest); orig_eth_type = eth->h_proto; - rxqueue = netif_get_rxqueue(skb); - xdp->rxq = &rxqueue->xdp_rxq; - act = bpf_prog_run_xdp(xdp_prog, xdp); /* check if bpf_xdp_adjust_head was used */ @@ -4840,6 +4897,21 @@ int netif_rx_ni(struct sk_buff *skb) } EXPORT_SYMBOL(netif_rx_ni); +int netif_rx_any_context(struct sk_buff *skb) +{ + /* + * If invoked from contexts which do not invoke bottom half + * processing either at return from interrupt or when softrqs are + * reenabled, use netif_rx_ni() which invokes bottomhalf processing + * directly. + */ + if (in_interrupt()) + return netif_rx(skb); + else + return netif_rx_ni(skb); +} +EXPORT_SYMBOL(netif_rx_any_context); + static __latent_entropy void net_tx_action(struct softirq_action *h) { struct softnet_data *sd = this_cpu_ptr(&softnet_data); @@ -4868,8 +4940,6 @@ static __latent_entropy void net_tx_action(struct softirq_action *h) else __kfree_skb_defer(skb); } - - __kfree_skb_flush(); } if (sd->output_queue) { @@ -4914,7 +4984,7 @@ EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook); static inline struct sk_buff * sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, - struct net_device *orig_dev) + struct net_device *orig_dev, bool *another) { #ifdef CONFIG_NET_CLS_ACT struct mini_Qdisc *miniq = rcu_dereference_bh(skb->dev->miniq_ingress); @@ -4934,6 +5004,8 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, } qdisc_skb_cb(skb)->pkt_len = skb->len; + qdisc_skb_cb(skb)->mru = 0; + qdisc_skb_cb(skb)->post_ct = false; skb->tc_at_ingress = 1; mini_qdisc_bstats_cpu_update(miniq, skb); @@ -4958,7 +5030,11 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, * redirecting to another netdev */ __skb_push(skb, skb->mac_len); - skb_do_redirect(skb); + if (skb_do_redirect(skb) == -EAGAIN) { + __skb_pull(skb, skb->mac_len); + *another = true; + break; + } return NULL; case TC_ACT_CONSUMED: return NULL; @@ -5119,8 +5195,7 @@ another_round: skb_reset_mac_len(skb); } - if (skb->protocol == cpu_to_be16(ETH_P_8021Q) || - skb->protocol == cpu_to_be16(ETH_P_8021AD)) { + if (eth_type_vlan(skb->protocol)) { skb = skb_vlan_untag(skb); if (unlikely(!skb)) goto out; @@ -5147,7 +5222,12 @@ another_round: skip_taps: #ifdef CONFIG_NET_INGRESS if (static_branch_unlikely(&ingress_needed_key)) { - skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev); + bool another = false; + + skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev, + &another); + if (another) + goto another_round; if (!skb) goto out; @@ -5192,15 +5272,14 @@ skip_classify: } } - if (unlikely(skb_vlan_tag_present(skb))) { + if (unlikely(skb_vlan_tag_present(skb)) && !netdev_uses_dsa(skb->dev)) { check_vlan_id: if (skb_vlan_tag_get_id(skb)) { /* Vlan id is non 0 and vlan_do_receive() above couldn't * find vlan device. */ skb->pkt_type = PACKET_OTHERHOST; - } else if (skb->protocol == cpu_to_be16(ETH_P_8021Q) || - skb->protocol == cpu_to_be16(ETH_P_8021AD)) { + } else if (eth_type_vlan(skb->protocol)) { /* Outer header is 802.1P with vlan 0, inner header is * 802.1Q or 802.1AD and vlan_do_receive() above could * not find vlan dev for vlan id 0. @@ -5441,15 +5520,20 @@ static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp) if (new) { u32 i; + mutex_lock(&new->aux->used_maps_mutex); + /* generic XDP does not work with DEVMAPs that can * have a bpf_prog installed on an entry */ for (i = 0; i < new->aux->used_map_cnt; i++) { - if (dev_map_can_have_prog(new->aux->used_maps[i])) - return -EINVAL; - if (cpu_map_prog_allowed(new->aux->used_maps[i])) + if (dev_map_can_have_prog(new->aux->used_maps[i]) || + cpu_map_prog_allowed(new->aux->used_maps[i])) { + mutex_unlock(&new->aux->used_maps_mutex); return -EINVAL; + } } + + mutex_unlock(&new->aux->used_maps_mutex); } switch (xdp->command) { @@ -5621,17 +5705,60 @@ static void flush_backlog(struct work_struct *work) local_bh_enable(); } +static bool flush_required(int cpu) +{ +#if IS_ENABLED(CONFIG_RPS) + struct softnet_data *sd = &per_cpu(softnet_data, cpu); + bool do_flush; + + local_irq_disable(); + rps_lock(sd); + + /* as insertion into process_queue happens with the rps lock held, + * process_queue access may race only with dequeue + */ + do_flush = !skb_queue_empty(&sd->input_pkt_queue) || + !skb_queue_empty_lockless(&sd->process_queue); + rps_unlock(sd); + local_irq_enable(); + + return do_flush; +#endif + /* without RPS we can't safely check input_pkt_queue: during a + * concurrent remote skb_queue_splice() we can detect as empty both + * input_pkt_queue and process_queue even if the latter could end-up + * containing a lot of packets. + */ + return true; +} + static void flush_all_backlogs(void) { + static cpumask_t flush_cpus; unsigned int cpu; + /* since we are under rtnl lock protection we can use static data + * for the cpumask and avoid allocating on stack the possibly + * large mask + */ + ASSERT_RTNL(); + get_online_cpus(); - for_each_online_cpu(cpu) - queue_work_on(cpu, system_highpri_wq, - per_cpu_ptr(&flush_works, cpu)); + cpumask_clear(&flush_cpus); + for_each_online_cpu(cpu) { + if (flush_required(cpu)) { + queue_work_on(cpu, system_highpri_wq, + per_cpu_ptr(&flush_works, cpu)); + cpumask_set_cpu(cpu, &flush_cpus); + } + } - for_each_online_cpu(cpu) + /* we can have in flight packet[s] on the cpus we are not flushing, + * synchronize_net() in unregister_netdevice_many() will take care of + * them + */ + for_each_cpu(cpu, &flush_cpus) flush_work(per_cpu_ptr(&flush_works, cpu)); put_online_cpus(); @@ -5650,15 +5777,14 @@ static void gro_normal_list(struct napi_struct *napi) /* Queue one GRO_NORMAL SKB up for list processing. If batch size exceeded, * pass the whole batch up to the stack. */ -static void gro_normal_one(struct napi_struct *napi, struct sk_buff *skb) +static void gro_normal_one(struct napi_struct *napi, struct sk_buff *skb, int segs) { list_add_tail(&skb->list, &napi->rx_list); - if (++napi->rx_count >= gro_normal_batch) + napi->rx_count += segs; + if (napi->rx_count >= gro_normal_batch) gro_normal_list(napi); } -INDIRECT_CALLABLE_DECLARE(int inet_gro_complete(struct sk_buff *, int)); -INDIRECT_CALLABLE_DECLARE(int ipv6_gro_complete(struct sk_buff *, int)); static int napi_gro_complete(struct napi_struct *napi, struct sk_buff *skb) { struct packet_offload *ptype; @@ -5692,7 +5818,7 @@ static int napi_gro_complete(struct napi_struct *napi, struct sk_buff *skb) } out: - gro_normal_one(napi, skb); + gro_normal_one(napi, skb, NAPI_GRO_CB(skb)->count); return NET_RX_SUCCESS; } @@ -5827,10 +5953,6 @@ static void gro_flush_oldest(struct napi_struct *napi, struct list_head *head) napi_gro_complete(napi, oldest); } -INDIRECT_CALLABLE_DECLARE(struct sk_buff *inet_gro_receive(struct list_head *, - struct sk_buff *)); -INDIRECT_CALLABLE_DECLARE(struct sk_buff *ipv6_gro_receive(struct list_head *, - struct sk_buff *)); static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) { u32 hash = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1); @@ -5969,31 +6091,20 @@ struct packet_offload *gro_find_complete_by_type(__be16 type) } EXPORT_SYMBOL(gro_find_complete_by_type); -static void napi_skb_free_stolen_head(struct sk_buff *skb) -{ - skb_dst_drop(skb); - skb_ext_put(skb); - kmem_cache_free(skbuff_head_cache, skb); -} - static gro_result_t napi_skb_finish(struct napi_struct *napi, struct sk_buff *skb, gro_result_t ret) { switch (ret) { case GRO_NORMAL: - gro_normal_one(napi, skb); - break; - - case GRO_DROP: - kfree_skb(skb); + gro_normal_one(napi, skb, 1); break; case GRO_MERGED_FREE: if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) napi_skb_free_stolen_head(skb); else - __kfree_skb(skb); + __kfree_skb_defer(skb); break; case GRO_HELD: @@ -6070,11 +6181,7 @@ static gro_result_t napi_frags_finish(struct napi_struct *napi, __skb_push(skb, ETH_HLEN); skb->protocol = eth_type_trans(skb, skb->dev); if (ret == GRO_NORMAL) - gro_normal_one(napi, skb); - break; - - case GRO_DROP: - napi_reuse_skb(napi, skb); + gro_normal_one(napi, skb, 1); break; case GRO_MERGED_FREE: @@ -6138,9 +6245,6 @@ gro_result_t napi_gro_frags(struct napi_struct *napi) gro_result_t ret; struct sk_buff *skb = napi_frags_skb(napi); - if (!skb) - return GRO_DROP; - trace_napi_gro_frags_entry(skb); ret = napi_frags_finish(napi, skb, dev_gro_receive(napi, skb)); @@ -6293,7 +6397,7 @@ EXPORT_SYMBOL(__napi_schedule); * @n: napi context * * Test if NAPI routine is already running, and if not mark - * it as running. This is used as a condition variable + * it as running. This is used as a condition variable to * insure only one NAPI poll instance runs. We also make * sure there is no pending NAPI disable. */ @@ -6381,7 +6485,8 @@ bool napi_complete_done(struct napi_struct *n, int work_done) WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED)); - new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED); + new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED | + NAPIF_STATE_PREFER_BUSY_POLL); /* If STATE_MISSED was set, leave STATE_SCHED set, * because we will call napi->poll() one more time. @@ -6418,10 +6523,30 @@ static struct napi_struct *napi_by_id(unsigned int napi_id) #if defined(CONFIG_NET_RX_BUSY_POLL) -#define BUSY_POLL_BUDGET 8 +static void __busy_poll_stop(struct napi_struct *napi, bool skip_schedule) +{ + if (!skip_schedule) { + gro_normal_list(napi); + __napi_schedule(napi); + return; + } -static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock) + if (napi->gro_bitmask) { + /* flush too old packets + * If HZ < 1000, flush all packets. + */ + napi_gro_flush(napi, HZ >= 1000); + } + + gro_normal_list(napi); + clear_bit(NAPI_STATE_SCHED, &napi->state); +} + +static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, bool prefer_busy_poll, + u16 budget) { + bool skip_schedule = false; + unsigned long timeout; int rc; /* Busy polling means there is a high chance device driver hard irq @@ -6438,29 +6563,33 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock) local_bh_disable(); + if (prefer_busy_poll) { + napi->defer_hard_irqs_count = READ_ONCE(napi->dev->napi_defer_hard_irqs); + timeout = READ_ONCE(napi->dev->gro_flush_timeout); + if (napi->defer_hard_irqs_count && timeout) { + hrtimer_start(&napi->timer, ns_to_ktime(timeout), HRTIMER_MODE_REL_PINNED); + skip_schedule = true; + } + } + /* All we really want here is to re-enable device interrupts. * Ideally, a new ndo_busy_poll_stop() could avoid another round. */ - rc = napi->poll(napi, BUSY_POLL_BUDGET); + rc = napi->poll(napi, budget); /* We can't gro_normal_list() here, because napi->poll() might have * rearmed the napi (napi_complete_done()) in which case it could * already be running on another CPU. */ - trace_napi_poll(napi, rc, BUSY_POLL_BUDGET); + trace_napi_poll(napi, rc, budget); netpoll_poll_unlock(have_poll_lock); - if (rc == BUSY_POLL_BUDGET) { - /* As the whole budget was spent, we still own the napi so can - * safely handle the rx_list. - */ - gro_normal_list(napi); - __napi_schedule(napi); - } + if (rc == budget) + __busy_poll_stop(napi, skip_schedule); local_bh_enable(); } void napi_busy_loop(unsigned int napi_id, bool (*loop_end)(void *, unsigned long), - void *loop_end_arg) + void *loop_end_arg, bool prefer_busy_poll, u16 budget) { unsigned long start_time = loop_end ? busy_loop_current_time() : 0; int (*napi_poll)(struct napi_struct *napi, int budget); @@ -6488,17 +6617,23 @@ restart: * we avoid dirtying napi->state as much as we can. */ if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED | - NAPIF_STATE_IN_BUSY_POLL)) + NAPIF_STATE_IN_BUSY_POLL)) { + if (prefer_busy_poll) + set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state); goto count; + } if (cmpxchg(&napi->state, val, val | NAPIF_STATE_IN_BUSY_POLL | - NAPIF_STATE_SCHED) != val) + NAPIF_STATE_SCHED) != val) { + if (prefer_busy_poll) + set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state); goto count; + } have_poll_lock = netpoll_poll_lock(napi); napi_poll = napi->poll; } - work = napi_poll(napi, BUSY_POLL_BUDGET); - trace_napi_poll(napi, work, BUSY_POLL_BUDGET); + work = napi_poll(napi, budget); + trace_napi_poll(napi, work, budget); gro_normal_list(napi); count: if (work > 0) @@ -6511,7 +6646,7 @@ count: if (unlikely(need_resched())) { if (napi_poll) - busy_poll_stop(napi, have_poll_lock); + busy_poll_stop(napi, have_poll_lock, prefer_busy_poll, budget); preempt_enable(); rcu_read_unlock(); cond_resched(); @@ -6522,7 +6657,7 @@ count: cpu_relax(); } if (napi_poll) - busy_poll_stop(napi, have_poll_lock); + busy_poll_stop(napi, have_poll_lock, prefer_busy_poll, budget); preempt_enable(); out: rcu_read_unlock(); @@ -6533,8 +6668,7 @@ EXPORT_SYMBOL(napi_busy_loop); static void napi_hash_add(struct napi_struct *napi) { - if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state) || - test_and_set_bit(NAPI_STATE_HASHED, &napi->state)) + if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state)) return; spin_lock(&napi_hash_lock); @@ -6555,20 +6689,14 @@ static void napi_hash_add(struct napi_struct *napi) /* Warning : caller is responsible to make sure rcu grace period * is respected before freeing memory containing @napi */ -bool napi_hash_del(struct napi_struct *napi) +static void napi_hash_del(struct napi_struct *napi) { - bool rcu_sync_needed = false; - spin_lock(&napi_hash_lock); - if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state)) { - rcu_sync_needed = true; - hlist_del_rcu(&napi->napi_hash_node); - } + hlist_del_init_rcu(&napi->napi_hash_node); + spin_unlock(&napi_hash_lock); - return rcu_sync_needed; } -EXPORT_SYMBOL_GPL(napi_hash_del); static enum hrtimer_restart napi_watchdog(struct hrtimer *timer) { @@ -6580,8 +6708,10 @@ static enum hrtimer_restart napi_watchdog(struct hrtimer *timer) * NAPI_STATE_MISSED, since we do not react to a device IRQ. */ if (!napi_disable_pending(napi) && - !test_and_set_bit(NAPI_STATE_SCHED, &napi->state)) + !test_and_set_bit(NAPI_STATE_SCHED, &napi->state)) { + clear_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state); __napi_schedule_irqoff(napi); + } return HRTIMER_NORESTART; } @@ -6597,10 +6727,57 @@ static void init_gro_hash(struct napi_struct *napi) napi->gro_bitmask = 0; } +int dev_set_threaded(struct net_device *dev, bool threaded) +{ + struct napi_struct *napi; + int err = 0; + + if (dev->threaded == threaded) + return 0; + + if (threaded) { + list_for_each_entry(napi, &dev->napi_list, dev_list) { + if (!napi->thread) { + err = napi_kthread_create(napi); + if (err) { + threaded = false; + break; + } + } + } + } + + dev->threaded = threaded; + + /* Make sure kthread is created before THREADED bit + * is set. + */ + smp_mb__before_atomic(); + + /* Setting/unsetting threaded mode on a napi might not immediately + * take effect, if the current napi instance is actively being + * polled. In this case, the switch between threaded mode and + * softirq mode will happen in the next round of napi_schedule(). + * This should not cause hiccups/stalls to the live traffic. + */ + list_for_each_entry(napi, &dev->napi_list, dev_list) { + if (threaded) + set_bit(NAPI_STATE_THREADED, &napi->state); + else + clear_bit(NAPI_STATE_THREADED, &napi->state); + } + + return err; +} + void netif_napi_add(struct net_device *dev, struct napi_struct *napi, int (*poll)(struct napi_struct *, int), int weight) { + if (WARN_ON(test_and_set_bit(NAPI_STATE_LISTED, &napi->state))) + return; + INIT_LIST_HEAD(&napi->poll_list); + INIT_HLIST_NODE(&napi->napi_hash_node); hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED); napi->timer.function = napi_watchdog; init_gro_hash(napi); @@ -6620,6 +6797,12 @@ void netif_napi_add(struct net_device *dev, struct napi_struct *napi, set_bit(NAPI_STATE_NPSVC, &napi->state); list_add_rcu(&napi->dev_list, &dev->napi_list); napi_hash_add(napi); + /* Create kthread for this napi if dev->threaded is set. + * Clear dev->threaded if kthread creation failed so that + * threaded mode will not be enabled in napi_enable(). + */ + if (dev->threaded && napi_kthread_create(napi)) + dev->threaded = 0; } EXPORT_SYMBOL(netif_napi_add); @@ -6635,10 +6818,30 @@ void napi_disable(struct napi_struct *n) hrtimer_cancel(&n->timer); + clear_bit(NAPI_STATE_PREFER_BUSY_POLL, &n->state); clear_bit(NAPI_STATE_DISABLE, &n->state); + clear_bit(NAPI_STATE_THREADED, &n->state); } EXPORT_SYMBOL(napi_disable); +/** + * napi_enable - enable NAPI scheduling + * @n: NAPI context + * + * Resume NAPI from being scheduled on this context. + * Must be paired with napi_disable. + */ +void napi_enable(struct napi_struct *n) +{ + BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state)); + smp_mb__before_atomic(); + clear_bit(NAPI_STATE_SCHED, &n->state); + clear_bit(NAPI_STATE_NPSVC, &n->state); + if (n->dev->threaded && n->thread) + set_bit(NAPI_STATE_THREADED, &n->state); +} +EXPORT_SYMBOL(napi_enable); + static void flush_gro_hash(struct napi_struct *napi) { int i; @@ -6653,28 +6856,29 @@ static void flush_gro_hash(struct napi_struct *napi) } /* Must be called in process context */ -void netif_napi_del(struct napi_struct *napi) +void __netif_napi_del(struct napi_struct *napi) { - might_sleep(); - if (napi_hash_del(napi)) - synchronize_net(); - list_del_init(&napi->dev_list); + if (!test_and_clear_bit(NAPI_STATE_LISTED, &napi->state)) + return; + + napi_hash_del(napi); + list_del_rcu(&napi->dev_list); napi_free_frags(napi); flush_gro_hash(napi); napi->gro_bitmask = 0; + + if (napi->thread) { + kthread_stop(napi->thread); + napi->thread = NULL; + } } -EXPORT_SYMBOL(netif_napi_del); +EXPORT_SYMBOL(__netif_napi_del); -static int napi_poll(struct napi_struct *n, struct list_head *repoll) +static int __napi_poll(struct napi_struct *n, bool *repoll) { - void *have; int work, weight; - list_del_init(&n->poll_list); - - have = netpoll_poll_lock(n); - weight = n->weight; /* This NAPI_STATE_SCHED test is for avoiding a race @@ -6694,7 +6898,7 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll) n->poll, work, weight); if (likely(work < weight)) - goto out_unlock; + return work; /* Drivers must not modify the NAPI state if they * consume the entire weight. In such cases this code @@ -6703,7 +6907,20 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll) */ if (unlikely(napi_disable_pending(n))) { napi_complete(n); - goto out_unlock; + return work; + } + + /* The NAPI context has more processing work, but busy-polling + * is preferred. Exit early. + */ + if (napi_prefer_busy_poll(n)) { + if (napi_complete_done(n, work)) { + /* If timeout is not set, we need to make sure + * that the NAPI is re-scheduled. + */ + napi_schedule(n); + } + return work; } if (n->gro_bitmask) { @@ -6721,17 +6938,78 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll) if (unlikely(!list_empty(&n->poll_list))) { pr_warn_once("%s: Budget exhausted after napi rescheduled\n", n->dev ? n->dev->name : "backlog"); - goto out_unlock; + return work; } - list_add_tail(&n->poll_list, repoll); + *repoll = true; + + return work; +} + +static int napi_poll(struct napi_struct *n, struct list_head *repoll) +{ + bool do_repoll = false; + void *have; + int work; + + list_del_init(&n->poll_list); + + have = netpoll_poll_lock(n); + + work = __napi_poll(n, &do_repoll); + + if (do_repoll) + list_add_tail(&n->poll_list, repoll); -out_unlock: netpoll_poll_unlock(have); return work; } +static int napi_thread_wait(struct napi_struct *napi) +{ + set_current_state(TASK_INTERRUPTIBLE); + + while (!kthread_should_stop() && !napi_disable_pending(napi)) { + if (test_bit(NAPI_STATE_SCHED, &napi->state)) { + WARN_ON(!list_empty(&napi->poll_list)); + __set_current_state(TASK_RUNNING); + return 0; + } + + schedule(); + set_current_state(TASK_INTERRUPTIBLE); + } + __set_current_state(TASK_RUNNING); + return -1; +} + +static int napi_threaded_poll(void *data) +{ + struct napi_struct *napi = data; + void *have; + + while (!napi_thread_wait(napi)) { + for (;;) { + bool repoll = false; + + local_bh_disable(); + + have = netpoll_poll_lock(napi); + __napi_poll(napi, &repoll); + netpoll_poll_unlock(have); + + local_bh_enable(); + + if (!repoll) + break; + + cond_resched(); + } + } + return 0; +} + static __latent_entropy void net_rx_action(struct softirq_action *h) { struct softnet_data *sd = this_cpu_ptr(&softnet_data); @@ -6750,7 +7028,7 @@ static __latent_entropy void net_rx_action(struct softirq_action *h) if (list_empty(&list)) { if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll)) - goto out; + return; break; } @@ -6777,8 +7055,6 @@ static __latent_entropy void net_rx_action(struct softirq_action *h) __raise_softirq_irqoff(NET_RX_SOFTIRQ); net_rps_action_and_irq_enable(sd); -out: - __kfree_skb_flush(); } struct netdev_adjacent { @@ -6844,7 +7120,7 @@ bool netdev_has_upper_dev(struct net_device *dev, EXPORT_SYMBOL(netdev_has_upper_dev); /** - * netdev_has_upper_dev_all - Check if device is linked to an upper device + * netdev_has_upper_dev_all_rcu - Check if device is linked to an upper device * @dev: device * @upper_dev: upper device to check * @@ -7990,6 +8266,39 @@ struct net_device *netdev_get_xmit_slave(struct net_device *dev, } EXPORT_SYMBOL(netdev_get_xmit_slave); +static struct net_device *netdev_sk_get_lower_dev(struct net_device *dev, + struct sock *sk) +{ + const struct net_device_ops *ops = dev->netdev_ops; + + if (!ops->ndo_sk_get_lower_dev) + return NULL; + return ops->ndo_sk_get_lower_dev(dev, sk); +} + +/** + * netdev_sk_get_lowest_dev - Get the lowest device in chain given device and socket + * @dev: device + * @sk: the socket + * + * %NULL is returned if no lower device is found. + */ + +struct net_device *netdev_sk_get_lowest_dev(struct net_device *dev, + struct sock *sk) +{ + struct net_device *lower; + + lower = netdev_sk_get_lower_dev(dev, sk); + while (lower) { + dev = lower; + lower = netdev_sk_get_lower_dev(dev, sk); + } + + return dev; +} +EXPORT_SYMBOL(netdev_sk_get_lowest_dev); + static void netdev_adjacent_add_links(struct net_device *dev) { struct netdev_adjacent *iter; @@ -8082,7 +8391,7 @@ EXPORT_SYMBOL(netdev_lower_dev_get_private); /** - * netdev_lower_change - Dispatch event about lower device state change + * netdev_lower_state_changed - Dispatch event about lower device state change * @lower_dev: device * @lower_state_info: state to dispatch * @@ -8612,6 +8921,48 @@ int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa, } EXPORT_SYMBOL(dev_set_mac_address); +static DECLARE_RWSEM(dev_addr_sem); + +int dev_set_mac_address_user(struct net_device *dev, struct sockaddr *sa, + struct netlink_ext_ack *extack) +{ + int ret; + + down_write(&dev_addr_sem); + ret = dev_set_mac_address(dev, sa, extack); + up_write(&dev_addr_sem); + return ret; +} +EXPORT_SYMBOL(dev_set_mac_address_user); + +int dev_get_mac_address(struct sockaddr *sa, struct net *net, char *dev_name) +{ + size_t size = sizeof(sa->sa_data); + struct net_device *dev; + int ret = 0; + + down_read(&dev_addr_sem); + rcu_read_lock(); + + dev = dev_get_by_name_rcu(net, dev_name); + if (!dev) { + ret = -ENODEV; + goto unlock; + } + if (!dev->addr_len) + memset(sa->sa_data, 0, size); + else + memcpy(sa->sa_data, dev->dev_addr, + min_t(size_t, size, dev->addr_len)); + sa->sa_family = dev->type; + +unlock: + rcu_read_unlock(); + up_read(&dev_addr_sem); + return ret; +} +EXPORT_SYMBOL(dev_get_mac_address); + /** * dev_change_carrier - Change device carrier * @dev: device @@ -8827,7 +9178,7 @@ static bpf_op_t dev_xdp_bpf_op(struct net_device *dev, enum bpf_xdp_mode mode) return dev->netdev_ops->ndo_bpf; default: return NULL; - }; + } } static struct bpf_xdp_link *dev_xdp_link(struct net_device *dev, @@ -8846,6 +9197,17 @@ static struct bpf_prog *dev_xdp_prog(struct net_device *dev, return dev->xdp_state[mode].prog; } +static u8 dev_xdp_prog_count(struct net_device *dev) +{ + u8 count = 0; + int i; + + for (i = 0; i < __MAX_XDP_MODE; i++) + if (dev->xdp_state[i].prog || dev->xdp_state[i].link) + count++; + return count; +} + u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode) { struct bpf_prog *prog = dev_xdp_prog(dev, mode); @@ -8936,6 +9298,7 @@ static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack struct bpf_xdp_link *link, struct bpf_prog *new_prog, struct bpf_prog *old_prog, u32 flags) { + unsigned int num_modes = hweight32(flags & XDP_FLAGS_MODES); struct bpf_prog *cur_prog; enum bpf_xdp_mode mode; bpf_op_t bpf_op; @@ -8951,11 +9314,17 @@ static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack NL_SET_ERR_MSG(extack, "Invalid XDP flags for BPF link attachment"); return -EINVAL; } - /* just one XDP mode bit should be set, zero defaults to SKB mode */ - if (hweight32(flags & XDP_FLAGS_MODES) > 1) { + /* just one XDP mode bit should be set, zero defaults to drv/skb mode */ + if (num_modes > 1) { NL_SET_ERR_MSG(extack, "Only one XDP mode flag can be set"); return -EINVAL; } + /* avoid ambiguity if offload + drv/skb mode progs are both loaded */ + if (!num_modes && dev_xdp_prog_count(dev) > 1) { + NL_SET_ERR_MSG(extack, + "More than one program loaded, unset mode is ambiguous"); + return -EINVAL; + } /* old_prog != NULL implies XDP_FLAGS_REPLACE is set */ if (old_prog && !(flags & XDP_FLAGS_REPLACE)) { NL_SET_ERR_MSG(extack, "XDP_FLAGS_REPLACE is not specified"); @@ -9293,106 +9662,6 @@ static void net_set_todo(struct net_device *dev) dev_net(dev)->dev_unreg_count++; } -static void rollback_registered_many(struct list_head *head) -{ - struct net_device *dev, *tmp; - LIST_HEAD(close_head); - - BUG_ON(dev_boot_phase); - ASSERT_RTNL(); - - list_for_each_entry_safe(dev, tmp, head, unreg_list) { - /* Some devices call without registering - * for initialization unwind. Remove those - * devices and proceed with the remaining. - */ - if (dev->reg_state == NETREG_UNINITIALIZED) { - pr_debug("unregister_netdevice: device %s/%p never was registered\n", - dev->name, dev); - - WARN_ON(1); - list_del(&dev->unreg_list); - continue; - } - dev->dismantle = true; - BUG_ON(dev->reg_state != NETREG_REGISTERED); - } - - /* If device is running, close it first. */ - list_for_each_entry(dev, head, unreg_list) - list_add_tail(&dev->close_list, &close_head); - dev_close_many(&close_head, true); - - list_for_each_entry(dev, head, unreg_list) { - /* And unlink it from device chain. */ - unlist_netdevice(dev); - - dev->reg_state = NETREG_UNREGISTERING; - } - flush_all_backlogs(); - - synchronize_net(); - - list_for_each_entry(dev, head, unreg_list) { - struct sk_buff *skb = NULL; - - /* Shutdown queueing discipline. */ - dev_shutdown(dev); - - dev_xdp_uninstall(dev); - - /* Notify protocols, that we are about to destroy - * this device. They should clean all the things. - */ - call_netdevice_notifiers(NETDEV_UNREGISTER, dev); - - if (!dev->rtnl_link_ops || - dev->rtnl_link_state == RTNL_LINK_INITIALIZED) - skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0, - GFP_KERNEL, NULL, 0); - - /* - * Flush the unicast and multicast chains - */ - dev_uc_flush(dev); - dev_mc_flush(dev); - - netdev_name_node_alt_flush(dev); - netdev_name_node_free(dev->name_node); - - if (dev->netdev_ops->ndo_uninit) - dev->netdev_ops->ndo_uninit(dev); - - if (skb) - rtmsg_ifinfo_send(skb, dev, GFP_KERNEL); - - /* Notifier chain MUST detach us all upper devices. */ - WARN_ON(netdev_has_any_upper_dev(dev)); - WARN_ON(netdev_has_any_lower_dev(dev)); - - /* Remove entries from kobject tree */ - netdev_unregister_kobject(dev); -#ifdef CONFIG_XPS - /* Remove XPS queueing entries */ - netif_reset_xps_queues_gt(dev, 0); -#endif - } - - synchronize_net(); - - list_for_each_entry(dev, head, unreg_list) - dev_put(dev); -} - -static void rollback_registered(struct net_device *dev) -{ - LIST_HEAD(single); - - list_add(&dev->unreg_list, &single); - rollback_registered_many(&single); - list_del(&single); -} - static netdev_features_t netdev_sync_upper_features(struct net_device *lower, struct net_device *upper, netdev_features_t features) { @@ -9513,6 +9782,22 @@ static netdev_features_t netdev_fix_features(struct net_device *dev, } } + if (features & NETIF_F_HW_TLS_TX) { + bool ip_csum = (features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) == + (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM); + bool hw_csum = features & NETIF_F_HW_CSUM; + + if (!ip_csum && !hw_csum) { + netdev_dbg(dev, "Dropping TLS TX HW offload feature since no CSUM feature.\n"); + features &= ~NETIF_F_HW_TLS_TX; + } + } + + if ((features & NETIF_F_HW_TLS_RX) && !(features & NETIF_F_RXCSUM)) { + netdev_dbg(dev, "Dropping TLS RX HW offload feature since no RXCSUM feature.\n"); + features &= ~NETIF_F_HW_TLS_RX; + } + return features; } @@ -9533,7 +9818,7 @@ int __netdev_update_features(struct net_device *dev) /* driver might be less strict about feature dependencies */ features = netdev_fix_features(dev, features); - /* some features can't be enabled if they're off an an upper device */ + /* some features can't be enabled if they're off on an upper device */ netdev_for_each_upper_dev_rcu(dev, upper, iter) features = netdev_sync_upper_features(dev, upper, features); @@ -9688,7 +9973,7 @@ static int netif_alloc_rx_queues(struct net_device *dev) rx[i].dev = dev; /* XDP RX-queue setup */ - err = xdp_rxq_info_reg(&rx[i].xdp_rxq, dev, i); + err = xdp_rxq_info_reg(&rx[i].xdp_rxq, dev, i, 0); if (err < 0) goto err_rxq_info; } @@ -9849,7 +10134,7 @@ int register_netdevice(struct net_device *dev) dev->hw_features |= (NETIF_F_SOFT_FEATURES | NETIF_F_SOFT_FEATURES_OFF); dev->features |= NETIF_F_SOFT_FEATURES; - if (dev->netdev_ops->ndo_udp_tunnel_add) { + if (dev->udp_tunnel_nic_info) { dev->features |= NETIF_F_RX_UDP_TUNNEL_PORT; dev->hw_features |= NETIF_F_RX_UDP_TUNNEL_PORT; } @@ -9924,17 +10209,10 @@ int register_netdevice(struct net_device *dev) ret = call_netdevice_notifiers(NETDEV_REGISTER, dev); ret = notifier_to_errno(ret); if (ret) { - rollback_registered(dev); - rcu_barrier(); - - dev->reg_state = NETREG_UNREGISTERED; - /* We should put the kobject that hold in - * netdev_unregister_kobject(), otherwise - * the net device cannot be freed when - * driver calls free_netdev(), because the - * kobject is being hold. - */ - kobject_put(&dev->dev.kobj); + /* Expect explicit free_netdev() on failure */ + dev->needs_free_netdev = false; + unregister_netdevice_queue(dev, NULL); + goto out; } /* * Prevent userspace races by waiting until the network @@ -10037,6 +10315,8 @@ int netdev_refcnt_read(const struct net_device *dev) } EXPORT_SYMBOL(netdev_refcnt_read); +#define WAIT_REFS_MIN_MSECS 1 +#define WAIT_REFS_MAX_MSECS 250 /** * netdev_wait_allrefs - wait until all references are gone. * @dev: target net_device @@ -10052,7 +10332,7 @@ EXPORT_SYMBOL(netdev_refcnt_read); static void netdev_wait_allrefs(struct net_device *dev) { unsigned long rebroadcast_time, warning_time; - int refcnt; + int wait = 0, refcnt; linkwatch_forget_dev(dev); @@ -10086,7 +10366,13 @@ static void netdev_wait_allrefs(struct net_device *dev) rebroadcast_time = jiffies; } - msleep(250); + if (!wait) { + rcu_barrier(); + wait = WAIT_REFS_MIN_MSECS; + } else { + msleep(wait); + wait = min(wait << 1, WAIT_REFS_MAX_MSECS); + } refcnt = netdev_refcnt_read(dev); @@ -10134,7 +10420,7 @@ void netdev_run_todo(void) struct net_device *dev = list_first_entry(&unlink_list, struct net_device, unlink_list); - list_del(&dev->unlink_list); + list_del_init(&dev->unlink_list); dev->nested_level = dev->lower_level - 1; } #endif @@ -10249,6 +10535,55 @@ struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev, } EXPORT_SYMBOL(dev_get_stats); +/** + * dev_fetch_sw_netstats - get per-cpu network device statistics + * @s: place to store stats + * @netstats: per-cpu network stats to read from + * + * Read per-cpu network statistics and populate the related fields in @s. + */ +void dev_fetch_sw_netstats(struct rtnl_link_stats64 *s, + const struct pcpu_sw_netstats __percpu *netstats) +{ + int cpu; + + for_each_possible_cpu(cpu) { + const struct pcpu_sw_netstats *stats; + struct pcpu_sw_netstats tmp; + unsigned int start; + + stats = per_cpu_ptr(netstats, cpu); + do { + start = u64_stats_fetch_begin_irq(&stats->syncp); + tmp.rx_packets = stats->rx_packets; + tmp.rx_bytes = stats->rx_bytes; + tmp.tx_packets = stats->tx_packets; + tmp.tx_bytes = stats->tx_bytes; + } while (u64_stats_fetch_retry_irq(&stats->syncp, start)); + + s->rx_packets += tmp.rx_packets; + s->rx_bytes += tmp.rx_bytes; + s->tx_packets += tmp.tx_packets; + s->tx_bytes += tmp.tx_bytes; + } +} +EXPORT_SYMBOL_GPL(dev_fetch_sw_netstats); + +/** + * dev_get_tstats64 - ndo_get_stats64 implementation + * @dev: device to get statistics from + * @s: place to store stats + * + * Populate @s from dev->stats and dev->tstats. Can be used as + * ndo_get_stats64() callback. + */ +void dev_get_tstats64(struct net_device *dev, struct rtnl_link_stats64 *s) +{ + netdev_stats_to_stats64(s, &dev->stats); + dev_fetch_sw_netstats(s, dev->tstats); +} +EXPORT_SYMBOL_GPL(dev_get_tstats64); + struct netdev_queue *dev_ingress_queue_create(struct net_device *dev) { struct netdev_queue *queue = dev_ingress_queue(dev); @@ -10421,6 +10756,17 @@ void free_netdev(struct net_device *dev) struct napi_struct *p, *n; might_sleep(); + + /* When called immediately after register_netdevice() failed the unwind + * handling may still be dismantling the device. Handle that case by + * deferring the free. + */ + if (dev->reg_state == NETREG_UNREGISTERING) { + ASSERT_RTNL(); + dev->needs_free_netdev = true; + return; + } + netif_free_tx_queues(dev); netif_free_rx_queues(dev); @@ -10487,9 +10833,10 @@ void unregister_netdevice_queue(struct net_device *dev, struct list_head *head) if (head) { list_move_tail(&dev->unreg_list, head); } else { - rollback_registered(dev); - /* Finish processing unregister after unlock */ - net_set_todo(dev); + LIST_HEAD(single); + + list_add(&dev->unreg_list, &single); + unregister_netdevice_many(&single); } } EXPORT_SYMBOL(unregister_netdevice_queue); @@ -10503,14 +10850,100 @@ EXPORT_SYMBOL(unregister_netdevice_queue); */ void unregister_netdevice_many(struct list_head *head) { - struct net_device *dev; + struct net_device *dev, *tmp; + LIST_HEAD(close_head); + + BUG_ON(dev_boot_phase); + ASSERT_RTNL(); + + if (list_empty(head)) + return; + + list_for_each_entry_safe(dev, tmp, head, unreg_list) { + /* Some devices call without registering + * for initialization unwind. Remove those + * devices and proceed with the remaining. + */ + if (dev->reg_state == NETREG_UNINITIALIZED) { + pr_debug("unregister_netdevice: device %s/%p never was registered\n", + dev->name, dev); + + WARN_ON(1); + list_del(&dev->unreg_list); + continue; + } + dev->dismantle = true; + BUG_ON(dev->reg_state != NETREG_REGISTERED); + } + + /* If device is running, close it first. */ + list_for_each_entry(dev, head, unreg_list) + list_add_tail(&dev->close_list, &close_head); + dev_close_many(&close_head, true); - if (!list_empty(head)) { - rollback_registered_many(head); - list_for_each_entry(dev, head, unreg_list) - net_set_todo(dev); - list_del(head); + list_for_each_entry(dev, head, unreg_list) { + /* And unlink it from device chain. */ + unlist_netdevice(dev); + + dev->reg_state = NETREG_UNREGISTERING; } + flush_all_backlogs(); + + synchronize_net(); + + list_for_each_entry(dev, head, unreg_list) { + struct sk_buff *skb = NULL; + + /* Shutdown queueing discipline. */ + dev_shutdown(dev); + + dev_xdp_uninstall(dev); + + /* Notify protocols, that we are about to destroy + * this device. They should clean all the things. + */ + call_netdevice_notifiers(NETDEV_UNREGISTER, dev); + + if (!dev->rtnl_link_ops || + dev->rtnl_link_state == RTNL_LINK_INITIALIZED) + skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0, + GFP_KERNEL, NULL, 0); + + /* + * Flush the unicast and multicast chains + */ + dev_uc_flush(dev); + dev_mc_flush(dev); + + netdev_name_node_alt_flush(dev); + netdev_name_node_free(dev->name_node); + + if (dev->netdev_ops->ndo_uninit) + dev->netdev_ops->ndo_uninit(dev); + + if (skb) + rtmsg_ifinfo_send(skb, dev, GFP_KERNEL); + + /* Notifier chain MUST detach us all upper devices. */ + WARN_ON(netdev_has_any_upper_dev(dev)); + WARN_ON(netdev_has_any_lower_dev(dev)); + + /* Remove entries from kobject tree */ + netdev_unregister_kobject(dev); +#ifdef CONFIG_XPS + /* Remove XPS queueing entries */ + netif_reset_xps_queues_gt(dev, 0); +#endif + } + + synchronize_net(); + + list_for_each_entry(dev, head, unreg_list) { + dev_put(dev); + net_set_todo(dev); + } + + list_del(head); } EXPORT_SYMBOL(unregister_netdevice_many); @@ -11048,8 +11481,7 @@ static int __init net_dev_init(void) INIT_LIST_HEAD(&sd->poll_list); sd->output_queue_tailp = &sd->output_queue; #ifdef CONFIG_RPS - sd->csd.func = rps_trigger_softirq; - sd->csd.info = sd; + INIT_CSD(&sd->csd, rps_trigger_softirq, sd); sd->cpu = i; #endif diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c index 205e92e604ef..478d032f34ac 100644 --- a/net/core/dev_ioctl.c +++ b/net/core/dev_ioctl.c @@ -123,17 +123,6 @@ static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cm ifr->ifr_mtu = dev->mtu; return 0; - case SIOCGIFHWADDR: - if (!dev->addr_len) - memset(ifr->ifr_hwaddr.sa_data, 0, - sizeof(ifr->ifr_hwaddr.sa_data)); - else - memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr, - min(sizeof(ifr->ifr_hwaddr.sa_data), - (size_t)dev->addr_len)); - ifr->ifr_hwaddr.sa_family = dev->type; - return 0; - case SIOCGIFSLAVE: err = -EINVAL; break; @@ -230,7 +219,7 @@ static int dev_do_ioctl(struct net_device *dev, struct ifreq *ifr, unsigned int cmd) { const struct net_device_ops *ops = dev->netdev_ops; - int err = -EOPNOTSUPP; + int err; err = dsa_ndo_do_ioctl(dev, ifr, cmd); if (err == 0 || err != -EOPNOTSUPP) @@ -274,7 +263,7 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd) case SIOCSIFHWADDR: if (dev->addr_len > sizeof(struct sockaddr)) return -EINVAL; - return dev_set_mac_address(dev, &ifr->ifr_hwaddr, NULL); + return dev_set_mac_address_user(dev, &ifr->ifr_hwaddr, NULL); case SIOCSIFHWBROADCAST: if (ifr->ifr_hwaddr.sa_family != dev->type) @@ -418,6 +407,12 @@ int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr, bool *need_c */ switch (cmd) { + case SIOCGIFHWADDR: + dev_load(net, ifr->ifr_name); + ret = dev_get_mac_address(&ifr->ifr_hwaddr, net, ifr->ifr_name); + if (colon) + *colon = ':'; + return ret; /* * These ioctl calls: * - can be done by all. @@ -427,7 +422,6 @@ int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr, bool *need_c case SIOCGIFFLAGS: case SIOCGIFMETRIC: case SIOCGIFMTU: - case SIOCGIFHWADDR: case SIOCGIFSLAVE: case SIOCGIFMAP: case SIOCGIFINDEX: diff --git a/net/core/devlink.c b/net/core/devlink.c index 80ec1cd81c64..737b61c2976e 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -27,7 +27,6 @@ #include <net/net_namespace.h> #include <net/sock.h> #include <net/devlink.h> -#include <net/drop_monitor.h> #define CREATE_TRACE_POINTS #include <trace/events/devlink.h> @@ -84,9 +83,13 @@ EXPORT_SYMBOL(devlink_dpipe_header_ipv6); EXPORT_TRACEPOINT_SYMBOL_GPL(devlink_hwmsg); EXPORT_TRACEPOINT_SYMBOL_GPL(devlink_hwerr); +EXPORT_TRACEPOINT_SYMBOL_GPL(devlink_trap_report); static const struct nla_policy devlink_function_nl_policy[DEVLINK_PORT_FUNCTION_ATTR_MAX + 1] = { [DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR] = { .type = NLA_BINARY }, + [DEVLINK_PORT_FN_ATTR_STATE] = + NLA_POLICY_RANGE(NLA_U8, DEVLINK_PORT_FN_STATE_INACTIVE, + DEVLINK_PORT_FN_STATE_ACTIVE), }; static LIST_HEAD(devlink_list); @@ -347,8 +350,12 @@ devlink_sb_tc_index_get_from_info(struct devlink_sb *devlink_sb, struct devlink_region { struct devlink *devlink; + struct devlink_port *port; struct list_head list; - const struct devlink_region_ops *ops; + union { + const struct devlink_region_ops *ops; + const struct devlink_port_region_ops *port_ops; + }; struct list_head snapshot_list; u32 max_snapshots; u32 cur_snapshots; @@ -374,6 +381,19 @@ devlink_region_get_by_name(struct devlink *devlink, const char *region_name) return NULL; } +static struct devlink_region * +devlink_port_region_get_by_name(struct devlink_port *port, + const char *region_name) +{ + struct devlink_region *region; + + list_for_each_entry(region, &port->region_list, list) + if (!strcmp(region->ops->name, region_name)) + return region; + + return NULL; +} + static struct devlink_snapshot * devlink_region_snapshot_get_by_id(struct devlink_region *region, u32 id) { @@ -462,10 +482,132 @@ static int devlink_nl_put_handle(struct sk_buff *msg, struct devlink *devlink) return 0; } +struct devlink_reload_combination { + enum devlink_reload_action action; + enum devlink_reload_limit limit; +}; + +static const struct devlink_reload_combination devlink_reload_invalid_combinations[] = { + { + /* can't reinitialize driver with no down time */ + .action = DEVLINK_RELOAD_ACTION_DRIVER_REINIT, + .limit = DEVLINK_RELOAD_LIMIT_NO_RESET, + }, +}; + +static bool +devlink_reload_combination_is_invalid(enum devlink_reload_action action, + enum devlink_reload_limit limit) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(devlink_reload_invalid_combinations); i++) + if (devlink_reload_invalid_combinations[i].action == action && + devlink_reload_invalid_combinations[i].limit == limit) + return true; + return false; +} + +static bool +devlink_reload_action_is_supported(struct devlink *devlink, enum devlink_reload_action action) +{ + return test_bit(action, &devlink->ops->reload_actions); +} + +static bool +devlink_reload_limit_is_supported(struct devlink *devlink, enum devlink_reload_limit limit) +{ + return test_bit(limit, &devlink->ops->reload_limits); +} + +static int devlink_reload_stat_put(struct sk_buff *msg, + enum devlink_reload_limit limit, u32 value) +{ + struct nlattr *reload_stats_entry; + + reload_stats_entry = nla_nest_start(msg, DEVLINK_ATTR_RELOAD_STATS_ENTRY); + if (!reload_stats_entry) + return -EMSGSIZE; + + if (nla_put_u8(msg, DEVLINK_ATTR_RELOAD_STATS_LIMIT, limit) || + nla_put_u32(msg, DEVLINK_ATTR_RELOAD_STATS_VALUE, value)) + goto nla_put_failure; + nla_nest_end(msg, reload_stats_entry); + return 0; + +nla_put_failure: + nla_nest_cancel(msg, reload_stats_entry); + return -EMSGSIZE; +} + +static int devlink_reload_stats_put(struct sk_buff *msg, struct devlink *devlink, bool is_remote) +{ + struct nlattr *reload_stats_attr, *act_info, *act_stats; + int i, j, stat_idx; + u32 value; + + if (!is_remote) + reload_stats_attr = nla_nest_start(msg, DEVLINK_ATTR_RELOAD_STATS); + else + reload_stats_attr = nla_nest_start(msg, DEVLINK_ATTR_REMOTE_RELOAD_STATS); + + if (!reload_stats_attr) + return -EMSGSIZE; + + for (i = 0; i <= DEVLINK_RELOAD_ACTION_MAX; i++) { + if ((!is_remote && + !devlink_reload_action_is_supported(devlink, i)) || + i == DEVLINK_RELOAD_ACTION_UNSPEC) + continue; + act_info = nla_nest_start(msg, DEVLINK_ATTR_RELOAD_ACTION_INFO); + if (!act_info) + goto nla_put_failure; + + if (nla_put_u8(msg, DEVLINK_ATTR_RELOAD_ACTION, i)) + goto action_info_nest_cancel; + act_stats = nla_nest_start(msg, DEVLINK_ATTR_RELOAD_ACTION_STATS); + if (!act_stats) + goto action_info_nest_cancel; + + for (j = 0; j <= DEVLINK_RELOAD_LIMIT_MAX; j++) { + /* Remote stats are shown even if not locally supported. + * Stats of actions with unspecified limit are shown + * though drivers don't need to register unspecified + * limit. + */ + if ((!is_remote && j != DEVLINK_RELOAD_LIMIT_UNSPEC && + !devlink_reload_limit_is_supported(devlink, j)) || + devlink_reload_combination_is_invalid(i, j)) + continue; + + stat_idx = j * __DEVLINK_RELOAD_ACTION_MAX + i; + if (!is_remote) + value = devlink->stats.reload_stats[stat_idx]; + else + value = devlink->stats.remote_reload_stats[stat_idx]; + if (devlink_reload_stat_put(msg, j, value)) + goto action_stats_nest_cancel; + } + nla_nest_end(msg, act_stats); + nla_nest_end(msg, act_info); + } + nla_nest_end(msg, reload_stats_attr); + return 0; + +action_stats_nest_cancel: + nla_nest_cancel(msg, act_stats); +action_info_nest_cancel: + nla_nest_cancel(msg, act_info); +nla_put_failure: + nla_nest_cancel(msg, reload_stats_attr); + return -EMSGSIZE; +} + static int devlink_nl_fill(struct sk_buff *msg, struct devlink *devlink, enum devlink_command cmd, u32 portid, u32 seq, int flags) { + struct nlattr *dev_stats; void *hdr; hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); @@ -477,9 +619,21 @@ static int devlink_nl_fill(struct sk_buff *msg, struct devlink *devlink, if (nla_put_u8(msg, DEVLINK_ATTR_RELOAD_FAILED, devlink->reload_failed)) goto nla_put_failure; + dev_stats = nla_nest_start(msg, DEVLINK_ATTR_DEV_STATS); + if (!dev_stats) + goto nla_put_failure; + + if (devlink_reload_stats_put(msg, devlink, false)) + goto dev_stats_nest_cancel; + if (devlink_reload_stats_put(msg, devlink, true)) + goto dev_stats_nest_cancel; + + nla_nest_end(msg, dev_stats); genlmsg_end(msg, hdr); return 0; +dev_stats_nest_cancel: + nla_nest_cancel(msg, dev_stats); nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; @@ -523,15 +677,29 @@ static int devlink_nl_port_attrs_put(struct sk_buff *msg, return -EMSGSIZE; switch (devlink_port->attrs.flavour) { case DEVLINK_PORT_FLAVOUR_PCI_PF: - if (nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_PF_NUMBER, - attrs->pci_pf.pf)) + if (nla_put_u32(msg, DEVLINK_ATTR_PORT_CONTROLLER_NUMBER, + attrs->pci_pf.controller) || + nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_PF_NUMBER, attrs->pci_pf.pf)) + return -EMSGSIZE; + if (nla_put_u8(msg, DEVLINK_ATTR_PORT_EXTERNAL, attrs->pci_pf.external)) return -EMSGSIZE; break; case DEVLINK_PORT_FLAVOUR_PCI_VF: - if (nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_PF_NUMBER, - attrs->pci_vf.pf) || - nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_VF_NUMBER, - attrs->pci_vf.vf)) + if (nla_put_u32(msg, DEVLINK_ATTR_PORT_CONTROLLER_NUMBER, + attrs->pci_vf.controller) || + nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_PF_NUMBER, attrs->pci_vf.pf) || + nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_VF_NUMBER, attrs->pci_vf.vf)) + return -EMSGSIZE; + if (nla_put_u8(msg, DEVLINK_ATTR_PORT_EXTERNAL, attrs->pci_vf.external)) + return -EMSGSIZE; + break; + case DEVLINK_PORT_FLAVOUR_PCI_SF: + if (nla_put_u32(msg, DEVLINK_ATTR_PORT_CONTROLLER_NUMBER, + attrs->pci_sf.controller) || + nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_PF_NUMBER, + attrs->pci_sf.pf) || + nla_put_u32(msg, DEVLINK_ATTR_PORT_PCI_SF_NUMBER, + attrs->pci_sf.sf)) return -EMSGSIZE; break; case DEVLINK_PORT_FLAVOUR_PHYSICAL: @@ -557,42 +725,105 @@ static int devlink_nl_port_attrs_put(struct sk_buff *msg, } static int +devlink_port_fn_hw_addr_fill(struct devlink *devlink, const struct devlink_ops *ops, + struct devlink_port *port, struct sk_buff *msg, + struct netlink_ext_ack *extack, bool *msg_updated) +{ + u8 hw_addr[MAX_ADDR_LEN]; + int hw_addr_len; + int err; + + if (!ops->port_function_hw_addr_get) + return 0; + + err = ops->port_function_hw_addr_get(devlink, port, hw_addr, &hw_addr_len, extack); + if (err) { + if (err == -EOPNOTSUPP) + return 0; + return err; + } + err = nla_put(msg, DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR, hw_addr_len, hw_addr); + if (err) + return err; + *msg_updated = true; + return 0; +} + +static bool +devlink_port_fn_state_valid(enum devlink_port_fn_state state) +{ + return state == DEVLINK_PORT_FN_STATE_INACTIVE || + state == DEVLINK_PORT_FN_STATE_ACTIVE; +} + +static bool +devlink_port_fn_opstate_valid(enum devlink_port_fn_opstate opstate) +{ + return opstate == DEVLINK_PORT_FN_OPSTATE_DETACHED || + opstate == DEVLINK_PORT_FN_OPSTATE_ATTACHED; +} + +static int +devlink_port_fn_state_fill(struct devlink *devlink, + const struct devlink_ops *ops, + struct devlink_port *port, struct sk_buff *msg, + struct netlink_ext_ack *extack, + bool *msg_updated) +{ + enum devlink_port_fn_opstate opstate; + enum devlink_port_fn_state state; + int err; + + if (!ops->port_fn_state_get) + return 0; + + err = ops->port_fn_state_get(devlink, port, &state, &opstate, extack); + if (err) { + if (err == -EOPNOTSUPP) + return 0; + return err; + } + if (!devlink_port_fn_state_valid(state)) { + WARN_ON_ONCE(1); + NL_SET_ERR_MSG_MOD(extack, "Invalid state read from driver"); + return -EINVAL; + } + if (!devlink_port_fn_opstate_valid(opstate)) { + WARN_ON_ONCE(1); + NL_SET_ERR_MSG_MOD(extack, + "Invalid operational state read from driver"); + return -EINVAL; + } + if (nla_put_u8(msg, DEVLINK_PORT_FN_ATTR_STATE, state) || + nla_put_u8(msg, DEVLINK_PORT_FN_ATTR_OPSTATE, opstate)) + return -EMSGSIZE; + *msg_updated = true; + return 0; +} + +static int devlink_nl_port_function_attrs_put(struct sk_buff *msg, struct devlink_port *port, struct netlink_ext_ack *extack) { struct devlink *devlink = port->devlink; const struct devlink_ops *ops; struct nlattr *function_attr; - bool empty_nest = true; - int err = 0; + bool msg_updated = false; + int err; function_attr = nla_nest_start_noflag(msg, DEVLINK_ATTR_PORT_FUNCTION); if (!function_attr) return -EMSGSIZE; ops = devlink->ops; - if (ops->port_function_hw_addr_get) { - int hw_addr_len; - u8 hw_addr[MAX_ADDR_LEN]; - - err = ops->port_function_hw_addr_get(devlink, port, hw_addr, &hw_addr_len, extack); - if (err == -EOPNOTSUPP) { - /* Port function attributes are optional for a port. If port doesn't - * support function attribute, returning -EOPNOTSUPP is not an error. - */ - err = 0; - goto out; - } else if (err) { - goto out; - } - err = nla_put(msg, DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR, hw_addr_len, hw_addr); - if (err) - goto out; - empty_nest = false; - } - + err = devlink_port_fn_hw_addr_fill(devlink, ops, port, msg, + extack, &msg_updated); + if (err) + goto out; + err = devlink_port_fn_state_fill(devlink, ops, port, msg, extack, + &msg_updated); out: - if (err || empty_nest) + if (err || !msg_updated) nla_nest_cancel(msg, function_attr); else nla_nest_end(msg, function_attr); @@ -616,6 +847,8 @@ static int devlink_nl_port_fill(struct sk_buff *msg, struct devlink *devlink, if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, devlink_port->index)) goto nla_put_failure; + /* Hold rtnl lock while accessing port's netdev attributes. */ + rtnl_lock(); spin_lock_bh(&devlink_port->type_lock); if (nla_put_u16(msg, DEVLINK_ATTR_PORT_TYPE, devlink_port->type)) goto nla_put_failure_type_locked; @@ -624,9 +857,10 @@ static int devlink_nl_port_fill(struct sk_buff *msg, struct devlink *devlink, devlink_port->desired_type)) goto nla_put_failure_type_locked; if (devlink_port->type == DEVLINK_PORT_TYPE_ETH) { + struct net *net = devlink_net(devlink_port->devlink); struct net_device *netdev = devlink_port->type_dev; - if (netdev && + if (netdev && net_eq(net, dev_net(netdev)) && (nla_put_u32(msg, DEVLINK_ATTR_PORT_NETDEV_IFINDEX, netdev->ifindex) || nla_put_string(msg, DEVLINK_ATTR_PORT_NETDEV_NAME, @@ -642,6 +876,7 @@ static int devlink_nl_port_fill(struct sk_buff *msg, struct devlink *devlink, goto nla_put_failure_type_locked; } spin_unlock_bh(&devlink_port->type_lock); + rtnl_unlock(); if (devlink_nl_port_attrs_put(msg, devlink_port)) goto nla_put_failure; if (devlink_nl_port_function_attrs_put(msg, devlink_port, extack)) @@ -652,6 +887,7 @@ static int devlink_nl_port_fill(struct sk_buff *msg, struct devlink *devlink, nla_put_failure_type_locked: spin_unlock_bh(&devlink_port->type_lock); + rtnl_unlock(); nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; @@ -806,8 +1042,6 @@ static int devlink_port_type_set(struct devlink *devlink, int err; if (devlink->ops->port_type_set) { - if (port_type == DEVLINK_PORT_TYPE_NOTSET) - return -EINVAL; if (port_type == devlink_port->type) return 0; err = devlink->ops->port_type_set(devlink_port, port_type); @@ -827,7 +1061,6 @@ devlink_port_function_hw_addr_set(struct devlink *devlink, struct devlink_port * const struct devlink_ops *ops; const u8 *hw_addr; int hw_addr_len; - int err; hw_addr = nla_data(attr); hw_addr_len = nla_len(attr); @@ -852,12 +1085,25 @@ devlink_port_function_hw_addr_set(struct devlink *devlink, struct devlink_port * return -EOPNOTSUPP; } - err = ops->port_function_hw_addr_set(devlink, port, hw_addr, hw_addr_len, extack); - if (err) - return err; + return ops->port_function_hw_addr_set(devlink, port, hw_addr, hw_addr_len, extack); +} - devlink_port_notify(port, DEVLINK_CMD_PORT_NEW); - return 0; +static int devlink_port_fn_state_set(struct devlink *devlink, + struct devlink_port *port, + const struct nlattr *attr, + struct netlink_ext_ack *extack) +{ + enum devlink_port_fn_state state; + const struct devlink_ops *ops; + + state = nla_get_u8(attr); + ops = devlink->ops; + if (!ops->port_fn_state_set) { + NL_SET_ERR_MSG_MOD(extack, + "Function does not support state setting"); + return -EOPNOTSUPP; + } + return ops->port_fn_state_set(devlink, port, state, extack); } static int @@ -875,9 +1121,21 @@ devlink_port_function_set(struct devlink *devlink, struct devlink_port *port, } attr = tb[DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR]; - if (attr) + if (attr) { err = devlink_port_function_hw_addr_set(devlink, port, attr, extack); + if (err) + return err; + } + /* Keep this as the last function attribute set, so that when + * multiple port function attributes are set along with state, + * Those can be applied first before activating the state. + */ + attr = tb[DEVLINK_PORT_FN_ATTR_STATE]; + if (attr) + err = devlink_port_fn_state_set(devlink, port, attr, extack); + if (!err) + devlink_port_notify(port, DEVLINK_CMD_PORT_NEW); return err; } @@ -977,6 +1235,111 @@ static int devlink_nl_cmd_port_unsplit_doit(struct sk_buff *skb, return devlink_port_unsplit(devlink, port_index, info->extack); } +static int devlink_port_new_notifiy(struct devlink *devlink, + unsigned int port_index, + struct genl_info *info) +{ + struct devlink_port *devlink_port; + struct sk_buff *msg; + int err; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + mutex_lock(&devlink->lock); + devlink_port = devlink_port_get_by_index(devlink, port_index); + if (!devlink_port) { + err = -ENODEV; + goto out; + } + + err = devlink_nl_port_fill(msg, devlink, devlink_port, + DEVLINK_CMD_NEW, info->snd_portid, + info->snd_seq, 0, NULL); + if (err) + goto out; + + err = genlmsg_reply(msg, info); + mutex_unlock(&devlink->lock); + return err; + +out: + mutex_unlock(&devlink->lock); + nlmsg_free(msg); + return err; +} + +static int devlink_nl_cmd_port_new_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct netlink_ext_ack *extack = info->extack; + struct devlink_port_new_attrs new_attrs = {}; + struct devlink *devlink = info->user_ptr[0]; + unsigned int new_port_index; + int err; + + if (!devlink->ops->port_new || !devlink->ops->port_del) + return -EOPNOTSUPP; + + if (!info->attrs[DEVLINK_ATTR_PORT_FLAVOUR] || + !info->attrs[DEVLINK_ATTR_PORT_PCI_PF_NUMBER]) { + NL_SET_ERR_MSG_MOD(extack, "Port flavour or PCI PF are not specified"); + return -EINVAL; + } + new_attrs.flavour = nla_get_u16(info->attrs[DEVLINK_ATTR_PORT_FLAVOUR]); + new_attrs.pfnum = + nla_get_u16(info->attrs[DEVLINK_ATTR_PORT_PCI_PF_NUMBER]); + + if (info->attrs[DEVLINK_ATTR_PORT_INDEX]) { + /* Port index of the new port being created by driver. */ + new_attrs.port_index = + nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]); + new_attrs.port_index_valid = true; + } + if (info->attrs[DEVLINK_ATTR_PORT_CONTROLLER_NUMBER]) { + new_attrs.controller = + nla_get_u16(info->attrs[DEVLINK_ATTR_PORT_CONTROLLER_NUMBER]); + new_attrs.controller_valid = true; + } + if (new_attrs.flavour == DEVLINK_PORT_FLAVOUR_PCI_SF && + info->attrs[DEVLINK_ATTR_PORT_PCI_SF_NUMBER]) { + new_attrs.sfnum = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_PCI_SF_NUMBER]); + new_attrs.sfnum_valid = true; + } + + err = devlink->ops->port_new(devlink, &new_attrs, extack, + &new_port_index); + if (err) + return err; + + err = devlink_port_new_notifiy(devlink, new_port_index, info); + if (err && err != -ENODEV) { + /* Fail to send the response; destroy newly created port. */ + devlink->ops->port_del(devlink, new_port_index, extack); + } + return err; +} + +static int devlink_nl_cmd_port_del_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct netlink_ext_ack *extack = info->extack; + struct devlink *devlink = info->user_ptr[0]; + unsigned int port_index; + + if (!devlink->ops->port_del) + return -EOPNOTSUPP; + + if (!info->attrs[DEVLINK_ATTR_PORT_INDEX]) { + NL_SET_ERR_MSG_MOD(extack, "Port index is not specified"); + return -EINVAL; + } + port_index = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]); + + return devlink->ops->port_del(devlink, port_index, extack); +} + static int devlink_nl_sb_fill(struct sk_buff *msg, struct devlink *devlink, struct devlink_sb *devlink_sb, enum devlink_command cmd, u32 portid, @@ -1311,7 +1674,7 @@ static int devlink_nl_sb_port_pool_fill(struct sk_buff *msg, err = ops->sb_occ_port_pool_get(devlink_port, devlink_sb->index, pool_index, &cur, &max); if (err && err != -EOPNOTSUPP) - return err; + goto sb_occ_get_failure; if (!err) { if (nla_put_u32(msg, DEVLINK_ATTR_SB_OCC_CUR, cur)) goto nla_put_failure; @@ -1324,8 +1687,10 @@ static int devlink_nl_sb_port_pool_fill(struct sk_buff *msg, return 0; nla_put_failure: + err = -EMSGSIZE; +sb_occ_get_failure: genlmsg_cancel(msg, hdr); - return -EMSGSIZE; + return err; } static int devlink_nl_cmd_sb_port_pool_get_doit(struct sk_buff *skb, @@ -2943,9 +3308,9 @@ static void devlink_reload_netns_change(struct devlink *devlink, DEVLINK_CMD_PARAM_NEW); } -static bool devlink_reload_supported(const struct devlink *devlink) +static bool devlink_reload_supported(const struct devlink_ops *ops) { - return devlink->ops->reload_down && devlink->ops->reload_up; + return ops->reload_down && ops->reload_up; } static void devlink_reload_failed_set(struct devlink *devlink, @@ -2963,33 +3328,132 @@ bool devlink_is_reload_failed(const struct devlink *devlink) } EXPORT_SYMBOL_GPL(devlink_is_reload_failed); +static void +__devlink_reload_stats_update(struct devlink *devlink, u32 *reload_stats, + enum devlink_reload_limit limit, u32 actions_performed) +{ + unsigned long actions = actions_performed; + int stat_idx; + int action; + + for_each_set_bit(action, &actions, __DEVLINK_RELOAD_ACTION_MAX) { + stat_idx = limit * __DEVLINK_RELOAD_ACTION_MAX + action; + reload_stats[stat_idx]++; + } + devlink_notify(devlink, DEVLINK_CMD_NEW); +} + +static void +devlink_reload_stats_update(struct devlink *devlink, enum devlink_reload_limit limit, + u32 actions_performed) +{ + __devlink_reload_stats_update(devlink, devlink->stats.reload_stats, limit, + actions_performed); +} + +/** + * devlink_remote_reload_actions_performed - Update devlink on reload actions + * performed which are not a direct result of devlink reload call. + * + * This should be called by a driver after performing reload actions in case it was not + * a result of devlink reload call. For example fw_activate was performed as a result + * of devlink reload triggered fw_activate on another host. + * The motivation for this function is to keep data on reload actions performed on this + * function whether it was done due to direct devlink reload call or not. + * + * @devlink: devlink + * @limit: reload limit + * @actions_performed: bitmask of actions performed + */ +void devlink_remote_reload_actions_performed(struct devlink *devlink, + enum devlink_reload_limit limit, + u32 actions_performed) +{ + if (WARN_ON(!actions_performed || + actions_performed & BIT(DEVLINK_RELOAD_ACTION_UNSPEC) || + actions_performed >= BIT(__DEVLINK_RELOAD_ACTION_MAX) || + limit > DEVLINK_RELOAD_LIMIT_MAX)) + return; + + __devlink_reload_stats_update(devlink, devlink->stats.remote_reload_stats, limit, + actions_performed); +} +EXPORT_SYMBOL_GPL(devlink_remote_reload_actions_performed); + static int devlink_reload(struct devlink *devlink, struct net *dest_net, - struct netlink_ext_ack *extack) + enum devlink_reload_action action, enum devlink_reload_limit limit, + u32 *actions_performed, struct netlink_ext_ack *extack) { + u32 remote_reload_stats[DEVLINK_RELOAD_STATS_ARRAY_SIZE]; int err; if (!devlink->reload_enabled) return -EOPNOTSUPP; - err = devlink->ops->reload_down(devlink, !!dest_net, extack); + memcpy(remote_reload_stats, devlink->stats.remote_reload_stats, + sizeof(remote_reload_stats)); + err = devlink->ops->reload_down(devlink, !!dest_net, action, limit, extack); if (err) return err; if (dest_net && !net_eq(dest_net, devlink_net(devlink))) devlink_reload_netns_change(devlink, dest_net); - err = devlink->ops->reload_up(devlink, extack); + err = devlink->ops->reload_up(devlink, action, limit, actions_performed, extack); devlink_reload_failed_set(devlink, !!err); - return err; + if (err) + return err; + + WARN_ON(!(*actions_performed & BIT(action))); + /* Catch driver on updating the remote action within devlink reload */ + WARN_ON(memcmp(remote_reload_stats, devlink->stats.remote_reload_stats, + sizeof(remote_reload_stats))); + devlink_reload_stats_update(devlink, limit, *actions_performed); + return 0; +} + +static int +devlink_nl_reload_actions_performed_snd(struct devlink *devlink, u32 actions_performed, + enum devlink_command cmd, struct genl_info *info) +{ + struct sk_buff *msg; + void *hdr; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + hdr = genlmsg_put(msg, info->snd_portid, info->snd_seq, &devlink_nl_family, 0, cmd); + if (!hdr) + goto free_msg; + + if (devlink_nl_put_handle(msg, devlink)) + goto nla_put_failure; + + if (nla_put_bitfield32(msg, DEVLINK_ATTR_RELOAD_ACTIONS_PERFORMED, actions_performed, + actions_performed)) + goto nla_put_failure; + genlmsg_end(msg, hdr); + + return genlmsg_reply(msg, info); + +nla_put_failure: + genlmsg_cancel(msg, hdr); +free_msg: + nlmsg_free(msg); + return -EMSGSIZE; } static int devlink_nl_cmd_reload(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; + enum devlink_reload_action action; + enum devlink_reload_limit limit; struct net *dest_net = NULL; + u32 actions_performed; int err; - if (!devlink_reload_supported(devlink)) + if (!devlink_reload_supported(devlink->ops)) return -EOPNOTSUPP; err = devlink_resources_validate(devlink, NULL, info); @@ -3006,20 +3470,67 @@ static int devlink_nl_cmd_reload(struct sk_buff *skb, struct genl_info *info) return PTR_ERR(dest_net); } - err = devlink_reload(devlink, dest_net, info->extack); + if (info->attrs[DEVLINK_ATTR_RELOAD_ACTION]) + action = nla_get_u8(info->attrs[DEVLINK_ATTR_RELOAD_ACTION]); + else + action = DEVLINK_RELOAD_ACTION_DRIVER_REINIT; + + if (!devlink_reload_action_is_supported(devlink, action)) { + NL_SET_ERR_MSG_MOD(info->extack, + "Requested reload action is not supported by the driver"); + return -EOPNOTSUPP; + } + + limit = DEVLINK_RELOAD_LIMIT_UNSPEC; + if (info->attrs[DEVLINK_ATTR_RELOAD_LIMITS]) { + struct nla_bitfield32 limits; + u32 limits_selected; + + limits = nla_get_bitfield32(info->attrs[DEVLINK_ATTR_RELOAD_LIMITS]); + limits_selected = limits.value & limits.selector; + if (!limits_selected) { + NL_SET_ERR_MSG_MOD(info->extack, "Invalid limit selected"); + return -EINVAL; + } + for (limit = 0 ; limit <= DEVLINK_RELOAD_LIMIT_MAX ; limit++) + if (limits_selected & BIT(limit)) + break; + /* UAPI enables multiselection, but currently it is not used */ + if (limits_selected != BIT(limit)) { + NL_SET_ERR_MSG_MOD(info->extack, + "Multiselection of limit is not supported"); + return -EOPNOTSUPP; + } + if (!devlink_reload_limit_is_supported(devlink, limit)) { + NL_SET_ERR_MSG_MOD(info->extack, + "Requested limit is not supported by the driver"); + return -EOPNOTSUPP; + } + if (devlink_reload_combination_is_invalid(action, limit)) { + NL_SET_ERR_MSG_MOD(info->extack, + "Requested limit is invalid for this action"); + return -EINVAL; + } + } + err = devlink_reload(devlink, dest_net, action, limit, &actions_performed, info->extack); if (dest_net) put_net(dest_net); - return err; + if (err) + return err; + /* For backward compatibility generate reply only if attributes used by user */ + if (!info->attrs[DEVLINK_ATTR_RELOAD_ACTION] && !info->attrs[DEVLINK_ATTR_RELOAD_LIMITS]) + return 0; + + return devlink_nl_reload_actions_performed_snd(devlink, actions_performed, + DEVLINK_CMD_RELOAD, info); } static int devlink_nl_flash_update_fill(struct sk_buff *msg, struct devlink *devlink, enum devlink_command cmd, - const char *status_msg, - const char *component, - unsigned long done, unsigned long total) + struct devlink_flash_notify *params) { void *hdr; @@ -3033,19 +3544,22 @@ static int devlink_nl_flash_update_fill(struct sk_buff *msg, if (cmd != DEVLINK_CMD_FLASH_UPDATE_STATUS) goto out; - if (status_msg && + if (params->status_msg && nla_put_string(msg, DEVLINK_ATTR_FLASH_UPDATE_STATUS_MSG, - status_msg)) + params->status_msg)) goto nla_put_failure; - if (component && + if (params->component && nla_put_string(msg, DEVLINK_ATTR_FLASH_UPDATE_COMPONENT, - component)) + params->component)) goto nla_put_failure; if (nla_put_u64_64bit(msg, DEVLINK_ATTR_FLASH_UPDATE_STATUS_DONE, - done, DEVLINK_ATTR_PAD)) + params->done, DEVLINK_ATTR_PAD)) goto nla_put_failure; if (nla_put_u64_64bit(msg, DEVLINK_ATTR_FLASH_UPDATE_STATUS_TOTAL, - total, DEVLINK_ATTR_PAD)) + params->total, DEVLINK_ATTR_PAD)) + goto nla_put_failure; + if (nla_put_u64_64bit(msg, DEVLINK_ATTR_FLASH_UPDATE_STATUS_TIMEOUT, + params->timeout, DEVLINK_ATTR_PAD)) goto nla_put_failure; out: @@ -3059,10 +3573,7 @@ nla_put_failure: static void __devlink_flash_update_notify(struct devlink *devlink, enum devlink_command cmd, - const char *status_msg, - const char *component, - unsigned long done, - unsigned long total) + struct devlink_flash_notify *params) { struct sk_buff *msg; int err; @@ -3075,8 +3586,7 @@ static void __devlink_flash_update_notify(struct devlink *devlink, if (!msg) return; - err = devlink_nl_flash_update_fill(msg, devlink, cmd, status_msg, - component, done, total); + err = devlink_nl_flash_update_fill(msg, devlink, cmd, params); if (err) goto out_free_msg; @@ -3088,21 +3598,23 @@ out_free_msg: nlmsg_free(msg); } -void devlink_flash_update_begin_notify(struct devlink *devlink) +static void devlink_flash_update_begin_notify(struct devlink *devlink) { + struct devlink_flash_notify params = { 0 }; + __devlink_flash_update_notify(devlink, DEVLINK_CMD_FLASH_UPDATE, - NULL, NULL, 0, 0); + ¶ms); } -EXPORT_SYMBOL_GPL(devlink_flash_update_begin_notify); -void devlink_flash_update_end_notify(struct devlink *devlink) +static void devlink_flash_update_end_notify(struct devlink *devlink) { + struct devlink_flash_notify params = { 0 }; + __devlink_flash_update_notify(devlink, DEVLINK_CMD_FLASH_UPDATE_END, - NULL, NULL, 0, 0); + ¶ms); } -EXPORT_SYMBOL_GPL(devlink_flash_update_end_notify); void devlink_flash_update_status_notify(struct devlink *devlink, const char *status_msg, @@ -3110,31 +3622,92 @@ void devlink_flash_update_status_notify(struct devlink *devlink, unsigned long done, unsigned long total) { + struct devlink_flash_notify params = { + .status_msg = status_msg, + .component = component, + .done = done, + .total = total, + }; + __devlink_flash_update_notify(devlink, DEVLINK_CMD_FLASH_UPDATE_STATUS, - status_msg, component, done, total); + ¶ms); } EXPORT_SYMBOL_GPL(devlink_flash_update_status_notify); +void devlink_flash_update_timeout_notify(struct devlink *devlink, + const char *status_msg, + const char *component, + unsigned long timeout) +{ + struct devlink_flash_notify params = { + .status_msg = status_msg, + .component = component, + .timeout = timeout, + }; + + __devlink_flash_update_notify(devlink, + DEVLINK_CMD_FLASH_UPDATE_STATUS, + ¶ms); +} +EXPORT_SYMBOL_GPL(devlink_flash_update_timeout_notify); + static int devlink_nl_cmd_flash_update(struct sk_buff *skb, struct genl_info *info) { + struct nlattr *nla_component, *nla_overwrite_mask, *nla_file_name; + struct devlink_flash_update_params params = {}; struct devlink *devlink = info->user_ptr[0]; - const char *file_name, *component; - struct nlattr *nla_component; + const char *file_name; + u32 supported_params; + int ret; if (!devlink->ops->flash_update) return -EOPNOTSUPP; if (!info->attrs[DEVLINK_ATTR_FLASH_UPDATE_FILE_NAME]) return -EINVAL; - file_name = nla_data(info->attrs[DEVLINK_ATTR_FLASH_UPDATE_FILE_NAME]); + + supported_params = devlink->ops->supported_flash_update_params; nla_component = info->attrs[DEVLINK_ATTR_FLASH_UPDATE_COMPONENT]; - component = nla_component ? nla_data(nla_component) : NULL; + if (nla_component) { + if (!(supported_params & DEVLINK_SUPPORT_FLASH_UPDATE_COMPONENT)) { + NL_SET_ERR_MSG_ATTR(info->extack, nla_component, + "component update is not supported by this device"); + return -EOPNOTSUPP; + } + params.component = nla_data(nla_component); + } - return devlink->ops->flash_update(devlink, file_name, component, - info->extack); + nla_overwrite_mask = info->attrs[DEVLINK_ATTR_FLASH_UPDATE_OVERWRITE_MASK]; + if (nla_overwrite_mask) { + struct nla_bitfield32 sections; + + if (!(supported_params & DEVLINK_SUPPORT_FLASH_UPDATE_OVERWRITE_MASK)) { + NL_SET_ERR_MSG_ATTR(info->extack, nla_overwrite_mask, + "overwrite settings are not supported by this device"); + return -EOPNOTSUPP; + } + sections = nla_get_bitfield32(nla_overwrite_mask); + params.overwrite_mask = sections.value & sections.selector; + } + + nla_file_name = info->attrs[DEVLINK_ATTR_FLASH_UPDATE_FILE_NAME]; + file_name = nla_data(nla_file_name); + ret = request_firmware(¶ms.fw, file_name, devlink->dev); + if (ret) { + NL_SET_ERR_MSG_ATTR(info->extack, nla_file_name, "failed to locate the requested firmware file"); + return ret; + } + + devlink_flash_update_begin_notify(devlink); + ret = devlink->ops->flash_update(devlink, ¶ms, info->extack); + devlink_flash_update_end_notify(devlink); + + release_firmware(params.fw); + + return ret; } static const struct devlink_param devlink_param_generic[] = { @@ -3188,6 +3761,11 @@ static const struct devlink_param devlink_param_generic[] = { .name = DEVLINK_PARAM_GENERIC_ENABLE_ROCE_NAME, .type = DEVLINK_PARAM_GENERIC_ENABLE_ROCE_TYPE, }, + { + .id = DEVLINK_PARAM_GENERIC_ID_ENABLE_REMOTE_DEV_RESET, + .name = DEVLINK_PARAM_GENERIC_ENABLE_REMOTE_DEV_RESET_NAME, + .type = DEVLINK_PARAM_GENERIC_ENABLE_REMOTE_DEV_RESET_TYPE, + }, }; static int devlink_param_generic_verify(const struct devlink_param *param) @@ -3772,7 +4350,7 @@ out: static int devlink_nl_cmd_port_param_get_doit(struct sk_buff *skb, struct genl_info *info) { - struct devlink_port *devlink_port = info->user_ptr[0]; + struct devlink_port *devlink_port = info->user_ptr[1]; struct devlink_param_item *param_item; struct sk_buff *msg; int err; @@ -3801,7 +4379,7 @@ static int devlink_nl_cmd_port_param_get_doit(struct sk_buff *skb, static int devlink_nl_cmd_port_param_set_doit(struct sk_buff *skb, struct genl_info *info) { - struct devlink_port *devlink_port = info->user_ptr[0]; + struct devlink_port *devlink_port = info->user_ptr[1]; return __devlink_nl_cmd_param_set_doit(devlink_port->devlink, devlink_port->index, @@ -3875,6 +4453,13 @@ static int devlink_nl_region_fill(struct sk_buff *msg, struct devlink *devlink, if (err) goto nla_put_failure; + if (region->port) { + err = nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, + region->port->index); + if (err) + goto nla_put_failure; + } + err = nla_put_string(msg, DEVLINK_ATTR_REGION_NAME, region->ops->name); if (err) goto nla_put_failure; @@ -3922,6 +4507,13 @@ devlink_nl_region_notify_build(struct devlink_region *region, if (err) goto out_cancel_msg; + if (region->port) { + err = nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, + region->port->index); + if (err) + goto out_cancel_msg; + } + err = nla_put_string(msg, DEVLINK_ATTR_REGION_NAME, region->ops->name); if (err) @@ -4168,16 +4760,30 @@ static int devlink_nl_cmd_region_get_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; + struct devlink_port *port = NULL; struct devlink_region *region; const char *region_name; struct sk_buff *msg; + unsigned int index; int err; if (!info->attrs[DEVLINK_ATTR_REGION_NAME]) return -EINVAL; + if (info->attrs[DEVLINK_ATTR_PORT_INDEX]) { + index = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]); + + port = devlink_port_get_by_index(devlink, index); + if (!port) + return -ENODEV; + } + region_name = nla_data(info->attrs[DEVLINK_ATTR_REGION_NAME]); - region = devlink_region_get_by_name(devlink, region_name); + if (port) + region = devlink_port_region_get_by_name(port, region_name); + else + region = devlink_region_get_by_name(devlink, region_name); + if (!region) return -EINVAL; @@ -4196,10 +4802,75 @@ static int devlink_nl_cmd_region_get_doit(struct sk_buff *skb, return genlmsg_reply(msg, info); } +static int devlink_nl_cmd_region_get_port_dumpit(struct sk_buff *msg, + struct netlink_callback *cb, + struct devlink_port *port, + int *idx, + int start) +{ + struct devlink_region *region; + int err = 0; + + list_for_each_entry(region, &port->region_list, list) { + if (*idx < start) { + (*idx)++; + continue; + } + err = devlink_nl_region_fill(msg, port->devlink, + DEVLINK_CMD_REGION_GET, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NLM_F_MULTI, region); + if (err) + goto out; + (*idx)++; + } + +out: + return err; +} + +static int devlink_nl_cmd_region_get_devlink_dumpit(struct sk_buff *msg, + struct netlink_callback *cb, + struct devlink *devlink, + int *idx, + int start) +{ + struct devlink_region *region; + struct devlink_port *port; + int err = 0; + + mutex_lock(&devlink->lock); + list_for_each_entry(region, &devlink->region_list, list) { + if (*idx < start) { + (*idx)++; + continue; + } + err = devlink_nl_region_fill(msg, devlink, + DEVLINK_CMD_REGION_GET, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NLM_F_MULTI, region); + if (err) + goto out; + (*idx)++; + } + + list_for_each_entry(port, &devlink->port_list, list) { + err = devlink_nl_cmd_region_get_port_dumpit(msg, cb, port, idx, + start); + if (err) + goto out; + } + +out: + mutex_unlock(&devlink->lock); + return err; +} + static int devlink_nl_cmd_region_get_dumpit(struct sk_buff *msg, struct netlink_callback *cb) { - struct devlink_region *region; struct devlink *devlink; int start = cb->args[0]; int idx = 0; @@ -4209,25 +4880,10 @@ static int devlink_nl_cmd_region_get_dumpit(struct sk_buff *msg, list_for_each_entry(devlink, &devlink_list, list) { if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) continue; - - mutex_lock(&devlink->lock); - list_for_each_entry(region, &devlink->region_list, list) { - if (idx < start) { - idx++; - continue; - } - err = devlink_nl_region_fill(msg, devlink, - DEVLINK_CMD_REGION_GET, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - NLM_F_MULTI, region); - if (err) { - mutex_unlock(&devlink->lock); - goto out; - } - idx++; - } - mutex_unlock(&devlink->lock); + err = devlink_nl_cmd_region_get_devlink_dumpit(msg, cb, devlink, + &idx, start); + if (err) + goto out; } out: mutex_unlock(&devlink_mutex); @@ -4240,8 +4896,10 @@ static int devlink_nl_cmd_region_del(struct sk_buff *skb, { struct devlink *devlink = info->user_ptr[0]; struct devlink_snapshot *snapshot; + struct devlink_port *port = NULL; struct devlink_region *region; const char *region_name; + unsigned int index; u32 snapshot_id; if (!info->attrs[DEVLINK_ATTR_REGION_NAME] || @@ -4251,7 +4909,19 @@ static int devlink_nl_cmd_region_del(struct sk_buff *skb, region_name = nla_data(info->attrs[DEVLINK_ATTR_REGION_NAME]); snapshot_id = nla_get_u32(info->attrs[DEVLINK_ATTR_REGION_SNAPSHOT_ID]); - region = devlink_region_get_by_name(devlink, region_name); + if (info->attrs[DEVLINK_ATTR_PORT_INDEX]) { + index = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]); + + port = devlink_port_get_by_index(devlink, index); + if (!port) + return -ENODEV; + } + + if (port) + region = devlink_port_region_get_by_name(port, region_name); + else + region = devlink_region_get_by_name(devlink, region_name); + if (!region) return -EINVAL; @@ -4268,9 +4938,11 @@ devlink_nl_cmd_region_new(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; struct devlink_snapshot *snapshot; + struct devlink_port *port = NULL; struct nlattr *snapshot_id_attr; struct devlink_region *region; const char *region_name; + unsigned int index; u32 snapshot_id; u8 *data; int err; @@ -4281,7 +4953,20 @@ devlink_nl_cmd_region_new(struct sk_buff *skb, struct genl_info *info) } region_name = nla_data(info->attrs[DEVLINK_ATTR_REGION_NAME]); - region = devlink_region_get_by_name(devlink, region_name); + + if (info->attrs[DEVLINK_ATTR_PORT_INDEX]) { + index = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]); + + port = devlink_port_get_by_index(devlink, index); + if (!port) + return -ENODEV; + } + + if (port) + region = devlink_port_region_get_by_name(port, region_name); + else + region = devlink_region_get_by_name(devlink, region_name); + if (!region) { NL_SET_ERR_MSG_MOD(info->extack, "The requested region does not exist"); return -EINVAL; @@ -4317,7 +5002,12 @@ devlink_nl_cmd_region_new(struct sk_buff *skb, struct genl_info *info) } } - err = region->ops->snapshot(devlink, info->extack, &data); + if (port) + err = region->port_ops->snapshot(port, region->port_ops, + info->extack, &data); + else + err = region->ops->snapshot(devlink, region->ops, + info->extack, &data); if (err) goto err_snapshot_capture; @@ -4439,10 +5129,12 @@ static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb, const struct genl_dumpit_info *info = genl_dumpit_info(cb); u64 ret_offset, start_offset, end_offset = U64_MAX; struct nlattr **attrs = info->attrs; + struct devlink_port *port = NULL; struct devlink_region *region; struct nlattr *chunks_attr; const char *region_name; struct devlink *devlink; + unsigned int index; void *hdr; int err; @@ -4463,8 +5155,23 @@ static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb, goto out_unlock; } + if (info->attrs[DEVLINK_ATTR_PORT_INDEX]) { + index = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]); + + port = devlink_port_get_by_index(devlink, index); + if (!port) { + err = -ENODEV; + goto out_unlock; + } + } + region_name = nla_data(attrs[DEVLINK_ATTR_REGION_NAME]); - region = devlink_region_get_by_name(devlink, region_name); + + if (port) + region = devlink_port_region_get_by_name(port, region_name); + else + region = devlink_region_get_by_name(devlink, region_name); + if (!region) { err = -EINVAL; goto out_unlock; @@ -4501,6 +5208,13 @@ static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb, if (err) goto nla_put_failure; + if (region->port) { + err = nla_put_u32(skb, DEVLINK_ATTR_PORT_INDEX, + region->port->index); + if (err) + goto nla_put_failure; + } + err = nla_put_string(skb, DEVLINK_ATTR_REGION_NAME, region_name); if (err) goto nla_put_failure; @@ -5895,6 +6609,7 @@ devlink_nl_cmd_health_reporter_get_dumpit(struct sk_buff *msg, list_for_each_entry(devlink, &devlink_list, list) { if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) continue; + mutex_lock(&devlink->lock); list_for_each_entry(port, &devlink->port_list, list) { mutex_lock(&port->reporters_lock); list_for_each_entry(reporter, &port->reporter_list, list) { @@ -5909,12 +6624,14 @@ devlink_nl_cmd_health_reporter_get_dumpit(struct sk_buff *msg, NLM_F_MULTI); if (err) { mutex_unlock(&port->reporters_lock); + mutex_unlock(&devlink->lock); goto out; } idx++; } mutex_unlock(&port->reporters_lock); } + mutex_unlock(&devlink->lock); } out: mutex_unlock(&devlink_mutex); @@ -6088,6 +6805,28 @@ devlink_nl_cmd_health_reporter_dump_clear_doit(struct sk_buff *skb, return 0; } +static int devlink_nl_cmd_health_reporter_test_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + struct devlink_health_reporter *reporter; + int err; + + reporter = devlink_health_reporter_get_from_info(devlink, info); + if (!reporter) + return -EINVAL; + + if (!reporter->ops->test) { + devlink_health_reporter_put(reporter); + return -EOPNOTSUPP; + } + + err = reporter->ops->test(reporter, info->extack); + + devlink_health_reporter_put(reporter); + return err; +} + struct devlink_stats { u64 rx_bytes; u64 rx_packets; @@ -6458,7 +7197,6 @@ static int devlink_nl_cmd_trap_set_doit(struct sk_buff *skb, struct netlink_ext_ack *extack = info->extack; struct devlink *devlink = info->user_ptr[0]; struct devlink_trap_item *trap_item; - int err; if (list_empty(&devlink->trap_list)) return -EOPNOTSUPP; @@ -6469,11 +7207,7 @@ static int devlink_nl_cmd_trap_set_doit(struct sk_buff *skb, return -ENOENT; } - err = devlink_trap_action_set(devlink, trap_item, info); - if (err) - return err; - - return 0; + return devlink_trap_action_set(devlink, trap_item, info); } static struct devlink_trap_group_item * @@ -6644,6 +7378,24 @@ __devlink_trap_group_action_set(struct devlink *devlink, struct devlink_trap_item *trap_item; int err; + if (devlink->ops->trap_group_action_set) { + err = devlink->ops->trap_group_action_set(devlink, group_item->group, + trap_action, extack); + if (err) + return err; + + list_for_each_entry(trap_item, &devlink->trap_list, list) { + if (strcmp(trap_item->group_item->group->name, group_name)) + continue; + if (trap_item->action != trap_action && + trap_item->trap->type != DEVLINK_TRAP_TYPE_DROP) + continue; + trap_item->action = trap_action; + } + + return 0; + } + list_for_each_entry(trap_item, &devlink->trap_list, list) { if (strcmp(trap_item->group_item->group->name, group_name)) continue; @@ -7000,7 +7752,8 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING }, [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING }, [DEVLINK_ATTR_PORT_INDEX] = { .type = NLA_U32 }, - [DEVLINK_ATTR_PORT_TYPE] = { .type = NLA_U16 }, + [DEVLINK_ATTR_PORT_TYPE] = NLA_POLICY_RANGE(NLA_U16, DEVLINK_PORT_TYPE_AUTO, + DEVLINK_PORT_TYPE_IB), [DEVLINK_ATTR_PORT_SPLIT_COUNT] = { .type = NLA_U32 }, [DEVLINK_ATTR_SB_INDEX] = { .type = NLA_U32 }, [DEVLINK_ATTR_SB_POOL_INDEX] = { .type = NLA_U16 }, @@ -7009,7 +7762,8 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { [DEVLINK_ATTR_SB_POOL_THRESHOLD_TYPE] = { .type = NLA_U8 }, [DEVLINK_ATTR_SB_THRESHOLD] = { .type = NLA_U32 }, [DEVLINK_ATTR_SB_TC_INDEX] = { .type = NLA_U16 }, - [DEVLINK_ATTR_ESWITCH_MODE] = { .type = NLA_U16 }, + [DEVLINK_ATTR_ESWITCH_MODE] = NLA_POLICY_RANGE(NLA_U16, DEVLINK_ESWITCH_MODE_LEGACY, + DEVLINK_ESWITCH_MODE_SWITCHDEV), [DEVLINK_ATTR_ESWITCH_INLINE_MODE] = { .type = NLA_U8 }, [DEVLINK_ATTR_ESWITCH_ENCAP_MODE] = { .type = NLA_U8 }, [DEVLINK_ATTR_DPIPE_TABLE_NAME] = { .type = NLA_NUL_STRING }, @@ -7028,6 +7782,8 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { [DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER] = { .type = NLA_U8 }, [DEVLINK_ATTR_FLASH_UPDATE_FILE_NAME] = { .type = NLA_NUL_STRING }, [DEVLINK_ATTR_FLASH_UPDATE_COMPONENT] = { .type = NLA_NUL_STRING }, + [DEVLINK_ATTR_FLASH_UPDATE_OVERWRITE_MASK] = + NLA_POLICY_BITFIELD32(DEVLINK_SUPPORTED_FLASH_OVERWRITE_SECTIONS), [DEVLINK_ATTR_TRAP_NAME] = { .type = NLA_NUL_STRING }, [DEVLINK_ATTR_TRAP_ACTION] = { .type = NLA_U8 }, [DEVLINK_ATTR_TRAP_GROUP_NAME] = { .type = NLA_NUL_STRING }, @@ -7039,9 +7795,16 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { [DEVLINK_ATTR_TRAP_POLICER_RATE] = { .type = NLA_U64 }, [DEVLINK_ATTR_TRAP_POLICER_BURST] = { .type = NLA_U64 }, [DEVLINK_ATTR_PORT_FUNCTION] = { .type = NLA_NESTED }, + [DEVLINK_ATTR_RELOAD_ACTION] = NLA_POLICY_RANGE(NLA_U8, DEVLINK_RELOAD_ACTION_DRIVER_REINIT, + DEVLINK_RELOAD_ACTION_MAX), + [DEVLINK_ATTR_RELOAD_LIMITS] = NLA_POLICY_BITFIELD32(DEVLINK_RELOAD_LIMITS_VALID_MASK), + [DEVLINK_ATTR_PORT_FLAVOUR] = { .type = NLA_U16 }, + [DEVLINK_ATTR_PORT_PCI_PF_NUMBER] = { .type = NLA_U16 }, + [DEVLINK_ATTR_PORT_PCI_SF_NUMBER] = { .type = NLA_U32 }, + [DEVLINK_ATTR_PORT_CONTROLLER_NUMBER] = { .type = NLA_U32 }, }; -static const struct genl_ops devlink_nl_ops[] = { +static const struct genl_small_ops devlink_nl_ops[] = { { .cmd = DEVLINK_CMD_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, @@ -7079,6 +7842,18 @@ static const struct genl_ops devlink_nl_ops[] = { .internal_flags = DEVLINK_NL_FLAG_NO_LOCK, }, { + .cmd = DEVLINK_CMD_PORT_NEW, + .doit = devlink_nl_cmd_port_new_doit, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NO_LOCK, + }, + { + .cmd = DEVLINK_CMD_PORT_DEL, + .doit = devlink_nl_cmd_port_del_doit, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NO_LOCK, + }, + { .cmd = DEVLINK_CMD_SB_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_sb_get_doit, @@ -7309,6 +8084,14 @@ static const struct genl_ops devlink_nl_ops[] = { DEVLINK_NL_FLAG_NO_LOCK, }, { + .cmd = DEVLINK_CMD_HEALTH_REPORTER_TEST, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .doit = devlink_nl_cmd_health_reporter_test_doit, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT | + DEVLINK_NL_FLAG_NO_LOCK, + }, + { .cmd = DEVLINK_CMD_FLASH_UPDATE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_flash_update, @@ -7358,12 +8141,41 @@ static struct genl_family devlink_nl_family __ro_after_init = { .pre_doit = devlink_nl_pre_doit, .post_doit = devlink_nl_post_doit, .module = THIS_MODULE, - .ops = devlink_nl_ops, - .n_ops = ARRAY_SIZE(devlink_nl_ops), + .small_ops = devlink_nl_ops, + .n_small_ops = ARRAY_SIZE(devlink_nl_ops), .mcgrps = devlink_nl_mcgrps, .n_mcgrps = ARRAY_SIZE(devlink_nl_mcgrps), }; +static bool devlink_reload_actions_valid(const struct devlink_ops *ops) +{ + const struct devlink_reload_combination *comb; + int i; + + if (!devlink_reload_supported(ops)) { + if (WARN_ON(ops->reload_actions)) + return false; + return true; + } + + if (WARN_ON(!ops->reload_actions || + ops->reload_actions & BIT(DEVLINK_RELOAD_ACTION_UNSPEC) || + ops->reload_actions >= BIT(__DEVLINK_RELOAD_ACTION_MAX))) + return false; + + if (WARN_ON(ops->reload_limits & BIT(DEVLINK_RELOAD_LIMIT_UNSPEC) || + ops->reload_limits >= BIT(__DEVLINK_RELOAD_LIMIT_MAX))) + return false; + + for (i = 0; i < ARRAY_SIZE(devlink_reload_invalid_combinations); i++) { + comb = &devlink_reload_invalid_combinations[i]; + if (ops->reload_actions == BIT(comb->action) && + ops->reload_limits == BIT(comb->limit)) + return false; + } + return true; +} + /** * devlink_alloc - Allocate new devlink instance resources * @@ -7380,6 +8192,9 @@ struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size) if (WARN_ON(!ops)) return NULL; + if (!devlink_reload_actions_valid(ops)) + return NULL; + devlink = kzalloc(sizeof(*devlink) + priv_size, GFP_KERNEL); if (!devlink) return NULL; @@ -7428,7 +8243,7 @@ EXPORT_SYMBOL_GPL(devlink_register); void devlink_unregister(struct devlink *devlink) { mutex_lock(&devlink_mutex); - WARN_ON(devlink_reload_supported(devlink) && + WARN_ON(devlink_reload_supported(devlink->ops) && devlink->reload_enabled); devlink_notify(devlink, DEVLINK_CMD_DEL); list_del(&devlink->list); @@ -7506,7 +8321,8 @@ static bool devlink_port_type_should_warn(struct devlink_port *devlink_port) { /* Ignore CPU and DSA flavours. */ return devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_CPU && - devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_DSA; + devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_DSA && + devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_UNUSED; } #define DEVLINK_PORT_TYPE_WARN_TIMEOUT (HZ * 3600) @@ -7555,11 +8371,12 @@ int devlink_port_register(struct devlink *devlink, devlink_port->index = port_index; devlink_port->registered = true; spin_lock_init(&devlink_port->type_lock); + INIT_LIST_HEAD(&devlink_port->reporter_list); + mutex_init(&devlink_port->reporters_lock); list_add_tail(&devlink_port->list, &devlink->port_list); INIT_LIST_HEAD(&devlink_port->param_list); + INIT_LIST_HEAD(&devlink_port->region_list); mutex_unlock(&devlink->lock); - INIT_LIST_HEAD(&devlink_port->reporter_list); - mutex_init(&devlink_port->reporters_lock); INIT_DELAYED_WORK(&devlink_port->type_warn_dw, &devlink_port_type_warn); devlink_port_type_warn_schedule(devlink_port); devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW); @@ -7576,13 +8393,14 @@ void devlink_port_unregister(struct devlink_port *devlink_port) { struct devlink *devlink = devlink_port->devlink; - WARN_ON(!list_empty(&devlink_port->reporter_list)); - mutex_destroy(&devlink_port->reporters_lock); devlink_port_type_warn_cancel(devlink_port); devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_DEL); mutex_lock(&devlink->lock); list_del(&devlink_port->list); mutex_unlock(&devlink->lock); + WARN_ON(!list_empty(&devlink_port->reporter_list)); + WARN_ON(!list_empty(&devlink_port->region_list)); + mutex_destroy(&devlink_port->reporters_lock); } EXPORT_SYMBOL_GPL(devlink_port_unregister); @@ -7600,14 +8418,8 @@ static void __devlink_port_type_set(struct devlink_port *devlink_port, devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW); } -/** - * devlink_port_type_eth_set - Set port type to Ethernet - * - * @devlink_port: devlink port - * @netdev: related netdevice - */ -void devlink_port_type_eth_set(struct devlink_port *devlink_port, - struct net_device *netdev) +static void devlink_port_type_netdev_checks(struct devlink_port *devlink_port, + struct net_device *netdev) { const struct net_device_ops *ops = netdev->netdev_ops; @@ -7641,6 +8453,24 @@ void devlink_port_type_eth_set(struct devlink_port *devlink_port, err = ops->ndo_get_port_parent_id(netdev, &ppid); WARN_ON(err != -EOPNOTSUPP); } +} + +/** + * devlink_port_type_eth_set - Set port type to Ethernet + * + * @devlink_port: devlink port + * @netdev: related netdevice + */ +void devlink_port_type_eth_set(struct devlink_port *devlink_port, + struct net_device *netdev) +{ + if (netdev) + devlink_port_type_netdev_checks(devlink_port, netdev); + else + dev_warn(devlink_port->devlink->dev, + "devlink port type for port %d set to Ethernet without a software interface reference, device type not supported by the kernel?\n", + devlink_port->index); + __devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_ETH, netdev); } EXPORT_SYMBOL_GPL(devlink_port_type_eth_set); @@ -7675,8 +8505,6 @@ static int __devlink_port_attrs_set(struct devlink_port *devlink_port, { struct devlink_port_attrs *attrs = &devlink_port->attrs; - if (WARN_ON(devlink_port->registered)) - return -EEXIST; devlink_port->attrs_set = true; attrs->flavour = flavour; if (attrs->switch_id.id_len) { @@ -7700,6 +8528,8 @@ void devlink_port_attrs_set(struct devlink_port *devlink_port, { int ret; + if (WARN_ON(devlink_port->registered)) + return; devlink_port->attrs = *attrs; ret = __devlink_port_attrs_set(devlink_port, attrs->flavour); if (ret) @@ -7712,19 +8542,25 @@ EXPORT_SYMBOL_GPL(devlink_port_attrs_set); * devlink_port_attrs_pci_pf_set - Set PCI PF port attributes * * @devlink_port: devlink port + * @controller: associated controller number for the devlink port instance * @pf: associated PF for the devlink port instance + * @external: indicates if the port is for an external controller */ -void devlink_port_attrs_pci_pf_set(struct devlink_port *devlink_port, u16 pf) +void devlink_port_attrs_pci_pf_set(struct devlink_port *devlink_port, u32 controller, + u16 pf, bool external) { struct devlink_port_attrs *attrs = &devlink_port->attrs; int ret; + if (WARN_ON(devlink_port->registered)) + return; ret = __devlink_port_attrs_set(devlink_port, DEVLINK_PORT_FLAVOUR_PCI_PF); if (ret) return; - + attrs->pci_pf.controller = controller; attrs->pci_pf.pf = pf; + attrs->pci_pf.external = external; } EXPORT_SYMBOL_GPL(devlink_port_attrs_pci_pf_set); @@ -7732,24 +8568,56 @@ EXPORT_SYMBOL_GPL(devlink_port_attrs_pci_pf_set); * devlink_port_attrs_pci_vf_set - Set PCI VF port attributes * * @devlink_port: devlink port + * @controller: associated controller number for the devlink port instance * @pf: associated PF for the devlink port instance * @vf: associated VF of a PF for the devlink port instance + * @external: indicates if the port is for an external controller */ -void devlink_port_attrs_pci_vf_set(struct devlink_port *devlink_port, - u16 pf, u16 vf) +void devlink_port_attrs_pci_vf_set(struct devlink_port *devlink_port, u32 controller, + u16 pf, u16 vf, bool external) { struct devlink_port_attrs *attrs = &devlink_port->attrs; int ret; + if (WARN_ON(devlink_port->registered)) + return; ret = __devlink_port_attrs_set(devlink_port, DEVLINK_PORT_FLAVOUR_PCI_VF); if (ret) return; + attrs->pci_vf.controller = controller; attrs->pci_vf.pf = pf; attrs->pci_vf.vf = vf; + attrs->pci_vf.external = external; } EXPORT_SYMBOL_GPL(devlink_port_attrs_pci_vf_set); +/** + * devlink_port_attrs_pci_sf_set - Set PCI SF port attributes + * + * @devlink_port: devlink port + * @controller: associated controller number for the devlink port instance + * @pf: associated PF for the devlink port instance + * @sf: associated SF of a PF for the devlink port instance + */ +void devlink_port_attrs_pci_sf_set(struct devlink_port *devlink_port, u32 controller, + u16 pf, u32 sf) +{ + struct devlink_port_attrs *attrs = &devlink_port->attrs; + int ret; + + if (WARN_ON(devlink_port->registered)) + return; + ret = __devlink_port_attrs_set(devlink_port, + DEVLINK_PORT_FLAVOUR_PCI_SF); + if (ret) + return; + attrs->pci_sf.controller = controller; + attrs->pci_sf.pf = pf; + attrs->pci_sf.sf = sf; +} +EXPORT_SYMBOL_GPL(devlink_port_attrs_pci_sf_set); + static int __devlink_port_phys_port_name_get(struct devlink_port *devlink_port, char *name, size_t len) { @@ -7771,18 +8639,37 @@ static int __devlink_port_phys_port_name_get(struct devlink_port *devlink_port, break; case DEVLINK_PORT_FLAVOUR_CPU: case DEVLINK_PORT_FLAVOUR_DSA: + case DEVLINK_PORT_FLAVOUR_UNUSED: /* As CPU and DSA ports do not have a netdevice associated * case should not ever happen. */ WARN_ON(1); return -EINVAL; case DEVLINK_PORT_FLAVOUR_PCI_PF: + if (attrs->pci_pf.external) { + n = snprintf(name, len, "c%u", attrs->pci_pf.controller); + if (n >= len) + return -EINVAL; + len -= n; + name += n; + } n = snprintf(name, len, "pf%u", attrs->pci_pf.pf); break; case DEVLINK_PORT_FLAVOUR_PCI_VF: + if (attrs->pci_vf.external) { + n = snprintf(name, len, "c%u", attrs->pci_vf.controller); + if (n >= len) + return -EINVAL; + len -= n; + name += n; + } n = snprintf(name, len, "pf%uvf%u", attrs->pci_vf.pf, attrs->pci_vf.vf); break; + case DEVLINK_PORT_FLAVOUR_PCI_SF: + n = snprintf(name, len, "pf%usf%u", attrs->pci_sf.pf, + attrs->pci_sf.sf); + break; } if (n >= len) @@ -7980,6 +8867,10 @@ EXPORT_SYMBOL_GPL(devlink_dpipe_table_unregister); * @resource_id: resource's id * @parent_resource_id: resource's parent id * @size_params: size parameters + * + * Generic resources should reuse the same names across drivers. + * Please see the generic resources list at: + * Documentation/networking/devlink/devlink-resource.rst */ int devlink_resource_register(struct devlink *devlink, const char *resource_name, @@ -8431,7 +9322,7 @@ __devlink_param_driverinit_value_set(struct devlink *devlink, int devlink_param_driverinit_value_get(struct devlink *devlink, u32 param_id, union devlink_param_value *init_val) { - if (!devlink_reload_supported(devlink)) + if (!devlink_reload_supported(devlink->ops)) return -EOPNOTSUPP; return __devlink_param_driverinit_value_get(&devlink->param_list, @@ -8478,7 +9369,7 @@ int devlink_port_param_driverinit_value_get(struct devlink_port *devlink_port, { struct devlink *devlink = devlink_port->devlink; - if (!devlink_reload_supported(devlink)) + if (!devlink_reload_supported(devlink->ops)) return -EOPNOTSUPP; return __devlink_param_driverinit_value_get(&devlink_port->param_list, @@ -8627,6 +9518,57 @@ unlock: EXPORT_SYMBOL_GPL(devlink_region_create); /** + * devlink_port_region_create - create a new address region for a port + * + * @port: devlink port + * @ops: region operations and name + * @region_max_snapshots: Maximum supported number of snapshots for region + * @region_size: size of region + */ +struct devlink_region * +devlink_port_region_create(struct devlink_port *port, + const struct devlink_port_region_ops *ops, + u32 region_max_snapshots, u64 region_size) +{ + struct devlink *devlink = port->devlink; + struct devlink_region *region; + int err = 0; + + if (WARN_ON(!ops) || WARN_ON(!ops->destructor)) + return ERR_PTR(-EINVAL); + + mutex_lock(&devlink->lock); + + if (devlink_port_region_get_by_name(port, ops->name)) { + err = -EEXIST; + goto unlock; + } + + region = kzalloc(sizeof(*region), GFP_KERNEL); + if (!region) { + err = -ENOMEM; + goto unlock; + } + + region->devlink = devlink; + region->port = port; + region->max_snapshots = region_max_snapshots; + region->port_ops = ops; + region->size = region_size; + INIT_LIST_HEAD(®ion->snapshot_list); + list_add_tail(®ion->list, &port->region_list); + devlink_nl_region_notify(region, NULL, DEVLINK_CMD_REGION_NEW); + + mutex_unlock(&devlink->lock); + return region; + +unlock: + mutex_unlock(&devlink->lock); + return ERR_PTR(err); +} +EXPORT_SYMBOL_GPL(devlink_port_region_create); + +/** * devlink_region_destroy - destroy address region * * @region: devlink region to destroy @@ -8803,6 +9745,24 @@ static const struct devlink_trap devlink_trap_generic[] = { DEVLINK_TRAP(FLOW_ACTION_SAMPLE, CONTROL), DEVLINK_TRAP(FLOW_ACTION_TRAP, CONTROL), DEVLINK_TRAP(EARLY_DROP, DROP), + DEVLINK_TRAP(VXLAN_PARSING, DROP), + DEVLINK_TRAP(LLC_SNAP_PARSING, DROP), + DEVLINK_TRAP(VLAN_PARSING, DROP), + DEVLINK_TRAP(PPPOE_PPP_PARSING, DROP), + DEVLINK_TRAP(MPLS_PARSING, DROP), + DEVLINK_TRAP(ARP_PARSING, DROP), + DEVLINK_TRAP(IP_1_PARSING, DROP), + DEVLINK_TRAP(IP_N_PARSING, DROP), + DEVLINK_TRAP(GRE_PARSING, DROP), + DEVLINK_TRAP(UDP_PARSING, DROP), + DEVLINK_TRAP(TCP_PARSING, DROP), + DEVLINK_TRAP(IPSEC_PARSING, DROP), + DEVLINK_TRAP(SCTP_PARSING, DROP), + DEVLINK_TRAP(DCCP_PARSING, DROP), + DEVLINK_TRAP(GTP_PARSING, DROP), + DEVLINK_TRAP(ESP_PARSING, DROP), + DEVLINK_TRAP(BLACKHOLE_NEXTHOP, DROP), + DEVLINK_TRAP(DMAC_FILTER, DROP), }; #define DEVLINK_TRAP_GROUP(_id) \ @@ -8837,6 +9797,7 @@ static const struct devlink_trap_group devlink_trap_group_generic[] = { DEVLINK_TRAP_GROUP(PTP_GENERAL), DEVLINK_TRAP_GROUP(ACL_SAMPLE), DEVLINK_TRAP_GROUP(ACL_TRAP), + DEVLINK_TRAP_GROUP(PARSER_ERROR_DROPS), }; static int devlink_trap_generic_verify(const struct devlink_trap *trap) @@ -9139,20 +10100,19 @@ devlink_trap_stats_update(struct devlink_stats __percpu *trap_stats, } static void -devlink_trap_report_metadata_fill(struct net_dm_hw_metadata *hw_metadata, - const struct devlink_trap_item *trap_item, - struct devlink_port *in_devlink_port, - const struct flow_action_cookie *fa_cookie) +devlink_trap_report_metadata_set(struct devlink_trap_metadata *metadata, + const struct devlink_trap_item *trap_item, + struct devlink_port *in_devlink_port, + const struct flow_action_cookie *fa_cookie) { - struct devlink_trap_group_item *group_item = trap_item->group_item; - - hw_metadata->trap_group_name = group_item->group->name; - hw_metadata->trap_name = trap_item->trap->name; - hw_metadata->fa_cookie = fa_cookie; + metadata->trap_name = trap_item->trap->name; + metadata->trap_group_name = trap_item->group_item->group->name; + metadata->fa_cookie = fa_cookie; + metadata->trap_type = trap_item->trap->type; spin_lock(&in_devlink_port->type_lock); if (in_devlink_port->type == DEVLINK_PORT_TYPE_ETH) - hw_metadata->input_dev = in_devlink_port->type_dev; + metadata->input_dev = in_devlink_port->type_dev; spin_unlock(&in_devlink_port->type_lock); } @@ -9170,21 +10130,17 @@ void devlink_trap_report(struct devlink *devlink, struct sk_buff *skb, { struct devlink_trap_item *trap_item = trap_ctx; - struct net_dm_hw_metadata hw_metadata = {}; devlink_trap_stats_update(trap_item->stats, skb->len); devlink_trap_stats_update(trap_item->group_item->stats, skb->len); - /* Control packets were not dropped by the device or encountered an - * exception during forwarding and therefore should not be reported to - * the kernel's drop monitor. - */ - if (trap_item->trap->type == DEVLINK_TRAP_TYPE_CONTROL) - return; + if (trace_devlink_trap_report_enabled()) { + struct devlink_trap_metadata metadata = {}; - devlink_trap_report_metadata_fill(&hw_metadata, trap_item, - in_devlink_port, fa_cookie); - net_dm_hw_report(skb, &hw_metadata); + devlink_trap_report_metadata_set(&metadata, trap_item, + in_devlink_port, fa_cookie); + trace_devlink_trap_report(devlink, skb, &metadata); + } } EXPORT_SYMBOL_GPL(devlink_trap_report); @@ -9543,6 +10499,7 @@ out: int devlink_compat_flash_update(struct net_device *dev, const char *file_name) { + struct devlink_flash_update_params params = {}; struct devlink *devlink; int ret; @@ -9555,10 +10512,18 @@ int devlink_compat_flash_update(struct net_device *dev, const char *file_name) goto out; } + ret = request_firmware(¶ms.fw, file_name, devlink->dev); + if (ret) + goto out; + mutex_lock(&devlink->lock); - ret = devlink->ops->flash_update(devlink, file_name, NULL, NULL); + devlink_flash_update_begin_notify(devlink); + ret = devlink->ops->flash_update(devlink, ¶ms, NULL); + devlink_flash_update_end_notify(devlink); mutex_unlock(&devlink->lock); + release_firmware(params.fw); + out: rtnl_lock(); dev_put(dev); @@ -9605,6 +10570,7 @@ int devlink_compat_switch_id_get(struct net_device *dev, static void __net_exit devlink_pernet_pre_exit(struct net *net) { struct devlink *devlink; + u32 actions_performed; int err; /* In case network namespace is getting destroyed, reload @@ -9613,9 +10579,12 @@ static void __net_exit devlink_pernet_pre_exit(struct net *net) mutex_lock(&devlink_mutex); list_for_each_entry(devlink, &devlink_list, list) { if (net_eq(devlink_net(devlink), net)) { - if (WARN_ON(!devlink_reload_supported(devlink))) + if (WARN_ON(!devlink_reload_supported(devlink->ops))) continue; - err = devlink_reload(devlink, &init_net, NULL); + err = devlink_reload(devlink, &init_net, + DEVLINK_RELOAD_ACTION_DRIVER_REINIT, + DEVLINK_RELOAD_LIMIT_UNSPEC, + &actions_performed, NULL); if (err && err != -EOPNOTSUPP) pr_warn("Failed to reload devlink instance into init_net\n"); } diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c index 9704522b0872..571f191c06d9 100644 --- a/net/core/drop_monitor.c +++ b/net/core/drop_monitor.c @@ -26,13 +26,14 @@ #include <linux/bitops.h> #include <linux/slab.h> #include <linux/module.h> -#include <net/drop_monitor.h> #include <net/genetlink.h> #include <net/netevent.h> #include <net/flow_offload.h> +#include <net/devlink.h> #include <trace/events/skb.h> #include <trace/events/napi.h> +#include <trace/events/devlink.h> #include <asm/unaligned.h> @@ -114,13 +115,14 @@ struct net_dm_alert_ops { int work, int budget); void (*work_item_func)(struct work_struct *work); void (*hw_work_item_func)(struct work_struct *work); - void (*hw_probe)(struct sk_buff *skb, - const struct net_dm_hw_metadata *hw_metadata); + void (*hw_trap_probe)(void *ignore, const struct devlink *devlink, + struct sk_buff *skb, + const struct devlink_trap_metadata *metadata); }; struct net_dm_skb_cb { union { - struct net_dm_hw_metadata *hw_metadata; + struct devlink_trap_metadata *hw_metadata; void *pc; }; }; @@ -432,8 +434,9 @@ out: } static void -net_dm_hw_summary_probe(struct sk_buff *skb, - const struct net_dm_hw_metadata *hw_metadata) +net_dm_hw_trap_summary_probe(void *ignore, const struct devlink *devlink, + struct sk_buff *skb, + const struct devlink_trap_metadata *metadata) { struct net_dm_hw_entries *hw_entries; struct net_dm_hw_entry *hw_entry; @@ -441,6 +444,9 @@ net_dm_hw_summary_probe(struct sk_buff *skb, unsigned long flags; int i; + if (metadata->trap_type == DEVLINK_TRAP_TYPE_CONTROL) + return; + hw_data = this_cpu_ptr(&dm_hw_cpu_data); spin_lock_irqsave(&hw_data->lock, flags); hw_entries = hw_data->hw_entries; @@ -450,7 +456,7 @@ net_dm_hw_summary_probe(struct sk_buff *skb, for (i = 0; i < hw_entries->num_entries; i++) { hw_entry = &hw_entries->entries[i]; - if (!strncmp(hw_entry->trap_name, hw_metadata->trap_name, + if (!strncmp(hw_entry->trap_name, metadata->trap_name, NET_DM_MAX_HW_TRAP_NAME_LEN - 1)) { hw_entry->count++; goto out; @@ -460,7 +466,7 @@ net_dm_hw_summary_probe(struct sk_buff *skb, goto out; hw_entry = &hw_entries->entries[hw_entries->num_entries]; - strlcpy(hw_entry->trap_name, hw_metadata->trap_name, + strlcpy(hw_entry->trap_name, metadata->trap_name, NET_DM_MAX_HW_TRAP_NAME_LEN - 1); hw_entry->count = 1; hw_entries->num_entries++; @@ -479,7 +485,7 @@ static const struct net_dm_alert_ops net_dm_alert_summary_ops = { .napi_poll_probe = trace_napi_poll_hit, .work_item_func = send_dm_alert, .hw_work_item_func = net_dm_hw_summary_work, - .hw_probe = net_dm_hw_summary_probe, + .hw_trap_probe = net_dm_hw_trap_summary_probe, }; static void net_dm_packet_trace_kfree_skb_hit(void *ignore, @@ -705,7 +711,7 @@ static void net_dm_packet_work(struct work_struct *work) } static size_t -net_dm_flow_action_cookie_size(const struct net_dm_hw_metadata *hw_metadata) +net_dm_flow_action_cookie_size(const struct devlink_trap_metadata *hw_metadata) { return hw_metadata->fa_cookie ? nla_total_size(hw_metadata->fa_cookie->cookie_len) : 0; @@ -713,7 +719,7 @@ net_dm_flow_action_cookie_size(const struct net_dm_hw_metadata *hw_metadata) static size_t net_dm_hw_packet_report_size(size_t payload_len, - const struct net_dm_hw_metadata *hw_metadata) + const struct devlink_trap_metadata *hw_metadata) { size_t size; @@ -743,7 +749,7 @@ net_dm_hw_packet_report_size(size_t payload_len, static int net_dm_hw_packet_report_fill(struct sk_buff *msg, struct sk_buff *skb, size_t payload_len) { - struct net_dm_hw_metadata *hw_metadata; + struct devlink_trap_metadata *hw_metadata; struct nlattr *attr; void *hdr; @@ -810,56 +816,56 @@ nla_put_failure: return -EMSGSIZE; } -static struct net_dm_hw_metadata * -net_dm_hw_metadata_clone(const struct net_dm_hw_metadata *hw_metadata) +static struct devlink_trap_metadata * +net_dm_hw_metadata_copy(const struct devlink_trap_metadata *metadata) { const struct flow_action_cookie *fa_cookie; - struct net_dm_hw_metadata *n_hw_metadata; + struct devlink_trap_metadata *hw_metadata; const char *trap_group_name; const char *trap_name; - n_hw_metadata = kzalloc(sizeof(*hw_metadata), GFP_ATOMIC); - if (!n_hw_metadata) + hw_metadata = kzalloc(sizeof(*hw_metadata), GFP_ATOMIC); + if (!hw_metadata) return NULL; - trap_group_name = kstrdup(hw_metadata->trap_group_name, GFP_ATOMIC); + trap_group_name = kstrdup(metadata->trap_group_name, GFP_ATOMIC); if (!trap_group_name) goto free_hw_metadata; - n_hw_metadata->trap_group_name = trap_group_name; + hw_metadata->trap_group_name = trap_group_name; - trap_name = kstrdup(hw_metadata->trap_name, GFP_ATOMIC); + trap_name = kstrdup(metadata->trap_name, GFP_ATOMIC); if (!trap_name) goto free_trap_group; - n_hw_metadata->trap_name = trap_name; + hw_metadata->trap_name = trap_name; - if (hw_metadata->fa_cookie) { + if (metadata->fa_cookie) { size_t cookie_size = sizeof(*fa_cookie) + - hw_metadata->fa_cookie->cookie_len; + metadata->fa_cookie->cookie_len; - fa_cookie = kmemdup(hw_metadata->fa_cookie, cookie_size, + fa_cookie = kmemdup(metadata->fa_cookie, cookie_size, GFP_ATOMIC); if (!fa_cookie) goto free_trap_name; - n_hw_metadata->fa_cookie = fa_cookie; + hw_metadata->fa_cookie = fa_cookie; } - n_hw_metadata->input_dev = hw_metadata->input_dev; - if (n_hw_metadata->input_dev) - dev_hold(n_hw_metadata->input_dev); + hw_metadata->input_dev = metadata->input_dev; + if (hw_metadata->input_dev) + dev_hold(hw_metadata->input_dev); - return n_hw_metadata; + return hw_metadata; free_trap_name: kfree(trap_name); free_trap_group: kfree(trap_group_name); free_hw_metadata: - kfree(n_hw_metadata); + kfree(hw_metadata); return NULL; } static void -net_dm_hw_metadata_free(const struct net_dm_hw_metadata *hw_metadata) +net_dm_hw_metadata_free(const struct devlink_trap_metadata *hw_metadata) { if (hw_metadata->input_dev) dev_put(hw_metadata->input_dev); @@ -871,7 +877,7 @@ net_dm_hw_metadata_free(const struct net_dm_hw_metadata *hw_metadata) static void net_dm_hw_packet_report(struct sk_buff *skb) { - struct net_dm_hw_metadata *hw_metadata; + struct devlink_trap_metadata *hw_metadata; struct sk_buff *msg; size_t payload_len; int rc; @@ -924,15 +930,19 @@ static void net_dm_hw_packet_work(struct work_struct *work) } static void -net_dm_hw_packet_probe(struct sk_buff *skb, - const struct net_dm_hw_metadata *hw_metadata) +net_dm_hw_trap_packet_probe(void *ignore, const struct devlink *devlink, + struct sk_buff *skb, + const struct devlink_trap_metadata *metadata) { - struct net_dm_hw_metadata *n_hw_metadata; + struct devlink_trap_metadata *n_hw_metadata; ktime_t tstamp = ktime_get_real(); struct per_cpu_dm_data *hw_data; struct sk_buff *nskb; unsigned long flags; + if (metadata->trap_type == DEVLINK_TRAP_TYPE_CONTROL) + return; + if (!skb_mac_header_was_set(skb)) return; @@ -940,7 +950,7 @@ net_dm_hw_packet_probe(struct sk_buff *skb, if (!nskb) return; - n_hw_metadata = net_dm_hw_metadata_clone(hw_metadata); + n_hw_metadata = net_dm_hw_metadata_copy(metadata); if (!n_hw_metadata) goto free; @@ -975,7 +985,7 @@ static const struct net_dm_alert_ops net_dm_alert_packet_ops = { .napi_poll_probe = net_dm_packet_trace_napi_poll_hit, .work_item_func = net_dm_packet_work, .hw_work_item_func = net_dm_hw_packet_work, - .hw_probe = net_dm_hw_packet_probe, + .hw_trap_probe = net_dm_hw_trap_packet_probe, }; static const struct net_dm_alert_ops *net_dm_alert_ops_arr[] = { @@ -983,25 +993,32 @@ static const struct net_dm_alert_ops *net_dm_alert_ops_arr[] = { [NET_DM_ALERT_MODE_PACKET] = &net_dm_alert_packet_ops, }; -void net_dm_hw_report(struct sk_buff *skb, - const struct net_dm_hw_metadata *hw_metadata) +#if IS_ENABLED(CONFIG_NET_DEVLINK) +static int net_dm_hw_probe_register(const struct net_dm_alert_ops *ops) { - rcu_read_lock(); - - if (!monitor_hw) - goto out; + return register_trace_devlink_trap_report(ops->hw_trap_probe, NULL); +} - net_dm_alert_ops_arr[net_dm_alert_mode]->hw_probe(skb, hw_metadata); +static void net_dm_hw_probe_unregister(const struct net_dm_alert_ops *ops) +{ + unregister_trace_devlink_trap_report(ops->hw_trap_probe, NULL); + tracepoint_synchronize_unregister(); +} +#else +static int net_dm_hw_probe_register(const struct net_dm_alert_ops *ops) +{ + return -EOPNOTSUPP; +} -out: - rcu_read_unlock(); +static void net_dm_hw_probe_unregister(const struct net_dm_alert_ops *ops) +{ } -EXPORT_SYMBOL_GPL(net_dm_hw_report); +#endif static int net_dm_hw_monitor_start(struct netlink_ext_ack *extack) { const struct net_dm_alert_ops *ops; - int cpu; + int cpu, rc; if (monitor_hw) { NL_SET_ERR_MSG_MOD(extack, "Hardware monitoring already enabled"); @@ -1025,13 +1042,24 @@ static int net_dm_hw_monitor_start(struct netlink_ext_ack *extack) kfree(hw_entries); } + rc = net_dm_hw_probe_register(ops); + if (rc) { + NL_SET_ERR_MSG_MOD(extack, "Failed to connect probe to devlink_trap_probe() tracepoint"); + goto err_module_put; + } + monitor_hw = true; return 0; + +err_module_put: + module_put(THIS_MODULE); + return rc; } static void net_dm_hw_monitor_stop(struct netlink_ext_ack *extack) { + const struct net_dm_alert_ops *ops; int cpu; if (!monitor_hw) { @@ -1039,12 +1067,11 @@ static void net_dm_hw_monitor_stop(struct netlink_ext_ack *extack) return; } + ops = net_dm_alert_ops_arr[net_dm_alert_mode]; + monitor_hw = false; - /* After this call returns we are guaranteed that no CPU is processing - * any hardware drops. - */ - synchronize_rcu(); + net_dm_hw_probe_unregister(ops); for_each_possible_cpu(cpu) { struct per_cpu_dm_data *hw_data = &per_cpu(dm_hw_cpu_data, cpu); @@ -1053,7 +1080,7 @@ static void net_dm_hw_monitor_stop(struct netlink_ext_ack *extack) del_timer_sync(&hw_data->send_timer); cancel_work_sync(&hw_data->dm_alert_work); while ((skb = __skb_dequeue(&hw_data->drop_queue))) { - struct net_dm_hw_metadata *hw_metadata; + struct devlink_trap_metadata *hw_metadata; hw_metadata = NET_DM_SKB_CB(skb)->hw_metadata; net_dm_hw_metadata_free(hw_metadata); @@ -1548,7 +1575,7 @@ static const struct nla_policy net_dm_nl_policy[NET_DM_ATTR_MAX + 1] = { [NET_DM_ATTR_HW_DROPS] = {. type = NLA_FLAG }, }; -static const struct genl_ops dropmon_ops[] = { +static const struct genl_small_ops dropmon_ops[] = { { .cmd = NET_DM_CMD_CONFIG, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, @@ -1598,8 +1625,8 @@ static struct genl_family net_drop_monitor_family __ro_after_init = { .pre_doit = net_dm_nl_pre_doit, .post_doit = net_dm_nl_post_doit, .module = THIS_MODULE, - .ops = dropmon_ops, - .n_ops = ARRAY_SIZE(dropmon_ops), + .small_ops = dropmon_ops, + .n_small_ops = ARRAY_SIZE(dropmon_ops), .mcgrps = dropmon_mcgrps, .n_mcgrps = ARRAY_SIZE(dropmon_mcgrps), }; diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 7bcfb16854cb..cd80ffed6d26 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -563,7 +563,7 @@ static int fib_nl2rule(struct sk_buff *skb, struct nlmsghdr *nlh, struct net_device *dev; nlrule->iifindex = -1; - nla_strlcpy(nlrule->iifname, tb[FRA_IIFNAME], IFNAMSIZ); + nla_strscpy(nlrule->iifname, tb[FRA_IIFNAME], IFNAMSIZ); dev = __dev_get_by_name(net, nlrule->iifname); if (dev) nlrule->iifindex = dev->ifindex; @@ -573,7 +573,7 @@ static int fib_nl2rule(struct sk_buff *skb, struct nlmsghdr *nlh, struct net_device *dev; nlrule->oifindex = -1; - nla_strlcpy(nlrule->oifname, tb[FRA_OIFNAME], IFNAMSIZ); + nla_strscpy(nlrule->oifname, tb[FRA_OIFNAME], IFNAMSIZ); dev = __dev_get_by_name(net, nlrule->oifname); if (dev) nlrule->oifindex = dev->ifindex; diff --git a/net/core/filter.c b/net/core/filter.c index b5f3faac5e3b..adfdad234674 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -76,6 +76,10 @@ #include <net/bpf_sk_storage.h> #include <net/transp_v6.h> #include <linux/btf_ids.h> +#include <net/tls.h> + +static const struct bpf_func_proto * +bpf_sk_base_func_proto(enum bpf_func_id func_id); int copy_bpf_fprog_from_user(struct sock_fprog *dst, sockptr_t src, int len) { @@ -2079,13 +2083,13 @@ static const struct bpf_func_proto bpf_csum_level_proto = { static inline int __bpf_rx_skb(struct net_device *dev, struct sk_buff *skb) { - return dev_forward_skb(dev, skb); + return dev_forward_skb_nomtu(dev, skb); } static inline int __bpf_rx_skb_no_mac(struct net_device *dev, struct sk_buff *skb) { - int ret = ____dev_forward_skb(dev, skb); + int ret = ____dev_forward_skb(dev, skb, false); if (likely(!ret)) { skb->dev = dev; @@ -2160,13 +2164,266 @@ static int __bpf_redirect(struct sk_buff *skb, struct net_device *dev, return __bpf_redirect_no_mac(skb, dev, flags); } +#if IS_ENABLED(CONFIG_IPV6) +static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb, + struct net_device *dev, struct bpf_nh_params *nh) +{ + u32 hh_len = LL_RESERVED_SPACE(dev); + const struct in6_addr *nexthop; + struct dst_entry *dst = NULL; + struct neighbour *neigh; + + if (dev_xmit_recursion()) { + net_crit_ratelimited("bpf: recursion limit reached on datapath, buggy bpf program?\n"); + goto out_drop; + } + + skb->dev = dev; + skb->tstamp = 0; + + if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) { + struct sk_buff *skb2; + + skb2 = skb_realloc_headroom(skb, hh_len); + if (unlikely(!skb2)) { + kfree_skb(skb); + return -ENOMEM; + } + if (skb->sk) + skb_set_owner_w(skb2, skb->sk); + consume_skb(skb); + skb = skb2; + } + + rcu_read_lock_bh(); + if (!nh) { + dst = skb_dst(skb); + nexthop = rt6_nexthop(container_of(dst, struct rt6_info, dst), + &ipv6_hdr(skb)->daddr); + } else { + nexthop = &nh->ipv6_nh; + } + neigh = ip_neigh_gw6(dev, nexthop); + if (likely(!IS_ERR(neigh))) { + int ret; + + sock_confirm_neigh(skb, neigh); + dev_xmit_recursion_inc(); + ret = neigh_output(neigh, skb, false); + dev_xmit_recursion_dec(); + rcu_read_unlock_bh(); + return ret; + } + rcu_read_unlock_bh(); + if (dst) + IP6_INC_STATS(dev_net(dst->dev), + ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES); +out_drop: + kfree_skb(skb); + return -ENETDOWN; +} + +static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev, + struct bpf_nh_params *nh) +{ + const struct ipv6hdr *ip6h = ipv6_hdr(skb); + struct net *net = dev_net(dev); + int err, ret = NET_XMIT_DROP; + + if (!nh) { + struct dst_entry *dst; + struct flowi6 fl6 = { + .flowi6_flags = FLOWI_FLAG_ANYSRC, + .flowi6_mark = skb->mark, + .flowlabel = ip6_flowinfo(ip6h), + .flowi6_oif = dev->ifindex, + .flowi6_proto = ip6h->nexthdr, + .daddr = ip6h->daddr, + .saddr = ip6h->saddr, + }; + + dst = ipv6_stub->ipv6_dst_lookup_flow(net, NULL, &fl6, NULL); + if (IS_ERR(dst)) + goto out_drop; + + skb_dst_set(skb, dst); + } else if (nh->nh_family != AF_INET6) { + goto out_drop; + } + + err = bpf_out_neigh_v6(net, skb, dev, nh); + if (unlikely(net_xmit_eval(err))) + dev->stats.tx_errors++; + else + ret = NET_XMIT_SUCCESS; + goto out_xmit; +out_drop: + dev->stats.tx_errors++; + kfree_skb(skb); +out_xmit: + return ret; +} +#else +static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev, + struct bpf_nh_params *nh) +{ + kfree_skb(skb); + return NET_XMIT_DROP; +} +#endif /* CONFIG_IPV6 */ + +#if IS_ENABLED(CONFIG_INET) +static int bpf_out_neigh_v4(struct net *net, struct sk_buff *skb, + struct net_device *dev, struct bpf_nh_params *nh) +{ + u32 hh_len = LL_RESERVED_SPACE(dev); + struct neighbour *neigh; + bool is_v6gw = false; + + if (dev_xmit_recursion()) { + net_crit_ratelimited("bpf: recursion limit reached on datapath, buggy bpf program?\n"); + goto out_drop; + } + + skb->dev = dev; + skb->tstamp = 0; + + if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) { + struct sk_buff *skb2; + + skb2 = skb_realloc_headroom(skb, hh_len); + if (unlikely(!skb2)) { + kfree_skb(skb); + return -ENOMEM; + } + if (skb->sk) + skb_set_owner_w(skb2, skb->sk); + consume_skb(skb); + skb = skb2; + } + + rcu_read_lock_bh(); + if (!nh) { + struct dst_entry *dst = skb_dst(skb); + struct rtable *rt = container_of(dst, struct rtable, dst); + + neigh = ip_neigh_for_gw(rt, skb, &is_v6gw); + } else if (nh->nh_family == AF_INET6) { + neigh = ip_neigh_gw6(dev, &nh->ipv6_nh); + is_v6gw = true; + } else if (nh->nh_family == AF_INET) { + neigh = ip_neigh_gw4(dev, nh->ipv4_nh); + } else { + rcu_read_unlock_bh(); + goto out_drop; + } + + if (likely(!IS_ERR(neigh))) { + int ret; + + sock_confirm_neigh(skb, neigh); + dev_xmit_recursion_inc(); + ret = neigh_output(neigh, skb, is_v6gw); + dev_xmit_recursion_dec(); + rcu_read_unlock_bh(); + return ret; + } + rcu_read_unlock_bh(); +out_drop: + kfree_skb(skb); + return -ENETDOWN; +} + +static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev, + struct bpf_nh_params *nh) +{ + const struct iphdr *ip4h = ip_hdr(skb); + struct net *net = dev_net(dev); + int err, ret = NET_XMIT_DROP; + + if (!nh) { + struct flowi4 fl4 = { + .flowi4_flags = FLOWI_FLAG_ANYSRC, + .flowi4_mark = skb->mark, + .flowi4_tos = RT_TOS(ip4h->tos), + .flowi4_oif = dev->ifindex, + .flowi4_proto = ip4h->protocol, + .daddr = ip4h->daddr, + .saddr = ip4h->saddr, + }; + struct rtable *rt; + + rt = ip_route_output_flow(net, &fl4, NULL); + if (IS_ERR(rt)) + goto out_drop; + if (rt->rt_type != RTN_UNICAST && rt->rt_type != RTN_LOCAL) { + ip_rt_put(rt); + goto out_drop; + } + + skb_dst_set(skb, &rt->dst); + } + + err = bpf_out_neigh_v4(net, skb, dev, nh); + if (unlikely(net_xmit_eval(err))) + dev->stats.tx_errors++; + else + ret = NET_XMIT_SUCCESS; + goto out_xmit; +out_drop: + dev->stats.tx_errors++; + kfree_skb(skb); +out_xmit: + return ret; +} +#else +static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev, + struct bpf_nh_params *nh) +{ + kfree_skb(skb); + return NET_XMIT_DROP; +} +#endif /* CONFIG_INET */ + +static int __bpf_redirect_neigh(struct sk_buff *skb, struct net_device *dev, + struct bpf_nh_params *nh) +{ + struct ethhdr *ethh = eth_hdr(skb); + + if (unlikely(skb->mac_header >= skb->network_header)) + goto out; + bpf_push_mac_rcsum(skb); + if (is_multicast_ether_addr(ethh->h_dest)) + goto out; + + skb_pull(skb, sizeof(*ethh)); + skb_unset_mac_header(skb); + skb_reset_network_header(skb); + + if (skb->protocol == htons(ETH_P_IP)) + return __bpf_redirect_neigh_v4(skb, dev, nh); + else if (skb->protocol == htons(ETH_P_IPV6)) + return __bpf_redirect_neigh_v6(skb, dev, nh); +out: + kfree_skb(skb); + return -ENOTSUPP; +} + +/* Internal, non-exposed redirect flags. */ +enum { + BPF_F_NEIGH = (1ULL << 1), + BPF_F_PEER = (1ULL << 2), + BPF_F_NEXTHOP = (1ULL << 3), +#define BPF_F_REDIRECT_INTERNAL (BPF_F_NEIGH | BPF_F_PEER | BPF_F_NEXTHOP) +}; + BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags) { struct net_device *dev; struct sk_buff *clone; int ret; - if (unlikely(flags & ~(BPF_F_INGRESS))) + if (unlikely(flags & (~(BPF_F_INGRESS) | BPF_F_REDIRECT_INTERNAL))) return -EINVAL; dev = dev_get_by_index_rcu(dev_net(skb->dev), ifindex); @@ -2203,11 +2460,46 @@ static const struct bpf_func_proto bpf_clone_redirect_proto = { DEFINE_PER_CPU(struct bpf_redirect_info, bpf_redirect_info); EXPORT_PER_CPU_SYMBOL_GPL(bpf_redirect_info); +int skb_do_redirect(struct sk_buff *skb) +{ + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + struct net *net = dev_net(skb->dev); + struct net_device *dev; + u32 flags = ri->flags; + + dev = dev_get_by_index_rcu(net, ri->tgt_index); + ri->tgt_index = 0; + ri->flags = 0; + if (unlikely(!dev)) + goto out_drop; + if (flags & BPF_F_PEER) { + const struct net_device_ops *ops = dev->netdev_ops; + + if (unlikely(!ops->ndo_get_peer_dev || + !skb_at_tc_ingress(skb))) + goto out_drop; + dev = ops->ndo_get_peer_dev(dev); + if (unlikely(!dev || + !(dev->flags & IFF_UP) || + net_eq(net, dev_net(dev)))) + goto out_drop; + skb->dev = dev; + return -EAGAIN; + } + return flags & BPF_F_NEIGH ? + __bpf_redirect_neigh(skb, dev, flags & BPF_F_NEXTHOP ? + &ri->nh : NULL) : + __bpf_redirect(skb, dev, flags); +out_drop: + kfree_skb(skb); + return -EINVAL; +} + BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags) { struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); - if (unlikely(flags & ~(BPF_F_INGRESS))) + if (unlikely(flags & (~(BPF_F_INGRESS) | BPF_F_REDIRECT_INTERNAL))) return TC_ACT_SHOT; ri->flags = flags; @@ -2216,29 +2508,63 @@ BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags) return TC_ACT_REDIRECT; } -int skb_do_redirect(struct sk_buff *skb) +static const struct bpf_func_proto bpf_redirect_proto = { + .func = bpf_redirect, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_ANYTHING, + .arg2_type = ARG_ANYTHING, +}; + +BPF_CALL_2(bpf_redirect_peer, u32, ifindex, u64, flags) { struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); - struct net_device *dev; - dev = dev_get_by_index_rcu(dev_net(skb->dev), ri->tgt_index); - ri->tgt_index = 0; - if (unlikely(!dev)) { - kfree_skb(skb); - return -EINVAL; - } + if (unlikely(flags)) + return TC_ACT_SHOT; + + ri->flags = BPF_F_PEER; + ri->tgt_index = ifindex; - return __bpf_redirect(skb, dev, ri->flags); + return TC_ACT_REDIRECT; } -static const struct bpf_func_proto bpf_redirect_proto = { - .func = bpf_redirect, +static const struct bpf_func_proto bpf_redirect_peer_proto = { + .func = bpf_redirect_peer, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_ANYTHING, .arg2_type = ARG_ANYTHING, }; +BPF_CALL_4(bpf_redirect_neigh, u32, ifindex, struct bpf_redir_neigh *, params, + int, plen, u64, flags) +{ + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + + if (unlikely((plen && plen < sizeof(*params)) || flags)) + return TC_ACT_SHOT; + + ri->flags = BPF_F_NEIGH | (plen ? BPF_F_NEXTHOP : 0); + ri->tgt_index = ifindex; + + BUILD_BUG_ON(sizeof(struct bpf_redir_neigh) != sizeof(struct bpf_nh_params)); + if (plen) + memcpy(&ri->nh, params, sizeof(ri->nh)); + + return TC_ACT_REDIRECT; +} + +static const struct bpf_func_proto bpf_redirect_neigh_proto = { + .func = bpf_redirect_neigh, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_ANYTHING, + .arg2_type = ARG_PTR_TO_MEM_OR_NULL, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, + .arg4_type = ARG_ANYTHING, +}; + BPF_CALL_2(bpf_msg_apply_bytes, struct sk_msg *, msg, u32, bytes) { msg->apply_bytes = bytes; @@ -2704,6 +3030,23 @@ static const struct bpf_func_proto bpf_get_cgroup_classid_curr_proto = { .gpl_only = false, .ret_type = RET_INTEGER, }; + +BPF_CALL_1(bpf_skb_cgroup_classid, const struct sk_buff *, skb) +{ + struct sock *sk = skb_to_full_sk(skb); + + if (!sk || !sk_fullsock(sk)) + return 0; + + return sock_cgroup_classid(&sk->sk_cgrp_data); +} + +static const struct bpf_func_proto bpf_skb_cgroup_classid_proto = { + .func = bpf_skb_cgroup_classid, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; #endif BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb) @@ -3209,18 +3552,56 @@ static int bpf_skb_net_shrink(struct sk_buff *skb, u32 off, u32 len_diff, return 0; } -static u32 __bpf_skb_max_len(const struct sk_buff *skb) +#define BPF_SKB_MAX_LEN SKB_MAX_ALLOC + +BPF_CALL_4(sk_skb_adjust_room, struct sk_buff *, skb, s32, len_diff, + u32, mode, u64, flags) { - return skb->dev ? skb->dev->mtu + skb->dev->hard_header_len : - SKB_MAX_ALLOC; + u32 len_diff_abs = abs(len_diff); + bool shrink = len_diff < 0; + int ret = 0; + + if (unlikely(flags || mode)) + return -EINVAL; + if (unlikely(len_diff_abs > 0xfffU)) + return -EFAULT; + + if (!shrink) { + ret = skb_cow(skb, len_diff); + if (unlikely(ret < 0)) + return ret; + __skb_push(skb, len_diff_abs); + memset(skb->data, 0, len_diff_abs); + } else { + if (unlikely(!pskb_may_pull(skb, len_diff_abs))) + return -ENOMEM; + __skb_pull(skb, len_diff_abs); + } + bpf_compute_data_end_sk_skb(skb); + if (tls_sw_has_ctx_rx(skb->sk)) { + struct strp_msg *rxm = strp_msg(skb); + + rxm->full_len += len_diff; + } + return ret; } +static const struct bpf_func_proto sk_skb_adjust_room_proto = { + .func = sk_skb_adjust_room, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_ANYTHING, +}; + BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff, u32, mode, u64, flags) { u32 len_cur, len_diff_abs = abs(len_diff); u32 len_min = bpf_skb_net_base_len(skb); - u32 len_max = __bpf_skb_max_len(skb); + u32 len_max = BPF_SKB_MAX_LEN; __be16 proto = skb->protocol; bool shrink = len_diff < 0; u32 off; @@ -3303,7 +3684,7 @@ static int bpf_skb_trim_rcsum(struct sk_buff *skb, unsigned int new_len) static inline int __bpf_skb_change_tail(struct sk_buff *skb, u32 new_len, u64 flags) { - u32 max_len = __bpf_skb_max_len(skb); + u32 max_len = BPF_SKB_MAX_LEN; u32 min_len = __bpf_skb_min_len(skb); int ret; @@ -3379,7 +3760,7 @@ static const struct bpf_func_proto sk_skb_change_tail_proto = { static inline int __bpf_skb_change_head(struct sk_buff *skb, u32 head_room, u64 flags) { - u32 max_len = __bpf_skb_max_len(skb); + u32 max_len = BPF_SKB_MAX_LEN; u32 new_len = skb->len + head_room; int ret; @@ -3803,19 +4184,18 @@ static const struct bpf_func_proto bpf_skb_event_output_proto = { .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; -BTF_ID_LIST(bpf_skb_output_btf_ids) -BTF_ID(struct, sk_buff) +BTF_ID_LIST_SINGLE(bpf_skb_output_btf_ids, struct, sk_buff) const struct bpf_func_proto bpf_skb_output_proto = { .func = bpf_skb_event_output, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &bpf_skb_output_btf_ids[0], .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_MEM, .arg5_type = ARG_CONST_SIZE_OR_ZERO, - .btf_id = bpf_skb_output_btf_ids, }; static unsigned short bpf_tunnel_key_af(u64 flags) @@ -4086,18 +4466,17 @@ static inline u64 __bpf_sk_cgroup_id(struct sock *sk) { struct cgroup *cgrp; + sk = sk_to_full_sk(sk); + if (!sk || !sk_fullsock(sk)) + return 0; + cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); return cgroup_id(cgrp); } BPF_CALL_1(bpf_skb_cgroup_id, const struct sk_buff *, skb) { - struct sock *sk = skb_to_full_sk(skb); - - if (!sk || !sk_fullsock(sk)) - return 0; - - return __bpf_sk_cgroup_id(sk); + return __bpf_sk_cgroup_id(skb->sk); } static const struct bpf_func_proto bpf_skb_cgroup_id_proto = { @@ -4113,6 +4492,10 @@ static inline u64 __bpf_sk_ancestor_cgroup_id(struct sock *sk, struct cgroup *ancestor; struct cgroup *cgrp; + sk = sk_to_full_sk(sk); + if (!sk || !sk_fullsock(sk)) + return 0; + cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); ancestor = cgroup_ancestor(cgrp, ancestor_level); if (!ancestor) @@ -4124,12 +4507,7 @@ static inline u64 __bpf_sk_ancestor_cgroup_id(struct sock *sk, BPF_CALL_2(bpf_skb_ancestor_cgroup_id, const struct sk_buff *, skb, int, ancestor_level) { - struct sock *sk = skb_to_full_sk(skb); - - if (!sk || !sk_fullsock(sk)) - return 0; - - return __bpf_sk_ancestor_cgroup_id(sk, ancestor_level); + return __bpf_sk_ancestor_cgroup_id(skb->sk, ancestor_level); } static const struct bpf_func_proto bpf_skb_ancestor_cgroup_id_proto = { @@ -4149,7 +4527,7 @@ static const struct bpf_func_proto bpf_sk_cgroup_id_proto = { .func = bpf_sk_cgroup_id, .gpl_only = false, .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_SOCKET, + .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, }; BPF_CALL_2(bpf_sk_ancestor_cgroup_id, struct sock *, sk, int, ancestor_level) @@ -4161,7 +4539,7 @@ static const struct bpf_func_proto bpf_sk_ancestor_cgroup_id_proto = { .func = bpf_sk_ancestor_cgroup_id, .gpl_only = false, .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_SOCKET, + .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, .arg2_type = ARG_ANYTHING, }; #endif @@ -4199,24 +4577,23 @@ static const struct bpf_func_proto bpf_xdp_event_output_proto = { .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; -BTF_ID_LIST(bpf_xdp_output_btf_ids) -BTF_ID(struct, xdp_buff) +BTF_ID_LIST_SINGLE(bpf_xdp_output_btf_ids, struct, xdp_buff) const struct bpf_func_proto bpf_xdp_output_proto = { .func = bpf_xdp_event_output, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &bpf_xdp_output_btf_ids[0], .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_MEM, .arg5_type = ARG_CONST_SIZE_OR_ZERO, - .btf_id = bpf_xdp_output_btf_ids, }; BPF_CALL_1(bpf_get_socket_cookie, struct sk_buff *, skb) { - return skb->sk ? sock_gen_cookie(skb->sk) : 0; + return skb->sk ? __sock_gen_cookie(skb->sk) : 0; } static const struct bpf_func_proto bpf_get_socket_cookie_proto = { @@ -4228,7 +4605,7 @@ static const struct bpf_func_proto bpf_get_socket_cookie_proto = { BPF_CALL_1(bpf_get_socket_cookie_sock_addr, struct bpf_sock_addr_kern *, ctx) { - return sock_gen_cookie(ctx->sk); + return __sock_gen_cookie(ctx->sk); } static const struct bpf_func_proto bpf_get_socket_cookie_sock_addr_proto = { @@ -4240,7 +4617,7 @@ static const struct bpf_func_proto bpf_get_socket_cookie_sock_addr_proto = { BPF_CALL_1(bpf_get_socket_cookie_sock, struct sock *, ctx) { - return sock_gen_cookie(ctx); + return __sock_gen_cookie(ctx); } static const struct bpf_func_proto bpf_get_socket_cookie_sock_proto = { @@ -4250,9 +4627,21 @@ static const struct bpf_func_proto bpf_get_socket_cookie_sock_proto = { .arg1_type = ARG_PTR_TO_CTX, }; +BPF_CALL_1(bpf_get_socket_ptr_cookie, struct sock *, sk) +{ + return sk ? sock_gen_cookie(sk) : 0; +} + +const struct bpf_func_proto bpf_get_socket_ptr_cookie_proto = { + .func = bpf_get_socket_ptr_cookie, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, +}; + BPF_CALL_1(bpf_get_socket_cookie_sock_ops, struct bpf_sock_ops_kern *, ctx) { - return sock_gen_cookie(ctx->sk); + return __sock_gen_cookie(ctx->sk); } static const struct bpf_func_proto bpf_get_socket_cookie_sock_ops_proto = { @@ -4264,11 +4653,9 @@ static const struct bpf_func_proto bpf_get_socket_cookie_sock_ops_proto = { static u64 __bpf_get_netns_cookie(struct sock *sk) { -#ifdef CONFIG_NET_NS - return net_gen_cookie(sk ? sk->sk_net.net : &init_net); -#else - return 0; -#endif + const struct net *net = sk ? sock_net(sk) : &init_net; + + return net->net_cookie; } BPF_CALL_1(bpf_get_netns_cookie_sock, struct sock *, ctx) @@ -4313,10 +4700,8 @@ static const struct bpf_func_proto bpf_get_socket_uid_proto = { .arg1_type = ARG_PTR_TO_CTX, }; -#define SOCKOPT_CC_REINIT (1 << 0) - static int _bpf_setsockopt(struct sock *sk, int level, int optname, - char *optval, int optlen, u32 flags) + char *optval, int optlen) { char devname[IFNAMSIZ]; int val, valbool; @@ -4354,7 +4739,8 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname, cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED); - sk->sk_max_pacing_rate = (val == ~0U) ? ~0UL : val; + sk->sk_max_pacing_rate = (val == ~0U) ? + ~0UL : (unsigned int)val; sk->sk_pacing_rate = min(sk->sk_pacing_rate, sk->sk_max_pacing_rate); break; @@ -4390,6 +4776,10 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname, ifindex = dev->ifindex; dev_put(dev); } + fallthrough; + case SO_BINDTOIFINDEX: + if (optname == SO_BINDTOIFINDEX) + ifindex = val; ret = sock_bindtoindex(sk, ifindex, false); break; case SO_KEEPALIVE: @@ -4449,16 +4839,15 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname, sk->sk_prot->setsockopt == tcp_setsockopt) { if (optname == TCP_CONGESTION) { char name[TCP_CA_NAME_MAX]; - bool reinit = flags & SOCKOPT_CC_REINIT; strncpy(name, optval, min_t(long, optlen, TCP_CA_NAME_MAX-1)); name[TCP_CA_NAME_MAX-1] = 0; - ret = tcp_set_congestion_control(sk, name, false, - reinit, true); + ret = tcp_set_congestion_control(sk, name, false, true); } else { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); + unsigned long timeout; if (optlen != sizeof(int)) return -EINVAL; @@ -4480,6 +4869,20 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname, tp->snd_ssthresh = val; } break; + case TCP_BPF_DELACK_MAX: + timeout = usecs_to_jiffies(val); + if (timeout > TCP_DELACK_MAX || + timeout < TCP_TIMEOUT_MIN) + return -EINVAL; + inet_csk(sk)->icsk_delack_max = timeout; + break; + case TCP_BPF_RTO_MIN: + timeout = usecs_to_jiffies(val); + if (timeout > TCP_RTO_MIN || + timeout < TCP_TIMEOUT_MIN) + return -EINVAL; + inet_csk(sk)->icsk_rto_min = timeout; + break; case TCP_SAVE_SYN: if (val < 0 || val > 1) ret = -EINVAL; @@ -4513,6 +4916,13 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname, else icsk->icsk_user_timeout = val; break; + case TCP_NOTSENT_LOWAT: + tp->notsent_lowat = val; + sk->sk_write_space(sk); + break; + case TCP_WINDOW_CLAMP: + ret = tcp_set_window_clamp(sk, val); + break; default: ret = -EINVAL; } @@ -4532,8 +4942,25 @@ static int _bpf_getsockopt(struct sock *sk, int level, int optname, sock_owned_by_me(sk); + if (level == SOL_SOCKET) { + if (optlen != sizeof(int)) + goto err_clear; + + switch (optname) { + case SO_MARK: + *((int *)optval) = sk->sk_mark; + break; + case SO_PRIORITY: + *((int *)optval) = sk->sk_priority; + break; + case SO_BINDTOIFINDEX: + *((int *)optval) = sk->sk_bound_dev_if; + break; + default: + goto err_clear; + } #ifdef CONFIG_INET - if (level == SOL_TCP && sk->sk_prot->getsockopt == tcp_getsockopt) { + } else if (level == SOL_TCP && sk->sk_prot->getsockopt == tcp_getsockopt) { struct inet_connection_sock *icsk; struct tcp_sock *tp; @@ -4550,9 +4977,9 @@ static int _bpf_getsockopt(struct sock *sk, int level, int optname, tp = tcp_sk(sk); if (optlen <= 0 || !tp->saved_syn || - optlen > tp->saved_syn[0]) + optlen > tcp_saved_syn_len(tp->saved_syn)) goto err_clear; - memcpy(optval, tp->saved_syn + 1, optlen); + memcpy(optval, tp->saved_syn->data, optlen); break; default: goto err_clear; @@ -4587,11 +5014,11 @@ static int _bpf_getsockopt(struct sock *sk, int level, int optname, goto err_clear; } #endif +#endif } else { goto err_clear; } return 0; -#endif err_clear: memset(optval, 0, optlen); return -EINVAL; @@ -4600,9 +5027,7 @@ err_clear: BPF_CALL_5(bpf_sock_addr_setsockopt, struct bpf_sock_addr_kern *, ctx, int, level, int, optname, char *, optval, int, optlen) { - u32 flags = 0; - return _bpf_setsockopt(ctx->sk, level, optname, optval, optlen, - flags); + return _bpf_setsockopt(ctx->sk, level, optname, optval, optlen); } static const struct bpf_func_proto bpf_sock_addr_setsockopt_proto = { @@ -4636,11 +5061,7 @@ static const struct bpf_func_proto bpf_sock_addr_getsockopt_proto = { BPF_CALL_5(bpf_sock_ops_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, int, level, int, optname, char *, optval, int, optlen) { - u32 flags = 0; - if (bpf_sock->op > BPF_SOCK_OPS_NEEDS_ECN) - flags |= SOCKOPT_CC_REINIT; - return _bpf_setsockopt(bpf_sock->sk, level, optname, optval, optlen, - flags); + return _bpf_setsockopt(bpf_sock->sk, level, optname, optval, optlen); } static const struct bpf_func_proto bpf_sock_ops_setsockopt_proto = { @@ -4654,9 +5075,99 @@ static const struct bpf_func_proto bpf_sock_ops_setsockopt_proto = { .arg5_type = ARG_CONST_SIZE, }; +static int bpf_sock_ops_get_syn(struct bpf_sock_ops_kern *bpf_sock, + int optname, const u8 **start) +{ + struct sk_buff *syn_skb = bpf_sock->syn_skb; + const u8 *hdr_start; + int ret; + + if (syn_skb) { + /* sk is a request_sock here */ + + if (optname == TCP_BPF_SYN) { + hdr_start = syn_skb->data; + ret = tcp_hdrlen(syn_skb); + } else if (optname == TCP_BPF_SYN_IP) { + hdr_start = skb_network_header(syn_skb); + ret = skb_network_header_len(syn_skb) + + tcp_hdrlen(syn_skb); + } else { + /* optname == TCP_BPF_SYN_MAC */ + hdr_start = skb_mac_header(syn_skb); + ret = skb_mac_header_len(syn_skb) + + skb_network_header_len(syn_skb) + + tcp_hdrlen(syn_skb); + } + } else { + struct sock *sk = bpf_sock->sk; + struct saved_syn *saved_syn; + + if (sk->sk_state == TCP_NEW_SYN_RECV) + /* synack retransmit. bpf_sock->syn_skb will + * not be available. It has to resort to + * saved_syn (if it is saved). + */ + saved_syn = inet_reqsk(sk)->saved_syn; + else + saved_syn = tcp_sk(sk)->saved_syn; + + if (!saved_syn) + return -ENOENT; + + if (optname == TCP_BPF_SYN) { + hdr_start = saved_syn->data + + saved_syn->mac_hdrlen + + saved_syn->network_hdrlen; + ret = saved_syn->tcp_hdrlen; + } else if (optname == TCP_BPF_SYN_IP) { + hdr_start = saved_syn->data + + saved_syn->mac_hdrlen; + ret = saved_syn->network_hdrlen + + saved_syn->tcp_hdrlen; + } else { + /* optname == TCP_BPF_SYN_MAC */ + + /* TCP_SAVE_SYN may not have saved the mac hdr */ + if (!saved_syn->mac_hdrlen) + return -ENOENT; + + hdr_start = saved_syn->data; + ret = saved_syn->mac_hdrlen + + saved_syn->network_hdrlen + + saved_syn->tcp_hdrlen; + } + } + + *start = hdr_start; + return ret; +} + BPF_CALL_5(bpf_sock_ops_getsockopt, struct bpf_sock_ops_kern *, bpf_sock, int, level, int, optname, char *, optval, int, optlen) { + if (IS_ENABLED(CONFIG_INET) && level == SOL_TCP && + optname >= TCP_BPF_SYN && optname <= TCP_BPF_SYN_MAC) { + int ret, copy_len = 0; + const u8 *start; + + ret = bpf_sock_ops_get_syn(bpf_sock, optname, &start); + if (ret > 0) { + copy_len = ret; + if (optlen < copy_len) { + copy_len = optlen; + ret = -ENOSPC; + } + + memcpy(optval, start, copy_len); + } + + /* Zero out unused buffer at the end */ + memset(optval + copy_len, 0, optlen - copy_len); + + return ret; + } + return _bpf_getsockopt(bpf_sock->sk, level, optname, optval, optlen); } @@ -4788,13 +5299,14 @@ static const struct bpf_func_proto bpf_skb_get_xfrm_state_proto = { #if IS_ENABLED(CONFIG_INET) || IS_ENABLED(CONFIG_IPV6) static int bpf_fib_set_fwd_params(struct bpf_fib_lookup *params, const struct neighbour *neigh, - const struct net_device *dev) + const struct net_device *dev, u32 mtu) { memcpy(params->dmac, neigh->ha, ETH_ALEN); memcpy(params->smac, dev->dev_addr, ETH_ALEN); params->h_vlan_TCI = 0; params->h_vlan_proto = 0; - params->ifindex = dev->ifindex; + if (mtu) + params->mtu_result = mtu; /* union with tot_len */ return 0; } @@ -4810,8 +5322,8 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params, struct net_device *dev; struct fib_result res; struct flowi4 fl4; + u32 mtu = 0; int err; - u32 mtu; dev = dev_get_by_index_rcu(net, params->ifindex); if (unlikely(!dev)) @@ -4878,8 +5390,10 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params, if (check_mtu) { mtu = ip_mtu_from_fib_result(&res, params->ipv4_dst); - if (params->tot_len > mtu) + if (params->tot_len > mtu) { + params->mtu_result = mtu; /* union with tot_len */ return BPF_FIB_LKUP_RET_FRAG_NEEDED; + } } nhc = res.nhc; @@ -4891,6 +5405,7 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params, dev = nhc->nhc_dev; params->rt_metric = res.fi->fib_priority; + params->ifindex = dev->ifindex; /* xdp and cls_bpf programs are run in RCU-bh so * rcu_read_lock_bh is not needed here @@ -4912,7 +5427,7 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params, if (!neigh) return BPF_FIB_LKUP_RET_NO_NEIGH; - return bpf_fib_set_fwd_params(params, neigh, dev); + return bpf_fib_set_fwd_params(params, neigh, dev, mtu); } #endif @@ -4929,7 +5444,7 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params, struct flowi6 fl6; int strict = 0; int oif, err; - u32 mtu; + u32 mtu = 0; /* link local addresses are never forwarded */ if (rt6_need_strict(dst) || rt6_need_strict(src)) @@ -5004,8 +5519,10 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params, if (check_mtu) { mtu = ipv6_stub->ip6_mtu_from_fib6(&res, dst, src); - if (params->tot_len > mtu) + if (params->tot_len > mtu) { + params->mtu_result = mtu; /* union with tot_len */ return BPF_FIB_LKUP_RET_FRAG_NEEDED; + } } if (res.nh->fib_nh_lws) @@ -5016,6 +5533,7 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params, dev = res.nh->fib_nh_dev; params->rt_metric = res.f6i->fib6_metric; + params->ifindex = dev->ifindex; /* xdp and cls_bpf programs are run in RCU-bh so rcu_read_lock_bh is * not needed here. @@ -5024,7 +5542,7 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params, if (!neigh) return BPF_FIB_LKUP_RET_NO_NEIGH; - return bpf_fib_set_fwd_params(params, neigh, dev); + return bpf_fib_set_fwd_params(params, neigh, dev, mtu); } #endif @@ -5067,6 +5585,7 @@ BPF_CALL_4(bpf_skb_fib_lookup, struct sk_buff *, skb, { struct net *net = dev_net(skb->dev); int rc = -EAFNOSUPPORT; + bool check_mtu = false; if (plen < sizeof(*params)) return -EINVAL; @@ -5074,25 +5593,33 @@ BPF_CALL_4(bpf_skb_fib_lookup, struct sk_buff *, skb, if (flags & ~(BPF_FIB_LOOKUP_DIRECT | BPF_FIB_LOOKUP_OUTPUT)) return -EINVAL; + if (params->tot_len) + check_mtu = true; + switch (params->family) { #if IS_ENABLED(CONFIG_INET) case AF_INET: - rc = bpf_ipv4_fib_lookup(net, params, flags, false); + rc = bpf_ipv4_fib_lookup(net, params, flags, check_mtu); break; #endif #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: - rc = bpf_ipv6_fib_lookup(net, params, flags, false); + rc = bpf_ipv6_fib_lookup(net, params, flags, check_mtu); break; #endif } - if (!rc) { + if (rc == BPF_FIB_LKUP_RET_SUCCESS && !check_mtu) { struct net_device *dev; + /* When tot_len isn't provided by user, check skb + * against MTU of FIB lookup resulting net_device + */ dev = dev_get_by_index_rcu(net, params->ifindex); if (!is_skb_forwardable(dev, skb)) rc = BPF_FIB_LKUP_RET_FRAG_NEEDED; + + params->mtu_result = dev->mtu; /* union with tot_len */ } return rc; @@ -5108,6 +5635,116 @@ static const struct bpf_func_proto bpf_skb_fib_lookup_proto = { .arg4_type = ARG_ANYTHING, }; +static struct net_device *__dev_via_ifindex(struct net_device *dev_curr, + u32 ifindex) +{ + struct net *netns = dev_net(dev_curr); + + /* Non-redirect use-cases can use ifindex=0 and save ifindex lookup */ + if (ifindex == 0) + return dev_curr; + + return dev_get_by_index_rcu(netns, ifindex); +} + +BPF_CALL_5(bpf_skb_check_mtu, struct sk_buff *, skb, + u32, ifindex, u32 *, mtu_len, s32, len_diff, u64, flags) +{ + int ret = BPF_MTU_CHK_RET_FRAG_NEEDED; + struct net_device *dev = skb->dev; + int skb_len, dev_len; + int mtu; + + if (unlikely(flags & ~(BPF_MTU_CHK_SEGS))) + return -EINVAL; + + if (unlikely(flags & BPF_MTU_CHK_SEGS && len_diff)) + return -EINVAL; + + dev = __dev_via_ifindex(dev, ifindex); + if (unlikely(!dev)) + return -ENODEV; + + mtu = READ_ONCE(dev->mtu); + + dev_len = mtu + dev->hard_header_len; + skb_len = skb->len + len_diff; /* minus result pass check */ + if (skb_len <= dev_len) { + ret = BPF_MTU_CHK_RET_SUCCESS; + goto out; + } + /* At this point, skb->len exceed MTU, but as it include length of all + * segments, it can still be below MTU. The SKB can possibly get + * re-segmented in transmit path (see validate_xmit_skb). Thus, user + * must choose if segs are to be MTU checked. + */ + if (skb_is_gso(skb)) { + ret = BPF_MTU_CHK_RET_SUCCESS; + + if (flags & BPF_MTU_CHK_SEGS && + !skb_gso_validate_network_len(skb, mtu)) + ret = BPF_MTU_CHK_RET_SEGS_TOOBIG; + } +out: + /* BPF verifier guarantees valid pointer */ + *mtu_len = mtu; + + return ret; +} + +BPF_CALL_5(bpf_xdp_check_mtu, struct xdp_buff *, xdp, + u32, ifindex, u32 *, mtu_len, s32, len_diff, u64, flags) +{ + struct net_device *dev = xdp->rxq->dev; + int xdp_len = xdp->data_end - xdp->data; + int ret = BPF_MTU_CHK_RET_SUCCESS; + int mtu, dev_len; + + /* XDP variant doesn't support multi-buffer segment check (yet) */ + if (unlikely(flags)) + return -EINVAL; + + dev = __dev_via_ifindex(dev, ifindex); + if (unlikely(!dev)) + return -ENODEV; + + mtu = READ_ONCE(dev->mtu); + + /* Add L2-header as dev MTU is L3 size */ + dev_len = mtu + dev->hard_header_len; + + xdp_len += len_diff; /* minus result pass check */ + if (xdp_len > dev_len) + ret = BPF_MTU_CHK_RET_FRAG_NEEDED; + + /* BPF verifier guarantees valid pointer */ + *mtu_len = mtu; + + return ret; +} + +static const struct bpf_func_proto bpf_skb_check_mtu_proto = { + .func = bpf_skb_check_mtu, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_INT, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; + +static const struct bpf_func_proto bpf_xdp_check_mtu_proto = { + .func = bpf_xdp_check_mtu, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_INT, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; + #if IS_ENABLED(CONFIG_IPV6_SEG6_BPF) static int bpf_push_seg6_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len) { @@ -5601,7 +6238,7 @@ static const struct bpf_func_proto bpf_sk_lookup_udp_proto = { BPF_CALL_1(bpf_sk_release, struct sock *, sk) { - if (sk_is_refcounted(sk)) + if (sk && sk_is_refcounted(sk)) sock_gen_put(sk); return 0; } @@ -5610,7 +6247,7 @@ static const struct bpf_func_proto bpf_sk_release_proto = { .func = bpf_sk_release, .gpl_only = false, .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_SOCK_COMMON, + .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, }; BPF_CALL_5(bpf_xdp_sk_lookup_udp, struct xdp_buff *, ctx, @@ -5992,7 +6629,7 @@ BPF_CALL_5(bpf_tcp_check_syncookie, struct sock *, sk, void *, iph, u32, iph_len u32 cookie; int ret; - if (unlikely(th_len < sizeof(*th))) + if (unlikely(!sk || th_len < sizeof(*th))) return -EINVAL; /* sk_listener() allows TCP_NEW_SYN_RECV, which makes no sense here. */ @@ -6045,7 +6682,7 @@ static const struct bpf_func_proto bpf_tcp_check_syncookie_proto = { .gpl_only = true, .pkt_access = true, .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_SOCK_COMMON, + .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, .arg2_type = ARG_PTR_TO_MEM, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_PTR_TO_MEM, @@ -6059,7 +6696,7 @@ BPF_CALL_5(bpf_tcp_gen_syncookie, struct sock *, sk, void *, iph, u32, iph_len, u32 cookie; u16 mss; - if (unlikely(th_len < sizeof(*th) || th_len != th->doff * 4)) + if (unlikely(!sk || th_len < sizeof(*th) || th_len != th->doff * 4)) return -EINVAL; if (sk->sk_protocol != IPPROTO_TCP || sk->sk_state != TCP_LISTEN) @@ -6114,7 +6751,7 @@ static const struct bpf_func_proto bpf_tcp_gen_syncookie_proto = { .gpl_only = true, /* __cookie_v*_init_sequence() is GPL */ .pkt_access = true, .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_SOCK_COMMON, + .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, .arg2_type = ARG_PTR_TO_MEM, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_PTR_TO_MEM, @@ -6123,7 +6760,7 @@ static const struct bpf_func_proto bpf_tcp_gen_syncookie_proto = { BPF_CALL_3(bpf_sk_assign, struct sk_buff *, skb, struct sock *, sk, u64, flags) { - if (flags != 0) + if (!sk || flags != 0) return -EINVAL; if (!skb_at_tc_ingress(skb)) return -EOPNOTSUPP; @@ -6147,7 +6784,233 @@ static const struct bpf_func_proto bpf_sk_assign_proto = { .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_SOCK_COMMON, + .arg2_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, + .arg3_type = ARG_ANYTHING, +}; + +static const u8 *bpf_search_tcp_opt(const u8 *op, const u8 *opend, + u8 search_kind, const u8 *magic, + u8 magic_len, bool *eol) +{ + u8 kind, kind_len; + + *eol = false; + + while (op < opend) { + kind = op[0]; + + if (kind == TCPOPT_EOL) { + *eol = true; + return ERR_PTR(-ENOMSG); + } else if (kind == TCPOPT_NOP) { + op++; + continue; + } + + if (opend - op < 2 || opend - op < op[1] || op[1] < 2) + /* Something is wrong in the received header. + * Follow the TCP stack's tcp_parse_options() + * and just bail here. + */ + return ERR_PTR(-EFAULT); + + kind_len = op[1]; + if (search_kind == kind) { + if (!magic_len) + return op; + + if (magic_len > kind_len - 2) + return ERR_PTR(-ENOMSG); + + if (!memcmp(&op[2], magic, magic_len)) + return op; + } + + op += kind_len; + } + + return ERR_PTR(-ENOMSG); +} + +BPF_CALL_4(bpf_sock_ops_load_hdr_opt, struct bpf_sock_ops_kern *, bpf_sock, + void *, search_res, u32, len, u64, flags) +{ + bool eol, load_syn = flags & BPF_LOAD_HDR_OPT_TCP_SYN; + const u8 *op, *opend, *magic, *search = search_res; + u8 search_kind, search_len, copy_len, magic_len; + int ret; + + /* 2 byte is the minimal option len except TCPOPT_NOP and + * TCPOPT_EOL which are useless for the bpf prog to learn + * and this helper disallow loading them also. + */ + if (len < 2 || flags & ~BPF_LOAD_HDR_OPT_TCP_SYN) + return -EINVAL; + + search_kind = search[0]; + search_len = search[1]; + + if (search_len > len || search_kind == TCPOPT_NOP || + search_kind == TCPOPT_EOL) + return -EINVAL; + + if (search_kind == TCPOPT_EXP || search_kind == 253) { + /* 16 or 32 bit magic. +2 for kind and kind length */ + if (search_len != 4 && search_len != 6) + return -EINVAL; + magic = &search[2]; + magic_len = search_len - 2; + } else { + if (search_len) + return -EINVAL; + magic = NULL; + magic_len = 0; + } + + if (load_syn) { + ret = bpf_sock_ops_get_syn(bpf_sock, TCP_BPF_SYN, &op); + if (ret < 0) + return ret; + + opend = op + ret; + op += sizeof(struct tcphdr); + } else { + if (!bpf_sock->skb || + bpf_sock->op == BPF_SOCK_OPS_HDR_OPT_LEN_CB) + /* This bpf_sock->op cannot call this helper */ + return -EPERM; + + opend = bpf_sock->skb_data_end; + op = bpf_sock->skb->data + sizeof(struct tcphdr); + } + + op = bpf_search_tcp_opt(op, opend, search_kind, magic, magic_len, + &eol); + if (IS_ERR(op)) + return PTR_ERR(op); + + copy_len = op[1]; + ret = copy_len; + if (copy_len > len) { + ret = -ENOSPC; + copy_len = len; + } + + memcpy(search_res, op, copy_len); + return ret; +} + +static const struct bpf_func_proto bpf_sock_ops_load_hdr_opt_proto = { + .func = bpf_sock_ops_load_hdr_opt, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, +}; + +BPF_CALL_4(bpf_sock_ops_store_hdr_opt, struct bpf_sock_ops_kern *, bpf_sock, + const void *, from, u32, len, u64, flags) +{ + u8 new_kind, new_kind_len, magic_len = 0, *opend; + const u8 *op, *new_op, *magic = NULL; + struct sk_buff *skb; + bool eol; + + if (bpf_sock->op != BPF_SOCK_OPS_WRITE_HDR_OPT_CB) + return -EPERM; + + if (len < 2 || flags) + return -EINVAL; + + new_op = from; + new_kind = new_op[0]; + new_kind_len = new_op[1]; + + if (new_kind_len > len || new_kind == TCPOPT_NOP || + new_kind == TCPOPT_EOL) + return -EINVAL; + + if (new_kind_len > bpf_sock->remaining_opt_len) + return -ENOSPC; + + /* 253 is another experimental kind */ + if (new_kind == TCPOPT_EXP || new_kind == 253) { + if (new_kind_len < 4) + return -EINVAL; + /* Match for the 2 byte magic also. + * RFC 6994: the magic could be 2 or 4 bytes. + * Hence, matching by 2 byte only is on the + * conservative side but it is the right + * thing to do for the 'search-for-duplication' + * purpose. + */ + magic = &new_op[2]; + magic_len = 2; + } + + /* Check for duplication */ + skb = bpf_sock->skb; + op = skb->data + sizeof(struct tcphdr); + opend = bpf_sock->skb_data_end; + + op = bpf_search_tcp_opt(op, opend, new_kind, magic, magic_len, + &eol); + if (!IS_ERR(op)) + return -EEXIST; + + if (PTR_ERR(op) != -ENOMSG) + return PTR_ERR(op); + + if (eol) + /* The option has been ended. Treat it as no more + * header option can be written. + */ + return -ENOSPC; + + /* No duplication found. Store the header option. */ + memcpy(opend, from, new_kind_len); + + bpf_sock->remaining_opt_len -= new_kind_len; + bpf_sock->skb_data_end += new_kind_len; + + return 0; +} + +static const struct bpf_func_proto bpf_sock_ops_store_hdr_opt_proto = { + .func = bpf_sock_ops_store_hdr_opt, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, +}; + +BPF_CALL_3(bpf_sock_ops_reserve_hdr_opt, struct bpf_sock_ops_kern *, bpf_sock, + u32, len, u64, flags) +{ + if (bpf_sock->op != BPF_SOCK_OPS_HDR_OPT_LEN_CB) + return -EPERM; + + if (flags || len < 2) + return -EINVAL; + + if (len > bpf_sock->remaining_opt_len) + return -ENOSPC; + + bpf_sock->remaining_opt_len -= len; + + return 0; +} + +static const struct bpf_func_proto bpf_sock_ops_reserve_hdr_opt_proto = { + .func = bpf_sock_ops_reserve_hdr_opt, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, }; @@ -6164,6 +7027,7 @@ bool bpf_helper_changes_pkt_data(void *func) func == bpf_skb_change_tail || func == sk_skb_change_tail || func == bpf_skb_adjust_room || + func == sk_skb_adjust_room || func == bpf_skb_pull_data || func == sk_skb_pull_data || func == bpf_clone_redirect || @@ -6180,6 +7044,9 @@ bool bpf_helper_changes_pkt_data(void *func) func == bpf_lwt_seg6_adjust_srh || func == bpf_lwt_seg6_action || #endif +#ifdef CONFIG_INET + func == bpf_sock_ops_store_hdr_opt || +#endif func == bpf_lwt_in_push_encap || func == bpf_lwt_xmit_push_encap) return true; @@ -6283,22 +7150,42 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sk_storage_delete_proto; case BPF_FUNC_setsockopt: switch (prog->expected_attach_type) { + case BPF_CGROUP_INET4_BIND: + case BPF_CGROUP_INET6_BIND: case BPF_CGROUP_INET4_CONNECT: case BPF_CGROUP_INET6_CONNECT: + case BPF_CGROUP_UDP4_RECVMSG: + case BPF_CGROUP_UDP6_RECVMSG: + case BPF_CGROUP_UDP4_SENDMSG: + case BPF_CGROUP_UDP6_SENDMSG: + case BPF_CGROUP_INET4_GETPEERNAME: + case BPF_CGROUP_INET6_GETPEERNAME: + case BPF_CGROUP_INET4_GETSOCKNAME: + case BPF_CGROUP_INET6_GETSOCKNAME: return &bpf_sock_addr_setsockopt_proto; default: return NULL; } case BPF_FUNC_getsockopt: switch (prog->expected_attach_type) { + case BPF_CGROUP_INET4_BIND: + case BPF_CGROUP_INET6_BIND: case BPF_CGROUP_INET4_CONNECT: case BPF_CGROUP_INET6_CONNECT: + case BPF_CGROUP_UDP4_RECVMSG: + case BPF_CGROUP_UDP6_RECVMSG: + case BPF_CGROUP_UDP4_SENDMSG: + case BPF_CGROUP_UDP6_SENDMSG: + case BPF_CGROUP_INET4_GETPEERNAME: + case BPF_CGROUP_INET6_GETPEERNAME: + case BPF_CGROUP_INET4_GETSOCKNAME: + case BPF_CGROUP_INET6_GETSOCKNAME: return &bpf_sock_addr_getsockopt_proto; default: return NULL; } default: - return bpf_base_func_proto(func_id); + return bpf_sk_base_func_proto(func_id); } } @@ -6317,7 +7204,7 @@ sk_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_perf_event_output: return &bpf_skb_event_output_proto; default: - return bpf_base_func_proto(func_id); + return bpf_sk_base_func_proto(func_id); } } @@ -6419,6 +7306,10 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return bpf_get_skb_set_tunnel_proto(func_id); case BPF_FUNC_redirect: return &bpf_redirect_proto; + case BPF_FUNC_redirect_neigh: + return &bpf_redirect_neigh_proto; + case BPF_FUNC_redirect_peer: + return &bpf_redirect_peer_proto; case BPF_FUNC_get_route_realm: return &bpf_get_route_realm_proto; case BPF_FUNC_get_hash_recalc: @@ -6439,6 +7330,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_socket_uid_proto; case BPF_FUNC_fib_lookup: return &bpf_skb_fib_lookup_proto; + case BPF_FUNC_check_mtu: + return &bpf_skb_check_mtu_proto; case BPF_FUNC_sk_fullsock: return &bpf_sk_fullsock_proto; case BPF_FUNC_sk_storage_get: @@ -6449,6 +7342,10 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_skb_get_xfrm_state: return &bpf_skb_get_xfrm_state_proto; #endif +#ifdef CONFIG_CGROUP_NET_CLASSID + case BPF_FUNC_skb_cgroup_classid: + return &bpf_skb_cgroup_classid_proto; +#endif #ifdef CONFIG_SOCK_CGROUP_DATA case BPF_FUNC_skb_cgroup_id: return &bpf_skb_cgroup_id_proto; @@ -6478,7 +7375,7 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sk_assign_proto; #endif default: - return bpf_base_func_proto(func_id); + return bpf_sk_base_func_proto(func_id); } } @@ -6504,6 +7401,8 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_xdp_adjust_tail_proto; case BPF_FUNC_fib_lookup: return &bpf_xdp_fib_lookup_proto; + case BPF_FUNC_check_mtu: + return &bpf_xdp_check_mtu_proto; #ifdef CONFIG_INET case BPF_FUNC_sk_lookup_udp: return &bpf_xdp_sk_lookup_udp_proto; @@ -6519,7 +7418,7 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_tcp_gen_syncookie_proto; #endif default: - return bpf_base_func_proto(func_id); + return bpf_sk_base_func_proto(func_id); } } @@ -6551,11 +7450,17 @@ sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_sk_storage_delete: return &bpf_sk_storage_delete_proto; #ifdef CONFIG_INET + case BPF_FUNC_load_hdr_opt: + return &bpf_sock_ops_load_hdr_opt_proto; + case BPF_FUNC_store_hdr_opt: + return &bpf_sock_ops_store_hdr_opt_proto; + case BPF_FUNC_reserve_hdr_opt: + return &bpf_sock_ops_reserve_hdr_opt_proto; case BPF_FUNC_tcp_sock: return &bpf_tcp_sock_proto; #endif /* CONFIG_INET */ default: - return bpf_base_func_proto(func_id); + return bpf_sk_base_func_proto(func_id); } } @@ -6601,7 +7506,7 @@ sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_cgroup_classid_curr_proto; #endif default: - return bpf_base_func_proto(func_id); + return bpf_sk_base_func_proto(func_id); } } @@ -6622,6 +7527,8 @@ sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &sk_skb_change_tail_proto; case BPF_FUNC_skb_change_head: return &sk_skb_change_head_proto; + case BPF_FUNC_skb_adjust_room: + return &sk_skb_adjust_room_proto; case BPF_FUNC_get_socket_cookie: return &bpf_get_socket_cookie_proto; case BPF_FUNC_get_socket_uid: @@ -6643,7 +7550,7 @@ sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_skc_lookup_tcp_proto; #endif default: - return bpf_base_func_proto(func_id); + return bpf_sk_base_func_proto(func_id); } } @@ -6654,7 +7561,7 @@ flow_dissector_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_skb_load_bytes: return &bpf_flow_dissector_load_bytes_proto; default: - return bpf_base_func_proto(func_id); + return bpf_sk_base_func_proto(func_id); } } @@ -6681,7 +7588,7 @@ lwt_out_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_skb_under_cgroup: return &bpf_skb_under_cgroup_proto; default: - return bpf_base_func_proto(func_id); + return bpf_sk_base_func_proto(func_id); } } @@ -7350,6 +8257,20 @@ static bool sock_ops_is_valid_access(int off, int size, return false; info->reg_type = PTR_TO_SOCKET_OR_NULL; break; + case offsetof(struct bpf_sock_ops, skb_data): + if (size != sizeof(__u64)) + return false; + info->reg_type = PTR_TO_PACKET; + break; + case offsetof(struct bpf_sock_ops, skb_data_end): + if (size != sizeof(__u64)) + return false; + info->reg_type = PTR_TO_PACKET_END; + break; + case offsetof(struct bpf_sock_ops, skb_tcp_flags): + bpf_ctx_record_field_size(info, size_default); + return bpf_ctx_narrow_access_ok(off, size, + size_default); default: if (size != size_default) return false; @@ -8046,7 +8967,7 @@ u32 bpf_sock_convert_ctx_access(enum bpf_access_type type, target_size)); break; case offsetof(struct bpf_sock, rx_queue_mapping): -#ifdef CONFIG_XPS +#ifdef CONFIG_SOCK_RX_QUEUE_MAPPING *insn++ = BPF_LDX_MEM( BPF_FIELD_SIZEOF(struct sock, sk_rx_queue_mapping), si->dst_reg, si->src_reg, @@ -8451,17 +9372,22 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, return insn - insn_buf; switch (si->off) { - case offsetof(struct bpf_sock_ops, op) ... + case offsetof(struct bpf_sock_ops, op): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern, + op), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, op)); + break; + + case offsetof(struct bpf_sock_ops, replylong[0]) ... offsetof(struct bpf_sock_ops, replylong[3]): - BUILD_BUG_ON(sizeof_field(struct bpf_sock_ops, op) != - sizeof_field(struct bpf_sock_ops_kern, op)); BUILD_BUG_ON(sizeof_field(struct bpf_sock_ops, reply) != sizeof_field(struct bpf_sock_ops_kern, reply)); BUILD_BUG_ON(sizeof_field(struct bpf_sock_ops, replylong) != sizeof_field(struct bpf_sock_ops_kern, replylong)); off = si->off; - off -= offsetof(struct bpf_sock_ops, op); - off += offsetof(struct bpf_sock_ops_kern, op); + off -= offsetof(struct bpf_sock_ops, replylong[0]); + off += offsetof(struct bpf_sock_ops_kern, replylong[0]); if (type == BPF_WRITE) *insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg, off); @@ -8682,6 +9608,49 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, case offsetof(struct bpf_sock_ops, sk): SOCK_OPS_GET_SK(); break; + case offsetof(struct bpf_sock_ops, skb_data_end): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern, + skb_data_end), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, + skb_data_end)); + break; + case offsetof(struct bpf_sock_ops, skb_data): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern, + skb), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, + skb)); + *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1); + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data), + si->dst_reg, si->dst_reg, + offsetof(struct sk_buff, data)); + break; + case offsetof(struct bpf_sock_ops, skb_len): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern, + skb), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, + skb)); + *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1); + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, len), + si->dst_reg, si->dst_reg, + offsetof(struct sk_buff, len)); + break; + case offsetof(struct bpf_sock_ops, skb_tcp_flags): + off = offsetof(struct sk_buff, cb); + off += offsetof(struct tcp_skb_cb, tcp_flags); + *target_size = sizeof_field(struct tcp_skb_cb, tcp_flags); + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern, + skb), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, + skb)); + *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1); + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct tcp_skb_cb, + tcp_flags), + si->dst_reg, si->dst_reg, off); + break; } return insn - insn_buf; } @@ -9356,7 +10325,7 @@ sk_lookup_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_sk_release: return &bpf_sk_release_proto; default: - return bpf_base_func_proto(func_id); + return bpf_sk_base_func_proto(func_id); } } @@ -9506,17 +10475,6 @@ BTF_SOCK_TYPE_xxx u32 btf_sock_ids[MAX_BTF_SOCK_TYPE]; #endif -static bool check_arg_btf_id(u32 btf_id, u32 arg) -{ - int i; - - /* only one argument, no need to check arg */ - for (i = 0; i < MAX_BTF_SOCK_TYPE; i++) - if (btf_sock_ids[i] == btf_id) - return true; - return false; -} - BPF_CALL_1(bpf_skc_to_tcp6_sock, struct sock *, sk) { /* tcp6_sock type is not generated in dwarf and hence btf, @@ -9534,8 +10492,7 @@ const struct bpf_func_proto bpf_skc_to_tcp6_sock_proto = { .func = bpf_skc_to_tcp6_sock, .gpl_only = false, .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, - .arg1_type = ARG_PTR_TO_BTF_ID, - .check_btf_id = check_arg_btf_id, + .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_TCP6], }; @@ -9551,8 +10508,7 @@ const struct bpf_func_proto bpf_skc_to_tcp_sock_proto = { .func = bpf_skc_to_tcp_sock, .gpl_only = false, .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, - .arg1_type = ARG_PTR_TO_BTF_ID, - .check_btf_id = check_arg_btf_id, + .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_TCP], }; @@ -9581,8 +10537,7 @@ const struct bpf_func_proto bpf_skc_to_tcp_timewait_sock_proto = { .func = bpf_skc_to_tcp_timewait_sock, .gpl_only = false, .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, - .arg1_type = ARG_PTR_TO_BTF_ID, - .check_btf_id = check_arg_btf_id, + .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_TCP_TW], }; @@ -9605,8 +10560,7 @@ const struct bpf_func_proto bpf_skc_to_tcp_request_sock_proto = { .func = bpf_skc_to_tcp_request_sock, .gpl_only = false, .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, - .arg1_type = ARG_PTR_TO_BTF_ID, - .check_btf_id = check_arg_btf_id, + .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_TCP_REQ], }; @@ -9627,7 +10581,55 @@ const struct bpf_func_proto bpf_skc_to_udp6_sock_proto = { .func = bpf_skc_to_udp6_sock, .gpl_only = false, .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, - .arg1_type = ARG_PTR_TO_BTF_ID, - .check_btf_id = check_arg_btf_id, + .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_UDP6], }; + +BPF_CALL_1(bpf_sock_from_file, struct file *, file) +{ + return (unsigned long)sock_from_file(file); +} + +BTF_ID_LIST(bpf_sock_from_file_btf_ids) +BTF_ID(struct, socket) +BTF_ID(struct, file) + +const struct bpf_func_proto bpf_sock_from_file_proto = { + .func = bpf_sock_from_file, + .gpl_only = false, + .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, + .ret_btf_id = &bpf_sock_from_file_btf_ids[0], + .arg1_type = ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &bpf_sock_from_file_btf_ids[1], +}; + +static const struct bpf_func_proto * +bpf_sk_base_func_proto(enum bpf_func_id func_id) +{ + const struct bpf_func_proto *func; + + switch (func_id) { + case BPF_FUNC_skc_to_tcp6_sock: + func = &bpf_skc_to_tcp6_sock_proto; + break; + case BPF_FUNC_skc_to_tcp_sock: + func = &bpf_skc_to_tcp_sock_proto; + break; + case BPF_FUNC_skc_to_tcp_timewait_sock: + func = &bpf_skc_to_tcp_timewait_sock_proto; + break; + case BPF_FUNC_skc_to_tcp_request_sock: + func = &bpf_skc_to_tcp_request_sock_proto; + break; + case BPF_FUNC_skc_to_udp6_sock: + func = &bpf_skc_to_udp6_sock_proto; + break; + default: + return bpf_base_func_proto(func_id); + } + + if (!perfmon_capable()) + return NULL; + + return func; +} diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 29806eb765cf..2ef2224b3bff 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -23,6 +23,7 @@ #include <linux/if_ether.h> #include <linux/mpls.h> #include <linux/tcp.h> +#include <linux/ptp_classify.h> #include <net/flow_dissector.h> #include <scsi/fc/fc_fcoe.h> #include <uapi/linux/batadv_packet.h> @@ -48,7 +49,7 @@ void skb_flow_dissector_init(struct flow_dissector *flow_dissector, memset(flow_dissector, 0, sizeof(*flow_dissector)); for (i = 0; i < key_count; i++, key++) { - /* User should make sure that every key target offset is withing + /* User should make sure that every key target offset is within * boundaries of unsigned short. */ BUG_ON(key->offset > USHRT_MAX); @@ -236,9 +237,8 @@ skb_flow_dissect_set_enc_addr_type(enum flow_dissector_key_id type, void skb_flow_dissect_ct(const struct sk_buff *skb, struct flow_dissector *flow_dissector, - void *target_container, - u16 *ctinfo_map, - size_t mapsize) + void *target_container, u16 *ctinfo_map, + size_t mapsize, bool post_ct) { #if IS_ENABLED(CONFIG_NF_CONNTRACK) struct flow_dissector_key_ct *key; @@ -250,13 +250,19 @@ skb_flow_dissect_ct(const struct sk_buff *skb, return; ct = nf_ct_get(skb, &ctinfo); - if (!ct) + if (!ct && !post_ct) return; key = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_CT, target_container); + if (!ct) { + key->ct_state = TCA_FLOWER_KEY_CT_FLAGS_TRACKED | + TCA_FLOWER_KEY_CT_FLAGS_INVALID; + return; + } + if (ctinfo < mapsize) key->ct_state = ctinfo_map[ctinfo]; #if IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES) @@ -932,8 +938,14 @@ bool __skb_flow_dissect(const struct net *net, int offset = 0; ops = skb->dev->dsa_ptr->tag_ops; - if (ops->flow_dissect && - !ops->flow_dissect(skb, &proto, &offset)) { + /* Tail taggers don't break flow dissection */ + if (!ops->tail_tag) { + if (ops->flow_dissect) + ops->flow_dissect(skb, &proto, &offset); + else + dsa_tag_generic_flow_dissect(skb, + &proto, + &offset); hlen -= offset; nhoff += offset; } @@ -1044,6 +1056,9 @@ proto_again: key_control->addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; } + __skb_flow_dissect_ipv4(skb, flow_dissector, + target_container, data, iph); + if (ip_is_fragment(iph)) { key_control->flags |= FLOW_DIS_IS_FRAGMENT; @@ -1060,9 +1075,6 @@ proto_again: } } - __skb_flow_dissect_ipv4(skb, flow_dissector, - target_container, data, iph); - break; } case htons(ETH_P_IPV6): { @@ -1245,6 +1257,21 @@ proto_again: &proto, &nhoff, hlen, flags); break; + case htons(ETH_P_1588): { + struct ptp_header *hdr, _hdr; + + hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, + hlen, &_hdr); + if (!hdr) { + fdret = FLOW_DISSECT_RET_OUT_BAD; + break; + } + + nhoff += ntohs(hdr->message_length); + fdret = FLOW_DISSECT_RET_OUT_GOOD; + break; + } + default: fdret = FLOW_DISSECT_RET_OUT_BAD; break; diff --git a/net/core/flow_offload.c b/net/core/flow_offload.c index d4474c812b64..715b67f6c62f 100644 --- a/net/core/flow_offload.c +++ b/net/core/flow_offload.c @@ -381,10 +381,8 @@ static void __flow_block_indr_cleanup(void (*release)(void *cb_priv), list_for_each_entry_safe(this, next, &flow_block_indr_list, indr.list) { if (this->release == release && - this->indr.cb_priv == cb_priv) { + this->indr.cb_priv == cb_priv) list_move(&this->indr.list, cleanup_list); - return; - } } } diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c index 80dbf2f4016e..8e582e29a41e 100644 --- a/net/core/gen_estimator.c +++ b/net/core/gen_estimator.c @@ -80,11 +80,11 @@ static void est_timer(struct timer_list *t) u64 rate, brate; est_fetch_counters(est, &b); - brate = (b.bytes - est->last_bytes) << (10 - est->ewma_log - est->intvl_log); - brate -= (est->avbps >> est->ewma_log); + brate = (b.bytes - est->last_bytes) << (10 - est->intvl_log); + brate = (brate >> est->ewma_log) - (est->avbps >> est->ewma_log); - rate = (b.packets - est->last_packets) << (10 - est->ewma_log - est->intvl_log); - rate -= (est->avpps >> est->ewma_log); + rate = (b.packets - est->last_packets) << (10 - est->intvl_log); + rate = (rate >> est->ewma_log) - (est->avpps >> est->ewma_log); write_seqcount_begin(&est->seq); est->avbps += brate; @@ -143,6 +143,9 @@ int gen_new_estimator(struct gnet_stats_basic_packed *bstats, if (parm->interval < -2 || parm->interval > 3) return -EINVAL; + if (parm->ewma_log == 0 || parm->ewma_log >= 31) + return -EINVAL; + est = kzalloc(sizeof(*est), GFP_KERNEL); if (!est) return -ENOBUFS; diff --git a/net/core/gro_cells.c b/net/core/gro_cells.c index e095fb871d91..6eb2e5ec2c50 100644 --- a/net/core/gro_cells.c +++ b/net/core/gro_cells.c @@ -99,9 +99,14 @@ void gro_cells_destroy(struct gro_cells *gcells) struct gro_cell *cell = per_cpu_ptr(gcells->cells, i); napi_disable(&cell->napi); - netif_napi_del(&cell->napi); + __netif_napi_del(&cell->napi); __skb_queue_purge(&cell->napi_skbs); } + /* This barrier is needed because netpoll could access dev->napi_list + * under rcu protection. + */ + synchronize_net(); + free_percpu(gcells->cells); gcells->cells = NULL; } diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c index 7d3438215f32..2f7940bcf715 100644 --- a/net/core/lwt_bpf.c +++ b/net/core/lwt_bpf.c @@ -39,12 +39,11 @@ static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt, { int ret; - /* Preempt disable is needed to protect per-cpu redirect_info between - * BPF prog and skb_do_redirect(). The call_rcu in bpf_prog_put() and - * access to maps strictly require a rcu_read_lock() for protection, - * mixing with BH RCU lock doesn't work. + /* Migration disable and BH disable are needed to protect per-cpu + * redirect_info between BPF prog and skb_do_redirect(). */ - preempt_disable(); + migrate_disable(); + local_bh_disable(); bpf_compute_data_pointers(skb); ret = bpf_prog_run_save_cb(lwt->prog, skb); @@ -78,7 +77,8 @@ static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt, break; } - preempt_enable(); + local_bh_enable(); + migrate_enable(); return ret; } diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 8e39e28b0a8d..e2982b3970b8 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -41,7 +41,6 @@ #include <trace/events/neigh.h> -#define DEBUG #define NEIGH_DEBUG 1 #define neigh_dbg(level, fmt, ...) \ do { \ @@ -235,6 +234,8 @@ static int neigh_forced_gc(struct neigh_table *tbl) write_lock(&n->lock); if ((n->nud_state == NUD_FAILED) || + (tbl->is_multicast && + tbl->is_multicast(n->primary_key)) || time_after(tref, n->updated)) remove = true; write_unlock(&n->lock); @@ -1243,13 +1244,14 @@ static int __neigh_update(struct neighbour *neigh, const u8 *lladdr, old = neigh->nud_state; err = -EPERM; - if (!(flags & NEIGH_UPDATE_F_ADMIN) && - (old & (NUD_NOARP | NUD_PERMANENT))) - goto out; if (neigh->dead) { NL_SET_ERR_MSG(extack, "Neighbor entry is now dead"); + new = old; goto out; } + if (!(flags & NEIGH_UPDATE_F_ADMIN) && + (old & (NUD_NOARP | NUD_PERMANENT))) + goto out; ext_learn_change = neigh_update_ext_learned(neigh, flags, ¬ify); @@ -1567,10 +1569,8 @@ static void neigh_proxy_process(struct timer_list *t) void pneigh_enqueue(struct neigh_table *tbl, struct neigh_parms *p, struct sk_buff *skb) { - unsigned long now = jiffies; - - unsigned long sched_next = now + (prandom_u32() % - NEIGH_VAR(p, PROXY_DELAY)); + unsigned long sched_next = jiffies + + prandom_u32_max(NEIGH_VAR(p, PROXY_DELAY)); if (tbl->proxy_queue.qlen > NEIGH_VAR(p, PROXY_QLEN)) { kfree_skb(skb); diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c index 6bbd06f7dc7d..c714e6a9dad4 100644 --- a/net/core/net-procfs.c +++ b/net/core/net-procfs.c @@ -116,6 +116,12 @@ static int dev_seq_show(struct seq_file *seq, void *v) return 0; } +static u32 softnet_backlog_len(struct softnet_data *sd) +{ + return skb_queue_len_lockless(&sd->input_pkt_queue) + + skb_queue_len_lockless(&sd->process_queue); +} + static struct softnet_data *softnet_get_online(loff_t *pos) { struct softnet_data *sd = NULL; @@ -159,12 +165,17 @@ static int softnet_seq_show(struct seq_file *seq, void *v) rcu_read_unlock(); #endif + /* the index is the CPU id owing this sd. Since offline CPUs are not + * displayed, it would be othrwise not trivial for the user-space + * mapping the data a specific CPU + */ seq_printf(seq, - "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n", + "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n", sd->processed, sd->dropped, sd->time_squeeze, 0, 0, 0, 0, 0, /* was fastroute */ 0, /* was cpu_collision */ - sd->received_rps, flow_limit_count); + sd->received_rps, flow_limit_count, + softnet_backlog_len(sd), (int)seq->index); return 0; } diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index efec66fa78b7..307628fdf380 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -538,6 +538,45 @@ static ssize_t phys_switch_id_show(struct device *dev, } static DEVICE_ATTR_RO(phys_switch_id); +static ssize_t threaded_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct net_device *netdev = to_net_dev(dev); + ssize_t ret = -EINVAL; + + if (!rtnl_trylock()) + return restart_syscall(); + + if (dev_isalive(netdev)) + ret = sprintf(buf, fmt_dec, netdev->threaded); + + rtnl_unlock(); + return ret; +} + +static int modify_napi_threaded(struct net_device *dev, unsigned long val) +{ + int ret; + + if (list_empty(&dev->napi_list)) + return -EOPNOTSUPP; + + if (val != 0 && val != 1) + return -EOPNOTSUPP; + + ret = dev_set_threaded(dev, val); + + return ret; +} + +static ssize_t threaded_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t len) +{ + return netdev_store(dev, attr, buf, len, modify_napi_threaded); +} +static DEVICE_ATTR_RW(threaded); + static struct attribute *net_class_attrs[] __ro_after_init = { &dev_attr_netdev_group.attr, &dev_attr_type.attr, @@ -570,6 +609,7 @@ static struct attribute *net_class_attrs[] __ro_after_init = { &dev_attr_proto_down.attr, &dev_attr_carrier_up_count.attr, &dev_attr_carrier_down_count.attr, + &dev_attr_threaded.attr, NULL, }; ATTRIBUTE_GROUPS(net_class); @@ -1027,7 +1067,7 @@ net_rx_queue_update_kobjects(struct net_device *dev, int old_num, int new_num) while (--i >= new_num) { struct kobject *kobj = &dev->_rx[i].kobj; - if (!refcount_read(&dev_net(dev)->count)) + if (!refcount_read(&dev_net(dev)->ns.count)) kobj->uevent_suppress = 1; if (dev->sysfs_rx_queue_group) sysfs_remove_group(kobj, dev->sysfs_rx_queue_group); @@ -1136,18 +1176,25 @@ static ssize_t traffic_class_show(struct netdev_queue *queue, char *buf) { struct net_device *dev = queue->dev; + int num_tc, tc; int index; - int tc; if (!netif_is_multiqueue(dev)) return -ENOENT; + if (!rtnl_trylock()) + return restart_syscall(); + index = get_netdev_queue_index(queue); /* If queue belongs to subordinate dev use its TC mapping */ dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev; + num_tc = dev->num_tc; tc = netdev_txq_to_tc(dev, index); + + rtnl_unlock(); + if (tc < 0) return -EINVAL; @@ -1158,8 +1205,8 @@ static ssize_t traffic_class_show(struct netdev_queue *queue, * belongs to the root device it will be reported with just the * traffic class, so just "0" for TC 0 for example. */ - return dev->num_tc < 0 ? sprintf(buf, "%u%d\n", tc, dev->num_tc) : - sprintf(buf, "%u\n", tc); + return num_tc < 0 ? sprintf(buf, "%d%d\n", tc, num_tc) : + sprintf(buf, "%d\n", tc); } #ifdef CONFIG_XPS @@ -1317,8 +1364,8 @@ static const struct attribute_group dql_group = { static ssize_t xps_cpus_show(struct netdev_queue *queue, char *buf) { + int cpu, len, ret, num_tc = 1, tc = 0; struct net_device *dev = queue->dev; - int cpu, len, num_tc = 1, tc = 0; struct xps_dev_maps *dev_maps; cpumask_var_t mask; unsigned long index; @@ -1328,22 +1375,31 @@ static ssize_t xps_cpus_show(struct netdev_queue *queue, index = get_netdev_queue_index(queue); + if (!rtnl_trylock()) + return restart_syscall(); + if (dev->num_tc) { /* Do not allow XPS on subordinate device directly */ num_tc = dev->num_tc; - if (num_tc < 0) - return -EINVAL; + if (num_tc < 0) { + ret = -EINVAL; + goto err_rtnl_unlock; + } /* If queue belongs to subordinate dev use its map */ dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev; tc = netdev_txq_to_tc(dev, index); - if (tc < 0) - return -EINVAL; + if (tc < 0) { + ret = -EINVAL; + goto err_rtnl_unlock; + } } - if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) - return -ENOMEM; + if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) { + ret = -ENOMEM; + goto err_rtnl_unlock; + } rcu_read_lock(); dev_maps = rcu_dereference(dev->xps_cpus_map); @@ -1366,9 +1422,15 @@ static ssize_t xps_cpus_show(struct netdev_queue *queue, } rcu_read_unlock(); + rtnl_unlock(); + len = snprintf(buf, PAGE_SIZE, "%*pb\n", cpumask_pr_args(mask)); free_cpumask_var(mask); return len < PAGE_SIZE ? len : -EINVAL; + +err_rtnl_unlock: + rtnl_unlock(); + return ret; } static ssize_t xps_cpus_store(struct netdev_queue *queue, @@ -1396,7 +1458,13 @@ static ssize_t xps_cpus_store(struct netdev_queue *queue, return err; } + if (!rtnl_trylock()) { + free_cpumask_var(mask); + return restart_syscall(); + } + err = netif_set_xps_queue(dev, mask, index); + rtnl_unlock(); free_cpumask_var(mask); @@ -1408,22 +1476,29 @@ static struct netdev_queue_attribute xps_cpus_attribute __ro_after_init static ssize_t xps_rxqs_show(struct netdev_queue *queue, char *buf) { + int j, len, ret, num_tc = 1, tc = 0; struct net_device *dev = queue->dev; struct xps_dev_maps *dev_maps; unsigned long *mask, index; - int j, len, num_tc = 1, tc = 0; index = get_netdev_queue_index(queue); + if (!rtnl_trylock()) + return restart_syscall(); + if (dev->num_tc) { num_tc = dev->num_tc; tc = netdev_txq_to_tc(dev, index); - if (tc < 0) - return -EINVAL; + if (tc < 0) { + ret = -EINVAL; + goto err_rtnl_unlock; + } } mask = bitmap_zalloc(dev->num_rx_queues, GFP_KERNEL); - if (!mask) - return -ENOMEM; + if (!mask) { + ret = -ENOMEM; + goto err_rtnl_unlock; + } rcu_read_lock(); dev_maps = rcu_dereference(dev->xps_rxqs_map); @@ -1449,10 +1524,16 @@ static ssize_t xps_rxqs_show(struct netdev_queue *queue, char *buf) out_no_maps: rcu_read_unlock(); + rtnl_unlock(); + len = bitmap_print_to_pagebuf(false, buf, mask, dev->num_rx_queues); bitmap_free(mask); return len < PAGE_SIZE ? len : -EINVAL; + +err_rtnl_unlock: + rtnl_unlock(); + return ret; } static ssize_t xps_rxqs_store(struct netdev_queue *queue, const char *buf, @@ -1478,10 +1559,17 @@ static ssize_t xps_rxqs_store(struct netdev_queue *queue, const char *buf, return err; } + if (!rtnl_trylock()) { + bitmap_free(mask); + return restart_syscall(); + } + cpus_read_lock(); err = __netif_set_xps_queue(dev, mask, index, true); cpus_read_unlock(); + rtnl_unlock(); + bitmap_free(mask); return err ? : len; } @@ -1605,7 +1693,7 @@ netdev_queue_update_kobjects(struct net_device *dev, int old_num, int new_num) while (--i >= new_num) { struct netdev_queue *queue = dev->_tx + i; - if (!refcount_read(&dev_net(dev)->count)) + if (!refcount_read(&dev_net(dev)->ns.count)) queue->kobj.uevent_suppress = 1; #ifdef CONFIG_BQL sysfs_remove_group(&queue->kobj, &dql_group); @@ -1852,7 +1940,7 @@ void netdev_unregister_kobject(struct net_device *ndev) { struct device *dev = &ndev->dev; - if (!refcount_read(&dev_net(ndev)->count)) + if (!refcount_read(&dev_net(ndev)->ns.count)) dev_set_uevent_suppress(dev, 1); kobject_get(&dev->kobj); diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 944ab214e5ae..43b6ac4c4439 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -19,6 +19,7 @@ #include <linux/net_namespace.h> #include <linux/sched/task.h> #include <linux/uidgid.h> +#include <linux/cookie.h> #include <net/sock.h> #include <net/netlink.h> @@ -44,7 +45,7 @@ static struct key_tag init_net_key_domain = { .usage = REFCOUNT_INIT(1) }; #endif struct net init_net = { - .count = REFCOUNT_INIT(1), + .ns.count = REFCOUNT_INIT(1), .dev_base_head = LIST_HEAD_INIT(init_net.dev_base_head), #ifdef CONFIG_KEYS .key_domain = &init_net_key_domain, @@ -69,19 +70,7 @@ EXPORT_SYMBOL_GPL(pernet_ops_rwsem); static unsigned int max_gen_ptrs = INITIAL_NET_GEN_PTRS; -static atomic64_t cookie_gen; - -u64 net_gen_cookie(struct net *net) -{ - while (1) { - u64 res = atomic64_read(&net->net_cookie); - - if (res) - return res; - res = atomic64_inc_return(&cookie_gen); - atomic64_cmpxchg(&net->net_cookie, 0, res); - } -} +DEFINE_COOKIE(net_cookie); static struct net_generic *net_alloc_generic(void) { @@ -248,7 +237,7 @@ int peernet2id_alloc(struct net *net, struct net *peer, gfp_t gfp) { int id; - if (refcount_read(&net->count) == 0) + if (refcount_read(&net->ns.count) == 0) return NETNSA_NSID_NOT_ASSIGNED; spin_lock_bh(&net->nsid_lock); @@ -328,9 +317,12 @@ static __net_init int setup_net(struct net *net, struct user_namespace *user_ns) int error = 0; LIST_HEAD(net_exit_list); - refcount_set(&net->count, 1); + refcount_set(&net->ns.count, 1); refcount_set(&net->passive, 1); get_random_bytes(&net->hash_mix, sizeof(u32)); + preempt_disable(); + net->net_cookie = gen_cookie_next(&net_cookie); + preempt_enable(); net->dev_base_seq = 1; net->user_ns = user_ns; idr_init(&net->netns_ids); @@ -1101,7 +1093,6 @@ static int __init net_ns_init(void) panic("Could not allocate generic netns"); rcu_assign_pointer(init_net.gen, ng); - net_gen_cookie(&init_net); down_write(&pernet_ops_rwsem); if (setup_net(&init_net, &init_user_ns)) diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c index 41b24cd31562..b49c57d35a88 100644 --- a/net/core/netclassid_cgroup.c +++ b/net/core/netclassid_cgroup.c @@ -68,9 +68,8 @@ struct update_classid_context { static int update_classid_sock(const void *v, struct file *file, unsigned n) { - int err; struct update_classid_context *ctx = (void *)v; - struct socket *sock = sock_from_file(file, &err); + struct socket *sock = sock_from_file(file); if (sock) { spin_lock(&cgroup_sk_update_lock); diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 2338753e936b..c310c7c1cef7 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -297,7 +297,7 @@ static int netpoll_owner_active(struct net_device *dev) { struct napi_struct *napi; - list_for_each_entry(napi, &dev->napi_list, dev_list) { + list_for_each_entry_rcu(napi, &dev->napi_list, dev_list) { if (napi->poll_owner == smp_processor_id()) return 1; } diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c index 9bd4cab7d510..99a431c56f23 100644 --- a/net/core/netprio_cgroup.c +++ b/net/core/netprio_cgroup.c @@ -220,8 +220,7 @@ static ssize_t write_priomap(struct kernfs_open_file *of, static int update_netprio(const void *v, struct file *file, unsigned n) { - int err; - struct socket *sock = sock_from_file(file, &err); + struct socket *sock = sock_from_file(file); if (sock) { spin_lock(&cgroup_sk_update_lock); sock_cgroup_set_prioidx(&sock->sk->sk_cgrp_data, diff --git a/net/core/page_pool.c b/net/core/page_pool.c index ef98372facf6..ad8b0707af04 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -11,6 +11,8 @@ #include <linux/device.h> #include <net/page_pool.h> +#include <net/xdp.h> + #include <linux/dma-direction.h> #include <linux/dma-mapping.h> #include <linux/page-flags.h> @@ -348,46 +350,38 @@ static bool page_pool_recycle_in_cache(struct page *page, return true; } -/* page is NOT reusable when: - * 1) allocated when system is under some pressure. (page_is_pfmemalloc) - */ -static bool pool_page_reusable(struct page_pool *pool, struct page *page) -{ - return !page_is_pfmemalloc(page); -} - /* If the page refcnt == 1, this will try to recycle the page. * if PP_FLAG_DMA_SYNC_DEV is set, we'll try to sync the DMA area for * the configured size min(dma_sync_size, pool->max_len). * If the page refcnt != 1, then the page will be returned to memory * subsystem. */ -void page_pool_put_page(struct page_pool *pool, struct page *page, - unsigned int dma_sync_size, bool allow_direct) +static __always_inline struct page * +__page_pool_put_page(struct page_pool *pool, struct page *page, + unsigned int dma_sync_size, bool allow_direct) { /* This allocator is optimized for the XDP mode that uses * one-frame-per-page, but have fallbacks that act like the * regular page allocator APIs. * * refcnt == 1 means page_pool owns page, and can recycle it. + * + * page is NOT reusable when allocated when system is under + * some pressure. (page_is_pfmemalloc) */ - if (likely(page_ref_count(page) == 1 && - pool_page_reusable(pool, page))) { + if (likely(page_ref_count(page) == 1 && !page_is_pfmemalloc(page))) { /* Read barrier done in page_ref_count / READ_ONCE */ if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) page_pool_dma_sync_for_device(pool, page, dma_sync_size); - if (allow_direct && in_serving_softirq()) - if (page_pool_recycle_in_cache(page, pool)) - return; + if (allow_direct && in_serving_softirq() && + page_pool_recycle_in_cache(page, pool)) + return NULL; - if (!page_pool_recycle_in_ring(pool, page)) { - /* Cache full, fallback to free pages */ - page_pool_return_page(pool, page); - } - return; + /* Page found as candidate for recycling */ + return page; } /* Fallback/non-XDP mode: API user have elevated refcnt. * @@ -405,9 +399,59 @@ void page_pool_put_page(struct page_pool *pool, struct page *page, /* Do not replace this with page_pool_return_page() */ page_pool_release_page(pool, page); put_page(page); + + return NULL; +} + +void page_pool_put_page(struct page_pool *pool, struct page *page, + unsigned int dma_sync_size, bool allow_direct) +{ + page = __page_pool_put_page(pool, page, dma_sync_size, allow_direct); + if (page && !page_pool_recycle_in_ring(pool, page)) { + /* Cache full, fallback to free pages */ + page_pool_return_page(pool, page); + } } EXPORT_SYMBOL(page_pool_put_page); +/* Caller must not use data area after call, as this function overwrites it */ +void page_pool_put_page_bulk(struct page_pool *pool, void **data, + int count) +{ + int i, bulk_len = 0; + + for (i = 0; i < count; i++) { + struct page *page = virt_to_head_page(data[i]); + + page = __page_pool_put_page(pool, page, -1, false); + /* Approved for bulk recycling in ptr_ring cache */ + if (page) + data[bulk_len++] = page; + } + + if (unlikely(!bulk_len)) + return; + + /* Bulk producer into ptr_ring page_pool cache */ + page_pool_ring_lock(pool); + for (i = 0; i < bulk_len; i++) { + if (__ptr_ring_produce(&pool->ring, data[i])) + break; /* ring full */ + } + page_pool_ring_unlock(pool); + + /* Hopefully all pages was return into ptr_ring */ + if (likely(i == bulk_len)) + return; + + /* ptr_ring cache full, free remaining pages outside producer lock + * since put_page() with refcnt == 1 can be an expensive operation + */ + for (; i < bulk_len; i++) + page_pool_return_page(pool, data[i]); +} +EXPORT_SYMBOL(page_pool_put_page_bulk); + static void page_pool_empty_ring(struct page_pool *pool) { struct page *page; diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 44fdbb9c6e53..3fba429f1f57 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -922,7 +922,7 @@ static ssize_t pktgen_if_write(struct file *file, pkt_dev->min_pkt_size = value; pkt_dev->cur_pkt_size = value; } - sprintf(pg_result, "OK: min_pkt_size=%u", + sprintf(pg_result, "OK: min_pkt_size=%d", pkt_dev->min_pkt_size); return count; } @@ -939,7 +939,7 @@ static ssize_t pktgen_if_write(struct file *file, pkt_dev->max_pkt_size = value; pkt_dev->cur_pkt_size = value; } - sprintf(pg_result, "OK: max_pkt_size=%u", + sprintf(pg_result, "OK: max_pkt_size=%d", pkt_dev->max_pkt_size); return count; } @@ -959,7 +959,7 @@ static ssize_t pktgen_if_write(struct file *file, pkt_dev->max_pkt_size = value; pkt_dev->cur_pkt_size = value; } - sprintf(pg_result, "OK: pkt_size=%u", pkt_dev->min_pkt_size); + sprintf(pg_result, "OK: pkt_size=%d", pkt_dev->min_pkt_size); return count; } @@ -981,7 +981,7 @@ static ssize_t pktgen_if_write(struct file *file, i += len; pkt_dev->nfrags = value; - sprintf(pg_result, "OK: frags=%u", pkt_dev->nfrags); + sprintf(pg_result, "OK: frags=%d", pkt_dev->nfrags); return count; } if (!strcmp(name, "delay")) { @@ -1146,7 +1146,7 @@ static ssize_t pktgen_if_write(struct file *file, (!(pkt_dev->odev->priv_flags & IFF_TX_SKB_SHARING))))) return -ENOTSUPP; pkt_dev->burst = value < 1 ? 1 : value; - sprintf(pg_result, "OK: burst=%d", pkt_dev->burst); + sprintf(pg_result, "OK: burst=%u", pkt_dev->burst); return count; } if (!strcmp(name, "node")) { @@ -3464,7 +3464,7 @@ static int pktgen_thread_worker(void *arg) struct pktgen_dev *pkt_dev = NULL; int cpu = t->cpu; - BUG_ON(smp_processor_id() != cpu); + WARN_ON(smp_processor_id() != cpu); init_waitqueue_head(&t->queue); complete(&t->start_done); diff --git a/net/core/ptp_classifier.c b/net/core/ptp_classifier.c index d964a5147f22..e33fde06d528 100644 --- a/net/core/ptp_classifier.c +++ b/net/core/ptp_classifier.c @@ -107,6 +107,36 @@ unsigned int ptp_classify_raw(const struct sk_buff *skb) } EXPORT_SYMBOL_GPL(ptp_classify_raw); +struct ptp_header *ptp_parse_header(struct sk_buff *skb, unsigned int type) +{ + u8 *ptr = skb_mac_header(skb); + + if (type & PTP_CLASS_VLAN) + ptr += VLAN_HLEN; + + switch (type & PTP_CLASS_PMASK) { + case PTP_CLASS_IPV4: + ptr += IPV4_HLEN(ptr) + UDP_HLEN; + break; + case PTP_CLASS_IPV6: + ptr += IP6_HLEN + UDP_HLEN; + break; + case PTP_CLASS_L2: + break; + default: + return NULL; + } + + ptr += ETH_HLEN; + + /* Ensure that the entire header is present in this packet. */ + if (ptr + sizeof(struct ptp_header) > skb->data + skb->len) + return NULL; + + return (struct ptp_header *)ptr; +} +EXPORT_SYMBOL_GPL(ptp_parse_header); + void __init ptp_classifier_init(void) { static struct sock_filter ptp_filter[] __initdata = { diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 68e0682450c6..0edc0b2baaa4 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -55,7 +55,7 @@ #include <net/net_namespace.h> #define RTNL_MAX_TYPE 50 -#define RTNL_SLAVE_MAX_TYPE 36 +#define RTNL_SLAVE_MAX_TYPE 40 struct rtnl_link { rtnl_doit_func doit; @@ -139,7 +139,7 @@ bool lockdep_rtnl_is_held(void) EXPORT_SYMBOL(lockdep_rtnl_is_held); #endif /* #ifdef CONFIG_PROVE_LOCKING */ -static struct rtnl_link *__rcu *rtnl_msg_handlers[RTNL_FAMILY_MAX + 1]; +static struct rtnl_link __rcu *__rcu *rtnl_msg_handlers[RTNL_FAMILY_MAX + 1]; static inline int rtm_msgindex(int msgtype) { @@ -157,7 +157,7 @@ static inline int rtm_msgindex(int msgtype) static struct rtnl_link *rtnl_get_link(int protocol, int msgtype) { - struct rtnl_link **tab; + struct rtnl_link __rcu **tab; if (protocol >= ARRAY_SIZE(rtnl_msg_handlers)) protocol = PF_UNSPEC; @@ -166,7 +166,7 @@ static struct rtnl_link *rtnl_get_link(int protocol, int msgtype) if (!tab) tab = rcu_dereference_rtnl(rtnl_msg_handlers[PF_UNSPEC]); - return tab[msgtype]; + return rcu_dereference_rtnl(tab[msgtype]); } static int rtnl_register_internal(struct module *owner, @@ -183,7 +183,7 @@ static int rtnl_register_internal(struct module *owner, msgindex = rtm_msgindex(msgtype); rtnl_lock(); - tab = rtnl_msg_handlers[protocol]; + tab = rtnl_dereference(rtnl_msg_handlers[protocol]); if (tab == NULL) { tab = kcalloc(RTM_NR_MSGTYPES, sizeof(void *), GFP_KERNEL); if (!tab) @@ -286,7 +286,8 @@ void rtnl_register(int protocol, int msgtype, */ int rtnl_unregister(int protocol, int msgtype) { - struct rtnl_link **tab, *link; + struct rtnl_link __rcu **tab; + struct rtnl_link *link; int msgindex; BUG_ON(protocol < 0 || protocol > RTNL_FAMILY_MAX); @@ -299,7 +300,7 @@ int rtnl_unregister(int protocol, int msgtype) return -ENOENT; } - link = tab[msgindex]; + link = rtnl_dereference(tab[msgindex]); rcu_assign_pointer(tab[msgindex], NULL); rtnl_unlock(); @@ -318,20 +319,21 @@ EXPORT_SYMBOL_GPL(rtnl_unregister); */ void rtnl_unregister_all(int protocol) { - struct rtnl_link **tab, *link; + struct rtnl_link __rcu **tab; + struct rtnl_link *link; int msgindex; BUG_ON(protocol < 0 || protocol > RTNL_FAMILY_MAX); rtnl_lock(); - tab = rtnl_msg_handlers[protocol]; + tab = rtnl_dereference(rtnl_msg_handlers[protocol]); if (!tab) { rtnl_unlock(); return; } RCU_INIT_POINTER(rtnl_msg_handlers[protocol], NULL); for (msgindex = 0; msgindex < RTM_NR_MSGTYPES; msgindex++) { - link = tab[msgindex]; + link = rtnl_dereference(tab[msgindex]); if (!link) continue; @@ -1939,7 +1941,7 @@ static const struct rtnl_link_ops *linkinfo_to_kind_ops(const struct nlattr *nla if (linfo[IFLA_INFO_KIND]) { char kind[MODULE_NAME_LEN]; - nla_strlcpy(kind, linfo[IFLA_INFO_KIND], sizeof(kind)); + nla_strscpy(kind, linfo[IFLA_INFO_KIND], sizeof(kind)); ops = rtnl_link_ops_get(kind); } @@ -2658,7 +2660,7 @@ static int do_setlink(const struct sk_buff *skb, sa->sa_family = dev->type; memcpy(sa->sa_data, nla_data(tb[IFLA_ADDRESS]), dev->addr_len); - err = dev_set_mac_address(dev, sa, extack); + err = dev_set_mac_address_user(dev, sa, extack); kfree(sa); if (err) goto errout; @@ -2953,9 +2955,9 @@ static struct net_device *rtnl_dev_get(struct net *net, if (!ifname) { ifname = buffer; if (ifname_attr) - nla_strlcpy(ifname, ifname_attr, IFNAMSIZ); + nla_strscpy(ifname, ifname_attr, IFNAMSIZ); else if (altifname_attr) - nla_strlcpy(ifname, altifname_attr, ALTIFNAMSIZ); + nla_strscpy(ifname, altifname_attr, ALTIFNAMSIZ); else return NULL; } @@ -2983,7 +2985,7 @@ static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, goto errout; if (tb[IFLA_IFNAME]) - nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ); + nla_strscpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ); else ifname[0] = '\0'; @@ -3264,7 +3266,7 @@ replay: return err; if (tb[IFLA_IFNAME]) - nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ); + nla_strscpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ); else ifname[0] = '\0'; @@ -3296,7 +3298,7 @@ replay: memset(linkinfo, 0, sizeof(linkinfo)); if (linkinfo[IFLA_INFO_KIND]) { - nla_strlcpy(kind, linkinfo[IFLA_INFO_KIND], sizeof(kind)); + nla_strscpy(kind, linkinfo[IFLA_INFO_KIND], sizeof(kind)); ops = rtnl_link_ops_get(kind); } else { kind[0] = '\0'; @@ -3437,26 +3439,15 @@ replay: dev->ifindex = ifm->ifi_index; - if (ops->newlink) { + if (ops->newlink) err = ops->newlink(link_net ? : net, dev, tb, data, extack); - /* Drivers should call free_netdev() in ->destructor - * and unregister it on failure after registration - * so that device could be finally freed in rtnl_unlock. - */ - if (err < 0) { - /* If device is not registered at all, free it now */ - if (dev->reg_state == NETREG_UNINITIALIZED || - dev->reg_state == NETREG_UNREGISTERED) - free_netdev(dev); - goto out; - } - } else { + else err = register_netdevice(dev); - if (err < 0) { - free_netdev(dev); - goto out; - } + if (err < 0) { + free_netdev(dev); + goto out; } + err = rtnl_configure_link(dev, ifm); if (err < 0) goto out_unregister; @@ -3709,13 +3700,13 @@ static int rtnl_dellinkprop(struct sk_buff *skb, struct nlmsghdr *nlh, return rtnl_linkprop(RTM_DELLINKPROP, skb, nlh, extack); } -static u16 rtnl_calcit(struct sk_buff *skb, struct nlmsghdr *nlh) +static u32 rtnl_calcit(struct sk_buff *skb, struct nlmsghdr *nlh) { struct net *net = sock_net(skb->sk); - struct net_device *dev; + size_t min_ifinfo_dump_size = 0; struct nlattr *tb[IFLA_MAX+1]; u32 ext_filter_mask = 0; - u16 min_ifinfo_dump_size = 0; + struct net_device *dev; int hdrlen; /* Same kernel<->userspace interface hack as in rtnl_dump_ifinfo. */ @@ -3735,9 +3726,8 @@ static u16 rtnl_calcit(struct sk_buff *skb, struct nlmsghdr *nlh) */ rcu_read_lock(); for_each_netdev_rcu(net, dev) { - min_ifinfo_dump_size = max_t(u16, min_ifinfo_dump_size, - if_nlmsg_size(dev, - ext_filter_mask)); + min_ifinfo_dump_size = max(min_ifinfo_dump_size, + if_nlmsg_size(dev, ext_filter_mask)); } rcu_read_unlock(); @@ -3755,7 +3745,7 @@ static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb) s_idx = 1; for (idx = 1; idx <= RTNL_FAMILY_MAX; idx++) { - struct rtnl_link **tab; + struct rtnl_link __rcu **tab; struct rtnl_link *link; rtnl_dumpit_func dumpit; @@ -3769,7 +3759,7 @@ static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb) if (!tab) continue; - link = tab[type]; + link = rcu_dereference_rtnl(tab[type]); if (!link) continue; @@ -5494,7 +5484,7 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, if (kind == 2 && nlh->nlmsg_flags&NLM_F_DUMP) { struct sock *rtnl; rtnl_dumpit_func dumpit; - u16 min_dump_alloc = 0; + u32 min_dump_alloc = 0; link = rtnl_get_link(family, type); if (!link || !link->dumpit) { diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 5cd6d48bb77b..545a472273a5 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -119,148 +119,75 @@ static void skb_under_panic(struct sk_buff *skb, unsigned int sz, void *addr) skb_panic(skb, sz, addr, __func__); } -/* - * kmalloc_reserve is a wrapper around kmalloc_node_track_caller that tells - * the caller if emergency pfmemalloc reserves are being used. If it is and - * the socket is later found to be SOCK_MEMALLOC then PFMEMALLOC reserves - * may be used. Otherwise, the packet data may be discarded until enough - * memory is free - */ -#define kmalloc_reserve(size, gfp, node, pfmemalloc) \ - __kmalloc_reserve(size, gfp, node, _RET_IP_, pfmemalloc) - -static void *__kmalloc_reserve(size_t size, gfp_t flags, int node, - unsigned long ip, bool *pfmemalloc) -{ - void *obj; - bool ret_pfmemalloc = false; +#define NAPI_SKB_CACHE_SIZE 64 +#define NAPI_SKB_CACHE_BULK 16 +#define NAPI_SKB_CACHE_HALF (NAPI_SKB_CACHE_SIZE / 2) - /* - * Try a regular allocation, when that fails and we're not entitled - * to the reserves, fail. - */ - obj = kmalloc_node_track_caller(size, - flags | __GFP_NOMEMALLOC | __GFP_NOWARN, - node); - if (obj || !(gfp_pfmemalloc_allowed(flags))) - goto out; +struct napi_alloc_cache { + struct page_frag_cache page; + unsigned int skb_count; + void *skb_cache[NAPI_SKB_CACHE_SIZE]; +}; - /* Try again but now we are using pfmemalloc reserves */ - ret_pfmemalloc = true; - obj = kmalloc_node_track_caller(size, flags, node); +static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache); +static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache); -out: - if (pfmemalloc) - *pfmemalloc = ret_pfmemalloc; +static void *__alloc_frag_align(unsigned int fragsz, gfp_t gfp_mask, + unsigned int align_mask) +{ + struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); - return obj; + return page_frag_alloc_align(&nc->page, fragsz, gfp_mask, align_mask); } -/* Allocate a new skbuff. We do this ourselves so we can fill in a few - * 'private' fields and also do memory statistics to find all the - * [BEEP] leaks. - * - */ - -/** - * __alloc_skb - allocate a network buffer - * @size: size to allocate - * @gfp_mask: allocation mask - * @flags: If SKB_ALLOC_FCLONE is set, allocate from fclone cache - * instead of head cache and allocate a cloned (child) skb. - * If SKB_ALLOC_RX is set, __GFP_MEMALLOC will be used for - * allocations in case the data is required for writeback - * @node: numa node to allocate memory on - * - * Allocate a new &sk_buff. The returned buffer has no headroom and a - * tail room of at least size bytes. The object has a reference count - * of one. The return is the buffer. On a failure the return is %NULL. - * - * Buffers may only be allocated from interrupts using a @gfp_mask of - * %GFP_ATOMIC. - */ -struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, - int flags, int node) +void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask) { - struct kmem_cache *cache; - struct skb_shared_info *shinfo; - struct sk_buff *skb; - u8 *data; - bool pfmemalloc; - - cache = (flags & SKB_ALLOC_FCLONE) - ? skbuff_fclone_cache : skbuff_head_cache; - - if (sk_memalloc_socks() && (flags & SKB_ALLOC_RX)) - gfp_mask |= __GFP_MEMALLOC; - - /* Get the HEAD */ - skb = kmem_cache_alloc_node(cache, gfp_mask & ~__GFP_DMA, node); - if (!skb) - goto out; - prefetchw(skb); + fragsz = SKB_DATA_ALIGN(fragsz); - /* We do our best to align skb_shared_info on a separate cache - * line. It usually works because kmalloc(X > SMP_CACHE_BYTES) gives - * aligned memory blocks, unless SLUB/SLAB debug is enabled. - * Both skb->head and skb_shared_info are cache line aligned. - */ - size = SKB_DATA_ALIGN(size); - size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); - data = kmalloc_reserve(size, gfp_mask, node, &pfmemalloc); - if (!data) - goto nodata; - /* kmalloc(size) might give us more room than requested. - * Put skb_shared_info exactly at the end of allocated zone, - * to allow max possible filling before reallocation. - */ - size = SKB_WITH_OVERHEAD(ksize(data)); - prefetchw(data + size); + return __alloc_frag_align(fragsz, GFP_ATOMIC, align_mask); +} +EXPORT_SYMBOL(__napi_alloc_frag_align); - /* - * Only clear those fields we need to clear, not those that we will - * actually initialise below. Hence, don't put any more fields after - * the tail pointer in struct sk_buff! - */ - memset(skb, 0, offsetof(struct sk_buff, tail)); - /* Account for allocated memory : skb + skb->head */ - skb->truesize = SKB_TRUESIZE(size); - skb->pfmemalloc = pfmemalloc; - refcount_set(&skb->users, 1); - skb->head = data; - skb->data = data; - skb_reset_tail_pointer(skb); - skb->end = skb->tail + size; - skb->mac_header = (typeof(skb->mac_header))~0U; - skb->transport_header = (typeof(skb->transport_header))~0U; +void *__netdev_alloc_frag_align(unsigned int fragsz, unsigned int align_mask) +{ + struct page_frag_cache *nc; + void *data; - /* make sure we initialize shinfo sequentially */ - shinfo = skb_shinfo(skb); - memset(shinfo, 0, offsetof(struct skb_shared_info, dataref)); - atomic_set(&shinfo->dataref, 1); + fragsz = SKB_DATA_ALIGN(fragsz); + if (in_irq() || irqs_disabled()) { + nc = this_cpu_ptr(&netdev_alloc_cache); + data = page_frag_alloc_align(nc, fragsz, GFP_ATOMIC, align_mask); + } else { + local_bh_disable(); + data = __alloc_frag_align(fragsz, GFP_ATOMIC, align_mask); + local_bh_enable(); + } + return data; +} +EXPORT_SYMBOL(__netdev_alloc_frag_align); - if (flags & SKB_ALLOC_FCLONE) { - struct sk_buff_fclones *fclones; +static struct sk_buff *napi_skb_cache_get(void) +{ + struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); + struct sk_buff *skb; - fclones = container_of(skb, struct sk_buff_fclones, skb1); + if (unlikely(!nc->skb_count)) + nc->skb_count = kmem_cache_alloc_bulk(skbuff_head_cache, + GFP_ATOMIC, + NAPI_SKB_CACHE_BULK, + nc->skb_cache); + if (unlikely(!nc->skb_count)) + return NULL; - skb->fclone = SKB_FCLONE_ORIG; - refcount_set(&fclones->fclone_ref, 1); + skb = nc->skb_cache[--nc->skb_count]; + kasan_unpoison_object_data(skbuff_head_cache, skb); - fclones->skb2.fclone = SKB_FCLONE_CLONE; - } -out: return skb; -nodata: - kmem_cache_free(cache, skb); - skb = NULL; - goto out; } -EXPORT_SYMBOL(__alloc_skb); /* Caller must provide SKB that is memset cleared */ -static struct sk_buff *__build_skb_around(struct sk_buff *skb, - void *data, unsigned int frag_size) +static void __build_skb_around(struct sk_buff *skb, void *data, + unsigned int frag_size) { struct skb_shared_info *shinfo; unsigned int size = frag_size ? : ksize(data); @@ -282,7 +209,7 @@ static struct sk_buff *__build_skb_around(struct sk_buff *skb, memset(shinfo, 0, offsetof(struct skb_shared_info, dataref)); atomic_set(&shinfo->dataref, 1); - return skb; + skb_set_kcov_handle(skb, kcov_common_handle()); } /** @@ -313,8 +240,9 @@ struct sk_buff *__build_skb(void *data, unsigned int frag_size) return NULL; memset(skb, 0, offsetof(struct sk_buff, tail)); + __build_skb_around(skb, data, frag_size); - return __build_skb_around(skb, data, frag_size); + return skb; } /* build_skb() is wrapper over __build_skb(), that specifically @@ -347,9 +275,9 @@ struct sk_buff *build_skb_around(struct sk_buff *skb, if (unlikely(!skb)) return NULL; - skb = __build_skb_around(skb, data, frag_size); + __build_skb_around(skb, data, frag_size); - if (skb && frag_size) { + if (frag_size) { skb->head_frag = 1; if (page_is_pfmemalloc(virt_to_head_page(data))) skb->pfmemalloc = 1; @@ -358,56 +286,178 @@ struct sk_buff *build_skb_around(struct sk_buff *skb, } EXPORT_SYMBOL(build_skb_around); -#define NAPI_SKB_CACHE_SIZE 64 +/** + * __napi_build_skb - build a network buffer + * @data: data buffer provided by caller + * @frag_size: size of data, or 0 if head was kmalloced + * + * Version of __build_skb() that uses NAPI percpu caches to obtain + * skbuff_head instead of inplace allocation. + * + * Returns a new &sk_buff on success, %NULL on allocation failure. + */ +static struct sk_buff *__napi_build_skb(void *data, unsigned int frag_size) +{ + struct sk_buff *skb; -struct napi_alloc_cache { - struct page_frag_cache page; - unsigned int skb_count; - void *skb_cache[NAPI_SKB_CACHE_SIZE]; -}; + skb = napi_skb_cache_get(); + if (unlikely(!skb)) + return NULL; -static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache); -static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache); + memset(skb, 0, offsetof(struct sk_buff, tail)); + __build_skb_around(skb, data, frag_size); + + return skb; +} -static void *__napi_alloc_frag(unsigned int fragsz, gfp_t gfp_mask) +/** + * napi_build_skb - build a network buffer + * @data: data buffer provided by caller + * @frag_size: size of data, or 0 if head was kmalloced + * + * Version of __napi_build_skb() that takes care of skb->head_frag + * and skb->pfmemalloc when the data is a page or page fragment. + * + * Returns a new &sk_buff on success, %NULL on allocation failure. + */ +struct sk_buff *napi_build_skb(void *data, unsigned int frag_size) { - struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); + struct sk_buff *skb = __napi_build_skb(data, frag_size); - return page_frag_alloc(&nc->page, fragsz, gfp_mask); + if (likely(skb) && frag_size) { + skb->head_frag = 1; + skb_propagate_pfmemalloc(virt_to_head_page(data), skb); + } + + return skb; } +EXPORT_SYMBOL(napi_build_skb); -void *napi_alloc_frag(unsigned int fragsz) +/* + * kmalloc_reserve is a wrapper around kmalloc_node_track_caller that tells + * the caller if emergency pfmemalloc reserves are being used. If it is and + * the socket is later found to be SOCK_MEMALLOC then PFMEMALLOC reserves + * may be used. Otherwise, the packet data may be discarded until enough + * memory is free + */ +static void *kmalloc_reserve(size_t size, gfp_t flags, int node, + bool *pfmemalloc) { - fragsz = SKB_DATA_ALIGN(fragsz); + void *obj; + bool ret_pfmemalloc = false; + + /* + * Try a regular allocation, when that fails and we're not entitled + * to the reserves, fail. + */ + obj = kmalloc_node_track_caller(size, + flags | __GFP_NOMEMALLOC | __GFP_NOWARN, + node); + if (obj || !(gfp_pfmemalloc_allowed(flags))) + goto out; + + /* Try again but now we are using pfmemalloc reserves */ + ret_pfmemalloc = true; + obj = kmalloc_node_track_caller(size, flags, node); + +out: + if (pfmemalloc) + *pfmemalloc = ret_pfmemalloc; - return __napi_alloc_frag(fragsz, GFP_ATOMIC); + return obj; } -EXPORT_SYMBOL(napi_alloc_frag); + +/* Allocate a new skbuff. We do this ourselves so we can fill in a few + * 'private' fields and also do memory statistics to find all the + * [BEEP] leaks. + * + */ /** - * netdev_alloc_frag - allocate a page fragment - * @fragsz: fragment size + * __alloc_skb - allocate a network buffer + * @size: size to allocate + * @gfp_mask: allocation mask + * @flags: If SKB_ALLOC_FCLONE is set, allocate from fclone cache + * instead of head cache and allocate a cloned (child) skb. + * If SKB_ALLOC_RX is set, __GFP_MEMALLOC will be used for + * allocations in case the data is required for writeback + * @node: numa node to allocate memory on + * + * Allocate a new &sk_buff. The returned buffer has no headroom and a + * tail room of at least size bytes. The object has a reference count + * of one. The return is the buffer. On a failure the return is %NULL. * - * Allocates a frag from a page for receive buffer. - * Uses GFP_ATOMIC allocations. + * Buffers may only be allocated from interrupts using a @gfp_mask of + * %GFP_ATOMIC. */ -void *netdev_alloc_frag(unsigned int fragsz) +struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, + int flags, int node) { - struct page_frag_cache *nc; - void *data; + struct kmem_cache *cache; + struct sk_buff *skb; + u8 *data; + bool pfmemalloc; - fragsz = SKB_DATA_ALIGN(fragsz); - if (in_irq() || irqs_disabled()) { - nc = this_cpu_ptr(&netdev_alloc_cache); - data = page_frag_alloc(nc, fragsz, GFP_ATOMIC); - } else { - local_bh_disable(); - data = __napi_alloc_frag(fragsz, GFP_ATOMIC); - local_bh_enable(); + cache = (flags & SKB_ALLOC_FCLONE) + ? skbuff_fclone_cache : skbuff_head_cache; + + if (sk_memalloc_socks() && (flags & SKB_ALLOC_RX)) + gfp_mask |= __GFP_MEMALLOC; + + /* Get the HEAD */ + if ((flags & (SKB_ALLOC_FCLONE | SKB_ALLOC_NAPI)) == SKB_ALLOC_NAPI && + likely(node == NUMA_NO_NODE || node == numa_mem_id())) + skb = napi_skb_cache_get(); + else + skb = kmem_cache_alloc_node(cache, gfp_mask & ~GFP_DMA, node); + if (unlikely(!skb)) + return NULL; + prefetchw(skb); + + /* We do our best to align skb_shared_info on a separate cache + * line. It usually works because kmalloc(X > SMP_CACHE_BYTES) gives + * aligned memory blocks, unless SLUB/SLAB debug is enabled. + * Both skb->head and skb_shared_info are cache line aligned. + */ + size = SKB_DATA_ALIGN(size); + size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + data = kmalloc_reserve(size, gfp_mask, node, &pfmemalloc); + if (unlikely(!data)) + goto nodata; + /* kmalloc(size) might give us more room than requested. + * Put skb_shared_info exactly at the end of allocated zone, + * to allow max possible filling before reallocation. + */ + size = SKB_WITH_OVERHEAD(ksize(data)); + prefetchw(data + size); + + /* + * Only clear those fields we need to clear, not those that we will + * actually initialise below. Hence, don't put any more fields after + * the tail pointer in struct sk_buff! + */ + memset(skb, 0, offsetof(struct sk_buff, tail)); + __build_skb_around(skb, data, 0); + skb->pfmemalloc = pfmemalloc; + + if (flags & SKB_ALLOC_FCLONE) { + struct sk_buff_fclones *fclones; + + fclones = container_of(skb, struct sk_buff_fclones, skb1); + + skb->fclone = SKB_FCLONE_ORIG; + refcount_set(&fclones->fclone_ref, 1); + + fclones->skb2.fclone = SKB_FCLONE_CLONE; } - return data; + + return skb; + +nodata: + kmem_cache_free(cache, skb); + return NULL; } -EXPORT_SYMBOL(netdev_alloc_frag); +EXPORT_SYMBOL(__alloc_skb); /** * __netdev_alloc_skb - allocate an skbuff for rx on a specific device @@ -432,7 +482,11 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len, len += NET_SKB_PAD; - if ((len > SKB_WITH_OVERHEAD(PAGE_SIZE)) || + /* If requested length is either too small or too big, + * we use kmalloc() for skb->head allocation. + */ + if (len <= SKB_WITH_OVERHEAD(1024) || + len > SKB_WITH_OVERHEAD(PAGE_SIZE) || (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) { skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE); if (!skb) @@ -496,20 +550,26 @@ EXPORT_SYMBOL(__netdev_alloc_skb); struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, gfp_t gfp_mask) { - struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); + struct napi_alloc_cache *nc; struct sk_buff *skb; void *data; len += NET_SKB_PAD + NET_IP_ALIGN; - if ((len > SKB_WITH_OVERHEAD(PAGE_SIZE)) || + /* If requested length is either too small or too big, + * we use kmalloc() for skb->head allocation. + */ + if (len <= SKB_WITH_OVERHEAD(1024) || + len > SKB_WITH_OVERHEAD(PAGE_SIZE) || (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) { - skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE); + skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX | SKB_ALLOC_NAPI, + NUMA_NO_NODE); if (!skb) goto skb_fail; goto skb_success; } + nc = this_cpu_ptr(&napi_alloc_cache); len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); len = SKB_DATA_ALIGN(len); @@ -520,7 +580,7 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, if (unlikely(!data)) return NULL; - skb = __build_skb(data, len); + skb = __napi_build_skb(data, len); if (unlikely(!skb)) { skb_free_frag(data); return NULL; @@ -600,13 +660,14 @@ static void skb_release_data(struct sk_buff *skb) &shinfo->dataref)) return; + skb_zcopy_clear(skb, true); + for (i = 0; i < shinfo->nr_frags; i++) __skb_frag_unref(&shinfo->frags[i]); if (shinfo->frag_list) kfree_skb_list(shinfo->frag_list); - skb_zcopy_clear(skb, true); skb_free_head(skb); } @@ -712,11 +773,10 @@ EXPORT_SYMBOL(kfree_skb_list); * * Must only be called from net_ratelimit()-ed paths. * - * Dumps up to can_dump_full whole packets if full_pkt, headers otherwise. + * Dumps whole packets if full_pkt, only headers otherwise. */ void skb_dump(const char *level, const struct sk_buff *skb, bool full_pkt) { - static atomic_t can_dump_full = ATOMIC_INIT(5); struct skb_shared_info *sh = skb_shinfo(skb); struct net_device *dev = skb->dev; struct sock *sk = skb->sk; @@ -726,9 +786,6 @@ void skb_dump(const char *level, const struct sk_buff *skb, bool full_pkt) int i, len, seg_len; if (full_pkt) - full_pkt = atomic_dec_if_positive(&can_dump_full) >= 0; - - if (full_pkt) len = skb->len; else len = min_t(int, skb->len, MAX_HEADER + 128); @@ -841,7 +898,7 @@ EXPORT_SYMBOL(consume_skb); #endif /** - * consume_stateless_skb - free an skbuff, assuming it is stateless + * __consume_stateless_skb - free an skbuff, assuming it is stateless * @skb: buffer to free * * Alike consume_skb(), but this variant assumes that this is the last @@ -854,56 +911,48 @@ void __consume_stateless_skb(struct sk_buff *skb) kfree_skbmem(skb); } -void __kfree_skb_flush(void) +static void napi_skb_cache_put(struct sk_buff *skb) { struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); + u32 i; - /* flush skb_cache if containing objects */ - if (nc->skb_count) { - kmem_cache_free_bulk(skbuff_head_cache, nc->skb_count, - nc->skb_cache); - nc->skb_count = 0; - } -} - -static inline void _kfree_skb_defer(struct sk_buff *skb) -{ - struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); - - /* drop skb->head and call any destructors for packet */ - skb_release_all(skb); - - /* record skb to CPU local list */ + kasan_poison_object_data(skbuff_head_cache, skb); nc->skb_cache[nc->skb_count++] = skb; -#ifdef CONFIG_SLUB - /* SLUB writes into objects when freeing */ - prefetchw(skb); -#endif - - /* flush skb_cache if it is filled */ if (unlikely(nc->skb_count == NAPI_SKB_CACHE_SIZE)) { - kmem_cache_free_bulk(skbuff_head_cache, NAPI_SKB_CACHE_SIZE, - nc->skb_cache); - nc->skb_count = 0; + for (i = NAPI_SKB_CACHE_HALF; i < NAPI_SKB_CACHE_SIZE; i++) + kasan_unpoison_object_data(skbuff_head_cache, + nc->skb_cache[i]); + + kmem_cache_free_bulk(skbuff_head_cache, NAPI_SKB_CACHE_HALF, + nc->skb_cache + NAPI_SKB_CACHE_HALF); + nc->skb_count = NAPI_SKB_CACHE_HALF; } } + void __kfree_skb_defer(struct sk_buff *skb) { - _kfree_skb_defer(skb); + skb_release_all(skb); + napi_skb_cache_put(skb); } -void napi_consume_skb(struct sk_buff *skb, int budget) +void napi_skb_free_stolen_head(struct sk_buff *skb) { - if (unlikely(!skb)) - return; + skb_dst_drop(skb); + skb_ext_put(skb); + napi_skb_cache_put(skb); +} +void napi_consume_skb(struct sk_buff *skb, int budget) +{ /* Zero budget indicate non-NAPI context called us, like netpoll */ if (unlikely(!budget)) { dev_consume_skb_any(skb); return; } + lockdep_assert_in_softirq(); + if (!skb_unref(skb)) return; @@ -916,7 +965,8 @@ void napi_consume_skb(struct sk_buff *skb, int budget) return; } - _kfree_skb_defer(skb); + skb_release_all(skb); + napi_skb_cache_put(skb); } EXPORT_SYMBOL(napi_consume_skb); @@ -1093,7 +1143,7 @@ void mm_unaccount_pinned_pages(struct mmpin *mmp) } EXPORT_SYMBOL_GPL(mm_unaccount_pinned_pages); -struct ubuf_info *sock_zerocopy_alloc(struct sock *sk, size_t size) +struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size) { struct ubuf_info *uarg; struct sk_buff *skb; @@ -1113,25 +1163,26 @@ struct ubuf_info *sock_zerocopy_alloc(struct sock *sk, size_t size) return NULL; } - uarg->callback = sock_zerocopy_callback; + uarg->callback = msg_zerocopy_callback; uarg->id = ((u32)atomic_inc_return(&sk->sk_zckey)) - 1; uarg->len = 1; uarg->bytelen = size; uarg->zerocopy = 1; + uarg->flags = SKBFL_ZEROCOPY_FRAG; refcount_set(&uarg->refcnt, 1); sock_hold(sk); return uarg; } -EXPORT_SYMBOL_GPL(sock_zerocopy_alloc); +EXPORT_SYMBOL_GPL(msg_zerocopy_alloc); static inline struct sk_buff *skb_from_uarg(struct ubuf_info *uarg) { return container_of((void *)uarg, struct sk_buff, cb); } -struct ubuf_info *sock_zerocopy_realloc(struct sock *sk, size_t size, - struct ubuf_info *uarg) +struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size, + struct ubuf_info *uarg) { if (uarg) { const u32 byte_limit = 1 << 19; /* limit to a few TSO */ @@ -1163,16 +1214,16 @@ struct ubuf_info *sock_zerocopy_realloc(struct sock *sk, size_t size, /* no extra ref when appending to datagram (MSG_MORE) */ if (sk->sk_type == SOCK_STREAM) - sock_zerocopy_get(uarg); + net_zcopy_get(uarg); return uarg; } } new_alloc: - return sock_zerocopy_alloc(sk, size); + return msg_zerocopy_alloc(sk, size); } -EXPORT_SYMBOL_GPL(sock_zerocopy_realloc); +EXPORT_SYMBOL_GPL(msg_zerocopy_realloc); static bool skb_zerocopy_notify_extend(struct sk_buff *skb, u32 lo, u16 len) { @@ -1194,7 +1245,7 @@ static bool skb_zerocopy_notify_extend(struct sk_buff *skb, u32 lo, u16 len) return true; } -void sock_zerocopy_callback(struct ubuf_info *uarg, bool success) +static void __msg_zerocopy_callback(struct ubuf_info *uarg) { struct sk_buff *tail, *skb = skb_from_uarg(uarg); struct sock_exterr_skb *serr; @@ -1222,7 +1273,7 @@ void sock_zerocopy_callback(struct ubuf_info *uarg, bool success) serr->ee.ee_origin = SO_EE_ORIGIN_ZEROCOPY; serr->ee.ee_data = hi; serr->ee.ee_info = lo; - if (!success) + if (!uarg->zerocopy) serr->ee.ee_code |= SO_EE_CODE_ZEROCOPY_COPIED; q = &sk->sk_error_queue; @@ -1241,32 +1292,28 @@ release: consume_skb(skb); sock_put(sk); } -EXPORT_SYMBOL_GPL(sock_zerocopy_callback); -void sock_zerocopy_put(struct ubuf_info *uarg) +void msg_zerocopy_callback(struct sk_buff *skb, struct ubuf_info *uarg, + bool success) { - if (uarg && refcount_dec_and_test(&uarg->refcnt)) { - if (uarg->callback) - uarg->callback(uarg, uarg->zerocopy); - else - consume_skb(skb_from_uarg(uarg)); - } + uarg->zerocopy = uarg->zerocopy & success; + + if (refcount_dec_and_test(&uarg->refcnt)) + __msg_zerocopy_callback(uarg); } -EXPORT_SYMBOL_GPL(sock_zerocopy_put); +EXPORT_SYMBOL_GPL(msg_zerocopy_callback); -void sock_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref) +void msg_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref) { - if (uarg) { - struct sock *sk = skb_from_uarg(uarg)->sk; + struct sock *sk = skb_from_uarg(uarg)->sk; - atomic_dec(&sk->sk_zckey); - uarg->len--; + atomic_dec(&sk->sk_zckey); + uarg->len--; - if (have_uref) - sock_zerocopy_put(uarg); - } + if (have_uref) + msg_zerocopy_callback(NULL, uarg, true); } -EXPORT_SYMBOL_GPL(sock_zerocopy_put_abort); +EXPORT_SYMBOL_GPL(msg_zerocopy_put_abort); int skb_zerocopy_iter_dgram(struct sk_buff *skb, struct msghdr *msg, int len) { @@ -1330,7 +1377,7 @@ static int skb_zerocopy_clone(struct sk_buff *nskb, struct sk_buff *orig, * @skb: the skb to modify * @gfp_mask: allocation priority * - * This must be called on SKBTX_DEV_ZEROCOPY skb. + * This must be called on skb with SKBFL_ZEROCOPY_ENABLE. * It will copy all frags into kernel and drop the reference * to userspace pages. * @@ -2018,6 +2065,12 @@ int pskb_trim_rcsum_slow(struct sk_buff *skb, unsigned int len) skb->csum = csum_block_sub(skb->csum, skb_checksum(skb, len, delta, 0), len); + } else if (skb->ip_summed == CHECKSUM_PARTIAL) { + int hdlen = (len > skb_headlen(skb)) ? skb_headlen(skb) : len; + int offset = skb_checksum_start_offset(skb) + skb->csum_offset; + + if (offset + sizeof(__sum16) > hdlen) + return -EINVAL; } return __pskb_trim(skb, len); } @@ -3261,8 +3314,7 @@ void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len) { int pos = skb_headlen(skb); - skb_shinfo(skb1)->tx_flags |= skb_shinfo(skb)->tx_flags & - SKBTX_SHARED_FRAG; + skb_shinfo(skb1)->flags |= skb_shinfo(skb)->flags & SKBFL_SHARED_FRAG; skb_zerocopy_clone(skb1, skb, 0); if (len < pos) /* Split line is inside header. */ skb_split_inside_header(skb, skb1, len, pos); @@ -3277,7 +3329,19 @@ EXPORT_SYMBOL(skb_split); */ static int skb_prepare_for_shift(struct sk_buff *skb) { - return skb_cloned(skb) && pskb_expand_head(skb, 0, 0, GFP_ATOMIC); + int ret = 0; + + if (skb_cloned(skb)) { + /* Save and restore truesize: pskb_expand_head() may reallocate + * memory where ksize(kmalloc(S)) != ksize(kmalloc(S)), but we + * cannot change truesize at this point. + */ + unsigned int save_truesize = skb->truesize; + + ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC); + skb->truesize = save_truesize; + } + return ret; } /** @@ -3436,6 +3500,7 @@ void skb_prepare_seq_read(struct sk_buff *skb, unsigned int from, st->root_skb = st->cur_skb = skb; st->frag_idx = st->stepped_offset = 0; st->frag_data = NULL; + st->frag_off = 0; } EXPORT_SYMBOL(skb_prepare_seq_read); @@ -3490,14 +3555,27 @@ next_skb: st->stepped_offset += skb_headlen(st->cur_skb); while (st->frag_idx < skb_shinfo(st->cur_skb)->nr_frags) { + unsigned int pg_idx, pg_off, pg_sz; + frag = &skb_shinfo(st->cur_skb)->frags[st->frag_idx]; - block_limit = skb_frag_size(frag) + st->stepped_offset; + pg_idx = 0; + pg_off = skb_frag_off(frag); + pg_sz = skb_frag_size(frag); + + if (skb_frag_must_loop(skb_frag_page(frag))) { + pg_idx = (pg_off + st->frag_off) >> PAGE_SHIFT; + pg_off = offset_in_page(pg_off + st->frag_off); + pg_sz = min_t(unsigned int, pg_sz - st->frag_off, + PAGE_SIZE - pg_off); + } + + block_limit = pg_sz + st->stepped_offset; if (abs_offset < block_limit) { if (!st->frag_data) - st->frag_data = kmap_atomic(skb_frag_page(frag)); + st->frag_data = kmap_atomic(skb_frag_page(frag) + pg_idx); - *data = (u8 *) st->frag_data + skb_frag_off(frag) + + *data = (u8 *)st->frag_data + pg_off + (abs_offset - st->stepped_offset); return block_limit - abs_offset; @@ -3508,8 +3586,12 @@ next_skb: st->frag_data = NULL; } - st->frag_idx++; - st->stepped_offset += skb_frag_size(frag); + st->stepped_offset += pg_sz; + st->frag_off += pg_sz; + if (st->frag_off == skb_frag_size(frag)) { + st->frag_off = 0; + st->frag_idx++; + } } if (st->frag_data) { @@ -3649,7 +3731,8 @@ struct sk_buff *skb_segment_list(struct sk_buff *skb, unsigned int delta_truesize = 0; unsigned int delta_len = 0; struct sk_buff *tail = NULL; - struct sk_buff *nskb; + struct sk_buff *nskb, *tmp; + int err; skb_push(skb, -skb_network_offset(skb) + offset); @@ -3659,11 +3742,28 @@ struct sk_buff *skb_segment_list(struct sk_buff *skb, nskb = list_skb; list_skb = list_skb->next; + err = 0; + if (skb_shared(nskb)) { + tmp = skb_clone(nskb, GFP_ATOMIC); + if (tmp) { + consume_skb(nskb); + nskb = tmp; + err = skb_unclone(nskb, GFP_ATOMIC); + } else { + err = -ENOMEM; + } + } + if (!tail) skb->next = nskb; else tail->next = nskb; + if (unlikely(err)) { + nskb->next = list_skb; + goto err_linearize; + } + tail = nskb; delta_len += nskb->len; @@ -3850,12 +3950,8 @@ normal: } hsize = skb_headlen(head_skb) - offset; - if (hsize < 0) - hsize = 0; - if (hsize > len || !sg) - hsize = len; - if (!hsize && i >= nfrags && skb_headlen(list_skb) && + if (hsize <= 0 && i >= nfrags && skb_headlen(list_skb) && (skb_headlen(list_skb) == len || sg)) { BUG_ON(skb_headlen(list_skb) > len); @@ -3898,6 +3994,11 @@ normal: skb_release_head_state(nskb); __skb_push(nskb, doffset); } else { + if (hsize < 0) + hsize = 0; + if (hsize > len || !sg) + hsize = len; + nskb = __alloc_skb(hsize + doffset + headroom, GFP_ATOMIC, skb_alloc_rx_flag(head_skb), NUMA_NO_NODE); @@ -3951,8 +4052,8 @@ normal: skb_copy_from_linear_data_offset(head_skb, offset, skb_put(nskb, hsize), hsize); - skb_shinfo(nskb)->tx_flags |= skb_shinfo(head_skb)->tx_flags & - SKBTX_SHARED_FRAG; + skb_shinfo(nskb)->flags |= skb_shinfo(head_skb)->flags & + SKBFL_SHARED_FRAG; if (skb_orphan_frags(frag_skb, GFP_ATOMIC) || skb_zerocopy_clone(nskb, frag_skb, GFP_ATOMIC)) @@ -4556,7 +4657,7 @@ struct sk_buff *sock_dequeue_err_skb(struct sock *sk) if (skb && (skb_next = skb_peek(q))) { icmp_next = is_icmp_err_skb(skb_next); if (icmp_next) - sk->sk_err = SKB_EXT_ERR(skb_next)->ee.ee_origin; + sk->sk_err = SKB_EXT_ERR(skb_next)->ee.ee_errno; } spin_unlock_irqrestore(&q->lock, flags); @@ -4672,6 +4773,7 @@ err: EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp); void __skb_tstamp_tx(struct sk_buff *orig_skb, + const struct sk_buff *ack_skb, struct skb_shared_hwtstamps *hwtstamps, struct sock *sk, int tstype) { @@ -4694,7 +4796,8 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb, if ((sk->sk_tsflags & SOF_TIMESTAMPING_OPT_STATS) && sk->sk_protocol == IPPROTO_TCP && sk->sk_type == SOCK_STREAM) { - skb = tcp_get_timestamping_opt_stats(sk, orig_skb); + skb = tcp_get_timestamping_opt_stats(sk, orig_skb, + ack_skb); opt_stats = true; } else #endif @@ -4723,7 +4826,7 @@ EXPORT_SYMBOL_GPL(__skb_tstamp_tx); void skb_tstamp_tx(struct sk_buff *orig_skb, struct skb_shared_hwtstamps *hwtstamps) { - return __skb_tstamp_tx(orig_skb, hwtstamps, orig_skb->sk, + return __skb_tstamp_tx(orig_skb, NULL, hwtstamps, orig_skb->sk, SCM_TSTAMP_SND); } EXPORT_SYMBOL_GPL(skb_tstamp_tx); @@ -5437,7 +5540,8 @@ struct sk_buff *skb_vlan_untag(struct sk_buff *skb) goto err_free; skb_reset_network_header(skb); - skb_reset_transport_header(skb); + if (!skb_transport_header_was_set(skb)) + skb_reset_transport_header(skb); skb_reset_mac_len(skb); return skb; @@ -5562,6 +5666,73 @@ int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci) } EXPORT_SYMBOL(skb_vlan_push); +/** + * skb_eth_pop() - Drop the Ethernet header at the head of a packet + * + * @skb: Socket buffer to modify + * + * Drop the Ethernet header of @skb. + * + * Expects that skb->data points to the mac header and that no VLAN tags are + * present. + * + * Returns 0 on success, -errno otherwise. + */ +int skb_eth_pop(struct sk_buff *skb) +{ + if (!pskb_may_pull(skb, ETH_HLEN) || skb_vlan_tagged(skb) || + skb_network_offset(skb) < ETH_HLEN) + return -EPROTO; + + skb_pull_rcsum(skb, ETH_HLEN); + skb_reset_mac_header(skb); + skb_reset_mac_len(skb); + + return 0; +} +EXPORT_SYMBOL(skb_eth_pop); + +/** + * skb_eth_push() - Add a new Ethernet header at the head of a packet + * + * @skb: Socket buffer to modify + * @dst: Destination MAC address of the new header + * @src: Source MAC address of the new header + * + * Prepend @skb with a new Ethernet header. + * + * Expects that skb->data points to the mac header, which must be empty. + * + * Returns 0 on success, -errno otherwise. + */ +int skb_eth_push(struct sk_buff *skb, const unsigned char *dst, + const unsigned char *src) +{ + struct ethhdr *eth; + int err; + + if (skb_network_offset(skb) || skb_vlan_tag_present(skb)) + return -EPROTO; + + err = skb_cow_head(skb, sizeof(*eth)); + if (err < 0) + return err; + + skb_push(skb, sizeof(*eth)); + skb_reset_mac_header(skb); + skb_reset_mac_len(skb); + + eth = eth_hdr(skb); + ether_addr_copy(eth->h_dest, dst); + ether_addr_copy(eth->h_source, src); + eth->h_proto = skb->protocol; + + skb_postpush_rcsum(skb, eth, sizeof(*eth)); + + return 0; +} +EXPORT_SYMBOL(skb_eth_push); + /* Update the ethertype of hdr and the skb csum value if required. */ static void skb_mod_eth_type(struct sk_buff *skb, struct ethhdr *hdr, __be16 ethertype) @@ -5726,6 +5897,9 @@ int skb_mpls_dec_ttl(struct sk_buff *skb) if (unlikely(!eth_p_mpls(skb->protocol))) return -EINVAL; + if (!pskb_may_pull(skb, skb_network_offset(skb) + MPLS_HLEN)) + return -ENOMEM; + lse = be32_to_cpu(mpls_hdr(skb)->label_stack_entry); ttl = (lse & MPLS_LS_TTL_MASK) >> MPLS_LS_TTL_SHIFT; if (!--ttl) @@ -5956,8 +6130,7 @@ static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off, size = SKB_WITH_OVERHEAD(ksize(data)); memcpy((struct skb_shared_info *)(data + size), - skb_shinfo(skb), offsetof(struct skb_shared_info, - frags[skb_shinfo(skb)->nr_frags])); + skb_shinfo(skb), offsetof(struct skb_shared_info, frags[0])); if (skb_orphan_frags(skb, gfp_mask)) { kfree(data); return -ENOMEM; diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 649583158983..1261512d6807 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -170,10 +170,12 @@ static int sk_msg_free_elem(struct sock *sk, struct sk_msg *msg, u32 i, struct scatterlist *sge = sk_msg_elem(msg, i); u32 len = sge->length; - if (charge) - sk_mem_uncharge(sk, len); - if (!msg->skb) + /* When the skb owns the memory we free it from consume_skb path. */ + if (!msg->skb) { + if (charge) + sk_mem_uncharge(sk, len); put_page(sg_page(sge)); + } memset(sge, 0, sizeof(*sge)); return len; } @@ -397,28 +399,45 @@ out: } EXPORT_SYMBOL_GPL(sk_msg_memcopy_from_iter); -static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb) +static struct sk_msg *sk_psock_create_ingress_msg(struct sock *sk, + struct sk_buff *skb) { - struct sock *sk = psock->sk; - int copied = 0, num_sge; struct sk_msg *msg; + if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) + return NULL; + + if (!sk_rmem_schedule(sk, skb, skb->truesize)) + return NULL; + msg = kzalloc(sizeof(*msg), __GFP_NOWARN | GFP_ATOMIC); if (unlikely(!msg)) - return -EAGAIN; - if (!sk_rmem_schedule(sk, skb, skb->len)) { - kfree(msg); - return -EAGAIN; - } + return NULL; sk_msg_init(msg); + return msg; +} + +static int sk_psock_skb_ingress_enqueue(struct sk_buff *skb, + struct sk_psock *psock, + struct sock *sk, + struct sk_msg *msg) +{ + int num_sge, copied; + + /* skb linearize may fail with ENOMEM, but lets simply try again + * later if this happens. Under memory pressure we don't want to + * drop the skb. We need to linearize the skb so that the mapping + * in skb_to_sgvec can not error. + */ + if (skb_linearize(skb)) + return -EAGAIN; num_sge = skb_to_sgvec(skb, msg->sg.data, 0, skb->len); if (unlikely(num_sge < 0)) { kfree(msg); return num_sge; } - sk_mem_charge(sk, skb->len); copied = skb->len; msg->sg.start = 0; msg->sg.size = copied; @@ -430,13 +449,57 @@ static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb) return copied; } +static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb); + +static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb) +{ + struct sock *sk = psock->sk; + struct sk_msg *msg; + + /* If we are receiving on the same sock skb->sk is already assigned, + * skip memory accounting and owner transition seeing it already set + * correctly. + */ + if (unlikely(skb->sk == sk)) + return sk_psock_skb_ingress_self(psock, skb); + msg = sk_psock_create_ingress_msg(sk, skb); + if (!msg) + return -EAGAIN; + + /* This will transition ownership of the data from the socket where + * the BPF program was run initiating the redirect to the socket + * we will eventually receive this data on. The data will be released + * from skb_consume found in __tcp_bpf_recvmsg() after its been copied + * into user buffers. + */ + skb_set_owner_r(skb, sk); + return sk_psock_skb_ingress_enqueue(skb, psock, sk, msg); +} + +/* Puts an skb on the ingress queue of the socket already assigned to the + * skb. In this case we do not need to check memory limits or skb_set_owner_r + * because the skb is already accounted for here. + */ +static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb) +{ + struct sk_msg *msg = kzalloc(sizeof(*msg), __GFP_NOWARN | GFP_ATOMIC); + struct sock *sk = psock->sk; + + if (unlikely(!msg)) + return -EAGAIN; + sk_msg_init(msg); + return sk_psock_skb_ingress_enqueue(skb, psock, sk, msg); +} + static int sk_psock_handle_skb(struct sk_psock *psock, struct sk_buff *skb, u32 off, u32 len, bool ingress) { - if (ingress) - return sk_psock_skb_ingress(psock, skb); - else + if (!ingress) { + if (!sock_writeable(psock->sk)) + return -EAGAIN; return skb_send_sock_locked(psock->sk, skb, off, len); + } + return sk_psock_skb_ingress(psock, skb); } static void sk_psock_backlog(struct work_struct *work) @@ -494,14 +557,34 @@ end: struct sk_psock *sk_psock_init(struct sock *sk, int node) { - struct sk_psock *psock = kzalloc_node(sizeof(*psock), - GFP_ATOMIC | __GFP_NOWARN, - node); - if (!psock) - return NULL; + struct sk_psock *psock; + struct proto *prot; + + write_lock_bh(&sk->sk_callback_lock); + + if (inet_csk_has_ulp(sk)) { + psock = ERR_PTR(-EINVAL); + goto out; + } + if (sk->sk_user_data) { + psock = ERR_PTR(-EBUSY); + goto out; + } + + psock = kzalloc_node(sizeof(*psock), GFP_ATOMIC | __GFP_NOWARN, node); + if (!psock) { + psock = ERR_PTR(-ENOMEM); + goto out; + } + + prot = READ_ONCE(sk->sk_prot); psock->sk = sk; - psock->eval = __SK_NONE; + psock->eval = __SK_NONE; + psock->sk_proto = prot; + psock->saved_unhash = prot->unhash; + psock->saved_close = prot->close; + psock->saved_write_space = sk->sk_write_space; INIT_LIST_HEAD(&psock->link); spin_lock_init(&psock->link_lock); @@ -516,6 +599,8 @@ struct sk_psock *sk_psock_init(struct sock *sk, int node) rcu_assign_sk_user_data_nocopy(sk, psock); sock_hold(sk); +out: + write_unlock_bh(&sk->sk_callback_lock); return psock; } EXPORT_SYMBOL_GPL(sk_psock_init); @@ -584,14 +669,13 @@ static void sk_psock_destroy_deferred(struct work_struct *gc) kfree(psock); } -void sk_psock_destroy(struct rcu_head *rcu) +static void sk_psock_destroy(struct rcu_head *rcu) { struct sk_psock *psock = container_of(rcu, struct sk_psock, rcu); INIT_WORK(&psock->gc, sk_psock_destroy_deferred); schedule_work(&psock->gc); } -EXPORT_SYMBOL_GPL(sk_psock_destroy); void sk_psock_drop(struct sock *sk, struct sk_psock *psock) { @@ -603,6 +687,8 @@ void sk_psock_drop(struct sock *sk, struct sk_psock *psock) rcu_assign_sk_user_data(sk, NULL); if (psock->progs.skb_parser) sk_psock_stop_strp(sk, psock); + else if (psock->progs.skb_verdict) + sk_psock_stop_verdict(sk, psock); write_unlock_bh(&sk->sk_callback_lock); sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED); @@ -660,19 +746,8 @@ EXPORT_SYMBOL_GPL(sk_psock_msg_verdict); static int sk_psock_bpf_run(struct sk_psock *psock, struct bpf_prog *prog, struct sk_buff *skb) { - int ret; - - skb->sk = psock->sk; bpf_compute_data_end_sk_skb(skb); - ret = bpf_prog_run_pin_on_cpu(prog, skb); - /* strparser clones the skb before handing it to a upper layer, - * meaning skb_orphan has been called. We NULL sk on the way out - * to ensure we don't trigger a BUG_ON() in skb/sk operations - * later and because we are not charging the memory of this skb - * to any socket yet. - */ - skb->sk = NULL; - return ret; + return bpf_prog_run_pin_on_cpu(prog, skb); } static struct sk_psock *sk_psock_from_strp(struct strparser *strp) @@ -687,38 +762,35 @@ static void sk_psock_skb_redirect(struct sk_buff *skb) { struct sk_psock *psock_other; struct sock *sk_other; - bool ingress; sk_other = tcp_skb_bpf_redirect_fetch(skb); + /* This error is a buggy BPF program, it returned a redirect + * return code, but then didn't set a redirect interface. + */ if (unlikely(!sk_other)) { kfree_skb(skb); return; } psock_other = sk_psock(sk_other); + /* This error indicates the socket is being torn down or had another + * error that caused the pipe to break. We can't send a packet on + * a socket that is in this state so we drop the skb. + */ if (!psock_other || sock_flag(sk_other, SOCK_DEAD) || !sk_psock_test_state(psock_other, SK_PSOCK_TX_ENABLED)) { kfree_skb(skb); return; } - ingress = tcp_skb_bpf_ingress(skb); - if ((!ingress && sock_writeable(sk_other)) || - (ingress && - atomic_read(&sk_other->sk_rmem_alloc) <= - sk_other->sk_rcvbuf)) { - if (!ingress) - skb_set_owner_w(skb, sk_other); - skb_queue_tail(&psock_other->ingress_skb, skb); - schedule_work(&psock_other->work); - } else { - kfree_skb(skb); - } + skb_queue_tail(&psock_other->ingress_skb, skb); + schedule_work(&psock_other->work); } -static void sk_psock_tls_verdict_apply(struct sk_buff *skb, int verdict) +static void sk_psock_tls_verdict_apply(struct sk_buff *skb, struct sock *sk, int verdict) { switch (verdict) { case __SK_REDIRECT: + skb_set_owner_r(skb, sk); sk_psock_skb_redirect(skb); break; case __SK_PASS: @@ -736,11 +808,17 @@ int sk_psock_tls_strp_read(struct sk_psock *psock, struct sk_buff *skb) rcu_read_lock(); prog = READ_ONCE(psock->progs.skb_verdict); if (likely(prog)) { + /* We skip full set_owner_r here because if we do a SK_PASS + * or SK_DROP we can skip skb memory accounting and use the + * TLS context. + */ + skb->sk = psock->sk; tcp_skb_bpf_redirect_clear(skb); ret = sk_psock_bpf_run(psock, prog, skb); ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb)); + skb->sk = NULL; } - sk_psock_tls_verdict_apply(skb, ret); + sk_psock_tls_verdict_apply(skb, psock->sk, ret); rcu_read_unlock(); return ret; } @@ -749,7 +827,9 @@ EXPORT_SYMBOL_GPL(sk_psock_tls_strp_read); static void sk_psock_verdict_apply(struct sk_psock *psock, struct sk_buff *skb, int verdict) { + struct tcp_skb_cb *tcp; struct sock *sk_other; + int err = -EIO; switch (verdict) { case __SK_PASS: @@ -758,16 +838,24 @@ static void sk_psock_verdict_apply(struct sk_psock *psock, !sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) { goto out_free; } - if (atomic_read(&sk_other->sk_rmem_alloc) <= - sk_other->sk_rcvbuf) { - struct tcp_skb_cb *tcp = TCP_SKB_CB(skb); - tcp->bpf.flags |= BPF_F_INGRESS; + tcp = TCP_SKB_CB(skb); + tcp->bpf.flags |= BPF_F_INGRESS; + + /* If the queue is empty then we can submit directly + * into the msg queue. If its not empty we have to + * queue work otherwise we may get OOO data. Otherwise, + * if sk_psock_skb_ingress errors will be handled by + * retrying later from workqueue. + */ + if (skb_queue_empty(&psock->ingress_skb)) { + err = sk_psock_skb_ingress_self(psock, skb); + } + if (err < 0) { skb_queue_tail(&psock->ingress_skb, skb); schedule_work(&psock->work); - break; } - goto out_free; + break; case __SK_REDIRECT: sk_psock_skb_redirect(skb); break; @@ -792,9 +880,9 @@ static void sk_psock_strp_read(struct strparser *strp, struct sk_buff *skb) kfree_skb(skb); goto out; } + skb_set_owner_r(skb, sk); prog = READ_ONCE(psock->progs.skb_verdict); if (likely(prog)) { - skb_orphan(skb); tcp_skb_bpf_redirect_clear(skb); ret = sk_psock_bpf_run(psock, prog, skb); ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb)); @@ -817,8 +905,11 @@ static int sk_psock_strp_parse(struct strparser *strp, struct sk_buff *skb) rcu_read_lock(); prog = READ_ONCE(psock->progs.skb_parser); - if (likely(prog)) + if (likely(prog)) { + skb->sk = psock->sk; ret = sk_psock_bpf_run(psock, prog, skb); + skb->sk = NULL; + } rcu_read_unlock(); return ret; } @@ -842,6 +933,57 @@ static void sk_psock_strp_data_ready(struct sock *sk) rcu_read_unlock(); } +static int sk_psock_verdict_recv(read_descriptor_t *desc, struct sk_buff *skb, + unsigned int offset, size_t orig_len) +{ + struct sock *sk = (struct sock *)desc->arg.data; + struct sk_psock *psock; + struct bpf_prog *prog; + int ret = __SK_DROP; + int len = skb->len; + + /* clone here so sk_eat_skb() in tcp_read_sock does not drop our data */ + skb = skb_clone(skb, GFP_ATOMIC); + if (!skb) { + desc->error = -ENOMEM; + return 0; + } + + rcu_read_lock(); + psock = sk_psock(sk); + if (unlikely(!psock)) { + len = 0; + kfree_skb(skb); + goto out; + } + skb_set_owner_r(skb, sk); + prog = READ_ONCE(psock->progs.skb_verdict); + if (likely(prog)) { + tcp_skb_bpf_redirect_clear(skb); + ret = sk_psock_bpf_run(psock, prog, skb); + ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb)); + } + sk_psock_verdict_apply(psock, skb, ret); +out: + rcu_read_unlock(); + return len; +} + +static void sk_psock_verdict_data_ready(struct sock *sk) +{ + struct socket *sock = sk->sk_socket; + read_descriptor_t desc; + + if (unlikely(!sock || !sock->ops || !sock->ops->read_sock)) + return; + + desc.arg.data = sk; + desc.error = 0; + desc.count = 1; + + sock->ops->read_sock(sk, &desc, sk_psock_verdict_recv); +} + static void sk_psock_write_space(struct sock *sk) { struct sk_psock *psock; @@ -871,6 +1013,19 @@ int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock) return strp_init(&psock->parser.strp, sk, &cb); } +void sk_psock_start_verdict(struct sock *sk, struct sk_psock *psock) +{ + struct sk_psock_parser *parser = &psock->parser; + + if (parser->enabled) + return; + + parser->saved_data_ready = sk->sk_data_ready; + sk->sk_data_ready = sk_psock_verdict_data_ready; + sk->sk_write_space = sk_psock_write_space; + parser->enabled = true; +} + void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock) { struct sk_psock_parser *parser = &psock->parser; @@ -896,3 +1051,15 @@ void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock) strp_stop(&parser->strp); parser->enabled = false; } + +void sk_psock_stop_verdict(struct sock *sk, struct sk_psock *psock) +{ + struct sk_psock_parser *parser = &psock->parser; + + if (!parser->enabled) + return; + + sk->sk_data_ready = parser->saved_data_ready; + parser->saved_data_ready = NULL; + parser->enabled = false; +} diff --git a/net/core/sock.c b/net/core/sock.c index 6c5c6b18eff4..0ed98f20448a 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -413,18 +413,6 @@ static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen, return 0; } -static void sock_warn_obsolete_bsdism(const char *name) -{ - static int warned; - static char warncomm[TASK_COMM_LEN]; - if (strcmp(warncomm, current->comm) && warned < 5) { - strcpy(warncomm, current->comm); - pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n", - warncomm, name); - warned++; - } -} - static bool sock_needs_netstamp(const struct sock *sk) { switch (sk->sk_family) { @@ -538,11 +526,17 @@ discard_and_relse: } EXPORT_SYMBOL(__sk_receive_skb); +INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *, + u32)); +INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *, + u32)); struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie) { struct dst_entry *dst = __sk_dst_get(sk); - if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) { + if (dst && dst->obsolete && + INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check, + dst, cookie) == NULL) { sk_tx_queue_clear(sk); sk->sk_dst_pending_confirm = 0; RCU_INIT_POINTER(sk->sk_dst_cache, NULL); @@ -558,7 +552,9 @@ struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie) { struct dst_entry *dst = sk_dst_get(sk); - if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) { + if (dst && dst->obsolete && + INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check, + dst, cookie) == NULL) { sk_dst_reset(sk); dst_release(dst); return NULL; @@ -769,7 +765,6 @@ static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns) } else { sock_reset_flag(sk, SOCK_RCVTSTAMP); sock_reset_flag(sk, SOCK_RCVTSTAMPNS); - sock_reset_flag(sk, SOCK_TSTAMP_NEW); } } @@ -984,7 +979,6 @@ set_sndbuf: break; case SO_BSDCOMPAT: - sock_warn_obsolete_bsdism("setsockopt"); break; case SO_PASSCRED: @@ -1007,8 +1001,6 @@ set_sndbuf: __sock_set_timestamps(sk, valbool, true, true); break; case SO_TIMESTAMPING_NEW: - sock_set_flag(sk, SOCK_TSTAMP_NEW); - fallthrough; case SO_TIMESTAMPING_OLD: if (val & ~SOF_TIMESTAMPING_MASK) { ret = -EINVAL; @@ -1037,16 +1029,14 @@ set_sndbuf: } sk->sk_tsflags = val; + sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW); + if (val & SOF_TIMESTAMPING_RX_SOFTWARE) sock_enable_timestamp(sk, SOCK_TIMESTAMPING_RX_SOFTWARE); - else { - if (optname == SO_TIMESTAMPING_NEW) - sock_reset_flag(sk, SOCK_TSTAMP_NEW); - + else sock_disable_timestamp(sk, (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)); - } break; case SO_RCVLOWAT: @@ -1177,11 +1167,27 @@ set_sndbuf: sk->sk_ll_usec = val; } break; + case SO_PREFER_BUSY_POLL: + if (valbool && !capable(CAP_NET_ADMIN)) + ret = -EPERM; + else + WRITE_ONCE(sk->sk_prefer_busy_poll, valbool); + break; + case SO_BUSY_POLL_BUDGET: + if (val > READ_ONCE(sk->sk_busy_poll_budget) && !capable(CAP_NET_ADMIN)) { + ret = -EPERM; + } else { + if (val < 0 || val > U16_MAX) + ret = -EINVAL; + else + WRITE_ONCE(sk->sk_busy_poll_budget, val); + } + break; #endif case SO_MAX_PACING_RATE: { - unsigned long ulval = (val == ~0U) ? ~0UL : val; + unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val; if (sizeof(ulval) != sizeof(val) && optlen >= sizeof(ulval) && @@ -1387,7 +1393,6 @@ int sock_getsockopt(struct socket *sock, int level, int optname, break; case SO_BSDCOMPAT: - sock_warn_obsolete_bsdism("getsockopt"); break; case SO_TIMESTAMP_OLD: @@ -1542,6 +1547,9 @@ int sock_getsockopt(struct socket *sock, int level, int optname, case SO_BUSY_POLL: v.val = sk->sk_ll_usec; break; + case SO_PREFER_BUSY_POLL: + v.val = READ_ONCE(sk->sk_prefer_busy_poll); + break; #endif case SO_MAX_PACING_RATE: @@ -1657,6 +1665,16 @@ static void sock_copy(struct sock *nsk, const struct sock *osk) #ifdef CONFIG_SECURITY_NETWORK void *sptr = nsk->sk_security; #endif + + /* If we move sk_tx_queue_mapping out of the private section, + * we must check if sk_tx_queue_clear() is called after + * sock_copy() in sk_clone_lock(). + */ + BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) < + offsetof(struct sock, sk_dontcopy_begin) || + offsetof(struct sock, sk_tx_queue_mapping) >= + offsetof(struct sock, sk_dontcopy_end)); + memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin)); memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end, @@ -1690,7 +1708,6 @@ static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority, if (!try_module_get(prot->owner)) goto out_free_sec; - sk_tx_queue_clear(sk); } return sk; @@ -1876,123 +1893,120 @@ static void sk_init_common(struct sock *sk) struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) { struct proto *prot = READ_ONCE(sk->sk_prot); - struct sock *newsk; + struct sk_filter *filter; bool is_charged = true; + struct sock *newsk; newsk = sk_prot_alloc(prot, priority, sk->sk_family); - if (newsk != NULL) { - struct sk_filter *filter; + if (!newsk) + goto out; - sock_copy(newsk, sk); + sock_copy(newsk, sk); - newsk->sk_prot_creator = prot; + newsk->sk_prot_creator = prot; - /* SANITY */ - if (likely(newsk->sk_net_refcnt)) - get_net(sock_net(newsk)); - sk_node_init(&newsk->sk_node); - sock_lock_init(newsk); - bh_lock_sock(newsk); - newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL; - newsk->sk_backlog.len = 0; + /* SANITY */ + if (likely(newsk->sk_net_refcnt)) + get_net(sock_net(newsk)); + sk_node_init(&newsk->sk_node); + sock_lock_init(newsk); + bh_lock_sock(newsk); + newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL; + newsk->sk_backlog.len = 0; - atomic_set(&newsk->sk_rmem_alloc, 0); - /* - * sk_wmem_alloc set to one (see sk_free() and sock_wfree()) - */ - refcount_set(&newsk->sk_wmem_alloc, 1); - atomic_set(&newsk->sk_omem_alloc, 0); - sk_init_common(newsk); + atomic_set(&newsk->sk_rmem_alloc, 0); - newsk->sk_dst_cache = NULL; - newsk->sk_dst_pending_confirm = 0; - newsk->sk_wmem_queued = 0; - newsk->sk_forward_alloc = 0; - atomic_set(&newsk->sk_drops, 0); - newsk->sk_send_head = NULL; - newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK; - atomic_set(&newsk->sk_zckey, 0); + /* sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */ + refcount_set(&newsk->sk_wmem_alloc, 1); - sock_reset_flag(newsk, SOCK_DONE); + atomic_set(&newsk->sk_omem_alloc, 0); + sk_init_common(newsk); - /* sk->sk_memcg will be populated at accept() time */ - newsk->sk_memcg = NULL; + newsk->sk_dst_cache = NULL; + newsk->sk_dst_pending_confirm = 0; + newsk->sk_wmem_queued = 0; + newsk->sk_forward_alloc = 0; + atomic_set(&newsk->sk_drops, 0); + newsk->sk_send_head = NULL; + newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK; + atomic_set(&newsk->sk_zckey, 0); - cgroup_sk_clone(&newsk->sk_cgrp_data); + sock_reset_flag(newsk, SOCK_DONE); - rcu_read_lock(); - filter = rcu_dereference(sk->sk_filter); - if (filter != NULL) - /* though it's an empty new sock, the charging may fail - * if sysctl_optmem_max was changed between creation of - * original socket and cloning - */ - is_charged = sk_filter_charge(newsk, filter); - RCU_INIT_POINTER(newsk->sk_filter, filter); - rcu_read_unlock(); + /* sk->sk_memcg will be populated at accept() time */ + newsk->sk_memcg = NULL; - if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) { - /* We need to make sure that we don't uncharge the new - * socket if we couldn't charge it in the first place - * as otherwise we uncharge the parent's filter. - */ - if (!is_charged) - RCU_INIT_POINTER(newsk->sk_filter, NULL); - sk_free_unlock_clone(newsk); - newsk = NULL; - goto out; - } - RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL); + cgroup_sk_clone(&newsk->sk_cgrp_data); - if (bpf_sk_storage_clone(sk, newsk)) { - sk_free_unlock_clone(newsk); - newsk = NULL; - goto out; - } + rcu_read_lock(); + filter = rcu_dereference(sk->sk_filter); + if (filter != NULL) + /* though it's an empty new sock, the charging may fail + * if sysctl_optmem_max was changed between creation of + * original socket and cloning + */ + is_charged = sk_filter_charge(newsk, filter); + RCU_INIT_POINTER(newsk->sk_filter, filter); + rcu_read_unlock(); - /* Clear sk_user_data if parent had the pointer tagged - * as not suitable for copying when cloning. + if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) { + /* We need to make sure that we don't uncharge the new + * socket if we couldn't charge it in the first place + * as otherwise we uncharge the parent's filter. */ - if (sk_user_data_is_nocopy(newsk)) - newsk->sk_user_data = NULL; + if (!is_charged) + RCU_INIT_POINTER(newsk->sk_filter, NULL); + sk_free_unlock_clone(newsk); + newsk = NULL; + goto out; + } + RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL); - newsk->sk_err = 0; - newsk->sk_err_soft = 0; - newsk->sk_priority = 0; - newsk->sk_incoming_cpu = raw_smp_processor_id(); - if (likely(newsk->sk_net_refcnt)) - sock_inuse_add(sock_net(newsk), 1); + if (bpf_sk_storage_clone(sk, newsk)) { + sk_free_unlock_clone(newsk); + newsk = NULL; + goto out; + } - /* - * Before updating sk_refcnt, we must commit prior changes to memory - * (Documentation/RCU/rculist_nulls.rst for details) - */ - smp_wmb(); - refcount_set(&newsk->sk_refcnt, 2); + /* Clear sk_user_data if parent had the pointer tagged + * as not suitable for copying when cloning. + */ + if (sk_user_data_is_nocopy(newsk)) + newsk->sk_user_data = NULL; - /* - * Increment the counter in the same struct proto as the master - * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that - * is the same as sk->sk_prot->socks, as this field was copied - * with memcpy). - * - * This _changes_ the previous behaviour, where - * tcp_create_openreq_child always was incrementing the - * equivalent to tcp_prot->socks (inet_sock_nr), so this have - * to be taken into account in all callers. -acme - */ - sk_refcnt_debug_inc(newsk); - sk_set_socket(newsk, NULL); - sk_tx_queue_clear(newsk); - RCU_INIT_POINTER(newsk->sk_wq, NULL); + newsk->sk_err = 0; + newsk->sk_err_soft = 0; + newsk->sk_priority = 0; + newsk->sk_incoming_cpu = raw_smp_processor_id(); + if (likely(newsk->sk_net_refcnt)) + sock_inuse_add(sock_net(newsk), 1); - if (newsk->sk_prot->sockets_allocated) - sk_sockets_allocated_inc(newsk); + /* Before updating sk_refcnt, we must commit prior changes to memory + * (Documentation/RCU/rculist_nulls.rst for details) + */ + smp_wmb(); + refcount_set(&newsk->sk_refcnt, 2); - if (sock_needs_netstamp(sk) && - newsk->sk_flags & SK_FLAGS_TIMESTAMP) - net_enable_timestamp(); - } + /* Increment the counter in the same struct proto as the master + * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that + * is the same as sk->sk_prot->socks, as this field was copied + * with memcpy). + * + * This _changes_ the previous behaviour, where + * tcp_create_openreq_child always was incrementing the + * equivalent to tcp_prot->socks (inet_sock_nr), so this have + * to be taken into account in all callers. -acme + */ + sk_refcnt_debug_inc(newsk); + sk_set_socket(newsk, NULL); + sk_tx_queue_clear(newsk); + RCU_INIT_POINTER(newsk->sk_wq, NULL); + + if (newsk->sk_prot->sockets_allocated) + sk_sockets_allocated_inc(newsk); + + if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP) + net_enable_timestamp(); out: return newsk; } @@ -2505,7 +2519,7 @@ bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag) } EXPORT_SYMBOL(sk_page_frag_refill); -static void __lock_sock(struct sock *sk) +void __lock_sock(struct sock *sk) __releases(&sk->sk_lock.slock) __acquires(&sk->sk_lock.slock) { @@ -2827,14 +2841,8 @@ EXPORT_SYMBOL(sock_no_mmap); void __receive_sock(struct file *file) { struct socket *sock; - int error; - /* - * The resulting value of "error" is ignored here since we only - * need to take action when the file is a socket and testing - * "sock" for NULL is sufficient. - */ - sock = sock_from_file(file, &error); + sock = sock_from_file(file); if (sock) { sock_update_netprioidx(&sock->sk->sk_cgrp_data); sock_update_classid(&sock->sk->sk_cgrp_data); @@ -2961,6 +2969,13 @@ void sk_stop_timer(struct sock *sk, struct timer_list* timer) } EXPORT_SYMBOL(sk_stop_timer); +void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer) +{ + if (del_timer_sync(timer)) + __sock_put(sk); +} +EXPORT_SYMBOL(sk_stop_timer_sync); + void sock_init_data(struct socket *sock, struct sock *sk) { sk_init_common(sk); @@ -3090,7 +3105,7 @@ EXPORT_SYMBOL(release_sock); * * sk_lock.slock unlocked, owned = 1, BH enabled */ -bool lock_sock_fast(struct sock *sk) +bool lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock) { might_sleep(); spin_lock_bh(&sk->sk_lock.slock); @@ -3108,6 +3123,7 @@ bool lock_sock_fast(struct sock *sk) * The sk_lock has mutex_lock() semantics here: */ mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_); + __acquire(&sk->sk_lock.slock); local_bh_enable(); return true; } diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c index c13ffbd33d8d..c9c45b935f99 100644 --- a/net/core/sock_diag.c +++ b/net/core/sock_diag.c @@ -11,7 +11,7 @@ #include <linux/tcp.h> #include <linux/workqueue.h> #include <linux/nospec.h> - +#include <linux/cookie.h> #include <linux/inet_diag.h> #include <linux/sock_diag.h> @@ -19,16 +19,17 @@ static const struct sock_diag_handler *sock_diag_handlers[AF_MAX]; static int (*inet_rcv_compat)(struct sk_buff *skb, struct nlmsghdr *nlh); static DEFINE_MUTEX(sock_diag_table_mutex); static struct workqueue_struct *broadcast_wq; -static atomic64_t cookie_gen; -u64 sock_gen_cookie(struct sock *sk) +DEFINE_COOKIE(sock_cookie); + +u64 __sock_gen_cookie(struct sock *sk) { while (1) { u64 res = atomic64_read(&sk->sk_cookie); if (res) return res; - res = atomic64_inc_return(&cookie_gen); + res = gen_cookie_next(&sock_cookie); atomic64_cmpxchg(&sk->sk_cookie, 0, res); } } diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 119f52a99dc1..d758fb83c884 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -2,6 +2,7 @@ /* Copyright (c) 2017 - 2018 Covalent IO, Inc. http://covalent.io */ #include <linux/bpf.h> +#include <linux/btf_ids.h> #include <linux/filter.h> #include <linux/errno.h> #include <linux/file.h> @@ -26,8 +27,6 @@ struct bpf_stab { static struct bpf_map *sock_map_alloc(union bpf_attr *attr) { struct bpf_stab *stab; - u64 cost; - int err; if (!capable(CAP_NET_ADMIN)) return ERR_PTR(-EPERM); @@ -38,29 +37,22 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr) attr->map_flags & ~SOCK_CREATE_FLAG_MASK) return ERR_PTR(-EINVAL); - stab = kzalloc(sizeof(*stab), GFP_USER); + stab = kzalloc(sizeof(*stab), GFP_USER | __GFP_ACCOUNT); if (!stab) return ERR_PTR(-ENOMEM); bpf_map_init_from_attr(&stab->map, attr); raw_spin_lock_init(&stab->lock); - /* Make sure page count doesn't overflow. */ - cost = (u64) stab->map.max_entries * sizeof(struct sock *); - err = bpf_map_charge_init(&stab->map.memory, cost); - if (err) - goto free_stab; - stab->sks = bpf_map_area_alloc(stab->map.max_entries * sizeof(struct sock *), stab->map.numa_node); - if (stab->sks) - return &stab->map; - err = -ENOMEM; - bpf_map_charge_finish(&stab->map.memory); -free_stab: - kfree(stab); - return ERR_PTR(err); + if (!stab->sks) { + kfree(stab); + return ERR_PTR(-ENOMEM); + } + + return &stab->map; } int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog) @@ -147,8 +139,8 @@ static void sock_map_add_link(struct sk_psock *psock, static void sock_map_del_link(struct sock *sk, struct sk_psock *psock, void *link_raw) { + bool strp_stop = false, verdict_stop = false; struct sk_psock_link *link, *tmp; - bool strp_stop = false; spin_lock_bh(&psock->link_lock); list_for_each_entry_safe(link, tmp, &psock->link, list) { @@ -158,14 +150,19 @@ static void sock_map_del_link(struct sock *sk, map); if (psock->parser.enabled && stab->progs.skb_parser) strp_stop = true; + if (psock->parser.enabled && stab->progs.skb_verdict) + verdict_stop = true; list_del(&link->list); sk_psock_free_link(link); } } spin_unlock_bh(&psock->link_lock); - if (strp_stop) { + if (strp_stop || verdict_stop) { write_lock_bh(&sk->sk_callback_lock); - sk_psock_stop_strp(sk, psock); + if (strp_stop) + sk_psock_stop_strp(sk, psock); + else + sk_psock_stop_verdict(sk, psock); write_unlock_bh(&sk->sk_callback_lock); } } @@ -184,8 +181,6 @@ static int sock_map_init_proto(struct sock *sk, struct sk_psock *psock) { struct proto *prot; - sock_owned_by_me(sk); - switch (sk->sk_type) { case SOCK_STREAM: prot = tcp_bpf_get_proto(sk, psock); @@ -231,20 +226,21 @@ static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs, { struct bpf_prog *msg_parser, *skb_parser, *skb_verdict; struct sk_psock *psock; - bool skb_progs; int ret; skb_verdict = READ_ONCE(progs->skb_verdict); - skb_parser = READ_ONCE(progs->skb_parser); - skb_progs = skb_parser && skb_verdict; - if (skb_progs) { + if (skb_verdict) { skb_verdict = bpf_prog_inc_not_zero(skb_verdict); if (IS_ERR(skb_verdict)) return PTR_ERR(skb_verdict); + } + + skb_parser = READ_ONCE(progs->skb_parser); + if (skb_parser) { skb_parser = bpf_prog_inc_not_zero(skb_parser); if (IS_ERR(skb_parser)) { - bpf_prog_put(skb_verdict); - return PTR_ERR(skb_parser); + ret = PTR_ERR(skb_parser); + goto out_put_skb_verdict; } } @@ -253,7 +249,7 @@ static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs, msg_parser = bpf_prog_inc_not_zero(msg_parser); if (IS_ERR(msg_parser)) { ret = PTR_ERR(msg_parser); - goto out; + goto out_put_skb_parser; } } @@ -265,15 +261,16 @@ static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs, if (psock) { if ((msg_parser && READ_ONCE(psock->progs.msg_parser)) || - (skb_progs && READ_ONCE(psock->progs.skb_parser))) { + (skb_parser && READ_ONCE(psock->progs.skb_parser)) || + (skb_verdict && READ_ONCE(psock->progs.skb_verdict))) { sk_psock_put(sk, psock); ret = -EBUSY; goto out_progs; } } else { psock = sk_psock_init(sk, map->numa_node); - if (!psock) { - ret = -ENOMEM; + if (IS_ERR(psock)) { + ret = PTR_ERR(psock); goto out_progs; } } @@ -286,28 +283,32 @@ static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs, goto out_drop; write_lock_bh(&sk->sk_callback_lock); - if (skb_progs && !psock->parser.enabled) { + if (skb_parser && skb_verdict && !psock->parser.enabled) { ret = sk_psock_init_strp(sk, psock); - if (ret) { - write_unlock_bh(&sk->sk_callback_lock); - goto out_drop; - } + if (ret) + goto out_unlock_drop; psock_set_prog(&psock->progs.skb_verdict, skb_verdict); psock_set_prog(&psock->progs.skb_parser, skb_parser); sk_psock_start_strp(sk, psock); + } else if (!skb_parser && skb_verdict && !psock->parser.enabled) { + psock_set_prog(&psock->progs.skb_verdict, skb_verdict); + sk_psock_start_verdict(sk,psock); } write_unlock_bh(&sk->sk_callback_lock); return 0; +out_unlock_drop: + write_unlock_bh(&sk->sk_callback_lock); out_drop: sk_psock_put(sk, psock); out_progs: if (msg_parser) bpf_prog_put(msg_parser); -out: - if (skb_progs) { - bpf_prog_put(skb_verdict); +out_put_skb_parser: + if (skb_parser) bpf_prog_put(skb_parser); - } +out_put_skb_verdict: + if (skb_verdict) + bpf_prog_put(skb_verdict); return ret; } @@ -322,8 +323,8 @@ static int sock_map_link_no_progs(struct bpf_map *map, struct sock *sk) if (!psock) { psock = sk_psock_init(sk, map->numa_node); - if (!psock) - return -ENOMEM; + if (IS_ERR(psock)) + return PTR_ERR(psock); } ret = sock_map_init_proto(sk, psock); @@ -384,7 +385,7 @@ static void *sock_map_lookup(struct bpf_map *map, void *key) struct sock *sk; sk = __sock_map_lookup_elem(map, *(u32 *)key); - if (!sk || !sk_fullsock(sk)) + if (!sk) return NULL; if (sk_is_refcounted(sk) && !refcount_inc_not_zero(&sk->sk_refcnt)) return NULL; @@ -402,7 +403,7 @@ static void *sock_map_lookup_sys(struct bpf_map *map, void *key) if (!sk) return ERR_PTR(-ENOENT); - sock_gen_cookie(sk); + __sock_gen_cookie(sk); return &sk->sk_cookie; } @@ -478,8 +479,6 @@ static int sock_map_update_common(struct bpf_map *map, u32 idx, return -EINVAL; if (unlikely(idx >= map->max_entries)) return -E2BIG; - if (inet_csk_has_ulp(sk)) - return -EINVAL; link = sk_psock_init_link(); if (!link) @@ -563,10 +562,12 @@ static bool sock_map_sk_state_allowed(const struct sock *sk) return false; } -static int sock_map_update_elem(struct bpf_map *map, void *key, - void *value, u64 flags) +static int sock_hash_update_common(struct bpf_map *map, void *key, + struct sock *sk, u64 flags); + +int sock_map_update_elem_sys(struct bpf_map *map, void *key, void *value, + u64 flags) { - u32 idx = *(u32 *)key; struct socket *sock; struct sock *sk; int ret; @@ -595,11 +596,38 @@ static int sock_map_update_elem(struct bpf_map *map, void *key, sock_map_sk_acquire(sk); if (!sock_map_sk_state_allowed(sk)) ret = -EOPNOTSUPP; + else if (map->map_type == BPF_MAP_TYPE_SOCKMAP) + ret = sock_map_update_common(map, *(u32 *)key, sk, flags); else - ret = sock_map_update_common(map, idx, sk, flags); + ret = sock_hash_update_common(map, key, sk, flags); sock_map_sk_release(sk); out: - fput(sock->file); + sockfd_put(sock); + return ret; +} + +static int sock_map_update_elem(struct bpf_map *map, void *key, + void *value, u64 flags) +{ + struct sock *sk = (struct sock *)value; + int ret; + + if (unlikely(!sk || !sk_fullsock(sk))) + return -EINVAL; + + if (!sock_map_sk_is_suitable(sk)) + return -EOPNOTSUPP; + + local_bh_disable(); + bh_lock_sock(sk); + if (!sock_map_sk_state_allowed(sk)) + ret = -EOPNOTSUPP; + else if (map->map_type == BPF_MAP_TYPE_SOCKMAP) + ret = sock_map_update_common(map, *(u32 *)key, sk, flags); + else + ret = sock_hash_update_common(map, key, sk, flags); + bh_unlock_sock(sk); + local_bh_enable(); return ret; } @@ -681,8 +709,116 @@ const struct bpf_func_proto bpf_msg_redirect_map_proto = { .arg4_type = ARG_ANYTHING, }; +struct sock_map_seq_info { + struct bpf_map *map; + struct sock *sk; + u32 index; +}; + +struct bpf_iter__sockmap { + __bpf_md_ptr(struct bpf_iter_meta *, meta); + __bpf_md_ptr(struct bpf_map *, map); + __bpf_md_ptr(void *, key); + __bpf_md_ptr(struct sock *, sk); +}; + +DEFINE_BPF_ITER_FUNC(sockmap, struct bpf_iter_meta *meta, + struct bpf_map *map, void *key, + struct sock *sk) + +static void *sock_map_seq_lookup_elem(struct sock_map_seq_info *info) +{ + if (unlikely(info->index >= info->map->max_entries)) + return NULL; + + info->sk = __sock_map_lookup_elem(info->map, info->index); + + /* can't return sk directly, since that might be NULL */ + return info; +} + +static void *sock_map_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(rcu) +{ + struct sock_map_seq_info *info = seq->private; + + if (*pos == 0) + ++*pos; + + /* pairs with sock_map_seq_stop */ + rcu_read_lock(); + return sock_map_seq_lookup_elem(info); +} + +static void *sock_map_seq_next(struct seq_file *seq, void *v, loff_t *pos) + __must_hold(rcu) +{ + struct sock_map_seq_info *info = seq->private; + + ++*pos; + ++info->index; + + return sock_map_seq_lookup_elem(info); +} + +static int sock_map_seq_show(struct seq_file *seq, void *v) + __must_hold(rcu) +{ + struct sock_map_seq_info *info = seq->private; + struct bpf_iter__sockmap ctx = {}; + struct bpf_iter_meta meta; + struct bpf_prog *prog; + + meta.seq = seq; + prog = bpf_iter_get_info(&meta, !v); + if (!prog) + return 0; + + ctx.meta = &meta; + ctx.map = info->map; + if (v) { + ctx.key = &info->index; + ctx.sk = info->sk; + } + + return bpf_iter_run_prog(prog, &ctx); +} + +static void sock_map_seq_stop(struct seq_file *seq, void *v) + __releases(rcu) +{ + if (!v) + (void)sock_map_seq_show(seq, NULL); + + /* pairs with sock_map_seq_start */ + rcu_read_unlock(); +} + +static const struct seq_operations sock_map_seq_ops = { + .start = sock_map_seq_start, + .next = sock_map_seq_next, + .stop = sock_map_seq_stop, + .show = sock_map_seq_show, +}; + +static int sock_map_init_seq_private(void *priv_data, + struct bpf_iter_aux_info *aux) +{ + struct sock_map_seq_info *info = priv_data; + + info->map = aux->map; + return 0; +} + +static const struct bpf_iter_seq_info sock_map_iter_seq_info = { + .seq_ops = &sock_map_seq_ops, + .init_seq_private = sock_map_init_seq_private, + .seq_priv_size = sizeof(struct sock_map_seq_info), +}; + static int sock_map_btf_id; const struct bpf_map_ops sock_map_ops = { + .map_meta_equal = bpf_map_meta_equal, .map_alloc = sock_map_alloc, .map_free = sock_map_free, .map_get_next_key = sock_map_get_next_key, @@ -694,6 +830,7 @@ const struct bpf_map_ops sock_map_ops = { .map_check_btf = map_check_no_btf, .map_btf_name = "bpf_stab", .map_btf_id = &sock_map_btf_id, + .iter_seq_info = &sock_map_iter_seq_info, }; struct bpf_shtab_elem { @@ -829,8 +966,9 @@ static struct bpf_shtab_elem *sock_hash_alloc_elem(struct bpf_shtab *htab, } } - new = kmalloc_node(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN, - htab->map.numa_node); + new = bpf_map_kmalloc_node(&htab->map, htab->elem_size, + GFP_ATOMIC | __GFP_NOWARN, + htab->map.numa_node); if (!new) { atomic_dec(&htab->count); return ERR_PTR(-ENOMEM); @@ -855,8 +993,6 @@ static int sock_hash_update_common(struct bpf_map *map, void *key, WARN_ON_ONCE(!rcu_read_lock_held()); if (unlikely(flags > BPF_EXIST)) return -EINVAL; - if (inet_csk_has_ulp(sk)) - return -EINVAL; link = sk_psock_init_link(); if (!link) @@ -915,45 +1051,6 @@ out_free: return ret; } -static int sock_hash_update_elem(struct bpf_map *map, void *key, - void *value, u64 flags) -{ - struct socket *sock; - struct sock *sk; - int ret; - u64 ufd; - - if (map->value_size == sizeof(u64)) - ufd = *(u64 *)value; - else - ufd = *(u32 *)value; - if (ufd > S32_MAX) - return -EINVAL; - - sock = sockfd_lookup(ufd, &ret); - if (!sock) - return ret; - sk = sock->sk; - if (!sk) { - ret = -EINVAL; - goto out; - } - if (!sock_map_sk_is_suitable(sk)) { - ret = -EOPNOTSUPP; - goto out; - } - - sock_map_sk_acquire(sk); - if (!sock_map_sk_state_allowed(sk)) - ret = -EOPNOTSUPP; - else - ret = sock_hash_update_common(map, key, sk, flags); - sock_map_sk_release(sk); -out: - fput(sock->file); - return ret; -} - static int sock_hash_get_next_key(struct bpf_map *map, void *key, void *key_next) { @@ -971,7 +1068,7 @@ static int sock_hash_get_next_key(struct bpf_map *map, void *key, if (!elem) goto find_first_elem; - elem_next = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&elem->node)), + elem_next = hlist_entry_safe(rcu_dereference(hlist_next_rcu(&elem->node)), struct bpf_shtab_elem, node); if (elem_next) { memcpy(key_next, elem_next->key, key_size); @@ -983,7 +1080,7 @@ static int sock_hash_get_next_key(struct bpf_map *map, void *key, find_first_elem: for (; i < htab->buckets_num; i++) { head = &sock_hash_select_bucket(htab, i)->head; - elem_next = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)), + elem_next = hlist_entry_safe(rcu_dereference(hlist_first_rcu(head)), struct bpf_shtab_elem, node); if (elem_next) { memcpy(key_next, elem_next->key, key_size); @@ -998,7 +1095,6 @@ static struct bpf_map *sock_hash_alloc(union bpf_attr *attr) { struct bpf_shtab *htab; int i, err; - u64 cost; if (!capable(CAP_NET_ADMIN)) return ERR_PTR(-EPERM); @@ -1011,7 +1107,7 @@ static struct bpf_map *sock_hash_alloc(union bpf_attr *attr) if (attr->key_size > MAX_BPF_STACK) return ERR_PTR(-E2BIG); - htab = kzalloc(sizeof(*htab), GFP_USER); + htab = kzalloc(sizeof(*htab), GFP_USER | __GFP_ACCOUNT); if (!htab) return ERR_PTR(-ENOMEM); @@ -1026,21 +1122,10 @@ static struct bpf_map *sock_hash_alloc(union bpf_attr *attr) goto free_htab; } - cost = (u64) htab->buckets_num * sizeof(struct bpf_shtab_bucket) + - (u64) htab->elem_size * htab->map.max_entries; - if (cost >= U32_MAX - PAGE_SIZE) { - err = -EINVAL; - goto free_htab; - } - err = bpf_map_charge_init(&htab->map.memory, cost); - if (err) - goto free_htab; - htab->buckets = bpf_map_area_alloc(htab->buckets_num * sizeof(struct bpf_shtab_bucket), htab->map.numa_node); if (!htab->buckets) { - bpf_map_charge_finish(&htab->map.memory); err = -ENOMEM; goto free_htab; } @@ -1119,7 +1204,7 @@ static void *sock_hash_lookup_sys(struct bpf_map *map, void *key) if (!sk) return ERR_PTR(-ENOENT); - sock_gen_cookie(sk); + __sock_gen_cookie(sk); return &sk->sk_cookie; } @@ -1128,7 +1213,7 @@ static void *sock_hash_lookup(struct bpf_map *map, void *key) struct sock *sk; sk = __sock_hash_lookup_elem(map, key); - if (!sk || !sk_fullsock(sk)) + if (!sk) return NULL; if (sk_is_refcounted(sk) && !refcount_inc_not_zero(&sk->sk_refcnt)) return NULL; @@ -1217,12 +1302,128 @@ const struct bpf_func_proto bpf_msg_redirect_hash_proto = { .arg4_type = ARG_ANYTHING, }; +struct sock_hash_seq_info { + struct bpf_map *map; + struct bpf_shtab *htab; + u32 bucket_id; +}; + +static void *sock_hash_seq_find_next(struct sock_hash_seq_info *info, + struct bpf_shtab_elem *prev_elem) +{ + const struct bpf_shtab *htab = info->htab; + struct bpf_shtab_bucket *bucket; + struct bpf_shtab_elem *elem; + struct hlist_node *node; + + /* try to find next elem in the same bucket */ + if (prev_elem) { + node = rcu_dereference(hlist_next_rcu(&prev_elem->node)); + elem = hlist_entry_safe(node, struct bpf_shtab_elem, node); + if (elem) + return elem; + + /* no more elements, continue in the next bucket */ + info->bucket_id++; + } + + for (; info->bucket_id < htab->buckets_num; info->bucket_id++) { + bucket = &htab->buckets[info->bucket_id]; + node = rcu_dereference(hlist_first_rcu(&bucket->head)); + elem = hlist_entry_safe(node, struct bpf_shtab_elem, node); + if (elem) + return elem; + } + + return NULL; +} + +static void *sock_hash_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(rcu) +{ + struct sock_hash_seq_info *info = seq->private; + + if (*pos == 0) + ++*pos; + + /* pairs with sock_hash_seq_stop */ + rcu_read_lock(); + return sock_hash_seq_find_next(info, NULL); +} + +static void *sock_hash_seq_next(struct seq_file *seq, void *v, loff_t *pos) + __must_hold(rcu) +{ + struct sock_hash_seq_info *info = seq->private; + + ++*pos; + return sock_hash_seq_find_next(info, v); +} + +static int sock_hash_seq_show(struct seq_file *seq, void *v) + __must_hold(rcu) +{ + struct sock_hash_seq_info *info = seq->private; + struct bpf_iter__sockmap ctx = {}; + struct bpf_shtab_elem *elem = v; + struct bpf_iter_meta meta; + struct bpf_prog *prog; + + meta.seq = seq; + prog = bpf_iter_get_info(&meta, !elem); + if (!prog) + return 0; + + ctx.meta = &meta; + ctx.map = info->map; + if (elem) { + ctx.key = elem->key; + ctx.sk = elem->sk; + } + + return bpf_iter_run_prog(prog, &ctx); +} + +static void sock_hash_seq_stop(struct seq_file *seq, void *v) + __releases(rcu) +{ + if (!v) + (void)sock_hash_seq_show(seq, NULL); + + /* pairs with sock_hash_seq_start */ + rcu_read_unlock(); +} + +static const struct seq_operations sock_hash_seq_ops = { + .start = sock_hash_seq_start, + .next = sock_hash_seq_next, + .stop = sock_hash_seq_stop, + .show = sock_hash_seq_show, +}; + +static int sock_hash_init_seq_private(void *priv_data, + struct bpf_iter_aux_info *aux) +{ + struct sock_hash_seq_info *info = priv_data; + + info->map = aux->map; + info->htab = container_of(aux->map, struct bpf_shtab, map); + return 0; +} + +static const struct bpf_iter_seq_info sock_hash_iter_seq_info = { + .seq_ops = &sock_hash_seq_ops, + .init_seq_private = sock_hash_init_seq_private, + .seq_priv_size = sizeof(struct sock_hash_seq_info), +}; + static int sock_hash_map_btf_id; const struct bpf_map_ops sock_hash_ops = { + .map_meta_equal = bpf_map_meta_equal, .map_alloc = sock_hash_alloc, .map_free = sock_hash_free, .map_get_next_key = sock_hash_get_next_key, - .map_update_elem = sock_hash_update_elem, + .map_update_elem = sock_map_update_elem, .map_delete_elem = sock_hash_delete_elem, .map_lookup_elem = sock_hash_lookup, .map_lookup_elem_sys_only = sock_hash_lookup_sys, @@ -1230,6 +1431,7 @@ const struct bpf_map_ops sock_hash_ops = { .map_check_btf = map_check_no_btf, .map_btf_name = "bpf_shtab", .map_btf_id = &sock_hash_map_btf_id, + .iter_seq_info = &sock_hash_iter_seq_info, }; static struct sk_psock_progs *sock_map_progs(struct bpf_map *map) @@ -1340,3 +1542,62 @@ void sock_map_close(struct sock *sk, long timeout) release_sock(sk); saved_close(sk, timeout); } + +static int sock_map_iter_attach_target(struct bpf_prog *prog, + union bpf_iter_link_info *linfo, + struct bpf_iter_aux_info *aux) +{ + struct bpf_map *map; + int err = -EINVAL; + + if (!linfo->map.map_fd) + return -EBADF; + + map = bpf_map_get_with_uref(linfo->map.map_fd); + if (IS_ERR(map)) + return PTR_ERR(map); + + if (map->map_type != BPF_MAP_TYPE_SOCKMAP && + map->map_type != BPF_MAP_TYPE_SOCKHASH) + goto put_map; + + if (prog->aux->max_rdonly_access > map->key_size) { + err = -EACCES; + goto put_map; + } + + aux->map = map; + return 0; + +put_map: + bpf_map_put_with_uref(map); + return err; +} + +static void sock_map_iter_detach_target(struct bpf_iter_aux_info *aux) +{ + bpf_map_put_with_uref(aux->map); +} + +static struct bpf_iter_reg sock_map_iter_reg = { + .target = "sockmap", + .attach_target = sock_map_iter_attach_target, + .detach_target = sock_map_iter_detach_target, + .show_fdinfo = bpf_iter_map_show_fdinfo, + .fill_link_info = bpf_iter_map_fill_link_info, + .ctx_arg_info_size = 2, + .ctx_arg_info = { + { offsetof(struct bpf_iter__sockmap, key), + PTR_TO_RDONLY_BUF_OR_NULL }, + { offsetof(struct bpf_iter__sockmap, sk), + PTR_TO_BTF_ID_OR_NULL }, + }, +}; + +static int __init bpf_sockmap_iter_init(void) +{ + sock_map_iter_reg.ctx_arg_info[1].btf_id = + btf_sock_ids[BTF_SOCK_TYPE_SOCK]; + return bpf_iter_reg_target(&sock_map_iter_reg); +} +late_initcall(bpf_sockmap_iter_init); diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c index bbdd3c7b6cb5..b065f0a103ed 100644 --- a/net/core/sock_reuseport.c +++ b/net/core/sock_reuseport.c @@ -293,7 +293,7 @@ select_by_hash: i = j = reciprocal_scale(hash, socks); while (reuse->socks[i]->sk_state == TCP_ESTABLISHED) { i++; - if (i >= reuse->num_socks) + if (i >= socks) i = 0; if (i == j) goto out; diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 6ada114bbcca..4567de519603 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -22,7 +22,7 @@ #include <net/busy_poll.h> #include <net/pkt_sched.h> -static int two __maybe_unused = 2; +static int two = 2; static int three = 3; static int min_sndbuf = SOCK_MIN_SNDBUF; static int min_rcvbuf = SOCK_MIN_RCVBUF; @@ -309,7 +309,6 @@ proc_dolongvec_minmax_bpf_restricted(struct ctl_table *table, int write, #endif static struct ctl_table net_core_table[] = { -#ifdef CONFIG_NET { .procname = "wmem_max", .data = &sysctl_wmem_max, @@ -507,7 +506,6 @@ static struct ctl_table net_core_table[] = { .proc_handler = set_default_qdisc }, #endif -#endif /* CONFIG_NET */ { .procname = "netdev_budget", .data = &netdev_budget, @@ -546,7 +544,7 @@ static struct ctl_table net_core_table[] = { .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, + .extra2 = &two, }, { .procname = "devconf_inherit_init_net", @@ -587,6 +585,19 @@ static struct ctl_table netns_core_table[] = { { } }; +static int __init fb_tunnels_only_for_init_net_sysctl_setup(char *str) +{ + /* fallback tunnels for initns only */ + if (!strncmp(str, "initns", 6)) + sysctl_fb_tunnels_only_for_init_net = 1; + /* no fallback tunnels anywhere */ + else if (!strncmp(str, "none", 4)) + sysctl_fb_tunnels_only_for_init_net = 2; + + return 1; +} +__setup("fb_tunnels=", fb_tunnels_only_for_init_net_sysctl_setup); + static __net_init int sysctl_core_net_init(struct net *net) { struct ctl_table *tbl; diff --git a/net/core/xdp.c b/net/core/xdp.c index 48aba933a5a8..05354976c1fc 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -158,7 +158,7 @@ static void xdp_rxq_info_init(struct xdp_rxq_info *xdp_rxq) /* Returns 0 on success, negative on failure */ int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq, - struct net_device *dev, u32 queue_index) + struct net_device *dev, u32 queue_index, unsigned int napi_id) { if (xdp_rxq->reg_state == REG_STATE_UNUSED) { WARN(1, "Driver promised not to register this"); @@ -179,6 +179,7 @@ int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq, xdp_rxq_info_init(xdp_rxq); xdp_rxq->dev = dev; xdp_rxq->queue_index = queue_index; + xdp_rxq->napi_id = napi_id; xdp_rxq->reg_state = REG_STATE_REGISTERED; return 0; @@ -335,11 +336,10 @@ EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model); * scenarios (e.g. queue full), it is possible to return the xdp_frame * while still leveraging this protection. The @napi_direct boolean * is used for those calls sites. Thus, allowing for faster recycling - * of xdp_frames/pages in those cases. This path is never used by the - * MEM_TYPE_XSK_BUFF_POOL memory type, so it's explicitly not part of - * the switch-statement. + * of xdp_frames/pages in those cases. */ -static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct) +static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct, + struct xdp_buff *xdp) { struct xdp_mem_allocator *xa; struct page *page; @@ -361,6 +361,10 @@ static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct) page = virt_to_page(data); /* Assumes order0 page*/ put_page(page); break; + case MEM_TYPE_XSK_BUFF_POOL: + /* NB! Only valid from an xdp_buff! */ + xsk_buff_free(xdp); + break; default: /* Not possible, checked in xdp_rxq_info_reg_mem_model() */ WARN(1, "Incorrect XDP memory type (%d) usage", mem->type); @@ -370,19 +374,73 @@ static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct) void xdp_return_frame(struct xdp_frame *xdpf) { - __xdp_return(xdpf->data, &xdpf->mem, false); + __xdp_return(xdpf->data, &xdpf->mem, false, NULL); } EXPORT_SYMBOL_GPL(xdp_return_frame); void xdp_return_frame_rx_napi(struct xdp_frame *xdpf) { - __xdp_return(xdpf->data, &xdpf->mem, true); + __xdp_return(xdpf->data, &xdpf->mem, true, NULL); } EXPORT_SYMBOL_GPL(xdp_return_frame_rx_napi); +/* XDP bulk APIs introduce a defer/flush mechanism to return + * pages belonging to the same xdp_mem_allocator object + * (identified via the mem.id field) in bulk to optimize + * I-cache and D-cache. + * The bulk queue size is set to 16 to be aligned to how + * XDP_REDIRECT bulking works. The bulk is flushed when + * it is full or when mem.id changes. + * xdp_frame_bulk is usually stored/allocated on the function + * call-stack to avoid locking penalties. + */ +void xdp_flush_frame_bulk(struct xdp_frame_bulk *bq) +{ + struct xdp_mem_allocator *xa = bq->xa; + + if (unlikely(!xa || !bq->count)) + return; + + page_pool_put_page_bulk(xa->page_pool, bq->q, bq->count); + /* bq->xa is not cleared to save lookup, if mem.id same in next bulk */ + bq->count = 0; +} +EXPORT_SYMBOL_GPL(xdp_flush_frame_bulk); + +/* Must be called with rcu_read_lock held */ +void xdp_return_frame_bulk(struct xdp_frame *xdpf, + struct xdp_frame_bulk *bq) +{ + struct xdp_mem_info *mem = &xdpf->mem; + struct xdp_mem_allocator *xa; + + if (mem->type != MEM_TYPE_PAGE_POOL) { + __xdp_return(xdpf->data, &xdpf->mem, false, NULL); + return; + } + + xa = bq->xa; + if (unlikely(!xa)) { + xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params); + bq->count = 0; + bq->xa = xa; + } + + if (bq->count == XDP_BULK_QUEUE_SIZE) + xdp_flush_frame_bulk(bq); + + if (unlikely(mem->id != xa->mem.id)) { + xdp_flush_frame_bulk(bq); + bq->xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params); + } + + bq->q[bq->count++] = xdpf->data; +} +EXPORT_SYMBOL_GPL(xdp_return_frame_bulk); + void xdp_return_buff(struct xdp_buff *xdp) { - __xdp_return(xdp->data, &xdp->rxq->mem, true); + __xdp_return(xdp->data, &xdp->rxq->mem, true, xdp); } /* Only called for MEM_TYPE_PAGE_POOL see xdp.h */ @@ -400,18 +458,6 @@ void __xdp_release_frame(void *data, struct xdp_mem_info *mem) } EXPORT_SYMBOL_GPL(__xdp_release_frame); -bool xdp_attachment_flags_ok(struct xdp_attachment_info *info, - struct netdev_bpf *bpf) -{ - if (info->prog && (bpf->flags ^ info->flags) & XDP_FLAGS_MODES) { - NL_SET_ERR_MSG(bpf->extack, - "program loaded with different flags"); - return false; - } - return true; -} -EXPORT_SYMBOL_GPL(xdp_attachment_flags_ok); - void xdp_attachment_setup(struct xdp_attachment_info *info, struct netdev_bpf *bpf) { @@ -467,3 +513,73 @@ void xdp_warn(const char *msg, const char *func, const int line) WARN(1, "XDP_WARN: %s(line:%d): %s\n", func, line, msg); }; EXPORT_SYMBOL_GPL(xdp_warn); + +int xdp_alloc_skb_bulk(void **skbs, int n_skb, gfp_t gfp) +{ + n_skb = kmem_cache_alloc_bulk(skbuff_head_cache, gfp, + n_skb, skbs); + if (unlikely(!n_skb)) + return -ENOMEM; + + return 0; +} +EXPORT_SYMBOL_GPL(xdp_alloc_skb_bulk); + +struct sk_buff *__xdp_build_skb_from_frame(struct xdp_frame *xdpf, + struct sk_buff *skb, + struct net_device *dev) +{ + unsigned int headroom, frame_size; + void *hard_start; + + /* Part of headroom was reserved to xdpf */ + headroom = sizeof(*xdpf) + xdpf->headroom; + + /* Memory size backing xdp_frame data already have reserved + * room for build_skb to place skb_shared_info in tailroom. + */ + frame_size = xdpf->frame_sz; + + hard_start = xdpf->data - headroom; + skb = build_skb_around(skb, hard_start, frame_size); + if (unlikely(!skb)) + return NULL; + + skb_reserve(skb, headroom); + __skb_put(skb, xdpf->len); + if (xdpf->metasize) + skb_metadata_set(skb, xdpf->metasize); + + /* Essential SKB info: protocol and skb->dev */ + skb->protocol = eth_type_trans(skb, dev); + + /* Optional SKB info, currently missing: + * - HW checksum info (skb->ip_summed) + * - HW RX hash (skb_set_hash) + * - RX ring dev queue index (skb_record_rx_queue) + */ + + /* Until page_pool get SKB return path, release DMA here */ + xdp_release_frame(xdpf); + + /* Allow SKB to reuse area used by xdp_frame */ + xdp_scrub_frame(xdpf); + + return skb; +} +EXPORT_SYMBOL_GPL(__xdp_build_skb_from_frame); + +struct sk_buff *xdp_build_skb_from_frame(struct xdp_frame *xdpf, + struct net_device *dev) +{ + struct sk_buff *skb; + + skb = kmem_cache_alloc(skbuff_head_cache, GFP_ATOMIC); + if (unlikely(!skb)) + return NULL; + + memset(skb, 0, offsetof(struct sk_buff, tail)); + + return __xdp_build_skb_from_frame(xdpf, skb, dev); +} +EXPORT_SYMBOL_GPL(xdp_build_skb_from_frame); |