diff options
Diffstat (limited to 'net/core')
43 files changed, 4322 insertions, 1269 deletions
diff --git a/net/core/Makefile b/net/core/Makefile index 0cb734cbc24b..62be9aef2528 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -18,13 +18,15 @@ obj-y += dev.o dev_addr_lists.o dst.o netevent.o \ obj-$(CONFIG_NETDEV_ADDR_LIST_TEST) += dev_addr_lists_test.o obj-y += net-sysfs.o -obj-$(CONFIG_PAGE_POOL) += page_pool.o +obj-y += hotdata.o +obj-$(CONFIG_PAGE_POOL) += page_pool.o page_pool_user.o obj-$(CONFIG_PROC_FS) += net-procfs.o obj-$(CONFIG_NET_PKTGEN) += pktgen.o obj-$(CONFIG_NETPOLL) += netpoll.o obj-$(CONFIG_FIB_RULES) += fib_rules.o obj-$(CONFIG_TRACEPOINTS) += net-traces.o obj-$(CONFIG_NET_DROP_MONITOR) += drop_monitor.o +obj-$(CONFIG_NET_IEEE8021Q_HELPERS) += ieee8021q_helpers.o obj-$(CONFIG_NET_SELFTESTS) += selftests.o obj-$(CONFIG_NETWORK_PHY_TIMESTAMPING) += timestamping.o obj-$(CONFIG_NET_PTP_CLASSIFY) += ptp_classifier.o @@ -40,4 +42,4 @@ obj-$(CONFIG_NET_SOCK_MSG) += skmsg.o obj-$(CONFIG_BPF_SYSCALL) += sock_map.o obj-$(CONFIG_BPF_SYSCALL) += bpf_sk_storage.o obj-$(CONFIG_OF) += of_net.o -obj-$(CONFIG_NET_TEST) += gso_test.o +obj-$(CONFIG_NET_TEST) += net_test.o diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index cca7594be92e..bc01b3aa6b0f 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -275,9 +275,10 @@ BPF_CALL_2(bpf_sk_storage_delete, struct bpf_map *, map, struct sock *, sk) static int bpf_sk_storage_charge(struct bpf_local_storage_map *smap, void *owner, u32 size) { - int optmem_max = READ_ONCE(sysctl_optmem_max); struct sock *sk = (struct sock *)owner; + int optmem_max; + optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max); /* same check as in sock_kmalloc() */ if (size <= optmem_max && atomic_read(&sk->sk_omem_alloc) + size < optmem_max) { @@ -495,27 +496,22 @@ bpf_sk_storage_diag_alloc(const struct nlattr *nla_stgs) if (!bpf_capable()) return ERR_PTR(-EPERM); - nla_for_each_nested(nla, nla_stgs, rem) { - if (nla_type(nla) == SK_DIAG_BPF_STORAGE_REQ_MAP_FD) { - if (nla_len(nla) != sizeof(u32)) - return ERR_PTR(-EINVAL); - nr_maps++; - } + nla_for_each_nested_type(nla, SK_DIAG_BPF_STORAGE_REQ_MAP_FD, + nla_stgs, rem) { + if (nla_len(nla) != sizeof(u32)) + return ERR_PTR(-EINVAL); + nr_maps++; } diag = kzalloc(struct_size(diag, maps, nr_maps), GFP_KERNEL); if (!diag) return ERR_PTR(-ENOMEM); - nla_for_each_nested(nla, nla_stgs, rem) { - struct bpf_map *map; - int map_fd; - - if (nla_type(nla) != SK_DIAG_BPF_STORAGE_REQ_MAP_FD) - continue; + nla_for_each_nested_type(nla, SK_DIAG_BPF_STORAGE_REQ_MAP_FD, + nla_stgs, rem) { + int map_fd = nla_get_u32(nla); + struct bpf_map *map = bpf_map_get(map_fd); - map_fd = nla_get_u32(nla); - map = bpf_map_get(map_fd); if (IS_ERR(map)) { err = PTR_ERR(map); goto err_free; diff --git a/net/core/datagram.c b/net/core/datagram.c index 103d46fa0eeb..e614cfd8e14a 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -324,25 +324,6 @@ void skb_free_datagram(struct sock *sk, struct sk_buff *skb) } EXPORT_SYMBOL(skb_free_datagram); -void __skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb, int len) -{ - bool slow; - - if (!skb_unref(skb)) { - sk_peek_offset_bwd(sk, len); - return; - } - - slow = lock_sock_fast(sk); - sk_peek_offset_bwd(sk, len); - skb_orphan(skb); - unlock_sock_fast(sk, slow); - - /* skb is now orphaned, can be freed outside of locked section */ - __kfree_skb(skb); -} -EXPORT_SYMBOL(__skb_free_datagram_locked); - int __sk_queue_drop_skb(struct sock *sk, struct sk_buff_head *sk_queue, struct sk_buff *skb, unsigned int flags, void (*destructor)(struct sock *sk, @@ -751,7 +732,7 @@ size_t memcpy_to_iter_csum(void *iter_to, size_t progress, size_t len, void *from, void *priv2) { __wsum *csum = priv2; - __wsum next = csum_partial_copy_nocheck(from, iter_to, len); + __wsum next = csum_partial_copy_nocheck(from + progress, iter_to, len); *csum = csum_block_add(*csum, next, progress); return 0; diff --git a/net/core/dev.c b/net/core/dev.c index 0d548431f3fa..e1bb6d7856d9 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -77,7 +77,9 @@ #include <linux/hash.h> #include <linux/slab.h> #include <linux/sched.h> +#include <linux/sched/isolation.h> #include <linux/sched/mm.h> +#include <linux/smpboot.h> #include <linux/mutex.h> #include <linux/rwsem.h> #include <linux/string.h> @@ -153,41 +155,20 @@ #include <linux/prandom.h> #include <linux/once_lite.h> #include <net/netdev_rx_queue.h> +#include <net/page_pool/types.h> +#include <net/page_pool/helpers.h> +#include <net/rps.h> #include "dev.h" #include "net-sysfs.h" static DEFINE_SPINLOCK(ptype_lock); struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly; -struct list_head ptype_all __read_mostly; /* Taps */ static int netif_rx_internal(struct sk_buff *skb); static int call_netdevice_notifiers_extack(unsigned long val, struct net_device *dev, struct netlink_ext_ack *extack); -static struct napi_struct *napi_by_id(unsigned int napi_id); - -/* - * The @dev_base_head list is protected by @dev_base_lock and the rtnl - * semaphore. - * - * Pure readers hold dev_base_lock for reading, or rcu_read_lock() - * - * Writers must hold the rtnl semaphore while they loop through the - * dev_base_head list, and hold dev_base_lock for writing when they do the - * actual updates. This allows pure readers to access the list even - * while a writer is preparing to update it. - * - * To put it another way, dev_base_lock is held for writing only to - * protect against pure readers; the rtnl semaphore provides the - * protection against other writers. - * - * See, for example usages, register_netdevice() and - * unregister_netdevice(), which must be called with the rtnl - * semaphore held. - */ -DEFINE_RWLOCK(dev_base_lock); -EXPORT_SYMBOL(dev_base_lock); static DEFINE_MUTEX(ifalias_mutex); @@ -201,8 +182,9 @@ static DECLARE_RWSEM(devnet_rename_sem); static inline void dev_base_seq_inc(struct net *net) { - while (++net->dev_base_seq == 0) - ; + unsigned int val = net->dev_base_seq + 1; + + WRITE_ONCE(net->dev_base_seq, val ?: 1); } static inline struct hlist_head *dev_name_hash(struct net *net, const char *name) @@ -217,35 +199,60 @@ static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex) return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)]; } -static inline void rps_lock_irqsave(struct softnet_data *sd, - unsigned long *flags) +#ifndef CONFIG_PREEMPT_RT + +static DEFINE_STATIC_KEY_FALSE(use_backlog_threads_key); + +static int __init setup_backlog_napi_threads(char *arg) +{ + static_branch_enable(&use_backlog_threads_key); + return 0; +} +early_param("thread_backlog_napi", setup_backlog_napi_threads); + +static bool use_backlog_threads(void) +{ + return static_branch_unlikely(&use_backlog_threads_key); +} + +#else + +static bool use_backlog_threads(void) +{ + return true; +} + +#endif + +static inline void backlog_lock_irq_save(struct softnet_data *sd, + unsigned long *flags) { - if (IS_ENABLED(CONFIG_RPS)) + if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads()) spin_lock_irqsave(&sd->input_pkt_queue.lock, *flags); else if (!IS_ENABLED(CONFIG_PREEMPT_RT)) local_irq_save(*flags); } -static inline void rps_lock_irq_disable(struct softnet_data *sd) +static inline void backlog_lock_irq_disable(struct softnet_data *sd) { - if (IS_ENABLED(CONFIG_RPS)) + if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads()) spin_lock_irq(&sd->input_pkt_queue.lock); else if (!IS_ENABLED(CONFIG_PREEMPT_RT)) local_irq_disable(); } -static inline void rps_unlock_irq_restore(struct softnet_data *sd, - unsigned long *flags) +static inline void backlog_unlock_irq_restore(struct softnet_data *sd, + unsigned long *flags) { - if (IS_ENABLED(CONFIG_RPS)) + if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads()) spin_unlock_irqrestore(&sd->input_pkt_queue.lock, *flags); else if (!IS_ENABLED(CONFIG_PREEMPT_RT)) local_irq_restore(*flags); } -static inline void rps_unlock_irq_enable(struct softnet_data *sd) +static inline void backlog_unlock_irq_enable(struct softnet_data *sd) { - if (IS_ENABLED(CONFIG_RPS)) + if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads()) spin_unlock_irq(&sd->input_pkt_queue.lock); else if (!IS_ENABLED(CONFIG_PREEMPT_RT)) local_irq_enable(); @@ -337,18 +344,27 @@ int netdev_name_node_alt_create(struct net_device *dev, const char *name) return -ENOMEM; netdev_name_node_add(net, name_node); /* The node that holds dev->name acts as a head of per-device list. */ - list_add_tail(&name_node->list, &dev->name_node->list); + list_add_tail_rcu(&name_node->list, &dev->name_node->list); return 0; } -static void __netdev_name_node_alt_destroy(struct netdev_name_node *name_node) +static void netdev_name_node_alt_free(struct rcu_head *head) { - list_del(&name_node->list); + struct netdev_name_node *name_node = + container_of(head, struct netdev_name_node, rcu); + kfree(name_node->name); netdev_name_node_free(name_node); } +static void __netdev_name_node_alt_destroy(struct netdev_name_node *name_node) +{ + netdev_name_node_del(name_node); + list_del(&name_node->list); + call_rcu(&name_node->rcu, netdev_name_node_alt_free); +} + int netdev_name_node_alt_destroy(struct net_device *dev, const char *name) { struct netdev_name_node *name_node; @@ -363,10 +379,7 @@ int netdev_name_node_alt_destroy(struct net_device *dev, const char *name) if (name_node == dev->name_node || name_node->dev != dev) return -EINVAL; - netdev_name_node_del(name_node); - synchronize_rcu(); __netdev_name_node_alt_destroy(name_node); - return 0; } @@ -374,8 +387,10 @@ static void netdev_name_node_alt_flush(struct net_device *dev) { struct netdev_name_node *name_node, *tmp; - list_for_each_entry_safe(name_node, tmp, &dev->name_node->list, list) - __netdev_name_node_alt_destroy(name_node); + list_for_each_entry_safe(name_node, tmp, &dev->name_node->list, list) { + list_del(&name_node->list); + netdev_name_node_alt_free(&name_node->rcu); + } } /* Device list insertion */ @@ -386,12 +401,10 @@ static void list_netdevice(struct net_device *dev) ASSERT_RTNL(); - write_lock(&dev_base_lock); list_add_tail_rcu(&dev->dev_list, &net->dev_base_head); netdev_name_node_add(net, dev->name_node); hlist_add_head_rcu(&dev->index_hlist, dev_index_hash(net, dev->ifindex)); - write_unlock(&dev_base_lock); netdev_for_each_altname(dev, name_node) netdev_name_node_add(net, name_node); @@ -405,7 +418,7 @@ static void list_netdevice(struct net_device *dev) /* Device list removal * caller must respect a RCU grace period before freeing/reusing dev */ -static void unlist_netdevice(struct net_device *dev, bool lock) +static void unlist_netdevice(struct net_device *dev) { struct netdev_name_node *name_node; struct net *net = dev_net(dev); @@ -418,13 +431,9 @@ static void unlist_netdevice(struct net_device *dev, bool lock) netdev_name_node_del(name_node); /* Unlink dev from the device chain */ - if (lock) - write_lock(&dev_base_lock); list_del_rcu(&dev->dev_list); netdev_name_node_del(dev->name_node); hlist_del_rcu(&dev->index_hlist); - if (lock) - write_unlock(&dev_base_lock); dev_base_seq_inc(dev_net(dev)); } @@ -443,6 +452,12 @@ static RAW_NOTIFIER_HEAD(netdev_chain); DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data); EXPORT_PER_CPU_SYMBOL(softnet_data); +/* Page_pool has a lockless array/stack to alloc/recycle pages. + * PP consumers must pay attention to run APIs in the appropriate context + * (e.g. NAPI context). + */ +static DEFINE_PER_CPU(struct page_pool *, system_page_pool); + #ifdef CONFIG_LOCKDEP /* * register_netdevice() inits txq->_xmit_lock and sets lockdep class @@ -552,7 +567,7 @@ static inline void netdev_set_addr_lockdep_class(struct net_device *dev) static inline struct list_head *ptype_head(const struct packet_type *pt) { if (pt->type == htons(ETH_P_ALL)) - return pt->dev ? &pt->dev->ptype_all : &ptype_all; + return pt->dev ? &pt->dev->ptype_all : &net_hotdata.ptype_all; else return pt->dev ? &pt->dev->ptype_specific : &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK]; @@ -654,7 +669,7 @@ int dev_get_iflink(const struct net_device *dev) if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink) return dev->netdev_ops->ndo_get_iflink(dev); - return dev->ifindex; + return READ_ONCE(dev->ifindex); } EXPORT_SYMBOL(dev_get_iflink); @@ -739,9 +754,9 @@ EXPORT_SYMBOL_GPL(dev_fill_forward_path); * @net: the applicable net namespace * @name: name to find * - * Find an interface by name. Must be called under RTNL semaphore - * or @dev_base_lock. If the name is found a pointer to the device - * is returned. If the name is not found then %NULL is returned. The + * Find an interface by name. Must be called under RTNL semaphore. + * If the name is found a pointer to the device is returned. + * If the name is not found then %NULL is returned. The * reference counters are not incremented so the caller must be * careful with locks. */ @@ -822,8 +837,7 @@ EXPORT_SYMBOL(netdev_get_by_name); * Search for an interface by index. Returns %NULL if the device * is not found or a pointer to the device. The device has not * had its reference counter increased so the caller must be careful - * about locking. The caller must hold either the RTNL semaphore - * or @dev_base_lock. + * about locking. The caller must hold the RTNL semaphore. */ struct net_device *__dev_get_by_index(struct net *net, int ifindex) @@ -925,6 +939,18 @@ struct net_device *dev_get_by_napi_id(unsigned int napi_id) } EXPORT_SYMBOL(dev_get_by_napi_id); +static DEFINE_SEQLOCK(netdev_rename_lock); + +void netdev_copy_name(struct net_device *dev, char *name) +{ + unsigned int seq; + + do { + seq = read_seqbegin(&netdev_rename_lock); + strscpy(name, dev->name, IFNAMSIZ); + } while (read_seqretry(&netdev_rename_lock, seq)); +} + /** * netdev_get_name - get a netdevice name, knowing its ifindex. * @net: network namespace @@ -936,7 +962,6 @@ int netdev_get_name(struct net *net, char *name, int ifindex) struct net_device *dev; int ret; - down_read(&devnet_rename_sem); rcu_read_lock(); dev = dev_get_by_index_rcu(net, ifindex); @@ -945,12 +970,11 @@ int netdev_get_name(struct net *net, char *name, int ifindex) goto out; } - strcpy(name, dev->name); + netdev_copy_name(dev, name); ret = 0; out: rcu_read_unlock(); - up_read(&devnet_rename_sem); return ret; } @@ -1119,7 +1143,9 @@ static int __dev_alloc_name(struct net *net, const char *name, char *res) if (i == max_netdevices) return -ENFILE; - snprintf(res, IFNAMSIZ, name, i); + /* 'res' and 'name' could overlap, use 'buf' as an intermediate buffer */ + strscpy(buf, name, IFNAMSIZ); + snprintf(res, IFNAMSIZ, buf, i); return i; } @@ -1200,7 +1226,10 @@ int dev_change_name(struct net_device *dev, const char *newname) memcpy(oldname, dev->name, IFNAMSIZ); + write_seqlock(&netdev_rename_lock); err = dev_get_valid_name(net, dev, newname); + write_sequnlock(&netdev_rename_lock); + if (err < 0) { up_write(&devnet_rename_sem); return err; @@ -1211,13 +1240,13 @@ int dev_change_name(struct net_device *dev, const char *newname) dev->flags & IFF_UP ? " (while UP)" : ""); old_assign_type = dev->name_assign_type; - dev->name_assign_type = NET_NAME_RENAMED; + WRITE_ONCE(dev->name_assign_type, NET_NAME_RENAMED); rollback: ret = device_rename(&dev->dev, dev->name); if (ret) { memcpy(dev->name, oldname, IFNAMSIZ); - dev->name_assign_type = old_assign_type; + WRITE_ONCE(dev->name_assign_type, old_assign_type); up_write(&devnet_rename_sem); return ret; } @@ -1226,15 +1255,11 @@ rollback: netdev_adjacent_rename_links(dev, oldname); - write_lock(&dev_base_lock); netdev_name_node_del(dev->name_node); - write_unlock(&dev_base_lock); - synchronize_rcu(); + synchronize_net(); - write_lock(&dev_base_lock); netdev_name_node_add(net, dev->name_node); - write_unlock(&dev_base_lock); ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev); ret = notifier_to_errno(ret); @@ -1244,9 +1269,11 @@ rollback: if (err >= 0) { err = ret; down_write(&devnet_rename_sem); + write_seqlock(&netdev_rename_lock); memcpy(dev->name, oldname, IFNAMSIZ); + write_sequnlock(&netdev_rename_lock); memcpy(oldname, newname, IFNAMSIZ); - dev->name_assign_type = old_assign_type; + WRITE_ONCE(dev->name_assign_type, old_assign_type); old_assign_type = NET_NAME_RENAMED; goto rollback; } else { @@ -2072,6 +2099,11 @@ void net_dec_egress_queue(void) EXPORT_SYMBOL_GPL(net_dec_egress_queue); #endif +#ifdef CONFIG_NET_CLS_ACT +DEFINE_STATIC_KEY_FALSE(tcf_bypass_check_needed_key); +EXPORT_SYMBOL(tcf_bypass_check_needed_key); +#endif + DEFINE_STATIC_KEY_FALSE(netstamp_needed_key); EXPORT_SYMBOL(netstamp_needed_key); #ifdef CONFIG_JUMP_LABEL @@ -2241,7 +2273,8 @@ static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb) */ bool dev_nit_active(struct net_device *dev) { - return !list_empty(&ptype_all) || !list_empty(&dev->ptype_all); + return !list_empty(&net_hotdata.ptype_all) || + !list_empty(&dev->ptype_all); } EXPORT_SYMBOL_GPL(dev_nit_active); @@ -2252,15 +2285,14 @@ EXPORT_SYMBOL_GPL(dev_nit_active); void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) { - struct packet_type *ptype; + struct list_head *ptype_list = &net_hotdata.ptype_all; + struct packet_type *ptype, *pt_prev = NULL; struct sk_buff *skb2 = NULL; - struct packet_type *pt_prev = NULL; - struct list_head *ptype_list = &ptype_all; rcu_read_lock(); again: list_for_each_entry_rcu(ptype, ptype_list, list) { - if (ptype->ignore_outgoing) + if (READ_ONCE(ptype->ignore_outgoing)) continue; /* Never send packets back to the socket @@ -2301,7 +2333,7 @@ again: pt_prev = ptype; } - if (ptype_list == &ptype_all) { + if (ptype_list == &net_hotdata.ptype_all) { ptype_list = &dev->ptype_all; goto again; } @@ -3470,6 +3502,9 @@ static netdev_features_t gso_features_check(const struct sk_buff *skb, if (gso_segs > READ_ONCE(dev->gso_max_segs)) return features & ~NETIF_F_GSO_MASK; + if (unlikely(skb->len >= READ_ONCE(dev->gso_max_size))) + return features & ~NETIF_F_GSO_MASK; + if (!skb_shinfo(skb)->gso_type) { skb_warn_bad_offload(skb); return features & ~NETIF_F_GSO_MASK; @@ -3752,6 +3787,8 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, qdisc_calculate_pkt_len(skb, q); + tcf_set_drop_reason(skb, SKB_DROP_REASON_QDISC_DROP); + if (q->flags & TCQ_F_NOLOCK) { if (q->flags & TCQ_F_CAN_BYPASS && nolock_qdisc_is_empty(q) && qdisc_run_begin(q)) { @@ -3781,10 +3818,14 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, no_lock_out: if (unlikely(to_free)) kfree_skb_list_reason(to_free, - SKB_DROP_REASON_QDISC_DROP); + tcf_get_drop_reason(to_free)); return rc; } + if (unlikely(READ_ONCE(q->owner) == smp_processor_id())) { + kfree_skb_reason(skb, SKB_DROP_REASON_TC_RECLASSIFY_LOOP); + return NET_XMIT_DROP; + } /* * Heuristic to force contended enqueues to serialize on a * separate lock before trying to get qdisc main lock. @@ -3824,7 +3865,9 @@ no_lock_out: qdisc_run_end(q); rc = NET_XMIT_SUCCESS; } else { + WRITE_ONCE(q->owner, smp_processor_id()); rc = dev_qdisc_enqueue(skb, q, &to_free, txq); + WRITE_ONCE(q->owner, -1); if (qdisc_run_begin(q)) { if (unlikely(contended)) { spin_unlock(&q->busylock); @@ -3836,7 +3879,8 @@ no_lock_out: } spin_unlock(root_lock); if (unlikely(to_free)) - kfree_skb_list_reason(to_free, SKB_DROP_REASON_QDISC_DROP); + kfree_skb_list_reason(to_free, + tcf_get_drop_reason(to_free)); if (unlikely(contended)) spin_unlock(&q->busylock); return rc; @@ -3920,16 +3964,21 @@ static int tc_run(struct tcx_entry *entry, struct sk_buff *skb, if (!miniq) return ret; + if (static_branch_unlikely(&tcf_bypass_check_needed_key)) { + if (tcf_block_bypass_sw(miniq->block)) + return ret; + } + tc_skb_cb(skb)->mru = 0; tc_skb_cb(skb)->post_ct = false; - res.drop_reason = *drop_reason; + tcf_set_drop_reason(skb, *drop_reason); mini_qdisc_bstats_cpu_update(miniq, skb); ret = tcf_classify(skb, miniq->block, miniq->filter_list, &res, false); /* Only tcf related quirks below. */ switch (ret) { case TC_ACT_SHOT: - *drop_reason = res.drop_reason; + *drop_reason = tcf_get_drop_reason(skb); mini_qdisc_qstats_cpu_drop(miniq); break; case TC_ACT_OK: @@ -4413,20 +4462,11 @@ EXPORT_SYMBOL(__dev_direct_xmit); /************************************************************************* * Receiver routines *************************************************************************/ +static DEFINE_PER_CPU(struct task_struct *, backlog_napi); -int netdev_max_backlog __read_mostly = 1000; -EXPORT_SYMBOL(netdev_max_backlog); - -int netdev_tstamp_prequeue __read_mostly = 1; -unsigned int sysctl_skb_defer_max __read_mostly = 64; -int netdev_budget __read_mostly = 300; -/* Must be at least 2 jiffes to guarantee 1 jiffy timeout */ -unsigned int __read_mostly netdev_budget_usecs = 2 * USEC_PER_SEC / HZ; int weight_p __read_mostly = 64; /* old backlog weight */ int dev_weight_rx_bias __read_mostly = 1; /* bias for backlog weight */ int dev_weight_tx_bias __read_mostly = 1; /* bias for output_queue quota */ -int dev_rx_weight __read_mostly = 64; -int dev_tx_weight __read_mostly = 64; /* Called with irq disabled */ static inline void ____napi_schedule(struct softnet_data *sd, @@ -4445,18 +4485,16 @@ static inline void ____napi_schedule(struct softnet_data *sd, */ thread = READ_ONCE(napi->thread); if (thread) { - /* Avoid doing set_bit() if the thread is in - * INTERRUPTIBLE state, cause napi_thread_wait() - * makes sure to proceed with napi polling - * if the thread is explicitly woken from here. - */ - if (READ_ONCE(thread->__state) != TASK_INTERRUPTIBLE) - set_bit(NAPI_STATE_SCHED_THREADED, &napi->state); + if (use_backlog_threads() && thread == raw_cpu_read(backlog_napi)) + goto use_local_napi; + + set_bit(NAPI_STATE_SCHED_THREADED, &napi->state); wake_up_process(thread); return; } } +use_local_napi: list_add_tail(&napi->poll_list, &sd->poll_list); WRITE_ONCE(napi->list_owner, smp_processor_id()); /* If not called from net_rx_action() @@ -4468,12 +4506,6 @@ static inline void ____napi_schedule(struct softnet_data *sd, #ifdef CONFIG_RPS -/* One global table that all flow-based protocols share. */ -struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly; -EXPORT_SYMBOL(rps_sock_flow_table); -u32 rps_cpu_mask __read_mostly; -EXPORT_SYMBOL(rps_cpu_mask); - struct static_key_false rps_needed __read_mostly; EXPORT_SYMBOL(rps_needed); struct static_key_false rfs_needed __read_mostly; @@ -4488,7 +4520,7 @@ set_rps_cpu(struct net_device *dev, struct sk_buff *skb, struct netdev_rx_queue *rxqueue; struct rps_dev_flow_table *flow_table; struct rps_dev_flow *old_rflow; - u32 flow_id; + u32 flow_id, head; u16 rxq_index; int rc; @@ -4511,16 +4543,16 @@ set_rps_cpu(struct net_device *dev, struct sk_buff *skb, goto out; old_rflow = rflow; rflow = &flow_table->flows[flow_id]; - rflow->filter = rc; - if (old_rflow->filter == rflow->filter) - old_rflow->filter = RPS_NO_FILTER; + WRITE_ONCE(rflow->filter, rc); + if (old_rflow->filter == rc) + WRITE_ONCE(old_rflow->filter, RPS_NO_FILTER); out: #endif - rflow->last_qtail = - per_cpu(softnet_data, next_cpu).input_queue_head; + head = READ_ONCE(per_cpu(softnet_data, next_cpu).input_queue_head); + rps_input_queue_tail_save(&rflow->last_qtail, head); } - rflow->cpu = next_cpu; + WRITE_ONCE(rflow->cpu, next_cpu); return rflow; } @@ -4565,7 +4597,7 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, if (!hash) goto done; - sock_flow_table = rcu_dereference(rps_sock_flow_table); + sock_flow_table = rcu_dereference(net_hotdata.rps_sock_flow_table); if (flow_table && sock_flow_table) { struct rps_dev_flow *rflow; u32 next_cpu; @@ -4575,10 +4607,10 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, * This READ_ONCE() pairs with WRITE_ONCE() from rps_record_sock_flow(). */ ident = READ_ONCE(sock_flow_table->ents[hash & sock_flow_table->mask]); - if ((ident ^ hash) & ~rps_cpu_mask) + if ((ident ^ hash) & ~net_hotdata.rps_cpu_mask) goto try_rps; - next_cpu = ident & rps_cpu_mask; + next_cpu = ident & net_hotdata.rps_cpu_mask; /* OK, now we know there is a match, * we can look at the local (per receive queue) flow table @@ -4599,7 +4631,7 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, */ if (unlikely(tcpu != next_cpu) && (tcpu >= nr_cpu_ids || !cpu_online(tcpu) || - ((int)(per_cpu(softnet_data, tcpu).input_queue_head - + ((int)(READ_ONCE(per_cpu(softnet_data, tcpu).input_queue_head) - rflow->last_qtail)) >= 0)) { tcpu = next_cpu; rflow = set_rps_cpu(dev, skb, rflow, next_cpu); @@ -4653,9 +4685,9 @@ bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index, if (flow_table && flow_id <= flow_table->mask) { rflow = &flow_table->flows[flow_id]; cpu = READ_ONCE(rflow->cpu); - if (rflow->filter == filter_id && cpu < nr_cpu_ids && - ((int)(per_cpu(softnet_data, cpu).input_queue_head - - rflow->last_qtail) < + if (READ_ONCE(rflow->filter) == filter_id && cpu < nr_cpu_ids && + ((int)(READ_ONCE(per_cpu(softnet_data, cpu).input_queue_head) - + READ_ONCE(rflow->last_qtail)) < (int)(10 * flow_table->mask))) expire = false; } @@ -4702,6 +4734,11 @@ static void napi_schedule_rps(struct softnet_data *sd) #ifdef CONFIG_RPS if (sd != mysd) { + if (use_backlog_threads()) { + __napi_schedule_irqoff(&sd->backlog); + return; + } + sd->rps_ipi_next = mysd->rps_ipi_list; mysd->rps_ipi_list = sd; @@ -4716,6 +4753,23 @@ static void napi_schedule_rps(struct softnet_data *sd) __napi_schedule_irqoff(&mysd->backlog); } +void kick_defer_list_purge(struct softnet_data *sd, unsigned int cpu) +{ + unsigned long flags; + + if (use_backlog_threads()) { + backlog_lock_irq_save(sd, &flags); + + if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) + __napi_schedule_irqoff(&sd->backlog); + + backlog_unlock_irq_restore(sd, &flags); + + } else if (!cmpxchg(&sd->defer_ipi_scheduled, 0, 1)) { + smp_call_function_single_async(cpu, &sd->defer_csd); + } +} + #ifdef CONFIG_NET_FLOW_LIMIT int netdev_flow_limit_table_len __read_mostly = (1 << 12); #endif @@ -4727,7 +4781,7 @@ static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen) struct softnet_data *sd; unsigned int old_flow, new_flow; - if (qlen < (READ_ONCE(netdev_max_backlog) >> 1)) + if (qlen < (READ_ONCE(net_hotdata.max_backlog) >> 1)) return false; sd = this_cpu_ptr(&softnet_data); @@ -4767,36 +4821,45 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu, struct softnet_data *sd; unsigned long flags; unsigned int qlen; + int max_backlog; + u32 tail; - reason = SKB_DROP_REASON_NOT_SPECIFIED; + reason = SKB_DROP_REASON_DEV_READY; + if (!netif_running(skb->dev)) + goto bad_dev; + + reason = SKB_DROP_REASON_CPU_BACKLOG; sd = &per_cpu(softnet_data, cpu); - rps_lock_irqsave(sd, &flags); - if (!netif_running(skb->dev)) - goto drop; + qlen = skb_queue_len_lockless(&sd->input_pkt_queue); + max_backlog = READ_ONCE(net_hotdata.max_backlog); + if (unlikely(qlen > max_backlog)) + goto cpu_backlog_drop; + backlog_lock_irq_save(sd, &flags); qlen = skb_queue_len(&sd->input_pkt_queue); - if (qlen <= READ_ONCE(netdev_max_backlog) && !skb_flow_limit(skb, qlen)) { - if (qlen) { -enqueue: - __skb_queue_tail(&sd->input_pkt_queue, skb); - input_queue_tail_incr_save(sd, qtail); - rps_unlock_irq_restore(sd, &flags); - return NET_RX_SUCCESS; + if (qlen <= max_backlog && !skb_flow_limit(skb, qlen)) { + if (!qlen) { + /* Schedule NAPI for backlog device. We can use + * non atomic operation as we own the queue lock. + */ + if (!__test_and_set_bit(NAPI_STATE_SCHED, + &sd->backlog.state)) + napi_schedule_rps(sd); } + __skb_queue_tail(&sd->input_pkt_queue, skb); + tail = rps_input_queue_tail_incr(sd); + backlog_unlock_irq_restore(sd, &flags); - /* Schedule NAPI for backlog device - * We can use non atomic operation since we own the queue lock - */ - if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) - napi_schedule_rps(sd); - goto enqueue; + /* save the tail outside of the critical section */ + rps_input_queue_tail_save(qtail, tail); + return NET_RX_SUCCESS; } - reason = SKB_DROP_REASON_CPU_BACKLOG; -drop: - sd->dropped++; - rps_unlock_irq_restore(sd, &flags); + backlog_unlock_irq_restore(sd, &flags); +cpu_backlog_drop: + atomic_inc(&sd->dropped); +bad_dev: dev_core_stats_rx_dropped_inc(skb->dev); kfree_skb_reason(skb, reason); return NET_RX_DROP; @@ -4851,6 +4914,12 @@ u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp, xdp_init_buff(xdp, frame_sz, &rxqueue->xdp_rxq); xdp_prepare_buff(xdp, hard_start, skb_headroom(skb) - mac_len, skb_headlen(skb) + mac_len, true); + if (skb_is_nonlinear(skb)) { + skb_shinfo(skb)->xdp_frags_size = skb->data_len; + xdp_buff_set_frags_flag(xdp); + } else { + xdp_buff_clear_frags_flag(xdp); + } orig_data_end = xdp->data_end; orig_data = xdp->data; @@ -4880,6 +4949,14 @@ u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp, skb->len += off; /* positive on grow, negative on shrink */ } + /* XDP frag metadata (e.g. nr_frags) are updated in eBPF helpers + * (e.g. bpf_xdp_adjust_tail), we need to update data_len here. + */ + if (xdp_buff_has_frags(xdp)) + skb->data_len = skb_shinfo(skb)->xdp_frags_size; + else + skb->data_len = 0; + /* check if XDP changed eth hdr such SKB needs update */ eth = (struct ethhdr *)xdp->data; if ((orig_eth_type != eth->h_proto) || @@ -4913,11 +4990,35 @@ u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp, return act; } -static u32 netif_receive_generic_xdp(struct sk_buff *skb, +static int +netif_skb_check_for_xdp(struct sk_buff **pskb, struct bpf_prog *prog) +{ + struct sk_buff *skb = *pskb; + int err, hroom, troom; + + if (!skb_cow_data_for_xdp(this_cpu_read(system_page_pool), pskb, prog)) + return 0; + + /* In case we have to go down the path and also linearize, + * then lets do the pskb_expand_head() work just once here. + */ + hroom = XDP_PACKET_HEADROOM - skb_headroom(skb); + troom = skb->tail + skb->data_len - skb->end; + err = pskb_expand_head(skb, + hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0, + troom > 0 ? troom + 128 : 0, GFP_ATOMIC); + if (err) + return err; + + return skb_linearize(skb); +} + +static u32 netif_receive_generic_xdp(struct sk_buff **pskb, struct xdp_buff *xdp, struct bpf_prog *xdp_prog) { - u32 act = XDP_DROP; + struct sk_buff *skb = *pskb; + u32 mac_len, act = XDP_DROP; /* Reinjected packets coming from act_mirred or similar should * not get XDP generic processing. @@ -4925,41 +5026,36 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, if (skb_is_redirected(skb)) return XDP_PASS; - /* XDP packets must be linear and must have sufficient headroom - * of XDP_PACKET_HEADROOM bytes. This is the guarantee that also - * native XDP provides, thus we need to do it here as well. + /* XDP packets must have sufficient headroom of XDP_PACKET_HEADROOM + * bytes. This is the guarantee that also native XDP provides, + * thus we need to do it here as well. */ + mac_len = skb->data - skb_mac_header(skb); + __skb_push(skb, mac_len); + if (skb_cloned(skb) || skb_is_nonlinear(skb) || skb_headroom(skb) < XDP_PACKET_HEADROOM) { - int hroom = XDP_PACKET_HEADROOM - skb_headroom(skb); - int troom = skb->tail + skb->data_len - skb->end; - - /* In case we have to go down the path and also linearize, - * then lets do the pskb_expand_head() work just once here. - */ - if (pskb_expand_head(skb, - hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0, - troom > 0 ? troom + 128 : 0, GFP_ATOMIC)) - goto do_drop; - if (skb_linearize(skb)) + if (netif_skb_check_for_xdp(pskb, xdp_prog)) goto do_drop; } - act = bpf_prog_run_generic_xdp(skb, xdp, xdp_prog); + __skb_pull(*pskb, mac_len); + + act = bpf_prog_run_generic_xdp(*pskb, xdp, xdp_prog); switch (act) { case XDP_REDIRECT: case XDP_TX: case XDP_PASS: break; default: - bpf_warn_invalid_xdp_action(skb->dev, xdp_prog, act); + bpf_warn_invalid_xdp_action((*pskb)->dev, xdp_prog, act); fallthrough; case XDP_ABORTED: - trace_xdp_exception(skb->dev, xdp_prog, act); + trace_xdp_exception((*pskb)->dev, xdp_prog, act); fallthrough; case XDP_DROP: do_drop: - kfree_skb(skb); + kfree_skb(*pskb); break; } @@ -4997,24 +5093,24 @@ void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog) static DEFINE_STATIC_KEY_FALSE(generic_xdp_needed_key); -int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb) +int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff **pskb) { if (xdp_prog) { struct xdp_buff xdp; u32 act; int err; - act = netif_receive_generic_xdp(skb, &xdp, xdp_prog); + act = netif_receive_generic_xdp(pskb, &xdp, xdp_prog); if (act != XDP_PASS) { switch (act) { case XDP_REDIRECT: - err = xdp_do_generic_redirect(skb->dev, skb, + err = xdp_do_generic_redirect((*pskb)->dev, *pskb, &xdp, xdp_prog); if (err) goto out_redir; break; case XDP_TX: - generic_xdp_tx(skb, xdp_prog); + generic_xdp_tx(*pskb, xdp_prog); break; } return XDP_DROP; @@ -5022,7 +5118,7 @@ int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb) } return XDP_PASS; out_redir: - kfree_skb_reason(skb, SKB_DROP_REASON_XDP); + kfree_skb_reason(*pskb, SKB_DROP_REASON_XDP); return XDP_DROP; } EXPORT_SYMBOL_GPL(do_xdp_generic); @@ -5031,7 +5127,7 @@ static int netif_rx_internal(struct sk_buff *skb) { int ret; - net_timestamp_check(READ_ONCE(netdev_tstamp_prequeue), skb); + net_timestamp_check(READ_ONCE(net_hotdata.tstamp_prequeue), skb); trace_netif_rx(skb); @@ -5323,7 +5419,7 @@ static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc, int ret = NET_RX_DROP; __be16 type; - net_timestamp_check(!READ_ONCE(netdev_tstamp_prequeue), skb); + net_timestamp_check(!READ_ONCE(net_hotdata.tstamp_prequeue), skb); trace_netif_receive_skb(skb); @@ -5345,7 +5441,8 @@ another_round: int ret2; migrate_disable(); - ret2 = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), skb); + ret2 = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), + &skb); migrate_enable(); if (ret2 != XDP_PASS) { @@ -5366,7 +5463,7 @@ another_round: if (pfmemalloc) goto skip_taps; - list_for_each_entry_rcu(ptype, &ptype_all, list) { + list_for_each_entry_rcu(ptype, &net_hotdata.ptype_all, list) { if (pt_prev) ret = deliver_skb(skb, pt_prev, orig_dev); pt_prev = ptype; @@ -5706,7 +5803,7 @@ static int netif_receive_skb_internal(struct sk_buff *skb) { int ret; - net_timestamp_check(READ_ONCE(netdev_tstamp_prequeue), skb); + net_timestamp_check(READ_ONCE(net_hotdata.tstamp_prequeue), skb); if (skb_defer_rx_timestamp(skb)) return NET_RX_SUCCESS; @@ -5736,7 +5833,8 @@ void netif_receive_skb_list_internal(struct list_head *head) INIT_LIST_HEAD(&sublist); list_for_each_entry_safe(skb, next, head, list) { - net_timestamp_check(READ_ONCE(netdev_tstamp_prequeue), skb); + net_timestamp_check(READ_ONCE(net_hotdata.tstamp_prequeue), + skb); skb_list_del_init(skb); if (!skb_defer_rx_timestamp(skb)) list_add_tail(&skb->list, &sublist); @@ -5826,21 +5924,21 @@ static void flush_backlog(struct work_struct *work) local_bh_disable(); sd = this_cpu_ptr(&softnet_data); - rps_lock_irq_disable(sd); + backlog_lock_irq_disable(sd); skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) { if (skb->dev->reg_state == NETREG_UNREGISTERING) { __skb_unlink(skb, &sd->input_pkt_queue); dev_kfree_skb_irq(skb); - input_queue_head_incr(sd); + rps_input_queue_head_incr(sd); } } - rps_unlock_irq_enable(sd); + backlog_unlock_irq_enable(sd); skb_queue_walk_safe(&sd->process_queue, skb, tmp) { if (skb->dev->reg_state == NETREG_UNREGISTERING) { __skb_unlink(skb, &sd->process_queue); kfree_skb(skb); - input_queue_head_incr(sd); + rps_input_queue_head_incr(sd); } } local_bh_enable(); @@ -5852,14 +5950,14 @@ static bool flush_required(int cpu) struct softnet_data *sd = &per_cpu(softnet_data, cpu); bool do_flush; - rps_lock_irq_disable(sd); + backlog_lock_irq_disable(sd); /* as insertion into process_queue happens with the rps lock held, * process_queue access may race only with dequeue */ do_flush = !skb_queue_empty(&sd->input_pkt_queue) || !skb_queue_empty_lockless(&sd->process_queue); - rps_unlock_irq_enable(sd); + backlog_unlock_irq_enable(sd); return do_flush; #endif @@ -5925,7 +6023,7 @@ static void net_rps_action_and_irq_enable(struct softnet_data *sd) #ifdef CONFIG_RPS struct softnet_data *remsd = sd->rps_ipi_list; - if (remsd) { + if (!use_backlog_threads() && remsd) { sd->rps_ipi_list = NULL; local_irq_enable(); @@ -5940,7 +6038,7 @@ static void net_rps_action_and_irq_enable(struct softnet_data *sd) static bool sd_has_rps_ipi_waiting(struct softnet_data *sd) { #ifdef CONFIG_RPS - return sd->rps_ipi_list != NULL; + return !use_backlog_threads() && sd->rps_ipi_list; #else return false; #endif @@ -5960,7 +6058,7 @@ static int process_backlog(struct napi_struct *napi, int quota) net_rps_action_and_irq_enable(sd); } - napi->weight = READ_ONCE(dev_rx_weight); + napi->weight = READ_ONCE(net_hotdata.dev_rx_weight); while (again) { struct sk_buff *skb; @@ -5968,13 +6066,14 @@ static int process_backlog(struct napi_struct *napi, int quota) rcu_read_lock(); __netif_receive_skb(skb); rcu_read_unlock(); - input_queue_head_incr(sd); - if (++work >= quota) + if (++work >= quota) { + rps_input_queue_head_add(sd, work); return work; + } } - rps_lock_irq_disable(sd); + backlog_lock_irq_disable(sd); if (skb_queue_empty(&sd->input_pkt_queue)) { /* * Inline a custom version of __napi_complete(). @@ -5984,15 +6083,17 @@ static int process_backlog(struct napi_struct *napi, int quota) * We can use a plain write instead of clear_bit(), * and we dont need an smp_mb() memory barrier. */ - napi->state = 0; + napi->state &= NAPIF_STATE_THREADED; again = false; } else { skb_queue_splice_tail_init(&sd->input_pkt_queue, &sd->process_queue); } - rps_unlock_irq_enable(sd); + backlog_unlock_irq_enable(sd); } + if (work) + rps_input_queue_head_add(sd, work); return work; } @@ -6137,7 +6238,7 @@ bool napi_complete_done(struct napi_struct *n, int work_done) EXPORT_SYMBOL(napi_complete_done); /* must be called under rcu_read_lock(), as we dont take a reference */ -static struct napi_struct *napi_by_id(unsigned int napi_id) +struct napi_struct *napi_by_id(unsigned int napi_id) { unsigned int hash = napi_id % HASH_SIZE(napi_hash); struct napi_struct *napi; @@ -6149,6 +6250,27 @@ static struct napi_struct *napi_by_id(unsigned int napi_id) return NULL; } +static void skb_defer_free_flush(struct softnet_data *sd) +{ + struct sk_buff *skb, *next; + + /* Paired with WRITE_ONCE() in skb_attempt_defer_free() */ + if (!READ_ONCE(sd->defer_list)) + return; + + spin_lock(&sd->defer_lock); + skb = sd->defer_list; + sd->defer_list = NULL; + sd->defer_count = 0; + spin_unlock(&sd->defer_lock); + + while (skb != NULL) { + next = skb->next; + napi_consume_skb(skb, 1); + skb = next; + } +} + #if defined(CONFIG_NET_RX_BUSY_POLL) static void __busy_poll_stop(struct napi_struct *napi, bool skip_schedule) @@ -6170,8 +6292,13 @@ static void __busy_poll_stop(struct napi_struct *napi, bool skip_schedule) clear_bit(NAPI_STATE_SCHED, &napi->state); } -static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, bool prefer_busy_poll, - u16 budget) +enum { + NAPI_F_PREFER_BUSY_POLL = 1, + NAPI_F_END_ON_RESCHED = 2, +}; + +static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, + unsigned flags, u16 budget) { bool skip_schedule = false; unsigned long timeout; @@ -6191,7 +6318,7 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, bool local_bh_disable(); - if (prefer_busy_poll) { + if (flags & NAPI_F_PREFER_BUSY_POLL) { napi->defer_hard_irqs_count = READ_ONCE(napi->dev->napi_defer_hard_irqs); timeout = READ_ONCE(napi->dev->gro_flush_timeout); if (napi->defer_hard_irqs_count && timeout) { @@ -6215,23 +6342,23 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, bool local_bh_enable(); } -void napi_busy_loop(unsigned int napi_id, - bool (*loop_end)(void *, unsigned long), - void *loop_end_arg, bool prefer_busy_poll, u16 budget) +static void __napi_busy_loop(unsigned int napi_id, + bool (*loop_end)(void *, unsigned long), + void *loop_end_arg, unsigned flags, u16 budget) { unsigned long start_time = loop_end ? busy_loop_current_time() : 0; int (*napi_poll)(struct napi_struct *napi, int budget); void *have_poll_lock = NULL; struct napi_struct *napi; + WARN_ON_ONCE(!rcu_read_lock_held()); + restart: napi_poll = NULL; - rcu_read_lock(); - napi = napi_by_id(napi_id); if (!napi) - goto out; + return; if (!IS_ENABLED(CONFIG_PREEMPT_RT)) preempt_disable(); @@ -6247,14 +6374,14 @@ restart: */ if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED | NAPIF_STATE_IN_BUSY_POLL)) { - if (prefer_busy_poll) + if (flags & NAPI_F_PREFER_BUSY_POLL) set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state); goto count; } if (cmpxchg(&napi->state, val, val | NAPIF_STATE_IN_BUSY_POLL | NAPIF_STATE_SCHED) != val) { - if (prefer_busy_poll) + if (flags & NAPI_F_PREFER_BUSY_POLL) set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state); goto count; } @@ -6268,18 +6395,22 @@ count: if (work > 0) __NET_ADD_STATS(dev_net(napi->dev), LINUX_MIB_BUSYPOLLRXPACKETS, work); + skb_defer_free_flush(this_cpu_ptr(&softnet_data)); local_bh_enable(); if (!loop_end || loop_end(loop_end_arg, start_time)) break; if (unlikely(need_resched())) { + if (flags & NAPI_F_END_ON_RESCHED) + break; if (napi_poll) - busy_poll_stop(napi, have_poll_lock, prefer_busy_poll, budget); + busy_poll_stop(napi, have_poll_lock, flags, budget); if (!IS_ENABLED(CONFIG_PREEMPT_RT)) preempt_enable(); rcu_read_unlock(); cond_resched(); + rcu_read_lock(); if (loop_end(loop_end_arg, start_time)) return; goto restart; @@ -6287,10 +6418,31 @@ count: cpu_relax(); } if (napi_poll) - busy_poll_stop(napi, have_poll_lock, prefer_busy_poll, budget); + busy_poll_stop(napi, have_poll_lock, flags, budget); if (!IS_ENABLED(CONFIG_PREEMPT_RT)) preempt_enable(); -out: +} + +void napi_busy_loop_rcu(unsigned int napi_id, + bool (*loop_end)(void *, unsigned long), + void *loop_end_arg, bool prefer_busy_poll, u16 budget) +{ + unsigned flags = NAPI_F_END_ON_RESCHED; + + if (prefer_busy_poll) + flags |= NAPI_F_PREFER_BUSY_POLL; + + __napi_busy_loop(napi_id, loop_end, loop_end_arg, flags, budget); +} + +void napi_busy_loop(unsigned int napi_id, + bool (*loop_end)(void *, unsigned long), + void *loop_end_arg, bool prefer_busy_poll, u16 budget) +{ + unsigned flags = prefer_busy_poll ? NAPI_F_PREFER_BUSY_POLL : 0; + + rcu_read_lock(); + __napi_busy_loop(napi_id, loop_end, loop_end_arg, flags, budget); rcu_read_unlock(); } EXPORT_SYMBOL(napi_busy_loop); @@ -6378,7 +6530,7 @@ int dev_set_threaded(struct net_device *dev, bool threaded) } } - dev->threaded = threaded; + WRITE_ONCE(dev->threaded, threaded); /* Make sure kthread is created before THREADED bit * is set. @@ -6398,6 +6550,43 @@ int dev_set_threaded(struct net_device *dev, bool threaded) } EXPORT_SYMBOL(dev_set_threaded); +/** + * netif_queue_set_napi - Associate queue with the napi + * @dev: device to which NAPI and queue belong + * @queue_index: Index of queue + * @type: queue type as RX or TX + * @napi: NAPI context, pass NULL to clear previously set NAPI + * + * Set queue with its corresponding napi context. This should be done after + * registering the NAPI handler for the queue-vector and the queues have been + * mapped to the corresponding interrupt vector. + */ +void netif_queue_set_napi(struct net_device *dev, unsigned int queue_index, + enum netdev_queue_type type, struct napi_struct *napi) +{ + struct netdev_rx_queue *rxq; + struct netdev_queue *txq; + + if (WARN_ON_ONCE(napi && !napi->dev)) + return; + if (dev->reg_state >= NETREG_REGISTERED) + ASSERT_RTNL(); + + switch (type) { + case NETDEV_QUEUE_TYPE_RX: + rxq = __netif_get_rx_queue(dev, queue_index); + rxq->napi = napi; + return; + case NETDEV_QUEUE_TYPE_TX: + txq = netdev_get_tx_queue(dev, queue_index); + txq->napi = napi; + return; + default: + return; + } +} +EXPORT_SYMBOL(netif_queue_set_napi); + void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi, int (*poll)(struct napi_struct *, int), int weight) { @@ -6432,7 +6621,8 @@ void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi, * threaded mode will not be enabled in napi_enable(). */ if (dev->threaded && napi_kthread_create(napi)) - dev->threaded = 0; + dev->threaded = false; + netif_napi_set_irq(napi, -1); } EXPORT_SYMBOL(netif_napi_add_weight); @@ -6609,8 +6799,6 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll) static int napi_thread_wait(struct napi_struct *napi) { - bool woken = false; - set_current_state(TASK_INTERRUPTIBLE); while (!kthread_should_stop()) { @@ -6619,15 +6807,13 @@ static int napi_thread_wait(struct napi_struct *napi) * Testing SCHED bit is not enough because SCHED bit might be * set by some other busy poll thread or by napi_disable(). */ - if (test_bit(NAPI_STATE_SCHED_THREADED, &napi->state) || woken) { + if (test_bit(NAPI_STATE_SCHED_THREADED, &napi->state)) { WARN_ON(!list_empty(&napi->poll_list)); __set_current_state(TASK_RUNNING); return 0; } schedule(); - /* woken being true indicates this thread owns this napi. */ - woken = true; set_current_state(TASK_INTERRUPTIBLE); } __set_current_state(TASK_RUNNING); @@ -6635,61 +6821,48 @@ static int napi_thread_wait(struct napi_struct *napi) return -1; } -static void skb_defer_free_flush(struct softnet_data *sd) +static void napi_threaded_poll_loop(struct napi_struct *napi) { - struct sk_buff *skb, *next; + struct softnet_data *sd; + unsigned long last_qs = jiffies; - /* Paired with WRITE_ONCE() in skb_attempt_defer_free() */ - if (!READ_ONCE(sd->defer_list)) - return; + for (;;) { + bool repoll = false; + void *have; - spin_lock(&sd->defer_lock); - skb = sd->defer_list; - sd->defer_list = NULL; - sd->defer_count = 0; - spin_unlock(&sd->defer_lock); + local_bh_disable(); + sd = this_cpu_ptr(&softnet_data); + sd->in_napi_threaded_poll = true; - while (skb != NULL) { - next = skb->next; - napi_consume_skb(skb, 1); - skb = next; + have = netpoll_poll_lock(napi); + __napi_poll(napi, &repoll); + netpoll_poll_unlock(have); + + sd->in_napi_threaded_poll = false; + barrier(); + + if (sd_has_rps_ipi_waiting(sd)) { + local_irq_disable(); + net_rps_action_and_irq_enable(sd); + } + skb_defer_free_flush(sd); + local_bh_enable(); + + if (!repoll) + break; + + rcu_softirq_qs_periodic(last_qs); + cond_resched(); } } static int napi_threaded_poll(void *data) { struct napi_struct *napi = data; - struct softnet_data *sd; - void *have; - - while (!napi_thread_wait(napi)) { - for (;;) { - bool repoll = false; - local_bh_disable(); - sd = this_cpu_ptr(&softnet_data); - sd->in_napi_threaded_poll = true; + while (!napi_thread_wait(napi)) + napi_threaded_poll_loop(napi); - have = netpoll_poll_lock(napi); - __napi_poll(napi, &repoll); - netpoll_poll_unlock(have); - - sd->in_napi_threaded_poll = false; - barrier(); - - if (sd_has_rps_ipi_waiting(sd)) { - local_irq_disable(); - net_rps_action_and_irq_enable(sd); - } - skb_defer_free_flush(sd); - local_bh_enable(); - - if (!repoll) - break; - - cond_resched(); - } - } return 0; } @@ -6697,8 +6870,8 @@ static __latent_entropy void net_rx_action(struct softirq_action *h) { struct softnet_data *sd = this_cpu_ptr(&softnet_data); unsigned long time_limit = jiffies + - usecs_to_jiffies(READ_ONCE(netdev_budget_usecs)); - int budget = READ_ONCE(netdev_budget); + usecs_to_jiffies(READ_ONCE(net_hotdata.netdev_budget_usecs)); + int budget = READ_ONCE(net_hotdata.netdev_budget); LIST_HEAD(list); LIST_HEAD(repoll); @@ -8370,27 +8543,29 @@ static void dev_change_rx_flags(struct net_device *dev, int flags) static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify) { unsigned int old_flags = dev->flags; + unsigned int promiscuity, flags; kuid_t uid; kgid_t gid; ASSERT_RTNL(); - dev->flags |= IFF_PROMISC; - dev->promiscuity += inc; - if (dev->promiscuity == 0) { + promiscuity = dev->promiscuity + inc; + if (promiscuity == 0) { /* * Avoid overflow. * If inc causes overflow, untouch promisc and return error. */ - if (inc < 0) - dev->flags &= ~IFF_PROMISC; - else { - dev->promiscuity -= inc; + if (unlikely(inc > 0)) { netdev_warn(dev, "promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n"); return -EOVERFLOW; } + flags = old_flags & ~IFF_PROMISC; + } else { + flags = old_flags | IFF_PROMISC; } - if (dev->flags != old_flags) { + WRITE_ONCE(dev->promiscuity, promiscuity); + if (flags != old_flags) { + WRITE_ONCE(dev->flags, flags); netdev_info(dev, "%s promiscuous mode\n", dev->flags & IFF_PROMISC ? "entered" : "left"); if (audit_enabled) { @@ -8441,25 +8616,27 @@ EXPORT_SYMBOL(dev_set_promiscuity); static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify) { unsigned int old_flags = dev->flags, old_gflags = dev->gflags; + unsigned int allmulti, flags; ASSERT_RTNL(); - dev->flags |= IFF_ALLMULTI; - dev->allmulti += inc; - if (dev->allmulti == 0) { + allmulti = dev->allmulti + inc; + if (allmulti == 0) { /* * Avoid overflow. * If inc causes overflow, untouch allmulti and return error. */ - if (inc < 0) - dev->flags &= ~IFF_ALLMULTI; - else { - dev->allmulti -= inc; + if (unlikely(inc > 0)) { netdev_warn(dev, "allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n"); return -EOVERFLOW; } + flags = old_flags & ~IFF_ALLMULTI; + } else { + flags = old_flags | IFF_ALLMULTI; } - if (dev->flags ^ old_flags) { + WRITE_ONCE(dev->allmulti, allmulti); + if (flags != old_flags) { + WRITE_ONCE(dev->flags, flags); netdev_info(dev, "%s allmulticast mode\n", dev->flags & IFF_ALLMULTI ? "entered" : "left"); dev_change_rx_flags(dev, IFF_ALLMULTI); @@ -8541,12 +8718,12 @@ unsigned int dev_get_flags(const struct net_device *dev) { unsigned int flags; - flags = (dev->flags & ~(IFF_PROMISC | + flags = (READ_ONCE(dev->flags) & ~(IFF_PROMISC | IFF_ALLMULTI | IFF_RUNNING | IFF_LOWER_UP | IFF_DORMANT)) | - (dev->gflags & (IFF_PROMISC | + (READ_ONCE(dev->gflags) & (IFF_PROMISC | IFF_ALLMULTI)); if (netif_running(dev)) { @@ -8785,7 +8962,7 @@ int dev_change_tx_queue_len(struct net_device *dev, unsigned long new_len) return -ERANGE; if (new_len != orig_len) { - dev->tx_queue_len = new_len; + WRITE_ONCE(dev->tx_queue_len, new_len); res = call_netdevice_notifiers(NETDEV_CHANGE_TX_QUEUE_LEN, dev); res = notifier_to_errno(res); if (res) @@ -8799,7 +8976,7 @@ int dev_change_tx_queue_len(struct net_device *dev, unsigned long new_len) err_rollback: netdev_err(dev, "refused to change device tx_queue_len\n"); - dev->tx_queue_len = orig_len; + WRITE_ONCE(dev->tx_queue_len, orig_len); return res; } @@ -8869,7 +9046,7 @@ int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa, } EXPORT_SYMBOL(dev_set_mac_address); -static DECLARE_RWSEM(dev_addr_sem); +DECLARE_RWSEM(dev_addr_sem); int dev_set_mac_address_user(struct net_device *dev, struct sockaddr *sa, struct netlink_ext_ack *extack) @@ -9029,28 +9206,6 @@ bool netdev_port_same_parent_id(struct net_device *a, struct net_device *b) } EXPORT_SYMBOL(netdev_port_same_parent_id); -static void netdev_dpll_pin_assign(struct net_device *dev, struct dpll_pin *dpll_pin) -{ -#if IS_ENABLED(CONFIG_DPLL) - rtnl_lock(); - dev->dpll_pin = dpll_pin; - rtnl_unlock(); -#endif -} - -void netdev_dpll_pin_set(struct net_device *dev, struct dpll_pin *dpll_pin) -{ - WARN_ON(!dpll_pin); - netdev_dpll_pin_assign(dev, dpll_pin); -} -EXPORT_SYMBOL(netdev_dpll_pin_set); - -void netdev_dpll_pin_clear(struct net_device *dev) -{ - netdev_dpll_pin_assign(dev, NULL); -} -EXPORT_SYMBOL(netdev_dpll_pin_clear); - /** * dev_change_proto_down - set carrier according to proto_down. * @@ -9067,7 +9222,7 @@ int dev_change_proto_down(struct net_device *dev, bool proto_down) netif_carrier_off(dev); else netif_carrier_on(dev); - dev->proto_down = proto_down; + WRITE_ONCE(dev->proto_down, proto_down); return 0; } @@ -9081,18 +9236,21 @@ int dev_change_proto_down(struct net_device *dev, bool proto_down) void dev_change_proto_down_reason(struct net_device *dev, unsigned long mask, u32 value) { + u32 proto_down_reason; int b; if (!mask) { - dev->proto_down_reason = value; + proto_down_reason = value; } else { + proto_down_reason = dev->proto_down_reason; for_each_set_bit(b, &mask, 32) { if (value & (1 << b)) - dev->proto_down_reason |= BIT(b); + proto_down_reason |= BIT(b); else - dev->proto_down_reason &= ~BIT(b); + proto_down_reason &= ~BIT(b); } } + WRITE_ONCE(dev->proto_down_reason, proto_down_reason); } struct bpf_xdp_link { @@ -9645,11 +9803,11 @@ static void dev_index_release(struct net *net, int ifindex) /* Delayed registration/unregisteration */ LIST_HEAD(net_todo_list); DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq); +atomic_t dev_unreg_count = ATOMIC_INIT(0); static void net_set_todo(struct net_device *dev) { list_add_tail(&dev->todo_list, &net_todo_list); - atomic_inc(&dev_net(dev)->dev_unreg_count); } static netdev_features_t netdev_sync_upper_features(struct net_device *lower, @@ -10049,6 +10207,54 @@ void netif_tx_stop_all_queues(struct net_device *dev) } EXPORT_SYMBOL(netif_tx_stop_all_queues); +static int netdev_do_alloc_pcpu_stats(struct net_device *dev) +{ + void __percpu *v; + + /* Drivers implementing ndo_get_peer_dev must support tstat + * accounting, so that skb_do_redirect() can bump the dev's + * RX stats upon network namespace switch. + */ + if (dev->netdev_ops->ndo_get_peer_dev && + dev->pcpu_stat_type != NETDEV_PCPU_STAT_TSTATS) + return -EOPNOTSUPP; + + switch (dev->pcpu_stat_type) { + case NETDEV_PCPU_STAT_NONE: + return 0; + case NETDEV_PCPU_STAT_LSTATS: + v = dev->lstats = netdev_alloc_pcpu_stats(struct pcpu_lstats); + break; + case NETDEV_PCPU_STAT_TSTATS: + v = dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); + break; + case NETDEV_PCPU_STAT_DSTATS: + v = dev->dstats = netdev_alloc_pcpu_stats(struct pcpu_dstats); + break; + default: + return -EINVAL; + } + + return v ? 0 : -ENOMEM; +} + +static void netdev_do_free_pcpu_stats(struct net_device *dev) +{ + switch (dev->pcpu_stat_type) { + case NETDEV_PCPU_STAT_NONE: + return; + case NETDEV_PCPU_STAT_LSTATS: + free_percpu(dev->lstats); + break; + case NETDEV_PCPU_STAT_TSTATS: + free_percpu(dev->tstats); + break; + case NETDEV_PCPU_STAT_DSTATS: + free_percpu(dev->dstats); + break; + } +} + /** * register_netdevice() - register a network device * @dev: device to register @@ -10109,9 +10315,13 @@ int register_netdevice(struct net_device *dev) goto err_uninit; } + ret = netdev_do_alloc_pcpu_stats(dev); + if (ret) + goto err_uninit; + ret = dev_index_reserve(net, dev->ifindex); if (ret < 0) - goto err_uninit; + goto err_free_pcpu; dev->ifindex = ret; /* Transfer changeable features to wanted_features and enable @@ -10162,9 +10372,9 @@ int register_netdevice(struct net_device *dev) goto err_ifindex_release; ret = netdev_register_kobject(dev); - write_lock(&dev_base_lock); - dev->reg_state = ret ? NETREG_UNREGISTERED : NETREG_REGISTERED; - write_unlock(&dev_base_lock); + + WRITE_ONCE(dev->reg_state, ret ? NETREG_UNREGISTERED : NETREG_REGISTERED); + if (ret) goto err_uninit_notify; @@ -10217,6 +10427,8 @@ err_uninit_notify: call_netdevice_notifiers(NETDEV_PRE_UNINIT, dev); err_ifindex_release: dev_index_release(net, dev->ifindex); +err_free_pcpu: + netdev_do_free_pcpu_stats(dev); err_uninit: if (dev->netdev_ops->ndo_uninit) dev->netdev_ops->ndo_uninit(dev); @@ -10228,25 +10440,12 @@ err_free_name: } EXPORT_SYMBOL(register_netdevice); -/** - * init_dummy_netdev - init a dummy network device for NAPI - * @dev: device to init - * - * This takes a network device structure and initialize the minimum - * amount of fields so it can be used to schedule NAPI polls without - * registering a full blown interface. This is to be used by drivers - * that need to tie several hardware interfaces to a single NAPI - * poll scheduler due to HW limitations. +/* Initialize the core of a dummy net device. + * This is useful if you are calling this function after alloc_netdev(), + * since it does not memset the net_device fields. */ -int init_dummy_netdev(struct net_device *dev) +static void init_dummy_netdev_core(struct net_device *dev) { - /* Clear everything. Note we don't initialize spinlocks - * are they aren't supposed to be taken by any of the - * NAPI code and this dummy netdev is supposed to be - * only ever used for NAPI polls - */ - memset(dev, 0, sizeof(struct net_device)); - /* make sure we BUG if trying to hit standard * register/unregister code path */ @@ -10266,12 +10465,30 @@ int init_dummy_netdev(struct net_device *dev) * because users of this 'device' dont need to change * its refcount. */ +} - return 0; +/** + * init_dummy_netdev - init a dummy network device for NAPI + * @dev: device to init + * + * This takes a network device structure and initializes the minimum + * amount of fields so it can be used to schedule NAPI polls without + * registering a full blown interface. This is to be used by drivers + * that need to tie several hardware interfaces to a single NAPI + * poll scheduler due to HW limitations. + */ +void init_dummy_netdev(struct net_device *dev) +{ + /* Clear everything. Note we don't initialize spinlocks + * as they aren't supposed to be taken by any of the + * NAPI code and this dummy netdev is supposed to be + * only ever used for NAPI polls + */ + memset(dev, 0, sizeof(struct net_device)); + init_dummy_netdev_core(dev); } EXPORT_SYMBOL_GPL(init_dummy_netdev); - /** * register_netdev - register a network device * @dev: device to register @@ -10369,8 +10586,9 @@ static struct net_device *netdev_wait_allrefs_any(struct list_head *list) rebroadcast_time = jiffies; } + rcu_barrier(); + if (!wait) { - rcu_barrier(); wait = WAIT_REFS_MIN_MSECS; } else { msleep(wait); @@ -10422,6 +10640,7 @@ void netdev_run_todo(void) { struct net_device *dev, *tmp; struct list_head list; + int cnt; #ifdef CONFIG_LOCKDEP struct list_head unlink_list; @@ -10452,12 +10671,11 @@ void netdev_run_todo(void) continue; } - write_lock(&dev_base_lock); - dev->reg_state = NETREG_UNREGISTERED; - write_unlock(&dev_base_lock); - linkwatch_forget_dev(dev); + WRITE_ONCE(dev->reg_state, NETREG_UNREGISTERED); + linkwatch_sync_dev(dev); } + cnt = 0; while (!list_empty(&list)) { dev = netdev_wait_allrefs_any(&list); list_del(&dev->todo_list); @@ -10469,17 +10687,19 @@ void netdev_run_todo(void) WARN_ON(rcu_access_pointer(dev->ip_ptr)); WARN_ON(rcu_access_pointer(dev->ip6_ptr)); + netdev_do_free_pcpu_stats(dev); if (dev->priv_destructor) dev->priv_destructor(dev); if (dev->needs_free_netdev) free_netdev(dev); - if (atomic_dec_and_test(&dev_net(dev)->dev_unreg_count)) - wake_up(&netdev_unregistering_wq); + cnt++; /* Free network device */ kobject_put(&dev->dev.kobj); } + if (cnt && atomic_sub_and_test(cnt, &dev_unreg_count)) + wake_up(&netdev_unregistering_wq); } /* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has @@ -10556,6 +10776,8 @@ struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev, ops->ndo_get_stats64(dev, storage); } else if (ops->ndo_get_stats) { netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev)); + } else if (dev->pcpu_stat_type == NETDEV_PCPU_STAT_TSTATS) { + dev_get_tstats64(dev, storage); } else { netdev_stats_to_stats64(storage, &dev->stats); } @@ -10864,13 +11086,14 @@ void free_netdev(struct net_device *dev) dev->xdp_bulkq = NULL; /* Compatibility with error handling in drivers */ - if (dev->reg_state == NETREG_UNINITIALIZED) { + if (dev->reg_state == NETREG_UNINITIALIZED || + dev->reg_state == NETREG_DUMMY) { netdev_freemem(dev); return; } BUG_ON(dev->reg_state != NETREG_UNREGISTERED); - dev->reg_state = NETREG_RELEASED; + WRITE_ONCE(dev->reg_state, NETREG_RELEASED); /* will free via device release */ put_device(&dev->dev); @@ -10878,6 +11101,19 @@ void free_netdev(struct net_device *dev) EXPORT_SYMBOL(free_netdev); /** + * alloc_netdev_dummy - Allocate and initialize a dummy net device. + * @sizeof_priv: size of private data to allocate space for + * + * Return: the allocated net_device on success, NULL otherwise + */ +struct net_device *alloc_netdev_dummy(int sizeof_priv) +{ + return alloc_netdev(sizeof_priv, "dummy#", NET_NAME_UNKNOWN, + init_dummy_netdev_core); +} +EXPORT_SYMBOL_GPL(alloc_netdev_dummy); + +/** * synchronize_net - Synchronize with packet receive processing * * Wait for packets currently being received to be done. @@ -10926,6 +11162,7 @@ void unregister_netdevice_many_notify(struct list_head *head, { struct net_device *dev, *tmp; LIST_HEAD(close_head); + int cnt = 0; BUG_ON(dev_boot_phase); ASSERT_RTNL(); @@ -10957,10 +11194,8 @@ void unregister_netdevice_many_notify(struct list_head *head, list_for_each_entry(dev, head, unreg_list) { /* And unlink it from device chain. */ - write_lock(&dev_base_lock); - unlist_netdevice(dev, false); - dev->reg_state = NETREG_UNREGISTERING; - write_unlock(&dev_base_lock); + unlist_netdevice(dev); + WRITE_ONCE(dev->reg_state, NETREG_UNREGISTERING); } flush_all_backlogs(); @@ -11022,7 +11257,9 @@ void unregister_netdevice_many_notify(struct list_head *head, list_for_each_entry(dev, head, unreg_list) { netdev_put(dev, &dev->dev_registered_tracker); net_set_todo(dev); + cnt++; } + atomic_add(cnt, &dev_unreg_count); list_del(head); } @@ -11140,7 +11377,7 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net, dev_close(dev); /* And unlink it from device chain */ - unlist_netdevice(dev, true); + unlist_netdevice(dev); synchronize_net(); @@ -11179,17 +11416,23 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net, dev_net_set(dev, net); dev->ifindex = new_ifindex; - /* Send a netdev-add uevent to the new namespace */ - kobject_uevent(&dev->dev.kobj, KOBJ_ADD); - netdev_adjacent_add_links(dev); - - if (new_name[0]) /* Rename the netdev to prepared name */ + if (new_name[0]) { + /* Rename the netdev to prepared name */ + write_seqlock(&netdev_rename_lock); strscpy(dev->name, new_name, IFNAMSIZ); + write_sequnlock(&netdev_rename_lock); + } /* Fixup kobjects */ + dev_set_uevent_suppress(&dev->dev, 1); err = device_rename(&dev->dev, dev->name); + dev_set_uevent_suppress(&dev->dev, 0); WARN_ON(err); + /* Send a netdev-add uevent to the new namespace */ + kobject_uevent(&dev->dev.kobj, KOBJ_ADD); + netdev_adjacent_add_links(dev); + /* Adapt owner in case owning user namespace of target network * namespace is different from the original one. */ @@ -11253,7 +11496,7 @@ static int dev_cpu_dead(unsigned int oldcpu) list_del_init(&napi->poll_list); if (napi->poll == process_backlog) - napi->state = 0; + napi->state &= NAPIF_STATE_THREADED; else ____napi_schedule(sd, napi); } @@ -11261,21 +11504,23 @@ static int dev_cpu_dead(unsigned int oldcpu) raise_softirq_irqoff(NET_TX_SOFTIRQ); local_irq_enable(); + if (!use_backlog_threads()) { #ifdef CONFIG_RPS - remsd = oldsd->rps_ipi_list; - oldsd->rps_ipi_list = NULL; + remsd = oldsd->rps_ipi_list; + oldsd->rps_ipi_list = NULL; #endif - /* send out pending IPI's on offline CPU */ - net_rps_send_ipi(remsd); + /* send out pending IPI's on offline CPU */ + net_rps_send_ipi(remsd); + } /* Process offline CPU's input_pkt_queue */ while ((skb = __skb_dequeue(&oldsd->process_queue))) { netif_rx(skb); - input_queue_head_incr(oldsd); + rps_input_queue_head_incr(oldsd); } while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) { netif_rx(skb); - input_queue_head_incr(oldsd); + rps_input_queue_head_incr(oldsd); } return 0; @@ -11449,6 +11694,7 @@ static struct pernet_operations __net_initdata netdev_net_ops = { static void __net_exit default_device_exit_net(struct net *net) { + struct netdev_name_node *name_node, *tmp; struct net_device *dev, *aux; /* * Push all migratable network devices back to the @@ -11471,6 +11717,11 @@ static void __net_exit default_device_exit_net(struct net *net) snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex); if (netdev_name_in_use(&init_net, fb_name)) snprintf(fb_name, IFNAMSIZ, "dev%%d"); + + netdev_for_each_altname_safe(dev, name_node, tmp) + if (netdev_name_in_use(&init_net, name_node->name)) + __netdev_name_node_alt_destroy(name_node); + err = dev_change_net_namespace(dev, &init_net, fb_name); if (err) { pr_emerg("%s: failed to move %s to init_net: %d\n", @@ -11513,6 +11764,63 @@ static struct pernet_operations __net_initdata default_device_ops = { .exit_batch = default_device_exit_batch, }; +static void __init net_dev_struct_check(void) +{ + /* TX read-mostly hotpath */ + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, priv_flags); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, netdev_ops); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, header_ops); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, _tx); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, real_num_tx_queues); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_max_size); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_ipv4_max_size); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_max_segs); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_partial_features); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, num_tc); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, mtu); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, needed_headroom); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, tc_to_txq); +#ifdef CONFIG_XPS + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, xps_maps); +#endif +#ifdef CONFIG_NETFILTER_EGRESS + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, nf_hooks_egress); +#endif +#ifdef CONFIG_NET_XGRESS + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, tcx_egress); +#endif + CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_tx, 160); + + /* TXRX read-mostly hotpath */ + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, lstats); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, state); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, flags); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, hard_header_len); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, features); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, ip6_ptr); + CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_txrx, 46); + + /* RX read-mostly hotpath */ + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, ptype_specific); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, ifindex); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, real_num_rx_queues); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, _rx); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_flush_timeout); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, napi_defer_hard_irqs); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_max_size); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_ipv4_max_size); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, rx_handler); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, rx_handler_data); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, nd_net); +#ifdef CONFIG_NETPOLL + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, npinfo); +#endif +#ifdef CONFIG_NET_XGRESS + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, tcx_ingress); +#endif + CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_rx, 104); +} + /* * Initialize the DEV module. At boot time this walks the device list and * unhooks any devices that fail to initialise (normally hardware not @@ -11520,6 +11828,60 @@ static struct pernet_operations __net_initdata default_device_ops = { * */ +/* We allocate 256 pages for each CPU if PAGE_SHIFT is 12 */ +#define SYSTEM_PERCPU_PAGE_POOL_SIZE ((1 << 20) / PAGE_SIZE) + +static int net_page_pool_create(int cpuid) +{ +#if IS_ENABLED(CONFIG_PAGE_POOL) + struct page_pool_params page_pool_params = { + .pool_size = SYSTEM_PERCPU_PAGE_POOL_SIZE, + .flags = PP_FLAG_SYSTEM_POOL, + .nid = cpu_to_mem(cpuid), + }; + struct page_pool *pp_ptr; + + pp_ptr = page_pool_create_percpu(&page_pool_params, cpuid); + if (IS_ERR(pp_ptr)) + return -ENOMEM; + + per_cpu(system_page_pool, cpuid) = pp_ptr; +#endif + return 0; +} + +static int backlog_napi_should_run(unsigned int cpu) +{ + struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu); + struct napi_struct *napi = &sd->backlog; + + return test_bit(NAPI_STATE_SCHED_THREADED, &napi->state); +} + +static void run_backlog_napi(unsigned int cpu) +{ + struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu); + + napi_threaded_poll_loop(&sd->backlog); +} + +static void backlog_napi_setup(unsigned int cpu) +{ + struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu); + struct napi_struct *napi = &sd->backlog; + + napi->thread = this_cpu_read(backlog_napi); + set_bit(NAPI_STATE_THREADED, &napi->state); +} + +static struct smp_hotplug_thread backlog_threads = { + .store = &backlog_napi, + .thread_should_run = backlog_napi_should_run, + .thread_fn = run_backlog_napi, + .thread_comm = "backlog_napi/%u", + .setup = backlog_napi_setup, +}; + /* * This is called single threaded during boot, so no need * to take the rtnl semaphore. @@ -11530,13 +11892,14 @@ static int __init net_dev_init(void) BUG_ON(!dev_boot_phase); + net_dev_struct_check(); + if (dev_proc_init()) goto out; if (netdev_kobject_init()) goto out; - INIT_LIST_HEAD(&ptype_all); for (i = 0; i < PTYPE_HASH_SIZE; i++) INIT_LIST_HEAD(&ptype_base[i]); @@ -11570,7 +11933,13 @@ static int __init net_dev_init(void) init_gro_hash(&sd->backlog); sd->backlog.poll = process_backlog; sd->backlog.weight = weight_p; + INIT_LIST_HEAD(&sd->backlog.poll_list); + + if (net_page_pool_create(i)) + goto out; } + if (use_backlog_threads()) + smpboot_register_percpu_thread(&backlog_threads); dev_boot_phase = 0; @@ -11596,7 +11965,24 @@ static int __init net_dev_init(void) NULL, dev_cpu_dead); WARN_ON(rc < 0); rc = 0; + + /* avoid static key IPIs to isolated CPUs */ + if (housekeeping_enabled(HK_TYPE_MISC)) + net_enable_timestamp(); out: + if (rc < 0) { + for_each_possible_cpu(i) { + struct page_pool *pp_ptr; + + pp_ptr = per_cpu(system_page_pool, i); + if (!pp_ptr) + continue; + + page_pool_destroy(pp_ptr); + per_cpu(system_page_pool, i) = NULL; + } + } + return rc; } diff --git a/net/core/dev.h b/net/core/dev.h index 5aa45f0fd4ae..b7b518bc2be5 100644 --- a/net/core/dev.h +++ b/net/core/dev.h @@ -3,11 +3,10 @@ #define _NET_CORE_DEV_H #include <linux/types.h> +#include <linux/rwsem.h> +#include <linux/netdevice.h> struct net; -struct net_device; -struct netdev_bpf; -struct netdev_phys_item_id; struct netlink_ext_ack; struct cpumask; @@ -30,7 +29,6 @@ int __init dev_proc_init(void); #endif void linkwatch_init_dev(struct net_device *dev); -void linkwatch_forget_dev(struct net_device *dev); void linkwatch_run_queue(void); void dev_addr_flush(struct net_device *dev); @@ -38,15 +36,13 @@ int dev_addr_init(struct net_device *dev); void dev_addr_check(struct net_device *dev); /* sysctls not referred to from outside net/core/ */ -extern int netdev_budget; -extern unsigned int netdev_budget_usecs; -extern unsigned int sysctl_skb_defer_max; -extern int netdev_tstamp_prequeue; extern int netdev_unregister_timeout_secs; extern int weight_p; extern int dev_weight_rx_bias; extern int dev_weight_tx_bias; +extern struct rw_semaphore dev_addr_sem; + /* rtnl helpers */ extern struct list_head net_todo_list; void netdev_run_todo(void); @@ -57,6 +53,7 @@ struct netdev_name_node { struct list_head list; struct net_device *dev; const char *name; + struct rcu_head rcu; }; int netdev_get_name(struct net *net, char *name, int ifindex); @@ -64,6 +61,9 @@ int dev_change_name(struct net_device *dev, const char *newname); #define netdev_for_each_altname(dev, namenode) \ list_for_each_entry((namenode), &(dev)->name_node->list, list) +#define netdev_for_each_altname_safe(dev, namenode, next) \ + list_for_each_entry_safe((namenode), (next), &(dev)->name_node->list, \ + list) int netdev_name_node_alt_create(struct net_device *dev, const char *name); int netdev_name_node_alt_destroy(struct net_device *dev, const char *name); @@ -145,4 +145,25 @@ void xdp_do_check_flushed(struct napi_struct *napi); #else static inline void xdp_do_check_flushed(struct napi_struct *napi) { } #endif + +struct napi_struct *napi_by_id(unsigned int napi_id); +void kick_defer_list_purge(struct softnet_data *sd, unsigned int cpu); + +#define XMIT_RECURSION_LIMIT 8 +static inline bool dev_xmit_recursion(void) +{ + return unlikely(__this_cpu_read(softnet_data.xmit.recursion) > + XMIT_RECURSION_LIMIT); +} + +static inline void dev_xmit_recursion_inc(void) +{ + __this_cpu_inc(softnet_data.xmit.recursion); +} + +static inline void dev_xmit_recursion_dec(void) +{ + __this_cpu_dec(softnet_data.xmit.recursion); +} + #endif diff --git a/net/core/dev_addr_lists_test.c b/net/core/dev_addr_lists_test.c index 4dbd0dc6aea2..8e1dba825e94 100644 --- a/net/core/dev_addr_lists_test.c +++ b/net/core/dev_addr_lists_test.c @@ -49,7 +49,6 @@ static int dev_addr_test_init(struct kunit *test) KUNIT_FAIL(test, "Can't register netdev %d", err); } - rtnl_lock(); return 0; } @@ -57,7 +56,6 @@ static void dev_addr_test_exit(struct kunit *test) { struct net_device *netdev = test->priv; - rtnl_unlock(); unregister_netdev(netdev); free_netdev(netdev); } @@ -67,6 +65,7 @@ static void dev_addr_test_basic(struct kunit *test) struct net_device *netdev = test->priv; u8 addr[ETH_ALEN]; + rtnl_lock(); KUNIT_EXPECT_TRUE(test, !!netdev->dev_addr); memset(addr, 2, sizeof(addr)); @@ -76,6 +75,7 @@ static void dev_addr_test_basic(struct kunit *test) memset(addr, 3, sizeof(addr)); dev_addr_set(netdev, addr); KUNIT_EXPECT_MEMEQ(test, netdev->dev_addr, addr, sizeof(addr)); + rtnl_unlock(); } static void dev_addr_test_sync_one(struct kunit *test) @@ -86,6 +86,7 @@ static void dev_addr_test_sync_one(struct kunit *test) datp = netdev_priv(netdev); + rtnl_lock(); memset(addr, 1, sizeof(addr)); eth_hw_addr_set(netdev, addr); @@ -103,6 +104,7 @@ static void dev_addr_test_sync_one(struct kunit *test) * considered synced and we overwrite in place. */ KUNIT_EXPECT_EQ(test, 0, datp->addr_seen); + rtnl_unlock(); } static void dev_addr_test_add_del(struct kunit *test) @@ -114,6 +116,7 @@ static void dev_addr_test_add_del(struct kunit *test) datp = netdev_priv(netdev); + rtnl_lock(); for (i = 1; i < 4; i++) { memset(addr, i, sizeof(addr)); KUNIT_EXPECT_EQ(test, 0, dev_addr_add(netdev, addr, @@ -143,6 +146,7 @@ static void dev_addr_test_add_del(struct kunit *test) __hw_addr_sync_dev(&netdev->dev_addrs, netdev, dev_addr_test_sync, dev_addr_test_unsync); KUNIT_EXPECT_EQ(test, 1, datp->addr_seen); + rtnl_unlock(); } static void dev_addr_test_del_main(struct kunit *test) @@ -150,6 +154,7 @@ static void dev_addr_test_del_main(struct kunit *test) struct net_device *netdev = test->priv; u8 addr[ETH_ALEN]; + rtnl_lock(); memset(addr, 1, sizeof(addr)); eth_hw_addr_set(netdev, addr); @@ -161,6 +166,7 @@ static void dev_addr_test_del_main(struct kunit *test) NETDEV_HW_ADDR_T_LAN)); KUNIT_EXPECT_EQ(test, -ENOENT, dev_addr_del(netdev, addr, NETDEV_HW_ADDR_T_LAN)); + rtnl_unlock(); } static void dev_addr_test_add_set(struct kunit *test) @@ -172,6 +178,7 @@ static void dev_addr_test_add_set(struct kunit *test) datp = netdev_priv(netdev); + rtnl_lock(); /* There is no external API like dev_addr_add_excl(), * so shuffle the tree a little bit and exploit aliasing. */ @@ -191,6 +198,7 @@ static void dev_addr_test_add_set(struct kunit *test) __hw_addr_sync_dev(&netdev->dev_addrs, netdev, dev_addr_test_sync, dev_addr_test_unsync); KUNIT_EXPECT_EQ(test, 0xffff, datp->addr_seen); + rtnl_unlock(); } static void dev_addr_test_add_excl(struct kunit *test) @@ -199,6 +207,7 @@ static void dev_addr_test_add_excl(struct kunit *test) u8 addr[ETH_ALEN]; int i; + rtnl_lock(); for (i = 0; i < 10; i++) { memset(addr, i, sizeof(addr)); KUNIT_EXPECT_EQ(test, 0, dev_uc_add_excl(netdev, addr)); @@ -213,6 +222,7 @@ static void dev_addr_test_add_excl(struct kunit *test) memset(addr, i, sizeof(addr)); KUNIT_EXPECT_EQ(test, -EEXIST, dev_uc_add_excl(netdev, addr)); } + rtnl_unlock(); } static struct kunit_case dev_addr_test_cases[] = { diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c index feeddf95f450..9a66cf5015f2 100644 --- a/net/core/dev_ioctl.c +++ b/net/core/dev_ioctl.c @@ -322,9 +322,9 @@ static int dev_get_hwtstamp(struct net_device *dev, struct ifreq *ifr) * frames and not forward them), it must set IFF_SEE_ALL_HWTSTAMP_REQUESTS in * dev->priv_flags. */ -static int dev_set_hwtstamp_phylib(struct net_device *dev, - struct kernel_hwtstamp_config *cfg, - struct netlink_ext_ack *extack) +int dev_set_hwtstamp_phylib(struct net_device *dev, + struct kernel_hwtstamp_config *cfg, + struct netlink_ext_ack *extack) { const struct net_device_ops *ops = dev->netdev_ops; bool phy_ts = phy_has_hwtstamp(dev->phydev); @@ -363,6 +363,7 @@ static int dev_set_hwtstamp_phylib(struct net_device *dev, return 0; } +EXPORT_SYMBOL_GPL(dev_set_hwtstamp_phylib); static int dev_set_hwtstamp(struct net_device *dev, struct ifreq *ifr) { diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c index aff31cd944c2..430ed18f8584 100644 --- a/net/core/drop_monitor.c +++ b/net/core/drop_monitor.c @@ -74,7 +74,7 @@ struct net_dm_hw_entries { }; struct per_cpu_dm_data { - spinlock_t lock; /* Protects 'skb', 'hw_entries' and + raw_spinlock_t lock; /* Protects 'skb', 'hw_entries' and * 'send_timer' */ union { @@ -168,9 +168,9 @@ static struct sk_buff *reset_per_cpu_data(struct per_cpu_dm_data *data) err: mod_timer(&data->send_timer, jiffies + HZ / 10); out: - spin_lock_irqsave(&data->lock, flags); + raw_spin_lock_irqsave(&data->lock, flags); swap(data->skb, skb); - spin_unlock_irqrestore(&data->lock, flags); + raw_spin_unlock_irqrestore(&data->lock, flags); if (skb) { struct nlmsghdr *nlh = (struct nlmsghdr *)skb->data; @@ -183,7 +183,7 @@ out: } static const struct genl_multicast_group dropmon_mcgrps[] = { - { .name = "events", }, + { .name = "events", .flags = GENL_MCAST_CAP_SYS_ADMIN, }, }; static void send_dm_alert(struct work_struct *work) @@ -225,7 +225,7 @@ static void trace_drop_common(struct sk_buff *skb, void *location) local_irq_save(flags); data = this_cpu_ptr(&dm_cpu_data); - spin_lock(&data->lock); + raw_spin_lock(&data->lock); dskb = data->skb; if (!dskb) @@ -259,7 +259,7 @@ static void trace_drop_common(struct sk_buff *skb, void *location) } out: - spin_unlock_irqrestore(&data->lock, flags); + raw_spin_unlock_irqrestore(&data->lock, flags); } static void trace_kfree_skb_hit(void *ignore, struct sk_buff *skb, @@ -314,9 +314,9 @@ net_dm_hw_reset_per_cpu_data(struct per_cpu_dm_data *hw_data) mod_timer(&hw_data->send_timer, jiffies + HZ / 10); } - spin_lock_irqsave(&hw_data->lock, flags); + raw_spin_lock_irqsave(&hw_data->lock, flags); swap(hw_data->hw_entries, hw_entries); - spin_unlock_irqrestore(&hw_data->lock, flags); + raw_spin_unlock_irqrestore(&hw_data->lock, flags); return hw_entries; } @@ -448,7 +448,7 @@ net_dm_hw_trap_summary_probe(void *ignore, const struct devlink *devlink, return; hw_data = this_cpu_ptr(&dm_hw_cpu_data); - spin_lock_irqsave(&hw_data->lock, flags); + raw_spin_lock_irqsave(&hw_data->lock, flags); hw_entries = hw_data->hw_entries; if (!hw_entries) @@ -477,7 +477,7 @@ net_dm_hw_trap_summary_probe(void *ignore, const struct devlink *devlink, } out: - spin_unlock_irqrestore(&hw_data->lock, flags); + raw_spin_unlock_irqrestore(&hw_data->lock, flags); } static const struct net_dm_alert_ops net_dm_alert_summary_ops = { @@ -1619,11 +1619,13 @@ static const struct genl_small_ops dropmon_ops[] = { .cmd = NET_DM_CMD_START, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = net_dm_cmd_trace, + .flags = GENL_ADMIN_PERM, }, { .cmd = NET_DM_CMD_STOP, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = net_dm_cmd_trace, + .flags = GENL_ADMIN_PERM, }, { .cmd = NET_DM_CMD_CONFIG_GET, @@ -1671,7 +1673,7 @@ static struct notifier_block dropmon_net_notifier = { static void __net_dm_cpu_data_init(struct per_cpu_dm_data *data) { - spin_lock_init(&data->lock); + raw_spin_lock_init(&data->lock); skb_queue_head_init(&data->drop_queue); u64_stats_init(&data->stats.syncp); } diff --git a/net/core/dst.c b/net/core/dst.c index 6838d3212c37..95f533844f17 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -96,7 +96,7 @@ void *dst_alloc(struct dst_ops *ops, struct net_device *dev, } EXPORT_SYMBOL(dst_alloc); -struct dst_entry *dst_destroy(struct dst_entry * dst) +static void dst_destroy(struct dst_entry *dst) { struct dst_entry *child = NULL; @@ -126,15 +126,13 @@ struct dst_entry *dst_destroy(struct dst_entry * dst) dst = child; if (dst) dst_release_immediate(dst); - return NULL; } -EXPORT_SYMBOL(dst_destroy); static void dst_destroy_rcu(struct rcu_head *head) { struct dst_entry *dst = container_of(head, struct dst_entry, rcu_head); - dst = dst_destroy(dst); + dst_destroy(dst); } /* Operations to mark dst as DEAD and clean up the net device referenced diff --git a/net/core/dst_cache.c b/net/core/dst_cache.c index 0ccfd5fa5cb9..6a0482e676d3 100644 --- a/net/core/dst_cache.c +++ b/net/core/dst_cache.c @@ -47,7 +47,8 @@ static struct dst_entry *dst_cache_per_cpu_get(struct dst_cache *dst_cache, /* the cache already hold a dst reference; it can't go away */ dst_hold(dst); - if (unlikely(!time_after(idst->refresh_ts, dst_cache->reset_ts) || + if (unlikely(!time_after(idst->refresh_ts, + READ_ONCE(dst_cache->reset_ts)) || (dst->obsolete && !dst->ops->check(dst, idst->cookie)))) { dst_cache_per_cpu_dst_set(idst, NULL, 0); dst_release(dst); @@ -83,7 +84,7 @@ struct rtable *dst_cache_get_ip4(struct dst_cache *dst_cache, __be32 *saddr) return NULL; *saddr = idst->in_saddr.s_addr; - return container_of(dst, struct rtable, dst); + return dst_rtable(dst); } EXPORT_SYMBOL_GPL(dst_cache_get_ip4); @@ -111,8 +112,8 @@ void dst_cache_set_ip6(struct dst_cache *dst_cache, struct dst_entry *dst, return; idst = this_cpu_ptr(dst_cache->cache); - dst_cache_per_cpu_dst_set(this_cpu_ptr(dst_cache->cache), dst, - rt6_get_cookie((struct rt6_info *)dst)); + dst_cache_per_cpu_dst_set(idst, dst, + rt6_get_cookie(dst_rt6_info(dst))); idst->in6_saddr = *saddr; } EXPORT_SYMBOL_GPL(dst_cache_set_ip6); @@ -170,7 +171,7 @@ void dst_cache_reset_now(struct dst_cache *dst_cache) if (!dst_cache->cache) return; - dst_cache->reset_ts = jiffies; + dst_cache_reset(dst_cache); for_each_possible_cpu(i) { struct dst_cache_pcpu *idst = per_cpu_ptr(dst_cache->cache, i); struct dst_entry *dst = idst->dst; diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 75282222e0b4..6ebffbc63236 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -53,7 +53,7 @@ bool fib_rule_matchall(const struct fib_rule *rule) EXPORT_SYMBOL_GPL(fib_rule_matchall); int fib_default_rule_add(struct fib_rules_ops *ops, - u32 pref, u32 table, u32 flags) + u32 pref, u32 table) { struct fib_rule *r; @@ -65,7 +65,6 @@ int fib_default_rule_add(struct fib_rules_ops *ops, r->action = FR_ACT_TO_TBL; r->pref = pref; r->table = table; - r->flags = flags; r->proto = RTPROT_KERNEL; r->fr_net = ops->fro_net; r->uid_range = fib_kuid_range_unset; @@ -594,7 +593,6 @@ static int fib_nl2rule(struct sk_buff *skb, struct nlmsghdr *nlh, if (tb[FRA_TUN_ID]) nlrule->tun_id = nla_get_be64(tb[FRA_TUN_ID]); - err = -EINVAL; if (tb[FRA_L3MDEV] && fib_nl2rule_l3mdev(tb[FRA_L3MDEV], nlrule, extack) < 0) goto errout_free; @@ -1144,10 +1142,10 @@ static int fib_nl_dumprule(struct sk_buff *skb, struct netlink_callback *cb) const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); struct fib_rules_ops *ops; - int idx = 0, family; + int err, idx = 0, family; if (cb->strict_check) { - int err = fib_valid_dumprule_req(nlh, cb->extack); + err = fib_valid_dumprule_req(nlh, cb->extack); if (err < 0) return err; @@ -1160,17 +1158,17 @@ static int fib_nl_dumprule(struct sk_buff *skb, struct netlink_callback *cb) if (ops == NULL) return -EAFNOSUPPORT; - dump_rules(skb, cb, ops); - - return skb->len; + return dump_rules(skb, cb, ops); } + err = 0; rcu_read_lock(); list_for_each_entry_rcu(ops, &net->rules_ops, list) { if (idx < cb->args[0] || !try_module_get(ops->owner)) goto skip; - if (dump_rules(skb, cb, ops) < 0) + err = dump_rules(skb, cb, ops); + if (err < 0) break; cb->args[1] = 0; @@ -1180,7 +1178,7 @@ skip: rcu_read_unlock(); cb->args[0] = idx; - return skb->len; + return err; } static void notify_rule_change(int event, struct fib_rule *rule, @@ -1295,7 +1293,8 @@ static int __init fib_rules_init(void) int err; rtnl_register(PF_UNSPEC, RTM_NEWRULE, fib_nl_newrule, NULL, 0); rtnl_register(PF_UNSPEC, RTM_DELRULE, fib_nl_delrule, NULL, 0); - rtnl_register(PF_UNSPEC, RTM_GETRULE, NULL, fib_nl_dumprule, 0); + rtnl_register(PF_UNSPEC, RTM_GETRULE, NULL, fib_nl_dumprule, + RTNL_FLAG_DUMP_UNLOCKED); err = register_pernet_subsys(&fib_rules_net_ops); if (err < 0) diff --git a/net/core/filter.c b/net/core/filter.c index 21d75108c2e9..2510464692af 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -81,12 +81,17 @@ #include <net/xdp.h> #include <net/mptcp.h> #include <net/netfilter/nf_conntrack_bpf.h> +#include <net/netkit.h> #include <linux/un.h> +#include <net/xdp_sock_drv.h> #include "dev.h" +/* Keep the struct bpf_fib_lookup small so that it fits into a cacheline */ +static_assert(sizeof(struct bpf_fib_lookup) == 64, "struct bpf_fib_lookup size check"); + static const struct bpf_func_proto * -bpf_sk_base_func_proto(enum bpf_func_id func_id); +bpf_sk_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog); int copy_bpf_fprog_from_user(struct sock_fprog *dst, sockptr_t src, int len) { @@ -202,7 +207,7 @@ BPF_CALL_3(bpf_skb_get_nlattr_nest, struct sk_buff *, skb, u32, a, u32, x) return 0; nla = (struct nlattr *) &skb->data[a]; - if (nla->nla_len > skb->len - a) + if (!nla_ok(nla, skb->len - a)) return 0; nla = nla_find_nested(nla, x); @@ -776,7 +781,7 @@ jmp_rest: BPF_EMIT_JMP; break; - /* ldxb 4 * ([14] & 0xf) is remaped into 6 insns. */ + /* ldxb 4 * ([14] & 0xf) is remapped into 6 insns. */ case BPF_LDX | BPF_MSH | BPF_B: { struct sock_filter tmp = { .code = BPF_LD | BPF_ABS | BPF_B, @@ -802,7 +807,7 @@ jmp_rest: *insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_TMP); break; } - /* RET_K is remaped into 2 insns. RET_A case doesn't need an + /* RET_K is remapped into 2 insns. RET_A case doesn't need an * extra mov as BPF_REG_0 is already mapped into BPF_REG_A. */ case BPF_RET | BPF_A: @@ -1218,8 +1223,8 @@ void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp) */ static bool __sk_filter_charge(struct sock *sk, struct sk_filter *fp) { + int optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max); u32 filter_size = bpf_prog_size(fp->prog->len); - int optmem_max = READ_ONCE(sysctl_optmem_max); /* same check as in sock_kmalloc() */ if (filter_size <= optmem_max && @@ -1549,12 +1554,13 @@ EXPORT_SYMBOL_GPL(sk_attach_filter); int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk) { struct bpf_prog *prog = __get_filter(fprog, sk); - int err; + int err, optmem_max; if (IS_ERR(prog)) return PTR_ERR(prog); - if (bpf_prog_size(prog->len) > READ_ONCE(sysctl_optmem_max)) + optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max); + if (bpf_prog_size(prog->len) > optmem_max) err = -ENOMEM; else err = reuseport_attach_prog(sk, prog); @@ -1593,7 +1599,7 @@ int sk_attach_bpf(u32 ufd, struct sock *sk) int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk) { struct bpf_prog *prog; - int err; + int err, optmem_max; if (sock_flag(sk, SOCK_FILTER_LOCKED)) return -EPERM; @@ -1621,7 +1627,8 @@ int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk) } } else { /* BPF_PROG_TYPE_SOCKET_FILTER */ - if (bpf_prog_size(prog->len) > READ_ONCE(sysctl_optmem_max)) { + optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max); + if (bpf_prog_size(prog->len) > optmem_max) { err = -ENOMEM; goto err_prog_put; } @@ -2211,7 +2218,7 @@ static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb, rcu_read_lock(); if (!nh) { dst = skb_dst(skb); - nexthop = rt6_nexthop(container_of(dst, struct rt6_info, dst), + nexthop = rt6_nexthop(dst_rt6_info(dst), &ipv6_hdr(skb)->daddr); } else { nexthop = &nh->ipv6_nh; @@ -2310,8 +2317,7 @@ static int bpf_out_neigh_v4(struct net *net, struct sk_buff *skb, rcu_read_lock(); if (!nh) { - struct dst_entry *dst = skb_dst(skb); - struct rtable *rt = container_of(dst, struct rtable, dst); + struct rtable *rt = skb_rtable(skb); neigh = ip_neigh_for_gw(rt, skb, &is_v6gw); } else if (nh->nh_family == AF_INET6) { @@ -2468,6 +2474,16 @@ static const struct bpf_func_proto bpf_clone_redirect_proto = { DEFINE_PER_CPU(struct bpf_redirect_info, bpf_redirect_info); EXPORT_PER_CPU_SYMBOL_GPL(bpf_redirect_info); +static struct net_device *skb_get_peer_dev(struct net_device *dev) +{ + const struct net_device_ops *ops = dev->netdev_ops; + + if (likely(ops->ndo_get_peer_dev)) + return INDIRECT_CALL_1(ops->ndo_get_peer_dev, + netkit_peer_dev, dev); + return NULL; +} + int skb_do_redirect(struct sk_buff *skb) { struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); @@ -2481,17 +2497,15 @@ int skb_do_redirect(struct sk_buff *skb) if (unlikely(!dev)) goto out_drop; if (flags & BPF_F_PEER) { - const struct net_device_ops *ops = dev->netdev_ops; - - if (unlikely(!ops->ndo_get_peer_dev || - !skb_at_tc_ingress(skb))) + if (unlikely(!skb_at_tc_ingress(skb))) goto out_drop; - dev = ops->ndo_get_peer_dev(dev); + dev = skb_get_peer_dev(dev); if (unlikely(!dev || !(dev->flags & IFF_UP) || net_eq(net, dev_net(dev)))) goto out_drop; skb->dev = dev; + dev_sw_netstats_rx_add(dev, skb->len); return -EAGAIN; } return flags & BPF_F_NEIGH ? @@ -2593,6 +2607,22 @@ BPF_CALL_2(bpf_msg_cork_bytes, struct sk_msg *, msg, u32, bytes) return 0; } +static void sk_msg_reset_curr(struct sk_msg *msg) +{ + u32 i = msg->sg.start; + u32 len = 0; + + do { + len += sk_msg_elem(msg, i)->length; + sk_msg_iter_var_next(i); + if (len >= msg->sg.size) + break; + } while (i != msg->sg.end); + + msg->sg.curr = i; + msg->sg.copybreak = 0; +} + static const struct bpf_func_proto bpf_msg_cork_bytes_proto = { .func = bpf_msg_cork_bytes, .gpl_only = false, @@ -2712,6 +2742,7 @@ BPF_CALL_4(bpf_msg_pull_data, struct sk_msg *, msg, u32, start, msg->sg.end - shift + NR_MSG_FRAG_IDS : msg->sg.end - shift; out: + sk_msg_reset_curr(msg); msg->data = sg_virt(&msg->sg.data[first_sge]) + start - offset; msg->data_end = msg->data + bytes; return 0; @@ -2848,6 +2879,7 @@ BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start, msg->sg.data[new] = rsge; } + sk_msg_reset_curr(msg); sk_msg_compute_data_pointers(msg); return 0; } @@ -2938,7 +2970,7 @@ BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start, * * Then if B is non-zero AND there is no space allocate space and * compact A, B regions into page. If there is space shift ring to - * the rigth free'ing the next element in ring to place B, leaving + * the right free'ing the next element in ring to place B, leaving * A untouched except to reduce length. */ if (start != offset) { @@ -3016,6 +3048,7 @@ BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start, sk_mem_uncharge(msg->sk, len - pop); msg->sg.size -= (len - pop); + sk_msg_reset_curr(msg); sk_msg_compute_data_pointers(msg); return 0; } @@ -4062,10 +4095,46 @@ static int bpf_xdp_frags_increase_tail(struct xdp_buff *xdp, int offset) memset(skb_frag_address(frag) + skb_frag_size(frag), 0, offset); skb_frag_size_add(frag, offset); sinfo->xdp_frags_size += offset; + if (rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL) + xsk_buff_get_tail(xdp)->data_end += offset; return 0; } +static void bpf_xdp_shrink_data_zc(struct xdp_buff *xdp, int shrink, + struct xdp_mem_info *mem_info, bool release) +{ + struct xdp_buff *zc_frag = xsk_buff_get_tail(xdp); + + if (release) { + xsk_buff_del_tail(zc_frag); + __xdp_return(NULL, mem_info, false, zc_frag); + } else { + zc_frag->data_end -= shrink; + } +} + +static bool bpf_xdp_shrink_data(struct xdp_buff *xdp, skb_frag_t *frag, + int shrink) +{ + struct xdp_mem_info *mem_info = &xdp->rxq->mem; + bool release = skb_frag_size(frag) == shrink; + + if (mem_info->type == MEM_TYPE_XSK_BUFF_POOL) { + bpf_xdp_shrink_data_zc(xdp, shrink, mem_info, release); + goto out; + } + + if (release) { + struct page *page = skb_frag_page(frag); + + __xdp_return(page_address(page), mem_info, false, NULL); + } + +out: + return release; +} + static int bpf_xdp_frags_shrink_tail(struct xdp_buff *xdp, int offset) { struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp); @@ -4080,12 +4149,7 @@ static int bpf_xdp_frags_shrink_tail(struct xdp_buff *xdp, int offset) len_free += shrink; offset -= shrink; - - if (skb_frag_size(frag) == shrink) { - struct page *page = skb_frag_page(frag); - - __xdp_return(page_address(page), &xdp->rxq->mem, - false, NULL); + if (bpf_xdp_shrink_data(xdp, frag, shrink)) { n_frags_free++; } else { skb_frag_size_sub(frag, shrink); @@ -4298,10 +4362,12 @@ static __always_inline int __xdp_do_redirect_frame(struct bpf_redirect_info *ri, enum bpf_map_type map_type = ri->map_type; void *fwd = ri->tgt_value; u32 map_id = ri->map_id; + u32 flags = ri->flags; struct bpf_map *map; int err; ri->map_id = 0; /* Valid map id idr range: [1,INT_MAX[ */ + ri->flags = 0; ri->map_type = BPF_MAP_TYPE_UNSPEC; if (unlikely(!xdpf)) { @@ -4313,11 +4379,20 @@ static __always_inline int __xdp_do_redirect_frame(struct bpf_redirect_info *ri, case BPF_MAP_TYPE_DEVMAP: fallthrough; case BPF_MAP_TYPE_DEVMAP_HASH: - map = READ_ONCE(ri->map); - if (unlikely(map)) { + if (unlikely(flags & BPF_F_BROADCAST)) { + map = READ_ONCE(ri->map); + + /* The map pointer is cleared when the map is being torn + * down by bpf_clear_redirect_map() + */ + if (unlikely(!map)) { + err = -ENOENT; + break; + } + WRITE_ONCE(ri->map, NULL); err = dev_map_enqueue_multi(xdpf, dev, map, - ri->flags & BPF_F_EXCLUDE_INGRESS); + flags & BPF_F_EXCLUDE_INGRESS); } else { err = dev_map_enqueue(fwd, xdpf, dev); } @@ -4380,9 +4455,9 @@ EXPORT_SYMBOL_GPL(xdp_do_redirect_frame); static int xdp_do_generic_redirect_map(struct net_device *dev, struct sk_buff *skb, struct xdp_buff *xdp, - struct bpf_prog *xdp_prog, - void *fwd, - enum bpf_map_type map_type, u32 map_id) + struct bpf_prog *xdp_prog, void *fwd, + enum bpf_map_type map_type, u32 map_id, + u32 flags) { struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); struct bpf_map *map; @@ -4392,11 +4467,20 @@ static int xdp_do_generic_redirect_map(struct net_device *dev, case BPF_MAP_TYPE_DEVMAP: fallthrough; case BPF_MAP_TYPE_DEVMAP_HASH: - map = READ_ONCE(ri->map); - if (unlikely(map)) { + if (unlikely(flags & BPF_F_BROADCAST)) { + map = READ_ONCE(ri->map); + + /* The map pointer is cleared when the map is being torn + * down by bpf_clear_redirect_map() + */ + if (unlikely(!map)) { + err = -ENOENT; + break; + } + WRITE_ONCE(ri->map, NULL); err = dev_map_redirect_multi(dev, skb, xdp_prog, map, - ri->flags & BPF_F_EXCLUDE_INGRESS); + flags & BPF_F_EXCLUDE_INGRESS); } else { err = dev_map_generic_redirect(fwd, skb, xdp_prog); } @@ -4433,9 +4517,11 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, enum bpf_map_type map_type = ri->map_type; void *fwd = ri->tgt_value; u32 map_id = ri->map_id; + u32 flags = ri->flags; int err; ri->map_id = 0; /* Valid map id idr range: [1,INT_MAX[ */ + ri->flags = 0; ri->map_type = BPF_MAP_TYPE_UNSPEC; if (map_type == BPF_MAP_TYPE_UNSPEC && map_id == INT_MAX) { @@ -4455,7 +4541,7 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, return 0; } - return xdp_do_generic_redirect_map(dev, skb, xdp, xdp_prog, fwd, map_type, map_id); + return xdp_do_generic_redirect_map(dev, skb, xdp, xdp_prog, fwd, map_type, map_id, flags); err: _trace_xdp_redirect_err(dev, xdp_prog, ri->tgt_index, err); return err; @@ -4600,7 +4686,7 @@ set_compat: to->tunnel_tos = info->key.tos; to->tunnel_ttl = info->key.ttl; if (flags & BPF_F_TUNINFO_FLAGS) - to->tunnel_flags = info->key.tun_flags; + to->tunnel_flags = ip_tunnel_flags_to_be16(info->key.tun_flags); else to->tunnel_ext = 0; @@ -4643,7 +4729,7 @@ BPF_CALL_3(bpf_skb_get_tunnel_opt, struct sk_buff *, skb, u8 *, to, u32, size) int err; if (unlikely(!info || - !(info->key.tun_flags & TUNNEL_OPTIONS_PRESENT))) { + !ip_tunnel_is_options_present(info->key.tun_flags))) { err = -ENOENT; goto err_clear; } @@ -4713,15 +4799,15 @@ BPF_CALL_4(bpf_skb_set_tunnel_key, struct sk_buff *, skb, memset(info, 0, sizeof(*info)); info->mode = IP_TUNNEL_INFO_TX; - info->key.tun_flags = TUNNEL_KEY | TUNNEL_CSUM | TUNNEL_NOCACHE; - if (flags & BPF_F_DONT_FRAGMENT) - info->key.tun_flags |= TUNNEL_DONT_FRAGMENT; - if (flags & BPF_F_ZERO_CSUM_TX) - info->key.tun_flags &= ~TUNNEL_CSUM; - if (flags & BPF_F_SEQ_NUMBER) - info->key.tun_flags |= TUNNEL_SEQ; - if (flags & BPF_F_NO_TUNNEL_KEY) - info->key.tun_flags &= ~TUNNEL_KEY; + __set_bit(IP_TUNNEL_NOCACHE_BIT, info->key.tun_flags); + __assign_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, info->key.tun_flags, + flags & BPF_F_DONT_FRAGMENT); + __assign_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags, + !(flags & BPF_F_ZERO_CSUM_TX)); + __assign_bit(IP_TUNNEL_SEQ_BIT, info->key.tun_flags, + flags & BPF_F_SEQ_NUMBER); + __assign_bit(IP_TUNNEL_KEY_BIT, info->key.tun_flags, + !(flags & BPF_F_NO_TUNNEL_KEY)); info->key.tun_id = cpu_to_be64(from->tunnel_id); info->key.tos = from->tunnel_tos; @@ -4759,13 +4845,15 @@ BPF_CALL_3(bpf_skb_set_tunnel_opt, struct sk_buff *, skb, { struct ip_tunnel_info *info = skb_tunnel_info(skb); const struct metadata_dst *md = this_cpu_ptr(md_dst); + IP_TUNNEL_DECLARE_FLAGS(present) = { }; if (unlikely(info != &md->u.tun_info || (size & (sizeof(u32) - 1)))) return -EINVAL; if (unlikely(size > IP_TUNNEL_OPTS_MAX)) return -ENOMEM; - ip_tunnel_info_opts_set(info, from, size, TUNNEL_OPTIONS_PRESENT); + ip_tunnel_set_options_present(present); + ip_tunnel_info_opts_set(info, from, size, present); return 0; } @@ -5822,7 +5910,10 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params, err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF); } else { - fl4.flowi4_mark = 0; + if (flags & BPF_FIB_LOOKUP_MARK) + fl4.flowi4_mark = params->mark; + else + fl4.flowi4_mark = 0; fl4.flowi4_secid = 0; fl4.flowi4_tun_key.tun_id = 0; fl4.flowi4_uid = sock_net_uid(net, NULL); @@ -5926,7 +6017,7 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params, return -ENODEV; idev = __in6_dev_get_safely(dev); - if (unlikely(!idev || !idev->cnf.forwarding)) + if (unlikely(!idev || !READ_ONCE(idev->cnf.forwarding))) return BPF_FIB_LKUP_RET_FWD_DISABLED; if (flags & BPF_FIB_LOOKUP_OUTPUT) { @@ -5965,7 +6056,10 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params, err = ipv6_stub->fib6_table_lookup(net, tb, oif, &fl6, &res, strict); } else { - fl6.flowi6_mark = 0; + if (flags & BPF_FIB_LOOKUP_MARK) + fl6.flowi6_mark = params->mark; + else + fl6.flowi6_mark = 0; fl6.flowi6_secid = 0; fl6.flowi6_tun_key.tun_id = 0; fl6.flowi6_uid = sock_net_uid(net, NULL); @@ -6043,7 +6137,7 @@ set_fwd_params: #define BPF_FIB_LOOKUP_MASK (BPF_FIB_LOOKUP_DIRECT | BPF_FIB_LOOKUP_OUTPUT | \ BPF_FIB_LOOKUP_SKIP_NEIGH | BPF_FIB_LOOKUP_TBID | \ - BPF_FIB_LOOKUP_SRC) + BPF_FIB_LOOKUP_SRC | BPF_FIB_LOOKUP_MARK) BPF_CALL_4(bpf_xdp_fib_lookup, struct xdp_buff *, ctx, struct bpf_fib_lookup *, params, int, plen, u32, flags) @@ -7229,7 +7323,6 @@ BPF_CALL_5(bpf_tcp_check_syncookie, struct sock *, sk, void *, iph, u32, iph_len struct tcphdr *, th, u32, th_len) { #ifdef CONFIG_SYN_COOKIES - u32 cookie; int ret; if (unlikely(!sk || th_len < sizeof(*th))) @@ -7251,8 +7344,6 @@ BPF_CALL_5(bpf_tcp_check_syncookie, struct sock *, sk, void *, iph, u32, iph_len if (tcp_synq_no_recent_overflow(sk)) return -ENOENT; - cookie = ntohl(th->ack_seq) - 1; - /* Both struct iphdr and struct ipv6hdr have the version field at the * same offset so we can cast to the shorter header (struct iphdr). */ @@ -7261,7 +7352,7 @@ BPF_CALL_5(bpf_tcp_check_syncookie, struct sock *, sk, void *, iph, u32, iph_len if (sk->sk_family == AF_INET6 && ipv6_only_sock(sk)) return -EINVAL; - ret = __cookie_v4_check((struct iphdr *)iph, th, cookie); + ret = __cookie_v4_check((struct iphdr *)iph, th); break; #if IS_BUILTIN(CONFIG_IPV6) @@ -7272,7 +7363,7 @@ BPF_CALL_5(bpf_tcp_check_syncookie, struct sock *, sk, void *, iph, u32, iph_len if (sk->sk_family != AF_INET6) return -EINVAL; - ret = __cookie_v6_check((struct ipv6hdr *)iph, th, cookie); + ret = __cookie_v6_check((struct ipv6hdr *)iph, th); break; #endif /* CONFIG_IPV6 */ @@ -7725,9 +7816,7 @@ static const struct bpf_func_proto bpf_tcp_raw_gen_syncookie_ipv6_proto = { BPF_CALL_2(bpf_tcp_raw_check_syncookie_ipv4, struct iphdr *, iph, struct tcphdr *, th) { - u32 cookie = ntohl(th->ack_seq) - 1; - - if (__cookie_v4_check(iph, th, cookie) > 0) + if (__cookie_v4_check(iph, th) > 0) return 0; return -EACCES; @@ -7748,9 +7837,7 @@ BPF_CALL_2(bpf_tcp_raw_check_syncookie_ipv6, struct ipv6hdr *, iph, struct tcphdr *, th) { #if IS_BUILTIN(CONFIG_IPV6) - u32 cookie = ntohl(th->ack_seq) - 1; - - if (__cookie_v6_check(iph, th, cookie) > 0) + if (__cookie_v6_check(iph, th) > 0) return 0; return -EACCES; @@ -7839,7 +7926,7 @@ sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_ktime_get_coarse_ns: return &bpf_ktime_get_coarse_ns_proto; default: - return bpf_base_func_proto(func_id); + return bpf_base_func_proto(func_id, prog); } } @@ -7932,7 +8019,7 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return NULL; } default: - return bpf_sk_base_func_proto(func_id); + return bpf_sk_base_func_proto(func_id, prog); } } @@ -7951,7 +8038,7 @@ sk_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_perf_event_output: return &bpf_skb_event_output_proto; default: - return bpf_sk_base_func_proto(func_id); + return bpf_sk_base_func_proto(func_id, prog); } } @@ -8138,7 +8225,7 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) #endif #endif default: - return bpf_sk_base_func_proto(func_id); + return bpf_sk_base_func_proto(func_id, prog); } } @@ -8197,13 +8284,13 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) #endif #endif default: - return bpf_sk_base_func_proto(func_id); + return bpf_sk_base_func_proto(func_id, prog); } #if IS_MODULE(CONFIG_NF_CONNTRACK) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES) /* The nf_conn___init type is used in the NF_CONNTRACK kfuncs. The * kfuncs are defined in two different modules, and we want to be able - * to use them interchangably with the same BTF type ID. Because modules + * to use them interchangeably with the same BTF type ID. Because modules * can't de-duplicate BTF IDs between each other, we need the type to be * referenced in the vmlinux BTF or the verifier will get confused about * the different types. So we add this dummy type reference which will @@ -8258,7 +8345,7 @@ sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_tcp_sock_proto; #endif /* CONFIG_INET */ default: - return bpf_sk_base_func_proto(func_id); + return bpf_sk_base_func_proto(func_id, prog); } } @@ -8287,8 +8374,6 @@ sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_event_output_data_proto; case BPF_FUNC_get_current_uid_gid: return &bpf_get_current_uid_gid_proto; - case BPF_FUNC_get_current_pid_tgid: - return &bpf_get_current_pid_tgid_proto; case BPF_FUNC_sk_storage_get: return &bpf_sk_storage_get_proto; case BPF_FUNC_sk_storage_delete: @@ -8300,7 +8385,7 @@ sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_cgroup_classid_curr_proto; #endif default: - return bpf_sk_base_func_proto(func_id); + return bpf_sk_base_func_proto(func_id, prog); } } @@ -8344,7 +8429,7 @@ sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_skc_lookup_tcp_proto; #endif default: - return bpf_sk_base_func_proto(func_id); + return bpf_sk_base_func_proto(func_id, prog); } } @@ -8355,7 +8440,7 @@ flow_dissector_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_skb_load_bytes: return &bpf_flow_dissector_load_bytes_proto; default: - return bpf_sk_base_func_proto(func_id); + return bpf_sk_base_func_proto(func_id, prog); } } @@ -8382,7 +8467,7 @@ lwt_out_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_skb_under_cgroup: return &bpf_skb_under_cgroup_proto; default: - return bpf_sk_base_func_proto(func_id); + return bpf_sk_base_func_proto(func_id, prog); } } @@ -8557,7 +8642,7 @@ static bool cg_skb_is_valid_access(int off, int size, return false; case bpf_ctx_range(struct __sk_buff, data): case bpf_ctx_range(struct __sk_buff, data_end): - if (!bpf_capable()) + if (!bpf_token_capable(prog->aux->token, CAP_BPF)) return false; break; } @@ -8569,7 +8654,7 @@ static bool cg_skb_is_valid_access(int off, int size, case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]): break; case bpf_ctx_range(struct __sk_buff, tstamp): - if (!bpf_capable()) + if (!bpf_token_capable(prog->aux->token, CAP_BPF)) return false; break; default: @@ -11213,7 +11298,7 @@ sk_reuseport_func_proto(enum bpf_func_id func_id, case BPF_FUNC_ktime_get_coarse_ns: return &bpf_ktime_get_coarse_ns_proto; default: - return bpf_base_func_proto(func_id); + return bpf_base_func_proto(func_id, prog); } } @@ -11395,7 +11480,7 @@ sk_lookup_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_sk_release: return &bpf_sk_release_proto; default: - return bpf_sk_base_func_proto(func_id); + return bpf_sk_base_func_proto(func_id, prog); } } @@ -11729,7 +11814,7 @@ const struct bpf_func_proto bpf_sock_from_file_proto = { }; static const struct bpf_func_proto * -bpf_sk_base_func_proto(enum bpf_func_id func_id) +bpf_sk_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { const struct bpf_func_proto *func; @@ -11758,18 +11843,16 @@ bpf_sk_base_func_proto(enum bpf_func_id func_id) case BPF_FUNC_ktime_get_coarse_ns: return &bpf_ktime_get_coarse_ns_proto; default: - return bpf_base_func_proto(func_id); + return bpf_base_func_proto(func_id, prog); } - if (!perfmon_capable()) + if (!bpf_token_capable(prog->aux->token, CAP_PERFMON)) return NULL; return func; } -__diag_push(); -__diag_ignore_all("-Wmissing-prototypes", - "Global functions as their definitions will be in vmlinux BTF"); +__bpf_kfunc_start_defs(); __bpf_kfunc int bpf_dynptr_from_skb(struct sk_buff *skb, u64 flags, struct bpf_dynptr_kern *ptr__uninit) { @@ -11816,7 +11899,104 @@ __bpf_kfunc int bpf_sock_addr_set_sun_path(struct bpf_sock_addr_kern *sa_kern, return 0; } -__diag_pop(); + +__bpf_kfunc int bpf_sk_assign_tcp_reqsk(struct sk_buff *skb, struct sock *sk, + struct bpf_tcp_req_attrs *attrs, int attrs__sz) +{ +#if IS_ENABLED(CONFIG_SYN_COOKIES) + const struct request_sock_ops *ops; + struct inet_request_sock *ireq; + struct tcp_request_sock *treq; + struct request_sock *req; + struct net *net; + __u16 min_mss; + u32 tsoff = 0; + + if (attrs__sz != sizeof(*attrs) || + attrs->reserved[0] || attrs->reserved[1] || attrs->reserved[2]) + return -EINVAL; + + if (!skb_at_tc_ingress(skb)) + return -EINVAL; + + net = dev_net(skb->dev); + if (net != sock_net(sk)) + return -ENETUNREACH; + + switch (skb->protocol) { + case htons(ETH_P_IP): + ops = &tcp_request_sock_ops; + min_mss = 536; + break; +#if IS_BUILTIN(CONFIG_IPV6) + case htons(ETH_P_IPV6): + ops = &tcp6_request_sock_ops; + min_mss = IPV6_MIN_MTU - 60; + break; +#endif + default: + return -EINVAL; + } + + if (sk->sk_type != SOCK_STREAM || sk->sk_state != TCP_LISTEN || + sk_is_mptcp(sk)) + return -EINVAL; + + if (attrs->mss < min_mss) + return -EINVAL; + + if (attrs->wscale_ok) { + if (!READ_ONCE(net->ipv4.sysctl_tcp_window_scaling)) + return -EINVAL; + + if (attrs->snd_wscale > TCP_MAX_WSCALE || + attrs->rcv_wscale > TCP_MAX_WSCALE) + return -EINVAL; + } + + if (attrs->sack_ok && !READ_ONCE(net->ipv4.sysctl_tcp_sack)) + return -EINVAL; + + if (attrs->tstamp_ok) { + if (!READ_ONCE(net->ipv4.sysctl_tcp_timestamps)) + return -EINVAL; + + tsoff = attrs->rcv_tsecr - tcp_ns_to_ts(attrs->usec_ts_ok, tcp_clock_ns()); + } + + req = inet_reqsk_alloc(ops, sk, false); + if (!req) + return -ENOMEM; + + ireq = inet_rsk(req); + treq = tcp_rsk(req); + + req->rsk_listener = sk; + req->syncookie = 1; + req->mss = attrs->mss; + req->ts_recent = attrs->rcv_tsval; + + ireq->snd_wscale = attrs->snd_wscale; + ireq->rcv_wscale = attrs->rcv_wscale; + ireq->tstamp_ok = !!attrs->tstamp_ok; + ireq->sack_ok = !!attrs->sack_ok; + ireq->wscale_ok = !!attrs->wscale_ok; + ireq->ecn_ok = !!attrs->ecn_ok; + + treq->req_usec_ts = !!attrs->usec_ts_ok; + treq->ts_off = tsoff; + + skb_orphan(skb); + skb->sk = req_to_sk(req); + skb->destructor = sock_pfree; + + return 0; +#else + return -EOPNOTSUPP; +#endif +} + +__bpf_kfunc_end_defs(); int bpf_dynptr_from_skb_rdonly(struct sk_buff *skb, u64 flags, struct bpf_dynptr_kern *ptr__uninit) @@ -11832,17 +12012,21 @@ int bpf_dynptr_from_skb_rdonly(struct sk_buff *skb, u64 flags, return 0; } -BTF_SET8_START(bpf_kfunc_check_set_skb) +BTF_KFUNCS_START(bpf_kfunc_check_set_skb) BTF_ID_FLAGS(func, bpf_dynptr_from_skb) -BTF_SET8_END(bpf_kfunc_check_set_skb) +BTF_KFUNCS_END(bpf_kfunc_check_set_skb) -BTF_SET8_START(bpf_kfunc_check_set_xdp) +BTF_KFUNCS_START(bpf_kfunc_check_set_xdp) BTF_ID_FLAGS(func, bpf_dynptr_from_xdp) -BTF_SET8_END(bpf_kfunc_check_set_xdp) +BTF_KFUNCS_END(bpf_kfunc_check_set_xdp) -BTF_SET8_START(bpf_kfunc_check_set_sock_addr) +BTF_KFUNCS_START(bpf_kfunc_check_set_sock_addr) BTF_ID_FLAGS(func, bpf_sock_addr_set_sun_path) -BTF_SET8_END(bpf_kfunc_check_set_sock_addr) +BTF_KFUNCS_END(bpf_kfunc_check_set_sock_addr) + +BTF_KFUNCS_START(bpf_kfunc_check_set_tcp_reqsk) +BTF_ID_FLAGS(func, bpf_sk_assign_tcp_reqsk, KF_TRUSTED_ARGS) +BTF_KFUNCS_END(bpf_kfunc_check_set_tcp_reqsk) static const struct btf_kfunc_id_set bpf_kfunc_set_skb = { .owner = THIS_MODULE, @@ -11859,6 +12043,11 @@ static const struct btf_kfunc_id_set bpf_kfunc_set_sock_addr = { .set = &bpf_kfunc_check_set_sock_addr, }; +static const struct btf_kfunc_id_set bpf_kfunc_set_tcp_reqsk = { + .owner = THIS_MODULE, + .set = &bpf_kfunc_check_set_tcp_reqsk, +}; + static int __init bpf_kfunc_init(void) { int ret; @@ -11874,15 +12063,13 @@ static int __init bpf_kfunc_init(void) ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_SEG6LOCAL, &bpf_kfunc_set_skb); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_NETFILTER, &bpf_kfunc_set_skb); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &bpf_kfunc_set_xdp); - return ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_CGROUP_SOCK_ADDR, - &bpf_kfunc_set_sock_addr); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_CGROUP_SOCK_ADDR, + &bpf_kfunc_set_sock_addr); + return ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &bpf_kfunc_set_tcp_reqsk); } late_initcall(bpf_kfunc_init); -/* Disables missing prototype warnings */ -__diag_push(); -__diag_ignore_all("-Wmissing-prototypes", - "Global functions as their definitions will be in vmlinux BTF"); +__bpf_kfunc_start_defs(); /* bpf_sock_destroy: Destroy the given socket with ECONNABORTED error code. * @@ -11916,11 +12103,11 @@ __bpf_kfunc int bpf_sock_destroy(struct sock_common *sock) return sk->sk_prot->diag_destroy(sk, ECONNABORTED); } -__diag_pop() +__bpf_kfunc_end_defs(); -BTF_SET8_START(bpf_sk_iter_kfunc_ids) +BTF_KFUNCS_START(bpf_sk_iter_kfunc_ids) BTF_ID_FLAGS(func, bpf_sock_destroy, KF_TRUSTED_ARGS) -BTF_SET8_END(bpf_sk_iter_kfunc_ids) +BTF_KFUNCS_END(bpf_sk_iter_kfunc_ids) static int tracing_iter_filter(const struct bpf_prog *prog, u32 kfunc_id) { diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 272f09251343..f82e9a7d3b37 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -455,17 +455,25 @@ skb_flow_dissect_tunnel_info(const struct sk_buff *skb, if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_OPTS)) { struct flow_dissector_key_enc_opts *enc_opt; + IP_TUNNEL_DECLARE_FLAGS(flags) = { }; + u32 val; enc_opt = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_ENC_OPTS, target_container); - if (info->options_len) { - enc_opt->len = info->options_len; - ip_tunnel_info_opts_get(enc_opt->data, info); - enc_opt->dst_opt_type = info->key.tun_flags & - TUNNEL_OPTIONS_PRESENT; - } + if (!info->options_len) + return; + + enc_opt->len = info->options_len; + ip_tunnel_info_opts_get(enc_opt->data, info); + + ip_tunnel_set_options_present(flags); + ip_tunnel_flags_and(flags, info->key.tun_flags, flags); + + val = find_next_bit(flags, __IP_TUNNEL_FLAG_NUM, + IP_TUNNEL_GENEVE_OPT_BIT); + enc_opt->dst_opt_type = val < __IP_TUNNEL_FLAG_NUM ? val : 0; } } EXPORT_SYMBOL(skb_flow_dissect_tunnel_info); diff --git a/net/core/gro.c b/net/core/gro.c index 0759277dc14e..b3b43de1a650 100644 --- a/net/core/gro.c +++ b/net/core/gro.c @@ -3,6 +3,7 @@ #include <net/dst_metadata.h> #include <net/busy_poll.h> #include <trace/events/net.h> +#include <linux/skbuff_ref.h> #define MAX_GRO_SKBS 8 @@ -10,9 +11,6 @@ #define GRO_MAX_HEAD (MAX_HEADER + 128) static DEFINE_SPINLOCK(offload_lock); -struct list_head offload_base __read_mostly = LIST_HEAD_INIT(offload_base); -/* Maximum number of GRO_NORMAL skbs to batch up for list-RX */ -int gro_normal_batch __read_mostly = 8; /** * dev_add_offload - register offload handlers @@ -31,7 +29,7 @@ void dev_add_offload(struct packet_offload *po) struct packet_offload *elem; spin_lock(&offload_lock); - list_for_each_entry(elem, &offload_base, list) { + list_for_each_entry(elem, &net_hotdata.offload_base, list) { if (po->priority < elem->priority) break; } @@ -55,7 +53,7 @@ EXPORT_SYMBOL(dev_add_offload); */ static void __dev_remove_offload(struct packet_offload *po) { - struct list_head *head = &offload_base; + struct list_head *head = &net_hotdata.offload_base; struct packet_offload *po1; spin_lock(&offload_lock); @@ -195,8 +193,9 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb) } merge: - /* sk owenrship - if any - completely transferred to the aggregated packet */ + /* sk ownership - if any - completely transferred to the aggregated packet */ skb->destructor = NULL; + skb->sk = NULL; delta_truesize = skb->truesize; if (offset > headlen) { unsigned int eat = offset - headlen; @@ -232,12 +231,39 @@ done: return 0; } +int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb) +{ + if (unlikely(p->len + skb->len >= 65536)) + return -E2BIG; + + if (NAPI_GRO_CB(p)->last == p) + skb_shinfo(p)->frag_list = skb; + else + NAPI_GRO_CB(p)->last->next = skb; + + skb_pull(skb, skb_gro_offset(skb)); + + NAPI_GRO_CB(p)->last = skb; + NAPI_GRO_CB(p)->count++; + p->data_len += skb->len; + + /* sk ownership - if any - completely transferred to the aggregated packet */ + skb->destructor = NULL; + skb->sk = NULL; + p->truesize += skb->truesize; + p->len += skb->len; + + NAPI_GRO_CB(skb)->same_flow = 1; + + return 0; +} + static void napi_gro_complete(struct napi_struct *napi, struct sk_buff *skb) { + struct list_head *head = &net_hotdata.offload_base; struct packet_offload *ptype; __be16 type = skb->protocol; - struct list_head *head = &offload_base; int err = -ENOENT; BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb)); @@ -332,8 +358,6 @@ static void gro_list_prepare(const struct list_head *head, list_for_each_entry(p, head, list) { unsigned long diffs; - NAPI_GRO_CB(p)->flush = 0; - if (hash != skb_get_hash_raw(p)) { NAPI_GRO_CB(p)->same_flow = 0; continue; @@ -369,15 +393,22 @@ static void gro_list_prepare(const struct list_head *head, static inline void skb_gro_reset_offset(struct sk_buff *skb, u32 nhoff) { - const struct skb_shared_info *pinfo = skb_shinfo(skb); - const skb_frag_t *frag0 = &pinfo->frags[0]; + const struct skb_shared_info *pinfo; + const skb_frag_t *frag0; + unsigned int headlen; + NAPI_GRO_CB(skb)->network_offset = 0; NAPI_GRO_CB(skb)->data_offset = 0; - NAPI_GRO_CB(skb)->frag0 = NULL; - NAPI_GRO_CB(skb)->frag0_len = 0; + headlen = skb_headlen(skb); + NAPI_GRO_CB(skb)->frag0 = skb->data; + NAPI_GRO_CB(skb)->frag0_len = headlen; + if (headlen) + return; + + pinfo = skb_shinfo(skb); + frag0 = &pinfo->frags[0]; - if (!skb_headlen(skb) && pinfo->nr_frags && - !PageHighMem(skb_frag_page(frag0)) && + if (pinfo->nr_frags && !PageHighMem(skb_frag_page(frag0)) && (!NET_IP_ALIGN || !((skb_frag_off(frag0) + nhoff) & 3))) { NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0); NAPI_GRO_CB(skb)->frag0_len = min_t(unsigned int, @@ -438,7 +469,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff { u32 bucket = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1); struct gro_list *gro_list = &napi->gro_hash[bucket]; - struct list_head *head = &offload_base; + struct list_head *head = &net_hotdata.offload_base; struct packet_offload *ptype; __be16 type = skb->protocol; struct sk_buff *pp = NULL; @@ -466,7 +497,6 @@ found_ptype: sizeof(u32))); /* Avoid slow unaligned acc */ *(u32 *)&NAPI_GRO_CB(skb)->zeroed = 0; NAPI_GRO_CB(skb)->flush = skb_has_frag_list(skb); - NAPI_GRO_CB(skb)->is_atomic = 1; NAPI_GRO_CB(skb)->count = 1; if (unlikely(skb_is_gso(skb))) { NAPI_GRO_CB(skb)->count = skb_shinfo(skb)->gso_segs; @@ -544,7 +574,7 @@ normal: struct packet_offload *gro_find_receive_by_type(__be16 type) { - struct list_head *offload_head = &offload_base; + struct list_head *offload_head = &net_hotdata.offload_base; struct packet_offload *ptype; list_for_each_entry_rcu(ptype, offload_head, list) { @@ -558,7 +588,7 @@ EXPORT_SYMBOL(gro_find_receive_by_type); struct packet_offload *gro_find_complete_by_type(__be16 type) { - struct list_head *offload_head = &offload_base; + struct list_head *offload_head = &net_hotdata.offload_base; struct packet_offload *ptype; list_for_each_entry_rcu(ptype, offload_head, list) { @@ -700,7 +730,7 @@ static struct sk_buff *napi_frags_skb(struct napi_struct *napi) skb_reset_mac_header(skb); skb_gro_reset_offset(skb, hlen); - if (unlikely(skb_gro_header_hard(skb, hlen))) { + if (unlikely(!skb_gro_may_pull(skb, hlen))) { eth = skb_gro_header_slow(skb, hlen, 0); if (unlikely(!eth)) { net_warn_ratelimited("%s: dropping impossible skb from %s\n", @@ -710,7 +740,10 @@ static struct sk_buff *napi_frags_skb(struct napi_struct *napi) } } else { eth = (const struct ethhdr *)skb->data; - gro_pull_from_frag0(skb, hlen); + + if (NAPI_GRO_CB(skb)->frag0 != skb->data) + gro_pull_from_frag0(skb, hlen); + NAPI_GRO_CB(skb)->frag0 += hlen; NAPI_GRO_CB(skb)->frag0_len -= hlen; } diff --git a/net/core/gro_cells.c b/net/core/gro_cells.c index ed5ec5de47f6..ff8e5b64bf6b 100644 --- a/net/core/gro_cells.c +++ b/net/core/gro_cells.c @@ -3,6 +3,7 @@ #include <linux/slab.h> #include <linux/netdevice.h> #include <net/gro_cells.h> +#include <net/hotdata.h> struct gro_cell { struct sk_buff_head napi_skbs; @@ -26,7 +27,7 @@ int gro_cells_receive(struct gro_cells *gcells, struct sk_buff *skb) cell = this_cpu_ptr(gcells->cells); - if (skb_queue_len(&cell->napi_skbs) > READ_ONCE(netdev_max_backlog)) { + if (skb_queue_len(&cell->napi_skbs) > READ_ONCE(net_hotdata.max_backlog)) { drop: dev_core_stats_rx_dropped_inc(dev); kfree_skb(skb); diff --git a/net/core/gso.c b/net/core/gso.c index 9e1803bfc9c6..bcd156372f4d 100644 --- a/net/core/gso.c +++ b/net/core/gso.c @@ -17,7 +17,7 @@ struct sk_buff *skb_eth_gso_segment(struct sk_buff *skb, struct packet_offload *ptype; rcu_read_lock(); - list_for_each_entry_rcu(ptype, &offload_base, list) { + list_for_each_entry_rcu(ptype, &net_hotdata.offload_base, list) { if (ptype->type == type && ptype->callbacks.gso_segment) { segs = ptype->callbacks.gso_segment(skb, features); break; @@ -48,7 +48,7 @@ struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb, __skb_pull(skb, vlan_depth); rcu_read_lock(); - list_for_each_entry_rcu(ptype, &offload_base, list) { + list_for_each_entry_rcu(ptype, &net_hotdata.offload_base, list) { if (ptype->type == type && ptype->callbacks.gso_segment) { segs = ptype->callbacks.gso_segment(skb, features); break; diff --git a/net/core/hotdata.c b/net/core/hotdata.c new file mode 100644 index 000000000000..d0aaaaa556f2 --- /dev/null +++ b/net/core/hotdata.c @@ -0,0 +1,25 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +#include <linux/cache.h> +#include <linux/jiffies.h> +#include <linux/list.h> +#include <net/hotdata.h> +#include <net/proto_memory.h> + +struct net_hotdata net_hotdata __cacheline_aligned = { + .offload_base = LIST_HEAD_INIT(net_hotdata.offload_base), + .ptype_all = LIST_HEAD_INIT(net_hotdata.ptype_all), + .gro_normal_batch = 8, + + .netdev_budget = 300, + /* Must be at least 2 jiffes to guarantee 1 jiffy timeout */ + .netdev_budget_usecs = 2 * USEC_PER_SEC / HZ, + + .tstamp_prequeue = 1, + .max_backlog = 1000, + .dev_tx_weight = 64, + .dev_rx_weight = 64, + .sysctl_max_skb_frags = MAX_SKB_FRAGS, + .sysctl_skb_defer_max = 64, + .sysctl_mem_pcpu_rsv = SK_MEMORY_PCPU_RESERVE +}; +EXPORT_SYMBOL(net_hotdata); diff --git a/net/core/ieee8021q_helpers.c b/net/core/ieee8021q_helpers.c new file mode 100644 index 000000000000..759a9b9f3f89 --- /dev/null +++ b/net/core/ieee8021q_helpers.c @@ -0,0 +1,242 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2024 Pengutronix, Oleksij Rempel <kernel@pengutronix.de> + +#include <linux/array_size.h> +#include <linux/printk.h> +#include <linux/types.h> +#include <net/dscp.h> +#include <net/ieee8021q.h> + +/* The following arrays map Traffic Types (TT) to traffic classes (TC) for + * different number of queues as shown in the example provided by + * IEEE 802.1Q-2022 in Annex I "I.3 Traffic type to traffic class mapping" and + * Table I-1 "Traffic type to traffic class mapping". + */ +static const u8 ieee8021q_8queue_tt_tc_map[] = { + [IEEE8021Q_TT_BK] = 0, + [IEEE8021Q_TT_BE] = 1, + [IEEE8021Q_TT_EE] = 2, + [IEEE8021Q_TT_CA] = 3, + [IEEE8021Q_TT_VI] = 4, + [IEEE8021Q_TT_VO] = 5, + [IEEE8021Q_TT_IC] = 6, + [IEEE8021Q_TT_NC] = 7, +}; + +static const u8 ieee8021q_7queue_tt_tc_map[] = { + [IEEE8021Q_TT_BK] = 0, + [IEEE8021Q_TT_BE] = 1, + [IEEE8021Q_TT_EE] = 2, + [IEEE8021Q_TT_CA] = 3, + [IEEE8021Q_TT_VI] = 4, [IEEE8021Q_TT_VO] = 4, + [IEEE8021Q_TT_IC] = 5, + [IEEE8021Q_TT_NC] = 6, +}; + +static const u8 ieee8021q_6queue_tt_tc_map[] = { + [IEEE8021Q_TT_BK] = 0, + [IEEE8021Q_TT_BE] = 1, + [IEEE8021Q_TT_EE] = 2, [IEEE8021Q_TT_CA] = 2, + [IEEE8021Q_TT_VI] = 3, [IEEE8021Q_TT_VO] = 3, + [IEEE8021Q_TT_IC] = 4, + [IEEE8021Q_TT_NC] = 5, +}; + +static const u8 ieee8021q_5queue_tt_tc_map[] = { + [IEEE8021Q_TT_BK] = 0, [IEEE8021Q_TT_BE] = 0, + [IEEE8021Q_TT_EE] = 1, [IEEE8021Q_TT_CA] = 1, + [IEEE8021Q_TT_VI] = 2, [IEEE8021Q_TT_VO] = 2, + [IEEE8021Q_TT_IC] = 3, + [IEEE8021Q_TT_NC] = 4, +}; + +static const u8 ieee8021q_4queue_tt_tc_map[] = { + [IEEE8021Q_TT_BK] = 0, [IEEE8021Q_TT_BE] = 0, + [IEEE8021Q_TT_EE] = 1, [IEEE8021Q_TT_CA] = 1, + [IEEE8021Q_TT_VI] = 2, [IEEE8021Q_TT_VO] = 2, + [IEEE8021Q_TT_IC] = 3, [IEEE8021Q_TT_NC] = 3, +}; + +static const u8 ieee8021q_3queue_tt_tc_map[] = { + [IEEE8021Q_TT_BK] = 0, [IEEE8021Q_TT_BE] = 0, + [IEEE8021Q_TT_EE] = 0, [IEEE8021Q_TT_CA] = 0, + [IEEE8021Q_TT_VI] = 1, [IEEE8021Q_TT_VO] = 1, + [IEEE8021Q_TT_IC] = 2, [IEEE8021Q_TT_NC] = 2, +}; + +static const u8 ieee8021q_2queue_tt_tc_map[] = { + [IEEE8021Q_TT_BK] = 0, [IEEE8021Q_TT_BE] = 0, + [IEEE8021Q_TT_EE] = 0, [IEEE8021Q_TT_CA] = 0, + [IEEE8021Q_TT_VI] = 1, [IEEE8021Q_TT_VO] = 1, + [IEEE8021Q_TT_IC] = 1, [IEEE8021Q_TT_NC] = 1, +}; + +static const u8 ieee8021q_1queue_tt_tc_map[] = { + [IEEE8021Q_TT_BK] = 0, [IEEE8021Q_TT_BE] = 0, + [IEEE8021Q_TT_EE] = 0, [IEEE8021Q_TT_CA] = 0, + [IEEE8021Q_TT_VI] = 0, [IEEE8021Q_TT_VO] = 0, + [IEEE8021Q_TT_IC] = 0, [IEEE8021Q_TT_NC] = 0, +}; + +/** + * ieee8021q_tt_to_tc - Map IEEE 802.1Q Traffic Type to Traffic Class + * @tt: IEEE 802.1Q Traffic Type + * @num_queues: Number of queues + * + * This function maps an IEEE 802.1Q Traffic Type to a Traffic Class (TC) based + * on the number of queues configured on the NIC. The mapping is based on the + * example provided by IEEE 802.1Q-2022 in Annex I "I.3 Traffic type to traffic + * class mapping" and Table I-1 "Traffic type to traffic class mapping". + * + * Return: Traffic Class corresponding to the given Traffic Type or negative + * value in case of error. + */ +int ieee8021q_tt_to_tc(enum ieee8021q_traffic_type tt, unsigned int num_queues) +{ + if (tt < 0 || tt >= IEEE8021Q_TT_MAX) { + pr_err("Requested Traffic Type (%d) is out of range (%d)\n", tt, + IEEE8021Q_TT_MAX); + return -EINVAL; + } + + switch (num_queues) { + case 8: + compiletime_assert(ARRAY_SIZE(ieee8021q_8queue_tt_tc_map) != + IEEE8021Q_TT_MAX - 1, + "ieee8021q_8queue_tt_tc_map != max - 1"); + return ieee8021q_8queue_tt_tc_map[tt]; + case 7: + compiletime_assert(ARRAY_SIZE(ieee8021q_7queue_tt_tc_map) != + IEEE8021Q_TT_MAX - 1, + "ieee8021q_7queue_tt_tc_map != max - 1"); + + return ieee8021q_7queue_tt_tc_map[tt]; + case 6: + compiletime_assert(ARRAY_SIZE(ieee8021q_6queue_tt_tc_map) != + IEEE8021Q_TT_MAX - 1, + "ieee8021q_6queue_tt_tc_map != max - 1"); + + return ieee8021q_6queue_tt_tc_map[tt]; + case 5: + compiletime_assert(ARRAY_SIZE(ieee8021q_5queue_tt_tc_map) != + IEEE8021Q_TT_MAX - 1, + "ieee8021q_5queue_tt_tc_map != max - 1"); + + return ieee8021q_5queue_tt_tc_map[tt]; + case 4: + compiletime_assert(ARRAY_SIZE(ieee8021q_4queue_tt_tc_map) != + IEEE8021Q_TT_MAX - 1, + "ieee8021q_4queue_tt_tc_map != max - 1"); + + return ieee8021q_4queue_tt_tc_map[tt]; + case 3: + compiletime_assert(ARRAY_SIZE(ieee8021q_3queue_tt_tc_map) != + IEEE8021Q_TT_MAX - 1, + "ieee8021q_3queue_tt_tc_map != max - 1"); + + return ieee8021q_3queue_tt_tc_map[tt]; + case 2: + compiletime_assert(ARRAY_SIZE(ieee8021q_2queue_tt_tc_map) != + IEEE8021Q_TT_MAX - 1, + "ieee8021q_2queue_tt_tc_map != max - 1"); + + return ieee8021q_2queue_tt_tc_map[tt]; + case 1: + compiletime_assert(ARRAY_SIZE(ieee8021q_1queue_tt_tc_map) != + IEEE8021Q_TT_MAX - 1, + "ieee8021q_1queue_tt_tc_map != max - 1"); + + return ieee8021q_1queue_tt_tc_map[tt]; + } + + pr_err("Invalid number of queues %d\n", num_queues); + + return -EINVAL; +} +EXPORT_SYMBOL_GPL(ieee8021q_tt_to_tc); + +/** + * ietf_dscp_to_ieee8021q_tt - Map IETF DSCP to IEEE 802.1Q Traffic Type + * @dscp: IETF DSCP value + * + * This function maps an IETF DSCP value to an IEEE 802.1Q Traffic Type (TT). + * Since there is no corresponding mapping between DSCP and IEEE 802.1Q Traffic + * Type, this function is inspired by the RFC8325 documentation which describe + * the mapping between DSCP and 802.11 User Priority (UP) values. + * + * Return: IEEE 802.1Q Traffic Type corresponding to the given DSCP value + */ +int ietf_dscp_to_ieee8021q_tt(u8 dscp) +{ + switch (dscp) { + case DSCP_CS0: + /* Comment from RFC8325: + * [RFC4594], Section 4.8, recommends High-Throughput Data be marked + * AF1x (that is, AF11, AF12, and AF13, according to the rules defined + * in [RFC2475]). + * + * By default (as described in Section 2.3), High-Throughput Data will + * map to UP 1 and, thus, to the Background Access Category (AC_BK), + * which is contrary to the intent expressed in [RFC4594]. + + * Unfortunately, there really is no corresponding fit for the High- + * Throughput Data service class within the constrained 4 Access + * Category [IEEE.802.11-2016] model. If the High-Throughput Data + * service class is assigned to the Best Effort Access Category (AC_BE), + * then it would contend with Low-Latency Data (while [RFC4594] + * recommends a distinction in servicing between these service classes) + * as well as with the default service class; alternatively, if it is + * assigned to the Background Access Category (AC_BK), then it would + * receive a less-then-best-effort service and contend with Low-Priority + * Data (as discussed in Section 4.2.10). + * + * As such, since there is no directly corresponding fit for the High- + * Throughout Data service class within the [IEEE.802.11-2016] model, it + * is generally RECOMMENDED to map High-Throughput Data to UP 0, thereby + * admitting it to the Best Effort Access Category (AC_BE). + * + * Note: The above text is from RFC8325 which is describing the mapping + * between DSCP and 802.11 User Priority (UP) values. The mapping + * between UP and IEEE 802.1Q Traffic Type is not defined in the RFC but + * the 802.11 AC_BK and AC_BE are closely related to the IEEE 802.1Q + * Traffic Types BE and BK. + */ + case DSCP_AF11: + case DSCP_AF12: + case DSCP_AF13: + return IEEE8021Q_TT_BE; + /* Comment from RFC8325: + * RFC3662 and RFC4594 both recommend Low-Priority Data be marked + * with DSCP CS1. The Low-Priority Data service class loosely + * corresponds to the [IEEE.802.11-2016] Background Access Category + */ + case DSCP_CS1: + return IEEE8021Q_TT_BK; + case DSCP_CS2: + case DSCP_AF21: + case DSCP_AF22: + case DSCP_AF23: + return IEEE8021Q_TT_EE; + case DSCP_CS3: + case DSCP_AF31: + case DSCP_AF32: + case DSCP_AF33: + return IEEE8021Q_TT_CA; + case DSCP_CS4: + case DSCP_AF41: + case DSCP_AF42: + case DSCP_AF43: + return IEEE8021Q_TT_VI; + case DSCP_CS5: + case DSCP_EF: + case DSCP_VOICE_ADMIT: + return IEEE8021Q_TT_VO; + case DSCP_CS6: + return IEEE8021Q_TT_IC; + case DSCP_CS7: + return IEEE8021Q_TT_NC; + } + + return SIMPLE_IETF_DSCP_TO_IEEE8021Q_TT(dscp); +} +EXPORT_SYMBOL_GPL(ietf_dscp_to_ieee8021q_tt); diff --git a/net/core/link_watch.c b/net/core/link_watch.c index c469d1c4db5d..8ec35194bfcb 100644 --- a/net/core/link_watch.c +++ b/net/core/link_watch.c @@ -33,7 +33,7 @@ static DECLARE_DELAYED_WORK(linkwatch_work, linkwatch_event); static LIST_HEAD(lweventlist); static DEFINE_SPINLOCK(lweventlist_lock); -static unsigned char default_operstate(const struct net_device *dev) +static unsigned int default_operstate(const struct net_device *dev) { if (netif_testing(dev)) return IF_OPER_TESTING; @@ -62,16 +62,13 @@ static unsigned char default_operstate(const struct net_device *dev) return IF_OPER_UP; } - static void rfc2863_policy(struct net_device *dev) { - unsigned char operstate = default_operstate(dev); + unsigned int operstate = default_operstate(dev); - if (operstate == dev->operstate) + if (operstate == READ_ONCE(dev->operstate)) return; - write_lock(&dev_base_lock); - switch(dev->link_mode) { case IF_LINK_MODE_TESTING: if (operstate == IF_OPER_UP) @@ -87,9 +84,7 @@ static void rfc2863_policy(struct net_device *dev) break; } - dev->operstate = operstate; - - write_unlock(&dev_base_lock); + WRITE_ONCE(dev->operstate, operstate); } @@ -192,7 +187,10 @@ static void __linkwatch_run_queue(int urgent_only) #define MAX_DO_DEV_PER_LOOP 100 int do_dev = MAX_DO_DEV_PER_LOOP; - struct net_device *dev; + /* Use a local list here since we add non-urgent + * events back to the global one when called with + * urgent_only=1. + */ LIST_HEAD(wrk); /* Give urgent case more budget */ @@ -218,6 +216,7 @@ static void __linkwatch_run_queue(int urgent_only) list_splice_init(&lweventlist, &wrk); while (!list_empty(&wrk) && do_dev > 0) { + struct net_device *dev; dev = list_first_entry(&wrk, struct net_device, link_watch_list); list_del_init(&dev->link_watch_list); @@ -245,7 +244,7 @@ static void __linkwatch_run_queue(int urgent_only) spin_unlock_irq(&lweventlist_lock); } -void linkwatch_forget_dev(struct net_device *dev) +void linkwatch_sync_dev(struct net_device *dev) { unsigned long flags; int clean = 0; diff --git a/net/core/neighbour.c b/net/core/neighbour.c index df81c1f0a570..45fd88405b6b 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -253,9 +253,11 @@ static int neigh_forced_gc(struct neigh_table *tbl) { int max_clean = atomic_read(&tbl->gc_entries) - READ_ONCE(tbl->gc_thresh2); + u64 tmax = ktime_get_ns() + NSEC_PER_MSEC; unsigned long tref = jiffies - 5 * HZ; struct neighbour *n, *tmp; int shrunk = 0; + int loop = 0; NEIGH_CACHE_STAT_INC(tbl, forced_gc_runs); @@ -278,11 +280,16 @@ static int neigh_forced_gc(struct neigh_table *tbl) shrunk++; if (shrunk >= max_clean) break; + if (++loop == 16) { + if (ktime_get_ns() > tmax) + goto unlock; + loop = 0; + } } } WRITE_ONCE(tbl->last_flush, jiffies); - +unlock: write_unlock_bh(&tbl->lock); return shrunk; @@ -727,7 +734,9 @@ out_neigh_release: struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey, struct net_device *dev, bool want_ref) { - return ___neigh_create(tbl, pkey, dev, 0, false, want_ref); + bool exempt_from_gc = !!(dev->flags & IFF_LOOPBACK); + + return ___neigh_create(tbl, pkey, dev, 0, exempt_from_gc, want_ref); } EXPORT_SYMBOL(__neigh_create); @@ -1762,7 +1771,7 @@ static void neigh_parms_destroy(struct neigh_parms *parms) static struct lock_class_key neigh_table_proxy_queue_class; -static struct neigh_table *neigh_tables[NEIGH_NR_TABLES] __read_mostly; +static struct neigh_table __rcu *neigh_tables[NEIGH_NR_TABLES] __read_mostly; void neigh_table_init(int index, struct neigh_table *tbl) { @@ -1819,13 +1828,19 @@ void neigh_table_init(int index, struct neigh_table *tbl) tbl->last_flush = now; tbl->last_rand = now + tbl->parms.reachable_time * 20; - neigh_tables[index] = tbl; + rcu_assign_pointer(neigh_tables[index], tbl); } EXPORT_SYMBOL(neigh_table_init); +/* + * Only called from ndisc_cleanup(), which means this is dead code + * because we no longer can unload IPv6 module. + */ int neigh_table_clear(int index, struct neigh_table *tbl) { - neigh_tables[index] = NULL; + RCU_INIT_POINTER(neigh_tables[index], NULL); + synchronize_rcu(); + /* It is not clean... Fix it to unload IPv6 module safely */ cancel_delayed_work_sync(&tbl->managed_work); cancel_delayed_work_sync(&tbl->gc_work); @@ -1857,10 +1872,10 @@ static struct neigh_table *neigh_find_table(int family) switch (family) { case AF_INET: - tbl = neigh_tables[NEIGH_ARP_TABLE]; + tbl = rcu_dereference_rtnl(neigh_tables[NEIGH_ARP_TABLE]); break; case AF_INET6: - tbl = neigh_tables[NEIGH_ND_TABLE]; + tbl = rcu_dereference_rtnl(neigh_tables[NEIGH_ND_TABLE]); break; } @@ -2324,7 +2339,7 @@ static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh, ndtmsg = nlmsg_data(nlh); for (tidx = 0; tidx < NEIGH_NR_TABLES; tidx++) { - tbl = neigh_tables[tidx]; + tbl = rcu_dereference_rtnl(neigh_tables[tidx]); if (!tbl) continue; if (ndtmsg->ndtm_family && tbl->family != ndtmsg->ndtm_family) @@ -2512,7 +2527,7 @@ static int neightbl_dump_info(struct sk_buff *skb, struct netlink_callback *cb) for (tidx = 0; tidx < NEIGH_NR_TABLES; tidx++) { struct neigh_parms *p; - tbl = neigh_tables[tidx]; + tbl = rcu_dereference_rtnl(neigh_tables[tidx]); if (!tbl) continue; @@ -2667,7 +2682,7 @@ static bool neigh_master_filtered(struct net_device *dev, int master_idx) if (!master_idx) return false; - master = dev ? netdev_master_upper_dev_get(dev) : NULL; + master = dev ? netdev_master_upper_dev_get_rcu(dev) : NULL; /* 0 is already used to denote NDA_MASTER wasn't passed, therefore need another * invalid value for ifindex to denote "no master". @@ -2700,7 +2715,7 @@ static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, { struct net *net = sock_net(skb->sk); struct neighbour *n; - int rc, h, s_h = cb->args[1]; + int err = 0, h, s_h = cb->args[1]; int idx, s_idx = idx = cb->args[2]; struct neigh_hash_table *nht; unsigned int flags = NLM_F_MULTI; @@ -2708,7 +2723,6 @@ static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, if (filter->dev_idx || filter->master_idx) flags |= NLM_F_DUMP_FILTERED; - rcu_read_lock(); nht = rcu_dereference(tbl->nht); for (h = s_h; h < (1 << nht->hash_shift); h++) { @@ -2722,23 +2736,19 @@ static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, if (neigh_ifindex_filtered(n->dev, filter->dev_idx) || neigh_master_filtered(n->dev, filter->master_idx)) goto next; - if (neigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - RTM_NEWNEIGH, - flags) < 0) { - rc = -1; + err = neigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + RTM_NEWNEIGH, flags); + if (err < 0) goto out; - } next: idx++; } } - rc = skb->len; out: - rcu_read_unlock(); cb->args[1] = h; cb->args[2] = idx; - return rc; + return err; } static int pneigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, @@ -2747,7 +2757,7 @@ static int pneigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, { struct pneigh_entry *n; struct net *net = sock_net(skb->sk); - int rc, h, s_h = cb->args[3]; + int err = 0, h, s_h = cb->args[3]; int idx, s_idx = idx = cb->args[4]; unsigned int flags = NLM_F_MULTI; @@ -2765,11 +2775,11 @@ static int pneigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, if (neigh_ifindex_filtered(n->dev, filter->dev_idx) || neigh_master_filtered(n->dev, filter->master_idx)) goto next; - if (pneigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - RTM_NEWNEIGH, flags, tbl) < 0) { + err = pneigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + RTM_NEWNEIGH, flags, tbl); + if (err < 0) { read_unlock_bh(&tbl->lock); - rc = -1; goto out; } next: @@ -2778,12 +2788,10 @@ static int pneigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, } read_unlock_bh(&tbl->lock); - rc = skb->len; out: cb->args[3] = h; cb->args[4] = idx; - return rc; - + return err; } static int neigh_valid_dump_req(const struct nlmsghdr *nlh, @@ -2871,8 +2879,9 @@ static int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb) s_t = cb->args[0]; + rcu_read_lock(); for (t = 0; t < NEIGH_NR_TABLES; t++) { - tbl = neigh_tables[t]; + tbl = rcu_dereference(neigh_tables[t]); if (!tbl) continue; @@ -2888,9 +2897,10 @@ static int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb) if (err < 0) break; } + rcu_read_unlock(); cb->args[0] = t; - return skb->len; + return err; } static int neigh_valid_get_req(const struct nlmsghdr *nlh, @@ -3136,14 +3146,15 @@ int neigh_xmit(int index, struct net_device *dev, const void *addr, struct sk_buff *skb) { int err = -EAFNOSUPPORT; + if (likely(index < NEIGH_NR_TABLES)) { struct neigh_table *tbl; struct neighbour *neigh; - tbl = neigh_tables[index]; - if (!tbl) - goto out; rcu_read_lock(); + tbl = rcu_dereference(neigh_tables[index]); + if (!tbl) + goto out_unlock; if (index == NEIGH_ARP_TABLE) { u32 key = *((u32 *)addr); @@ -3159,6 +3170,7 @@ int neigh_xmit(int index, struct net_device *dev, goto out_kfree_skb; } err = READ_ONCE(neigh->output)(neigh, skb); +out_unlock: rcu_read_unlock(); } else if (index == NEIGH_LINK_TABLE) { @@ -3721,7 +3733,7 @@ static int neigh_proc_base_reachable_time(struct ctl_table *ctl, int write, static struct neigh_sysctl_table { struct ctl_table_header *sysctl_header; - struct ctl_table neigh_vars[NEIGH_VAR_MAX + 1]; + struct ctl_table neigh_vars[NEIGH_VAR_MAX]; } neigh_sysctl_template __read_mostly = { .neigh_vars = { NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(MCAST_PROBES, "mcast_solicit"), @@ -3772,7 +3784,6 @@ static struct neigh_sysctl_table { .extra2 = SYSCTL_INT_MAX, .proc_handler = proc_dointvec_minmax, }, - {}, }, }; @@ -3800,8 +3811,6 @@ int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p, if (dev) { dev_name_source = dev->name; /* Terminate the table early */ - memset(&t->neigh_vars[NEIGH_VAR_GC_INTERVAL], 0, - sizeof(t->neigh_vars[NEIGH_VAR_GC_INTERVAL])); neigh_vars_size = NEIGH_VAR_BASE_REACHABLE_TIME_MS + 1; } else { struct neigh_table *tbl = p->tbl; @@ -3882,7 +3891,8 @@ static int __init neigh_init(void) { rtnl_register(PF_UNSPEC, RTM_NEWNEIGH, neigh_add, NULL, 0); rtnl_register(PF_UNSPEC, RTM_DELNEIGH, neigh_delete, NULL, 0); - rtnl_register(PF_UNSPEC, RTM_GETNEIGH, neigh_get, neigh_dump_info, 0); + rtnl_register(PF_UNSPEC, RTM_GETNEIGH, neigh_get, neigh_dump_info, + RTNL_FLAG_DUMP_UNLOCKED); rtnl_register(PF_UNSPEC, RTM_GETNEIGHTBL, NULL, neightbl_dump_info, 0); diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c index 09f7ed1a04e8..fa6d3969734a 100644 --- a/net/core/net-procfs.c +++ b/net/core/net-procfs.c @@ -3,52 +3,22 @@ #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <net/wext.h> +#include <net/hotdata.h> #include "dev.h" -#define BUCKET_SPACE (32 - NETDEV_HASHBITS - 1) - -#define get_bucket(x) ((x) >> BUCKET_SPACE) -#define get_offset(x) ((x) & ((1 << BUCKET_SPACE) - 1)) -#define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o)) - -static inline struct net_device *dev_from_same_bucket(struct seq_file *seq, loff_t *pos) +static void *dev_seq_from_index(struct seq_file *seq, loff_t *pos) { - struct net *net = seq_file_net(seq); + unsigned long ifindex = *pos; struct net_device *dev; - struct hlist_head *h; - unsigned int count = 0, offset = get_offset(*pos); - h = &net->dev_index_head[get_bucket(*pos)]; - hlist_for_each_entry_rcu(dev, h, index_hlist) { - if (++count == offset) - return dev; + for_each_netdev_dump(seq_file_net(seq), dev, ifindex) { + *pos = dev->ifindex; + return dev; } - - return NULL; -} - -static inline struct net_device *dev_from_bucket(struct seq_file *seq, loff_t *pos) -{ - struct net_device *dev; - unsigned int bucket; - - do { - dev = dev_from_same_bucket(seq, pos); - if (dev) - return dev; - - bucket = get_bucket(*pos) + 1; - *pos = set_bucket_offset(bucket, 1); - } while (bucket < NETDEV_HASHENTRIES); - return NULL; } -/* - * This is invoked by the /proc filesystem handler to display a device - * in detail. - */ static void *dev_seq_start(struct seq_file *seq, loff_t *pos) __acquires(RCU) { @@ -56,16 +26,13 @@ static void *dev_seq_start(struct seq_file *seq, loff_t *pos) if (!*pos) return SEQ_START_TOKEN; - if (get_bucket(*pos) >= NETDEV_HASHENTRIES) - return NULL; - - return dev_from_bucket(seq, pos); + return dev_seq_from_index(seq, pos); } static void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos) { ++*pos; - return dev_from_bucket(seq, pos); + return dev_seq_from_index(seq, pos); } static void dev_seq_stop(struct seq_file *seq, void *v) @@ -177,7 +144,8 @@ static int softnet_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x " "%08x %08x\n", - sd->processed, sd->dropped, sd->time_squeeze, 0, + sd->processed, atomic_read(&sd->dropped), + sd->time_squeeze, 0, 0, 0, 0, 0, /* was fastroute */ 0, /* was cpu_collision */ sd->received_rps, flow_limit_count, @@ -217,7 +185,7 @@ static void *ptype_get_idx(struct seq_file *seq, loff_t pos) } } - list_for_each_entry_rcu(pt, &ptype_all, list) { + list_for_each_entry_rcu(pt, &net_hotdata.ptype_all, list) { if (i == pos) return pt; ++i; @@ -265,13 +233,13 @@ static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos) } } - nxt = ptype_all.next; + nxt = net_hotdata.ptype_all.next; goto ptype_all; } if (pt->type == htons(ETH_P_ALL)) { ptype_all: - if (nxt != &ptype_all) + if (nxt != &net_hotdata.ptype_all) goto found; hash = 0; nxt = ptype_base[0].next; diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index fccaa5bac0ed..4c27a360c294 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -24,6 +24,7 @@ #include <linux/of_net.h> #include <linux/cpu.h> #include <net/netdev_rx_queue.h> +#include <net/rps.h> #include "dev.h" #include "net-sysfs.h" @@ -34,10 +35,10 @@ static const char fmt_dec[] = "%d\n"; static const char fmt_ulong[] = "%lu\n"; static const char fmt_u64[] = "%llu\n"; -/* Caller holds RTNL or dev_base_lock */ +/* Caller holds RTNL or RCU */ static inline int dev_isalive(const struct net_device *dev) { - return dev->reg_state <= NETREG_REGISTERED; + return READ_ONCE(dev->reg_state) <= NETREG_REGISTERED; } /* use same locking rules as GIF* ioctl's */ @@ -48,10 +49,10 @@ static ssize_t netdev_show(const struct device *dev, struct net_device *ndev = to_net_dev(dev); ssize_t ret = -EINVAL; - read_lock(&dev_base_lock); + rcu_read_lock(); if (dev_isalive(ndev)) ret = (*format)(ndev, buf); - read_unlock(&dev_base_lock); + rcu_read_unlock(); return ret; } @@ -60,7 +61,7 @@ static ssize_t netdev_show(const struct device *dev, #define NETDEVICE_SHOW(field, format_string) \ static ssize_t format_##field(const struct net_device *dev, char *buf) \ { \ - return sysfs_emit(buf, format_string, dev->field); \ + return sysfs_emit(buf, format_string, READ_ONCE(dev->field)); \ } \ static ssize_t field##_show(struct device *dev, \ struct device_attribute *attr, char *buf) \ @@ -125,7 +126,7 @@ static DEVICE_ATTR_RO(iflink); static ssize_t format_name_assign_type(const struct net_device *dev, char *buf) { - return sysfs_emit(buf, fmt_dec, dev->name_assign_type); + return sysfs_emit(buf, fmt_dec, READ_ONCE(dev->name_assign_type)); } static ssize_t name_assign_type_show(struct device *dev, @@ -135,24 +136,28 @@ static ssize_t name_assign_type_show(struct device *dev, struct net_device *ndev = to_net_dev(dev); ssize_t ret = -EINVAL; - if (ndev->name_assign_type != NET_NAME_UNKNOWN) + if (READ_ONCE(ndev->name_assign_type) != NET_NAME_UNKNOWN) ret = netdev_show(dev, attr, buf, format_name_assign_type); return ret; } static DEVICE_ATTR_RO(name_assign_type); -/* use same locking rules as GIFHWADDR ioctl's */ +/* use same locking rules as GIFHWADDR ioctl's (dev_get_mac_address()) */ static ssize_t address_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *ndev = to_net_dev(dev); ssize_t ret = -EINVAL; - read_lock(&dev_base_lock); + down_read(&dev_addr_sem); + + rcu_read_lock(); if (dev_isalive(ndev)) ret = sysfs_format_mac(buf, ndev->dev_addr, ndev->addr_len); - read_unlock(&dev_base_lock); + rcu_read_unlock(); + + up_read(&dev_addr_sem); return ret; } static DEVICE_ATTR_RO(address); @@ -161,10 +166,13 @@ static ssize_t broadcast_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *ndev = to_net_dev(dev); + int ret = -EINVAL; + rcu_read_lock(); if (dev_isalive(ndev)) - return sysfs_format_mac(buf, ndev->broadcast, ndev->addr_len); - return -EINVAL; + ret = sysfs_format_mac(buf, ndev->broadcast, ndev->addr_len); + rcu_read_unlock(); + return ret; } static DEVICE_ATTR_RO(broadcast); @@ -193,11 +201,22 @@ static ssize_t carrier_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *netdev = to_net_dev(dev); + int ret = -EINVAL; - if (netif_running(netdev)) - return sysfs_emit(buf, fmt_dec, !!netif_carrier_ok(netdev)); + if (!rtnl_trylock()) + return restart_syscall(); - return -EINVAL; + if (netif_running(netdev)) { + /* Synchronize carrier state with link watch, + * see also rtnl_getlink(). + */ + linkwatch_sync_dev(netdev); + + ret = sysfs_emit(buf, fmt_dec, !!netif_carrier_ok(netdev)); + } + rtnl_unlock(); + + return ret; } static DEVICE_ATTR_RW(carrier); @@ -307,11 +326,9 @@ static ssize_t operstate_show(struct device *dev, const struct net_device *netdev = to_net_dev(dev); unsigned char operstate; - read_lock(&dev_base_lock); - operstate = netdev->operstate; + operstate = READ_ONCE(netdev->operstate); if (!netif_running(netdev)) operstate = IF_OPER_DOWN; - read_unlock(&dev_base_lock); if (operstate >= ARRAY_SIZE(operstates)) return -EINVAL; /* should not happen */ @@ -588,13 +605,13 @@ static ssize_t threaded_show(struct device *dev, struct net_device *netdev = to_net_dev(dev); ssize_t ret = -EINVAL; - if (!rtnl_trylock()) - return restart_syscall(); + rcu_read_lock(); if (dev_isalive(netdev)) - ret = sysfs_emit(buf, fmt_dec, netdev->threaded); + ret = sysfs_emit(buf, fmt_dec, READ_ONCE(netdev->threaded)); + + rcu_read_unlock(); - rtnl_unlock(); return ret; } @@ -669,14 +686,14 @@ static ssize_t netstat_show(const struct device *d, WARN_ON(offset > sizeof(struct rtnl_link_stats64) || offset % sizeof(u64) != 0); - read_lock(&dev_base_lock); + rcu_read_lock(); if (dev_isalive(dev)) { struct rtnl_link_stats64 temp; const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp); ret = sysfs_emit(buf, fmt_u64, *(u64 *)(((u8 *)stats) + offset)); } - read_unlock(&dev_base_lock); + rcu_read_unlock(); return ret; } @@ -1398,6 +1415,65 @@ static struct netdev_queue_attribute bql_hold_time_attribute __ro_after_init = __ATTR(hold_time, 0644, bql_show_hold_time, bql_set_hold_time); +static ssize_t bql_show_stall_thrs(struct netdev_queue *queue, char *buf) +{ + struct dql *dql = &queue->dql; + + return sysfs_emit(buf, "%u\n", jiffies_to_msecs(dql->stall_thrs)); +} + +static ssize_t bql_set_stall_thrs(struct netdev_queue *queue, + const char *buf, size_t len) +{ + struct dql *dql = &queue->dql; + unsigned int value; + int err; + + err = kstrtouint(buf, 10, &value); + if (err < 0) + return err; + + value = msecs_to_jiffies(value); + if (value && (value < 4 || value > 4 / 2 * BITS_PER_LONG)) + return -ERANGE; + + if (!dql->stall_thrs && value) + dql->last_reap = jiffies; + /* Force last_reap to be live */ + smp_wmb(); + dql->stall_thrs = value; + + return len; +} + +static struct netdev_queue_attribute bql_stall_thrs_attribute __ro_after_init = + __ATTR(stall_thrs, 0644, bql_show_stall_thrs, bql_set_stall_thrs); + +static ssize_t bql_show_stall_max(struct netdev_queue *queue, char *buf) +{ + return sysfs_emit(buf, "%u\n", READ_ONCE(queue->dql.stall_max)); +} + +static ssize_t bql_set_stall_max(struct netdev_queue *queue, + const char *buf, size_t len) +{ + WRITE_ONCE(queue->dql.stall_max, 0); + return len; +} + +static struct netdev_queue_attribute bql_stall_max_attribute __ro_after_init = + __ATTR(stall_max, 0644, bql_show_stall_max, bql_set_stall_max); + +static ssize_t bql_show_stall_cnt(struct netdev_queue *queue, char *buf) +{ + struct dql *dql = &queue->dql; + + return sysfs_emit(buf, "%lu\n", dql->stall_cnt); +} + +static struct netdev_queue_attribute bql_stall_cnt_attribute __ro_after_init = + __ATTR(stall_cnt, 0444, bql_show_stall_cnt, NULL); + static ssize_t bql_show_inflight(struct netdev_queue *queue, char *buf) { @@ -1436,6 +1512,9 @@ static struct attribute *dql_attrs[] __ro_after_init = { &bql_limit_min_attribute.attr, &bql_hold_time_attribute.attr, &bql_inflight_attribute.attr, + &bql_stall_thrs_attribute.attr, + &bql_stall_cnt_attribute.attr, + &bql_stall_max_attribute.attr, NULL }; @@ -1443,6 +1522,9 @@ static const struct attribute_group dql_group = { .name = "byte_queue_limits", .attrs = dql_attrs, }; +#else +/* Fake declaration, all the code using it should be dead */ +extern const struct attribute_group dql_group; #endif /* CONFIG_BQL */ #ifdef CONFIG_XPS @@ -1680,6 +1762,15 @@ static const struct kobj_type netdev_queue_ktype = { .get_ownership = netdev_queue_get_ownership, }; +static bool netdev_uses_bql(const struct net_device *dev) +{ + if (dev->features & NETIF_F_LLTX || + dev->priv_flags & IFF_NO_QUEUE) + return false; + + return IS_ENABLED(CONFIG_BQL); +} + static int netdev_queue_add_kobject(struct net_device *dev, int index) { struct netdev_queue *queue = dev->_tx + index; @@ -1697,11 +1788,11 @@ static int netdev_queue_add_kobject(struct net_device *dev, int index) if (error) goto err; -#ifdef CONFIG_BQL - error = sysfs_create_group(kobj, &dql_group); - if (error) - goto err; -#endif + if (netdev_uses_bql(dev)) { + error = sysfs_create_group(kobj, &dql_group); + if (error) + goto err; + } kobject_uevent(kobj, KOBJ_ADD); return 0; @@ -1722,9 +1813,9 @@ static int tx_queue_change_owner(struct net_device *ndev, int index, if (error) return error; -#ifdef CONFIG_BQL - error = sysfs_group_change_owner(kobj, &dql_group, kuid, kgid); -#endif + if (netdev_uses_bql(ndev)) + error = sysfs_group_change_owner(kobj, &dql_group, kuid, kgid); + return error; } #endif /* CONFIG_SYSFS */ @@ -1756,9 +1847,10 @@ netdev_queue_update_kobjects(struct net_device *dev, int old_num, int new_num) if (!refcount_read(&dev_net(dev)->ns.count)) queue->kobj.uevent_suppress = 1; -#ifdef CONFIG_BQL - sysfs_remove_group(&queue->kobj, &dql_group); -#endif + + if (netdev_uses_bql(dev)) + sysfs_remove_group(&queue->kobj, &dql_group); + kobject_put(&queue->kobj); } @@ -1954,7 +2046,7 @@ static void net_get_ownership(const struct device *d, kuid_t *uid, kgid_t *gid) net_ns_get_ownership(net, uid, gid); } -static struct class net_class __ro_after_init = { +static const struct class net_class = { .name = "net", .dev_release = netdev_release, .dev_groups = net_class_groups, diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index f4183c4c1ec8..4f7a61688d18 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -69,12 +69,15 @@ DEFINE_COOKIE(net_cookie); static struct net_generic *net_alloc_generic(void) { + unsigned int gen_ptrs = READ_ONCE(max_gen_ptrs); + unsigned int generic_size; struct net_generic *ng; - unsigned int generic_size = offsetof(struct net_generic, ptr[max_gen_ptrs]); + + generic_size = offsetof(struct net_generic, ptr[gen_ptrs]); ng = kzalloc(generic_size, GFP_KERNEL); if (ng) - ng->s.len = max_gen_ptrs; + ng->s.len = gen_ptrs; return ng; } @@ -318,8 +321,9 @@ static __net_init int setup_net(struct net *net, struct user_namespace *user_ns) { /* Must be called with pernet_ops_rwsem held */ const struct pernet_operations *ops, *saved_ops; - int error = 0; LIST_HEAD(net_exit_list); + LIST_HEAD(dev_kill_list); + int error = 0; refcount_set(&net->ns.count, 1); ref_tracker_dir_init(&net->refcnt_tracker, 128, "net refcnt"); @@ -358,6 +362,15 @@ out_undo: synchronize_rcu(); ops = saved_ops; + rtnl_lock(); + list_for_each_entry_continue_reverse(ops, &pernet_list, list) { + if (ops->exit_batch_rtnl) + ops->exit_batch_rtnl(&net_exit_list, &dev_kill_list); + } + unregister_netdevice_many(&dev_kill_list); + rtnl_unlock(); + + ops = saved_ops; list_for_each_entry_continue_reverse(ops, &pernet_list, list) ops_exit_list(ops, &net_exit_list); @@ -372,6 +385,10 @@ out_undo: static int __net_init net_defaults_init_net(struct net *net) { net->core.sysctl_somaxconn = SOMAXCONN; + /* Limits per socket sk_omem_alloc usage. + * TCP zerocopy regular usage needs 128 KB. + */ + net->core.sysctl_optmem_max = 128 * 1024; net->core.sysctl_txrehash = SOCK_TXREHASH_ENABLED; return 0; @@ -569,6 +586,7 @@ static void cleanup_net(struct work_struct *work) struct net *net, *tmp, *last; struct llist_node *net_kill_list; LIST_HEAD(net_exit_list); + LIST_HEAD(dev_kill_list); /* Atomically snapshot the list of namespaces to cleanup */ net_kill_list = llist_del_all(&cleanup_list); @@ -607,7 +625,15 @@ static void cleanup_net(struct work_struct *work) * the rcu_barrier() below isn't sufficient alone. * Also the pre_exit() and exit() methods need this barrier. */ - synchronize_rcu(); + synchronize_rcu_expedited(); + + rtnl_lock(); + list_for_each_entry_reverse(ops, &pernet_list, list) { + if (ops->exit_batch_rtnl) + ops->exit_batch_rtnl(&net_exit_list, &dev_kill_list); + } + unregister_netdevice_many(&dev_kill_list); + rtnl_unlock(); /* Run all of the network namespace exit methods */ list_for_each_entry_reverse(ops, &pernet_list, list) @@ -1067,7 +1093,7 @@ static int rtnl_net_dumpid(struct sk_buff *skb, struct netlink_callback *cb) end: if (net_cb.fillargs.add_ref) put_net(net_cb.tgt_net); - return err < 0 ? err : skb->len; + return err; } static void rtnl_net_notifyid(struct net *net, int cmd, int id, u32 portid, @@ -1099,11 +1125,56 @@ out: rtnl_set_sk_err(net, RTNLGRP_NSID, err); } +#ifdef CONFIG_NET_NS +static void __init netns_ipv4_struct_check(void) +{ + /* TX readonly hotpath cache lines */ + CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx, + sysctl_tcp_early_retrans); + CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx, + sysctl_tcp_tso_win_divisor); + CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx, + sysctl_tcp_tso_rtt_log); + CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx, + sysctl_tcp_autocorking); + CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx, + sysctl_tcp_min_snd_mss); + CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx, + sysctl_tcp_notsent_lowat); + CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx, + sysctl_tcp_limit_output_bytes); + CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx, + sysctl_tcp_min_rtt_wlen); + CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx, + sysctl_tcp_wmem); + CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx, + sysctl_ip_fwd_use_pmtu); + CACHELINE_ASSERT_GROUP_SIZE(struct netns_ipv4, netns_ipv4_read_tx, 33); + + /* TXRX readonly hotpath cache lines */ + CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_txrx, + sysctl_tcp_moderate_rcvbuf); + CACHELINE_ASSERT_GROUP_SIZE(struct netns_ipv4, netns_ipv4_read_txrx, 1); + + /* RX readonly hotpath cache line */ + CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx, + sysctl_ip_early_demux); + CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx, + sysctl_tcp_early_demux); + CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx, + sysctl_tcp_reordering); + CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx, + sysctl_tcp_rmem); + CACHELINE_ASSERT_GROUP_SIZE(struct netns_ipv4, netns_ipv4_read_rx, 18); +} +#endif + void __init net_ns_init(void) { struct net_generic *ng; #ifdef CONFIG_NET_NS + netns_ipv4_struct_check(); net_cachep = kmem_cache_create("net_namespace", sizeof(struct net), SMP_CACHE_BYTES, SLAB_PANIC|SLAB_ACCOUNT, NULL); @@ -1137,14 +1208,25 @@ void __init net_ns_init(void) rtnl_register(PF_UNSPEC, RTM_NEWNSID, rtnl_net_newid, NULL, RTNL_FLAG_DOIT_UNLOCKED); rtnl_register(PF_UNSPEC, RTM_GETNSID, rtnl_net_getid, rtnl_net_dumpid, - RTNL_FLAG_DOIT_UNLOCKED); + RTNL_FLAG_DOIT_UNLOCKED | + RTNL_FLAG_DUMP_UNLOCKED); } static void free_exit_list(struct pernet_operations *ops, struct list_head *net_exit_list) { ops_pre_exit_list(ops, net_exit_list); synchronize_rcu(); + + if (ops->exit_batch_rtnl) { + LIST_HEAD(dev_kill_list); + + rtnl_lock(); + ops->exit_batch_rtnl(net_exit_list, &dev_kill_list); + unregister_netdevice_many(&dev_kill_list); + rtnl_unlock(); + } ops_exit_list(ops, net_exit_list); + ops_free_list(ops, net_exit_list); } @@ -1229,7 +1311,11 @@ static int register_pernet_operations(struct list_head *list, if (error < 0) return error; *ops->id = error; - max_gen_ptrs = max(max_gen_ptrs, *ops->id + 1); + /* This does not require READ_ONCE as writers already hold + * pernet_ops_rwsem. But WRITE_ONCE is needed to protect + * net_alloc_generic. + */ + WRITE_ONCE(max_gen_ptrs, max(max_gen_ptrs, *ops->id + 1)); } error = __register_pernet_operations(list, ops); if (error) { diff --git a/net/core/gso_test.c b/net/core/net_test.c index ceb684be4cbf..9c3a590865d2 100644 --- a/net/core/gso_test.c +++ b/net/core/net_test.c @@ -1,6 +1,9 @@ // SPDX-License-Identifier: GPL-2.0-or-later #include <kunit/test.h> + +/* GSO */ + #include <linux/skbuff.h> static const char hdr[] = "abcdefgh"; @@ -180,18 +183,17 @@ static void gso_test_func(struct kunit *test) } if (tcase->frag_skbs) { - unsigned int total_size = 0, total_true_size = 0, alloc_size = 0; + unsigned int total_size = 0, total_true_size = 0; struct sk_buff *frag_skb, *prev = NULL; - page = alloc_page(GFP_KERNEL); - KUNIT_ASSERT_NOT_NULL(test, page); - page_ref_add(page, tcase->nr_frag_skbs - 1); - for (i = 0; i < tcase->nr_frag_skbs; i++) { unsigned int frag_size; + page = alloc_page(GFP_KERNEL); + KUNIT_ASSERT_NOT_NULL(test, page); + frag_size = tcase->frag_skbs[i]; - frag_skb = build_skb(page_address(page) + alloc_size, + frag_skb = build_skb(page_address(page), frag_size + shinfo_size); KUNIT_ASSERT_NOT_NULL(test, frag_skb); __skb_put(frag_skb, frag_size); @@ -204,11 +206,8 @@ static void gso_test_func(struct kunit *test) total_size += frag_size; total_true_size += frag_skb->truesize; - alloc_size += frag_size + shinfo_size; } - KUNIT_ASSERT_LE(test, alloc_size, PAGE_SIZE); - skb->len += total_size; skb->data_len += total_size; skb->truesize += total_true_size; @@ -229,7 +228,7 @@ static void gso_test_func(struct kunit *test) segs = skb_segment(skb, features); if (IS_ERR(segs)) { - KUNIT_FAIL(test, "segs error %lld", PTR_ERR(segs)); + KUNIT_FAIL(test, "segs error %pe", segs); goto free_gso_skb; } else if (!segs) { KUNIT_FAIL(test, "no segments"); @@ -262,17 +261,127 @@ free_gso_skb: consume_skb(skb); } -static struct kunit_case gso_test_cases[] = { - KUNIT_CASE_PARAM(gso_test_func, gso_test_gen_params), - {} +/* IP tunnel flags */ + +#include <net/ip_tunnels.h> + +struct ip_tunnel_flags_test { + const char *name; + + const u16 *src_bits; + const u16 *exp_bits; + u8 src_num; + u8 exp_num; + + __be16 exp_val; + bool exp_comp; +}; + +#define IP_TUNNEL_FLAGS_TEST(n, src, comp, eval, exp) { \ + .name = (n), \ + .src_bits = (src), \ + .src_num = ARRAY_SIZE(src), \ + .exp_comp = (comp), \ + .exp_val = (eval), \ + .exp_bits = (exp), \ + .exp_num = ARRAY_SIZE(exp), \ +} + +/* These are __be16-compatible and can be compared as is */ +static const u16 ip_tunnel_flags_1[] = { + IP_TUNNEL_KEY_BIT, + IP_TUNNEL_STRICT_BIT, + IP_TUNNEL_ERSPAN_OPT_BIT, +}; + +/* Due to the previous flags design limitation, setting either + * ``IP_TUNNEL_CSUM_BIT`` (on Big Endian) or ``IP_TUNNEL_DONT_FRAGMENT_BIT`` + * (on Little) also sets VTI/ISATAP bit. In the bitmap implementation, they + * correspond to ``BIT(16)``, which is bigger than ``U16_MAX``, but still is + * backward-compatible. + */ +#ifdef __LITTLE_ENDIAN +#define IP_TUNNEL_CONFLICT_BIT IP_TUNNEL_DONT_FRAGMENT_BIT +#else +#define IP_TUNNEL_CONFLICT_BIT IP_TUNNEL_CSUM_BIT +#endif + +static const u16 ip_tunnel_flags_2_src[] = { + IP_TUNNEL_CONFLICT_BIT, +}; + +static const u16 ip_tunnel_flags_2_exp[] = { + IP_TUNNEL_CONFLICT_BIT, + IP_TUNNEL_SIT_ISATAP_BIT, +}; + +/* Bits 17 and higher are not compatible with __be16 flags */ +static const u16 ip_tunnel_flags_3_src[] = { + IP_TUNNEL_VXLAN_OPT_BIT, + 17, + 18, + 20, }; -static struct kunit_suite gso_test_suite = { - .name = "net_core_gso", - .test_cases = gso_test_cases, +static const u16 ip_tunnel_flags_3_exp[] = { + IP_TUNNEL_VXLAN_OPT_BIT, }; -kunit_test_suite(gso_test_suite); +static const struct ip_tunnel_flags_test ip_tunnel_flags_test[] = { + IP_TUNNEL_FLAGS_TEST("compat", ip_tunnel_flags_1, true, + cpu_to_be16(BIT(IP_TUNNEL_KEY_BIT) | + BIT(IP_TUNNEL_STRICT_BIT) | + BIT(IP_TUNNEL_ERSPAN_OPT_BIT)), + ip_tunnel_flags_1), + IP_TUNNEL_FLAGS_TEST("conflict", ip_tunnel_flags_2_src, true, + VTI_ISVTI, ip_tunnel_flags_2_exp), + IP_TUNNEL_FLAGS_TEST("new", ip_tunnel_flags_3_src, false, + cpu_to_be16(BIT(IP_TUNNEL_VXLAN_OPT_BIT)), + ip_tunnel_flags_3_exp), +}; + +static void +ip_tunnel_flags_test_case_to_desc(const struct ip_tunnel_flags_test *t, + char *desc) +{ + strscpy(desc, t->name, KUNIT_PARAM_DESC_SIZE); +} +KUNIT_ARRAY_PARAM(ip_tunnel_flags_test, ip_tunnel_flags_test, + ip_tunnel_flags_test_case_to_desc); + +static void ip_tunnel_flags_test_run(struct kunit *test) +{ + const struct ip_tunnel_flags_test *t = test->param_value; + IP_TUNNEL_DECLARE_FLAGS(src) = { }; + IP_TUNNEL_DECLARE_FLAGS(exp) = { }; + IP_TUNNEL_DECLARE_FLAGS(out); + + for (u32 j = 0; j < t->src_num; j++) + __set_bit(t->src_bits[j], src); + for (u32 j = 0; j < t->exp_num; j++) + __set_bit(t->exp_bits[j], exp); + + KUNIT_ASSERT_EQ(test, t->exp_comp, + ip_tunnel_flags_is_be16_compat(src)); + KUNIT_ASSERT_EQ(test, (__force u16)t->exp_val, + (__force u16)ip_tunnel_flags_to_be16(src)); + + ip_tunnel_flags_from_be16(out, t->exp_val); + KUNIT_ASSERT_TRUE(test, __ipt_flag_op(bitmap_equal, exp, out)); +} + +static struct kunit_case net_test_cases[] = { + KUNIT_CASE_PARAM(gso_test_func, gso_test_gen_params), + KUNIT_CASE_PARAM(ip_tunnel_flags_test_run, + ip_tunnel_flags_test_gen_params), + { }, +}; + +static struct kunit_suite net_test_suite = { + .name = "net_core", + .test_cases = net_test_cases, +}; +kunit_test_suite(net_test_suite); +MODULE_DESCRIPTION("KUnit tests for networking core"); MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("KUnit tests for segmentation offload"); diff --git a/net/core/netdev-genl-gen.c b/net/core/netdev-genl-gen.c index ea9231378aa6..8350a0afa9ec 100644 --- a/net/core/netdev-genl-gen.c +++ b/net/core/netdev-genl-gen.c @@ -10,11 +10,70 @@ #include <uapi/linux/netdev.h> +/* Integer value ranges */ +static const struct netlink_range_validation netdev_a_page_pool_id_range = { + .min = 1ULL, + .max = 4294967295ULL, +}; + +static const struct netlink_range_validation netdev_a_page_pool_ifindex_range = { + .min = 1ULL, + .max = 2147483647ULL, +}; + +/* Common nested types */ +const struct nla_policy netdev_page_pool_info_nl_policy[NETDEV_A_PAGE_POOL_IFINDEX + 1] = { + [NETDEV_A_PAGE_POOL_ID] = NLA_POLICY_FULL_RANGE(NLA_UINT, &netdev_a_page_pool_id_range), + [NETDEV_A_PAGE_POOL_IFINDEX] = NLA_POLICY_FULL_RANGE(NLA_U32, &netdev_a_page_pool_ifindex_range), +}; + /* NETDEV_CMD_DEV_GET - do */ static const struct nla_policy netdev_dev_get_nl_policy[NETDEV_A_DEV_IFINDEX + 1] = { [NETDEV_A_DEV_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1), }; +/* NETDEV_CMD_PAGE_POOL_GET - do */ +#ifdef CONFIG_PAGE_POOL +static const struct nla_policy netdev_page_pool_get_nl_policy[NETDEV_A_PAGE_POOL_ID + 1] = { + [NETDEV_A_PAGE_POOL_ID] = NLA_POLICY_FULL_RANGE(NLA_UINT, &netdev_a_page_pool_id_range), +}; +#endif /* CONFIG_PAGE_POOL */ + +/* NETDEV_CMD_PAGE_POOL_STATS_GET - do */ +#ifdef CONFIG_PAGE_POOL_STATS +static const struct nla_policy netdev_page_pool_stats_get_nl_policy[NETDEV_A_PAGE_POOL_STATS_INFO + 1] = { + [NETDEV_A_PAGE_POOL_STATS_INFO] = NLA_POLICY_NESTED(netdev_page_pool_info_nl_policy), +}; +#endif /* CONFIG_PAGE_POOL_STATS */ + +/* NETDEV_CMD_QUEUE_GET - do */ +static const struct nla_policy netdev_queue_get_do_nl_policy[NETDEV_A_QUEUE_TYPE + 1] = { + [NETDEV_A_QUEUE_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1), + [NETDEV_A_QUEUE_TYPE] = NLA_POLICY_MAX(NLA_U32, 1), + [NETDEV_A_QUEUE_ID] = { .type = NLA_U32, }, +}; + +/* NETDEV_CMD_QUEUE_GET - dump */ +static const struct nla_policy netdev_queue_get_dump_nl_policy[NETDEV_A_QUEUE_IFINDEX + 1] = { + [NETDEV_A_QUEUE_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1), +}; + +/* NETDEV_CMD_NAPI_GET - do */ +static const struct nla_policy netdev_napi_get_do_nl_policy[NETDEV_A_NAPI_ID + 1] = { + [NETDEV_A_NAPI_ID] = { .type = NLA_U32, }, +}; + +/* NETDEV_CMD_NAPI_GET - dump */ +static const struct nla_policy netdev_napi_get_dump_nl_policy[NETDEV_A_NAPI_IFINDEX + 1] = { + [NETDEV_A_NAPI_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1), +}; + +/* NETDEV_CMD_QSTATS_GET - dump */ +static const struct nla_policy netdev_qstats_get_nl_policy[NETDEV_A_QSTATS_SCOPE + 1] = { + [NETDEV_A_QSTATS_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1), + [NETDEV_A_QSTATS_SCOPE] = NLA_POLICY_MASK(NLA_UINT, 0x1), +}; + /* Ops table for netdev */ static const struct genl_split_ops netdev_nl_ops[] = { { @@ -29,10 +88,74 @@ static const struct genl_split_ops netdev_nl_ops[] = { .dumpit = netdev_nl_dev_get_dumpit, .flags = GENL_CMD_CAP_DUMP, }, +#ifdef CONFIG_PAGE_POOL + { + .cmd = NETDEV_CMD_PAGE_POOL_GET, + .doit = netdev_nl_page_pool_get_doit, + .policy = netdev_page_pool_get_nl_policy, + .maxattr = NETDEV_A_PAGE_POOL_ID, + .flags = GENL_CMD_CAP_DO, + }, + { + .cmd = NETDEV_CMD_PAGE_POOL_GET, + .dumpit = netdev_nl_page_pool_get_dumpit, + .flags = GENL_CMD_CAP_DUMP, + }, +#endif /* CONFIG_PAGE_POOL */ +#ifdef CONFIG_PAGE_POOL_STATS + { + .cmd = NETDEV_CMD_PAGE_POOL_STATS_GET, + .doit = netdev_nl_page_pool_stats_get_doit, + .policy = netdev_page_pool_stats_get_nl_policy, + .maxattr = NETDEV_A_PAGE_POOL_STATS_INFO, + .flags = GENL_CMD_CAP_DO, + }, + { + .cmd = NETDEV_CMD_PAGE_POOL_STATS_GET, + .dumpit = netdev_nl_page_pool_stats_get_dumpit, + .flags = GENL_CMD_CAP_DUMP, + }, +#endif /* CONFIG_PAGE_POOL_STATS */ + { + .cmd = NETDEV_CMD_QUEUE_GET, + .doit = netdev_nl_queue_get_doit, + .policy = netdev_queue_get_do_nl_policy, + .maxattr = NETDEV_A_QUEUE_TYPE, + .flags = GENL_CMD_CAP_DO, + }, + { + .cmd = NETDEV_CMD_QUEUE_GET, + .dumpit = netdev_nl_queue_get_dumpit, + .policy = netdev_queue_get_dump_nl_policy, + .maxattr = NETDEV_A_QUEUE_IFINDEX, + .flags = GENL_CMD_CAP_DUMP, + }, + { + .cmd = NETDEV_CMD_NAPI_GET, + .doit = netdev_nl_napi_get_doit, + .policy = netdev_napi_get_do_nl_policy, + .maxattr = NETDEV_A_NAPI_ID, + .flags = GENL_CMD_CAP_DO, + }, + { + .cmd = NETDEV_CMD_NAPI_GET, + .dumpit = netdev_nl_napi_get_dumpit, + .policy = netdev_napi_get_dump_nl_policy, + .maxattr = NETDEV_A_NAPI_IFINDEX, + .flags = GENL_CMD_CAP_DUMP, + }, + { + .cmd = NETDEV_CMD_QSTATS_GET, + .dumpit = netdev_nl_qstats_get_dumpit, + .policy = netdev_qstats_get_nl_policy, + .maxattr = NETDEV_A_QSTATS_SCOPE, + .flags = GENL_CMD_CAP_DUMP, + }, }; static const struct genl_multicast_group netdev_nl_mcgrps[] = { [NETDEV_NLGRP_MGMT] = { "mgmt", }, + [NETDEV_NLGRP_PAGE_POOL] = { "page-pool", }, }; struct genl_family netdev_nl_family __ro_after_init = { diff --git a/net/core/netdev-genl-gen.h b/net/core/netdev-genl-gen.h index 7b370c073e7d..4db40fd5b4a9 100644 --- a/net/core/netdev-genl-gen.h +++ b/net/core/netdev-genl-gen.h @@ -11,11 +11,29 @@ #include <uapi/linux/netdev.h> +/* Common nested types */ +extern const struct nla_policy netdev_page_pool_info_nl_policy[NETDEV_A_PAGE_POOL_IFINDEX + 1]; + int netdev_nl_dev_get_doit(struct sk_buff *skb, struct genl_info *info); int netdev_nl_dev_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb); +int netdev_nl_page_pool_get_doit(struct sk_buff *skb, struct genl_info *info); +int netdev_nl_page_pool_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb); +int netdev_nl_page_pool_stats_get_doit(struct sk_buff *skb, + struct genl_info *info); +int netdev_nl_page_pool_stats_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb); +int netdev_nl_queue_get_doit(struct sk_buff *skb, struct genl_info *info); +int netdev_nl_queue_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb); +int netdev_nl_napi_get_doit(struct sk_buff *skb, struct genl_info *info); +int netdev_nl_napi_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb); +int netdev_nl_qstats_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb); enum { NETDEV_NLGRP_MGMT, + NETDEV_NLGRP_PAGE_POOL, }; extern struct genl_family netdev_nl_family; diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index fe61f85bcf33..1f6ae6379e0f 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -6,13 +6,33 @@ #include <net/net_namespace.h> #include <net/sock.h> #include <net/xdp.h> +#include <net/xdp_sock.h> +#include <net/netdev_rx_queue.h> +#include <net/netdev_queues.h> +#include <net/busy_poll.h> #include "netdev-genl-gen.h" +#include "dev.h" + +struct netdev_nl_dump_ctx { + unsigned long ifindex; + unsigned int rxq_idx; + unsigned int txq_idx; + unsigned int napi_id; +}; + +static struct netdev_nl_dump_ctx *netdev_dump_ctx(struct netlink_callback *cb) +{ + NL_ASSERT_DUMP_CTX_FITS(struct netdev_nl_dump_ctx); + + return (struct netdev_nl_dump_ctx *)cb->ctx; +} static int netdev_nl_dev_fill(struct net_device *netdev, struct sk_buff *rsp, const struct genl_info *info) { + u64 xsk_features = 0; u64 xdp_rx_meta = 0; void *hdr; @@ -26,11 +46,20 @@ netdev_nl_dev_fill(struct net_device *netdev, struct sk_buff *rsp, XDP_METADATA_KFUNC_xxx #undef XDP_METADATA_KFUNC + if (netdev->xsk_tx_metadata_ops) { + if (netdev->xsk_tx_metadata_ops->tmo_fill_timestamp) + xsk_features |= NETDEV_XSK_FLAGS_TX_TIMESTAMP; + if (netdev->xsk_tx_metadata_ops->tmo_request_checksum) + xsk_features |= NETDEV_XSK_FLAGS_TX_CHECKSUM; + } + if (nla_put_u32(rsp, NETDEV_A_DEV_IFINDEX, netdev->ifindex) || nla_put_u64_64bit(rsp, NETDEV_A_DEV_XDP_FEATURES, netdev->xdp_features, NETDEV_A_DEV_PAD) || nla_put_u64_64bit(rsp, NETDEV_A_DEV_XDP_RX_METADATA_FEATURES, - xdp_rx_meta, NETDEV_A_DEV_PAD)) { + xdp_rx_meta, NETDEV_A_DEV_PAD) || + nla_put_u64_64bit(rsp, NETDEV_A_DEV_XSK_FEATURES, + xsk_features, NETDEV_A_DEV_PAD)) { genlmsg_cancel(rsp, hdr); return -EINVAL; } @@ -111,22 +140,585 @@ err_free_msg: int netdev_nl_dev_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { + struct netdev_nl_dump_ctx *ctx = netdev_dump_ctx(cb); struct net *net = sock_net(skb->sk); struct net_device *netdev; int err = 0; rtnl_lock(); - for_each_netdev_dump(net, netdev, cb->args[0]) { + for_each_netdev_dump(net, netdev, ctx->ifindex) { err = netdev_nl_dev_fill(netdev, skb, genl_info_dump(cb)); if (err < 0) break; } rtnl_unlock(); - if (err != -EMSGSIZE) + return err; +} + +static int +netdev_nl_napi_fill_one(struct sk_buff *rsp, struct napi_struct *napi, + const struct genl_info *info) +{ + void *hdr; + pid_t pid; + + if (WARN_ON_ONCE(!napi->dev)) + return -EINVAL; + if (!(napi->dev->flags & IFF_UP)) + return 0; + + hdr = genlmsg_iput(rsp, info); + if (!hdr) + return -EMSGSIZE; + + if (napi->napi_id >= MIN_NAPI_ID && + nla_put_u32(rsp, NETDEV_A_NAPI_ID, napi->napi_id)) + goto nla_put_failure; + + if (nla_put_u32(rsp, NETDEV_A_NAPI_IFINDEX, napi->dev->ifindex)) + goto nla_put_failure; + + if (napi->irq >= 0 && nla_put_u32(rsp, NETDEV_A_NAPI_IRQ, napi->irq)) + goto nla_put_failure; + + if (napi->thread) { + pid = task_pid_nr(napi->thread); + if (nla_put_u32(rsp, NETDEV_A_NAPI_PID, pid)) + goto nla_put_failure; + } + + genlmsg_end(rsp, hdr); + + return 0; + +nla_put_failure: + genlmsg_cancel(rsp, hdr); + return -EMSGSIZE; +} + +int netdev_nl_napi_get_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct napi_struct *napi; + struct sk_buff *rsp; + u32 napi_id; + int err; + + if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_NAPI_ID)) + return -EINVAL; + + napi_id = nla_get_u32(info->attrs[NETDEV_A_NAPI_ID]); + + rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!rsp) + return -ENOMEM; + + rtnl_lock(); + + napi = napi_by_id(napi_id); + if (napi) + err = netdev_nl_napi_fill_one(rsp, napi, info); + else + err = -EINVAL; + + rtnl_unlock(); + + if (err) + goto err_free_msg; + + return genlmsg_reply(rsp, info); + +err_free_msg: + nlmsg_free(rsp); + return err; +} + +static int +netdev_nl_napi_dump_one(struct net_device *netdev, struct sk_buff *rsp, + const struct genl_info *info, + struct netdev_nl_dump_ctx *ctx) +{ + struct napi_struct *napi; + int err = 0; + + if (!(netdev->flags & IFF_UP)) + return err; + + list_for_each_entry(napi, &netdev->napi_list, dev_list) { + if (ctx->napi_id && napi->napi_id >= ctx->napi_id) + continue; + + err = netdev_nl_napi_fill_one(rsp, napi, info); + if (err) + return err; + ctx->napi_id = napi->napi_id; + } + return err; +} + +int netdev_nl_napi_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct netdev_nl_dump_ctx *ctx = netdev_dump_ctx(cb); + const struct genl_info *info = genl_info_dump(cb); + struct net *net = sock_net(skb->sk); + struct net_device *netdev; + u32 ifindex = 0; + int err = 0; + + if (info->attrs[NETDEV_A_NAPI_IFINDEX]) + ifindex = nla_get_u32(info->attrs[NETDEV_A_NAPI_IFINDEX]); + + rtnl_lock(); + if (ifindex) { + netdev = __dev_get_by_index(net, ifindex); + if (netdev) + err = netdev_nl_napi_dump_one(netdev, skb, info, ctx); + else + err = -ENODEV; + } else { + for_each_netdev_dump(net, netdev, ctx->ifindex) { + err = netdev_nl_napi_dump_one(netdev, skb, info, ctx); + if (err < 0) + break; + ctx->napi_id = 0; + } + } + rtnl_unlock(); + + return err; +} + +static int +netdev_nl_queue_fill_one(struct sk_buff *rsp, struct net_device *netdev, + u32 q_idx, u32 q_type, const struct genl_info *info) +{ + struct netdev_rx_queue *rxq; + struct netdev_queue *txq; + void *hdr; + + hdr = genlmsg_iput(rsp, info); + if (!hdr) + return -EMSGSIZE; + + if (nla_put_u32(rsp, NETDEV_A_QUEUE_ID, q_idx) || + nla_put_u32(rsp, NETDEV_A_QUEUE_TYPE, q_type) || + nla_put_u32(rsp, NETDEV_A_QUEUE_IFINDEX, netdev->ifindex)) + goto nla_put_failure; + + switch (q_type) { + case NETDEV_QUEUE_TYPE_RX: + rxq = __netif_get_rx_queue(netdev, q_idx); + if (rxq->napi && nla_put_u32(rsp, NETDEV_A_QUEUE_NAPI_ID, + rxq->napi->napi_id)) + goto nla_put_failure; + break; + case NETDEV_QUEUE_TYPE_TX: + txq = netdev_get_tx_queue(netdev, q_idx); + if (txq->napi && nla_put_u32(rsp, NETDEV_A_QUEUE_NAPI_ID, + txq->napi->napi_id)) + goto nla_put_failure; + } + + genlmsg_end(rsp, hdr); + + return 0; + +nla_put_failure: + genlmsg_cancel(rsp, hdr); + return -EMSGSIZE; +} + +static int netdev_nl_queue_validate(struct net_device *netdev, u32 q_id, + u32 q_type) +{ + switch (q_type) { + case NETDEV_QUEUE_TYPE_RX: + if (q_id >= netdev->real_num_rx_queues) + return -EINVAL; + return 0; + case NETDEV_QUEUE_TYPE_TX: + if (q_id >= netdev->real_num_tx_queues) + return -EINVAL; + } + return 0; +} + +static int +netdev_nl_queue_fill(struct sk_buff *rsp, struct net_device *netdev, u32 q_idx, + u32 q_type, const struct genl_info *info) +{ + int err = 0; + + if (!(netdev->flags & IFF_UP)) + return err; + + err = netdev_nl_queue_validate(netdev, q_idx, q_type); + if (err) return err; - return skb->len; + return netdev_nl_queue_fill_one(rsp, netdev, q_idx, q_type, info); +} + +int netdev_nl_queue_get_doit(struct sk_buff *skb, struct genl_info *info) +{ + u32 q_id, q_type, ifindex; + struct net_device *netdev; + struct sk_buff *rsp; + int err; + + if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_QUEUE_ID) || + GENL_REQ_ATTR_CHECK(info, NETDEV_A_QUEUE_TYPE) || + GENL_REQ_ATTR_CHECK(info, NETDEV_A_QUEUE_IFINDEX)) + return -EINVAL; + + q_id = nla_get_u32(info->attrs[NETDEV_A_QUEUE_ID]); + q_type = nla_get_u32(info->attrs[NETDEV_A_QUEUE_TYPE]); + ifindex = nla_get_u32(info->attrs[NETDEV_A_QUEUE_IFINDEX]); + + rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!rsp) + return -ENOMEM; + + rtnl_lock(); + + netdev = __dev_get_by_index(genl_info_net(info), ifindex); + if (netdev) + err = netdev_nl_queue_fill(rsp, netdev, q_id, q_type, info); + else + err = -ENODEV; + + rtnl_unlock(); + + if (err) + goto err_free_msg; + + return genlmsg_reply(rsp, info); + +err_free_msg: + nlmsg_free(rsp); + return err; +} + +static int +netdev_nl_queue_dump_one(struct net_device *netdev, struct sk_buff *rsp, + const struct genl_info *info, + struct netdev_nl_dump_ctx *ctx) +{ + int err = 0; + int i; + + if (!(netdev->flags & IFF_UP)) + return err; + + for (i = ctx->rxq_idx; i < netdev->real_num_rx_queues;) { + err = netdev_nl_queue_fill_one(rsp, netdev, i, + NETDEV_QUEUE_TYPE_RX, info); + if (err) + return err; + ctx->rxq_idx = i++; + } + for (i = ctx->txq_idx; i < netdev->real_num_tx_queues;) { + err = netdev_nl_queue_fill_one(rsp, netdev, i, + NETDEV_QUEUE_TYPE_TX, info); + if (err) + return err; + ctx->txq_idx = i++; + } + + return err; +} + +int netdev_nl_queue_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct netdev_nl_dump_ctx *ctx = netdev_dump_ctx(cb); + const struct genl_info *info = genl_info_dump(cb); + struct net *net = sock_net(skb->sk); + struct net_device *netdev; + u32 ifindex = 0; + int err = 0; + + if (info->attrs[NETDEV_A_QUEUE_IFINDEX]) + ifindex = nla_get_u32(info->attrs[NETDEV_A_QUEUE_IFINDEX]); + + rtnl_lock(); + if (ifindex) { + netdev = __dev_get_by_index(net, ifindex); + if (netdev) + err = netdev_nl_queue_dump_one(netdev, skb, info, ctx); + else + err = -ENODEV; + } else { + for_each_netdev_dump(net, netdev, ctx->ifindex) { + err = netdev_nl_queue_dump_one(netdev, skb, info, ctx); + if (err < 0) + break; + ctx->rxq_idx = 0; + ctx->txq_idx = 0; + } + } + rtnl_unlock(); + + return err; +} + +#define NETDEV_STAT_NOT_SET (~0ULL) + +static void netdev_nl_stats_add(void *_sum, const void *_add, size_t size) +{ + const u64 *add = _add; + u64 *sum = _sum; + + while (size) { + if (*add != NETDEV_STAT_NOT_SET && *sum != NETDEV_STAT_NOT_SET) + *sum += *add; + sum++; + add++; + size -= 8; + } +} + +static int netdev_stat_put(struct sk_buff *rsp, unsigned int attr_id, u64 value) +{ + if (value == NETDEV_STAT_NOT_SET) + return 0; + return nla_put_uint(rsp, attr_id, value); +} + +static int +netdev_nl_stats_write_rx(struct sk_buff *rsp, struct netdev_queue_stats_rx *rx) +{ + if (netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_PACKETS, rx->packets) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_BYTES, rx->bytes) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_ALLOC_FAIL, rx->alloc_fail) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_DROPS, rx->hw_drops) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_DROP_OVERRUNS, rx->hw_drop_overruns) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_CSUM_UNNECESSARY, rx->csum_unnecessary) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_CSUM_NONE, rx->csum_none) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_CSUM_BAD, rx->csum_bad) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_GRO_PACKETS, rx->hw_gro_packets) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_GRO_BYTES, rx->hw_gro_bytes) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_GRO_WIRE_PACKETS, rx->hw_gro_wire_packets) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_GRO_WIRE_BYTES, rx->hw_gro_wire_bytes) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_DROP_RATELIMITS, rx->hw_drop_ratelimits)) + return -EMSGSIZE; + return 0; +} + +static int +netdev_nl_stats_write_tx(struct sk_buff *rsp, struct netdev_queue_stats_tx *tx) +{ + if (netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_PACKETS, tx->packets) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_BYTES, tx->bytes) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_DROPS, tx->hw_drops) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_DROP_ERRORS, tx->hw_drop_errors) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_CSUM_NONE, tx->csum_none) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_NEEDS_CSUM, tx->needs_csum) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_GSO_PACKETS, tx->hw_gso_packets) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_GSO_BYTES, tx->hw_gso_bytes) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_GSO_WIRE_PACKETS, tx->hw_gso_wire_packets) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_GSO_WIRE_BYTES, tx->hw_gso_wire_bytes) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_DROP_RATELIMITS, tx->hw_drop_ratelimits) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_STOP, tx->stop) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_WAKE, tx->wake)) + return -EMSGSIZE; + return 0; +} + +static int +netdev_nl_stats_queue(struct net_device *netdev, struct sk_buff *rsp, + u32 q_type, int i, const struct genl_info *info) +{ + const struct netdev_stat_ops *ops = netdev->stat_ops; + struct netdev_queue_stats_rx rx; + struct netdev_queue_stats_tx tx; + void *hdr; + + hdr = genlmsg_iput(rsp, info); + if (!hdr) + return -EMSGSIZE; + if (nla_put_u32(rsp, NETDEV_A_QSTATS_IFINDEX, netdev->ifindex) || + nla_put_u32(rsp, NETDEV_A_QSTATS_QUEUE_TYPE, q_type) || + nla_put_u32(rsp, NETDEV_A_QSTATS_QUEUE_ID, i)) + goto nla_put_failure; + + switch (q_type) { + case NETDEV_QUEUE_TYPE_RX: + memset(&rx, 0xff, sizeof(rx)); + ops->get_queue_stats_rx(netdev, i, &rx); + if (!memchr_inv(&rx, 0xff, sizeof(rx))) + goto nla_cancel; + if (netdev_nl_stats_write_rx(rsp, &rx)) + goto nla_put_failure; + break; + case NETDEV_QUEUE_TYPE_TX: + memset(&tx, 0xff, sizeof(tx)); + ops->get_queue_stats_tx(netdev, i, &tx); + if (!memchr_inv(&tx, 0xff, sizeof(tx))) + goto nla_cancel; + if (netdev_nl_stats_write_tx(rsp, &tx)) + goto nla_put_failure; + break; + } + + genlmsg_end(rsp, hdr); + return 0; + +nla_cancel: + genlmsg_cancel(rsp, hdr); + return 0; +nla_put_failure: + genlmsg_cancel(rsp, hdr); + return -EMSGSIZE; +} + +static int +netdev_nl_stats_by_queue(struct net_device *netdev, struct sk_buff *rsp, + const struct genl_info *info, + struct netdev_nl_dump_ctx *ctx) +{ + const struct netdev_stat_ops *ops = netdev->stat_ops; + int i, err; + + if (!(netdev->flags & IFF_UP)) + return 0; + + i = ctx->rxq_idx; + while (ops->get_queue_stats_rx && i < netdev->real_num_rx_queues) { + err = netdev_nl_stats_queue(netdev, rsp, NETDEV_QUEUE_TYPE_RX, + i, info); + if (err) + return err; + ctx->rxq_idx = i++; + } + i = ctx->txq_idx; + while (ops->get_queue_stats_tx && i < netdev->real_num_tx_queues) { + err = netdev_nl_stats_queue(netdev, rsp, NETDEV_QUEUE_TYPE_TX, + i, info); + if (err) + return err; + ctx->txq_idx = i++; + } + + ctx->rxq_idx = 0; + ctx->txq_idx = 0; + return 0; +} + +static int +netdev_nl_stats_by_netdev(struct net_device *netdev, struct sk_buff *rsp, + const struct genl_info *info) +{ + struct netdev_queue_stats_rx rx_sum, rx; + struct netdev_queue_stats_tx tx_sum, tx; + const struct netdev_stat_ops *ops; + void *hdr; + int i; + + ops = netdev->stat_ops; + /* Netdev can't guarantee any complete counters */ + if (!ops->get_base_stats) + return 0; + + memset(&rx_sum, 0xff, sizeof(rx_sum)); + memset(&tx_sum, 0xff, sizeof(tx_sum)); + + ops->get_base_stats(netdev, &rx_sum, &tx_sum); + + /* The op was there, but nothing reported, don't bother */ + if (!memchr_inv(&rx_sum, 0xff, sizeof(rx_sum)) && + !memchr_inv(&tx_sum, 0xff, sizeof(tx_sum))) + return 0; + + hdr = genlmsg_iput(rsp, info); + if (!hdr) + return -EMSGSIZE; + if (nla_put_u32(rsp, NETDEV_A_QSTATS_IFINDEX, netdev->ifindex)) + goto nla_put_failure; + + for (i = 0; i < netdev->real_num_rx_queues; i++) { + memset(&rx, 0xff, sizeof(rx)); + if (ops->get_queue_stats_rx) + ops->get_queue_stats_rx(netdev, i, &rx); + netdev_nl_stats_add(&rx_sum, &rx, sizeof(rx)); + } + for (i = 0; i < netdev->real_num_tx_queues; i++) { + memset(&tx, 0xff, sizeof(tx)); + if (ops->get_queue_stats_tx) + ops->get_queue_stats_tx(netdev, i, &tx); + netdev_nl_stats_add(&tx_sum, &tx, sizeof(tx)); + } + + if (netdev_nl_stats_write_rx(rsp, &rx_sum) || + netdev_nl_stats_write_tx(rsp, &tx_sum)) + goto nla_put_failure; + + genlmsg_end(rsp, hdr); + return 0; + +nla_put_failure: + genlmsg_cancel(rsp, hdr); + return -EMSGSIZE; +} + +static int +netdev_nl_qstats_get_dump_one(struct net_device *netdev, unsigned int scope, + struct sk_buff *skb, const struct genl_info *info, + struct netdev_nl_dump_ctx *ctx) +{ + if (!netdev->stat_ops) + return 0; + + switch (scope) { + case 0: + return netdev_nl_stats_by_netdev(netdev, skb, info); + case NETDEV_QSTATS_SCOPE_QUEUE: + return netdev_nl_stats_by_queue(netdev, skb, info, ctx); + } + + return -EINVAL; /* Should not happen, per netlink policy */ +} + +int netdev_nl_qstats_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct netdev_nl_dump_ctx *ctx = netdev_dump_ctx(cb); + const struct genl_info *info = genl_info_dump(cb); + struct net *net = sock_net(skb->sk); + struct net_device *netdev; + unsigned int ifindex; + unsigned int scope; + int err = 0; + + scope = 0; + if (info->attrs[NETDEV_A_QSTATS_SCOPE]) + scope = nla_get_uint(info->attrs[NETDEV_A_QSTATS_SCOPE]); + + ifindex = 0; + if (info->attrs[NETDEV_A_QSTATS_IFINDEX]) + ifindex = nla_get_u32(info->attrs[NETDEV_A_QSTATS_IFINDEX]); + + rtnl_lock(); + if (ifindex) { + netdev = __dev_get_by_index(net, ifindex); + if (netdev && netdev->stat_ops) { + err = netdev_nl_qstats_get_dump_one(netdev, scope, skb, + info, ctx); + } else { + NL_SET_BAD_ATTR(info->extack, + info->attrs[NETDEV_A_QSTATS_IFINDEX]); + err = netdev ? -EOPNOTSUPP : -ENODEV; + } + } else { + for_each_netdev_dump(net, netdev, ctx->ifindex) { + err = netdev_nl_qstats_get_dump_one(netdev, scope, skb, + info, ctx); + if (err < 0) + break; + } + } + rtnl_unlock(); + + return err; } static int netdev_genl_netdevice_event(struct notifier_block *nb, diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 543007f159f9..55bcacf67df3 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -316,7 +316,7 @@ static int netpoll_owner_active(struct net_device *dev) struct napi_struct *napi; list_for_each_entry_rcu(napi, &dev->napi_list, dev_list) { - if (napi->poll_owner == smp_processor_id()) + if (READ_ONCE(napi->poll_owner) == smp_processor_id()) return 1; } return 0; diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 5e409b98aba0..8bcc7014a61a 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -5,6 +5,7 @@ * Copyright (C) 2016 Red Hat, Inc. */ +#include <linux/error-injection.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/slab.h> @@ -23,12 +24,16 @@ #include <trace/events/page_pool.h> +#include "page_pool_priv.h" + #define DEFER_TIME (msecs_to_jiffies(1000)) #define DEFER_WARN_INTERVAL (60 * HZ) -#define BIAS_MAX LONG_MAX +#define BIAS_MAX (LONG_MAX >> 1) #ifdef CONFIG_PAGE_POOL_STATS +static DEFINE_PER_CPU(struct page_pool_recycle_stats, pp_system_recycle_stats); + /* alloc_stat_inc is intended to be used in softirq context */ #define alloc_stat_inc(pool, __stat) (pool->alloc_stats.__stat++) /* recycle_stat_inc is safe to use when preemption is possible. */ @@ -69,7 +74,7 @@ static const char pp_stats[][ETH_GSTRING_LEN] = { * is passed to this API which is filled in. The caller can then report * those stats to the user (perhaps via ethtool, debugfs, etc.). */ -bool page_pool_get_stats(struct page_pool *pool, +bool page_pool_get_stats(const struct page_pool *pool, struct page_pool_stats *stats) { int cpu = 0; @@ -119,9 +124,9 @@ int page_pool_ethtool_stats_get_count(void) } EXPORT_SYMBOL(page_pool_ethtool_stats_get_count); -u64 *page_pool_ethtool_stats_get(u64 *data, void *stats) +u64 *page_pool_ethtool_stats_get(u64 *data, const void *stats) { - struct page_pool_stats *pool_stats = stats; + const struct page_pool_stats *pool_stats = stats; *data++ = pool_stats->alloc_stats.fast; *data++ = pool_stats->alloc_stats.slow; @@ -169,11 +174,15 @@ static void page_pool_producer_unlock(struct page_pool *pool, } static int page_pool_init(struct page_pool *pool, - const struct page_pool_params *params) + const struct page_pool_params *params, + int cpuid) { unsigned int ring_qsize = 1024; /* Default */ - memcpy(&pool->p, params, sizeof(pool->p)); + memcpy(&pool->p, ¶ms->fast, sizeof(pool->p)); + memcpy(&pool->slow, ¶ms->slow, sizeof(pool->slow)); + + pool->cpuid = cpuid; /* Validate only known flags were used */ if (pool->p.flags & ~(PP_FLAG_ALL)) @@ -211,14 +220,29 @@ static int page_pool_init(struct page_pool *pool, */ } + pool->has_init_callback = !!pool->slow.init_callback; + #ifdef CONFIG_PAGE_POOL_STATS - pool->recycle_stats = alloc_percpu(struct page_pool_recycle_stats); - if (!pool->recycle_stats) - return -ENOMEM; + if (!(pool->p.flags & PP_FLAG_SYSTEM_POOL)) { + pool->recycle_stats = alloc_percpu(struct page_pool_recycle_stats); + if (!pool->recycle_stats) + return -ENOMEM; + } else { + /* For system page pool instance we use a singular stats object + * instead of allocating a separate percpu variable for each + * (also percpu) page pool instance. + */ + pool->recycle_stats = &pp_system_recycle_stats; + } #endif - if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0) + if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0) { +#ifdef CONFIG_PAGE_POOL_STATS + if (!(pool->p.flags & PP_FLAG_SYSTEM_POOL)) + free_percpu(pool->recycle_stats); +#endif return -ENOMEM; + } atomic_set(&pool->pages_state_release_cnt, 0); @@ -231,11 +255,26 @@ static int page_pool_init(struct page_pool *pool, return 0; } +static void page_pool_uninit(struct page_pool *pool) +{ + ptr_ring_cleanup(&pool->ring, NULL); + + if (pool->p.flags & PP_FLAG_DMA_MAP) + put_device(pool->p.dev); + +#ifdef CONFIG_PAGE_POOL_STATS + if (!(pool->p.flags & PP_FLAG_SYSTEM_POOL)) + free_percpu(pool->recycle_stats); +#endif +} + /** - * page_pool_create() - create a page pool. + * page_pool_create_percpu() - create a page pool for a given cpu. * @params: parameters, see struct page_pool_params + * @cpuid: cpu identifier */ -struct page_pool *page_pool_create(const struct page_pool_params *params) +struct page_pool * +page_pool_create_percpu(const struct page_pool_params *params, int cpuid) { struct page_pool *pool; int err; @@ -244,14 +283,32 @@ struct page_pool *page_pool_create(const struct page_pool_params *params) if (!pool) return ERR_PTR(-ENOMEM); - err = page_pool_init(pool, params); - if (err < 0) { - pr_warn("%s() gave up with errno %d\n", __func__, err); - kfree(pool); - return ERR_PTR(err); - } + err = page_pool_init(pool, params, cpuid); + if (err < 0) + goto err_free; + + err = page_pool_list(pool); + if (err) + goto err_uninit; return pool; + +err_uninit: + page_pool_uninit(pool); +err_free: + pr_warn("%s() gave up with errno %d\n", __func__, err); + kfree(pool); + return ERR_PTR(err); +} +EXPORT_SYMBOL(page_pool_create_percpu); + +/** + * page_pool_create() - create a page pool + * @params: parameters, see struct page_pool_params + */ +struct page_pool *page_pool_create(const struct page_pool_params *params) +{ + return page_pool_create_percpu(params, -1); } EXPORT_SYMBOL(page_pool_create); @@ -327,8 +384,8 @@ static struct page *__page_pool_get_cached(struct page_pool *pool) return page; } -static void page_pool_dma_sync_for_device(struct page_pool *pool, - struct page *page, +static void page_pool_dma_sync_for_device(const struct page_pool *pool, + const struct page *page, unsigned int dma_sync_size) { dma_addr_t dma_addr = page_pool_get_dma_addr(page); @@ -384,8 +441,8 @@ static void page_pool_set_pp_info(struct page_pool *pool, * the overhead is negligible. */ page_pool_fragment_page(page, 1); - if (pool->p.init_callback) - pool->p.init_callback(page, pool->p.init_arg); + if (pool->has_init_callback) + pool->slow.init_callback(page, pool->slow.init_arg); } static void page_pool_clear_pp_info(struct page *page) @@ -494,13 +551,14 @@ struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp) return page; } EXPORT_SYMBOL(page_pool_alloc_pages); +ALLOW_ERROR_INJECTION(page_pool_alloc_pages, NULL); /* Calculate distance between two u32 values, valid if distance is below 2^(31) * https://en.wikipedia.org/wiki/Serial_number_arithmetic#General_Solution */ #define _distance(a, b) (s32)((a) - (b)) -static s32 page_pool_inflight(struct page_pool *pool) +s32 page_pool_inflight(const struct page_pool *pool, bool strict) { u32 release_cnt = atomic_read(&pool->pages_state_release_cnt); u32 hold_cnt = READ_ONCE(pool->pages_state_hold_cnt); @@ -508,27 +566,27 @@ static s32 page_pool_inflight(struct page_pool *pool) inflight = _distance(hold_cnt, release_cnt); - trace_page_pool_release(pool, inflight, hold_cnt, release_cnt); - WARN(inflight < 0, "Negative(%d) inflight packet-pages", inflight); + if (strict) { + trace_page_pool_release(pool, inflight, hold_cnt, release_cnt); + WARN(inflight < 0, "Negative(%d) inflight packet-pages", + inflight); + } else { + inflight = max(0, inflight); + } return inflight; } -/* Disconnects a page (from a page_pool). API users can have a need - * to disconnect a page (from a page_pool), to allow it to be used as - * a regular page (that will eventually be returned to the normal - * page-allocator via put_page). - */ -static void page_pool_return_page(struct page_pool *pool, struct page *page) +static __always_inline +void __page_pool_release_page_dma(struct page_pool *pool, struct page *page) { dma_addr_t dma; - int count; if (!(pool->p.flags & PP_FLAG_DMA_MAP)) /* Always account for inflight pages, even if we didn't * map them */ - goto skip_dma_unmap; + return; dma = page_pool_get_dma_addr(page); @@ -537,7 +595,19 @@ static void page_pool_return_page(struct page_pool *pool, struct page *page) PAGE_SIZE << pool->p.order, pool->p.dma_dir, DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING); page_pool_set_dma_addr(page, 0); -skip_dma_unmap: +} + +/* Disconnects a page (from a page_pool). API users can have a need + * to disconnect a page (from a page_pool), to allow it to be used as + * a regular page (that will eventually be returned to the normal + * page-allocator via put_page). + */ +void page_pool_return_page(struct page_pool *pool, struct page *page) +{ + int count; + + __page_pool_release_page_dma(pool, page); + page_pool_clear_pp_info(page); /* This may be the last page returned, releasing the pool, so @@ -589,6 +659,11 @@ static bool page_pool_recycle_in_cache(struct page *page, return true; } +static bool __page_pool_page_can_be_recycled(const struct page *page) +{ + return page_ref_count(page) == 1 && !page_is_pfmemalloc(page); +} + /* If the page refcnt == 1, this will try to recycle the page. * if PP_FLAG_DMA_SYNC_DEV is set, we'll try to sync the DMA area for * the configured size min(dma_sync_size, pool->max_len). @@ -610,15 +685,14 @@ __page_pool_put_page(struct page_pool *pool, struct page *page, * page is NOT reusable when allocated when system is under * some pressure. (page_is_pfmemalloc) */ - if (likely(page_ref_count(page) == 1 && !page_is_pfmemalloc(page))) { + if (likely(__page_pool_page_can_be_recycled(page))) { /* Read barrier done in page_ref_count / READ_ONCE */ if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) page_pool_dma_sync_for_device(pool, page, dma_sync_size); - if (allow_direct && in_softirq() && - page_pool_recycle_in_cache(page, pool)) + if (allow_direct && page_pool_recycle_in_cache(page, pool)) return NULL; /* Page found as candidate for recycling */ @@ -643,9 +717,35 @@ __page_pool_put_page(struct page_pool *pool, struct page *page, return NULL; } -void page_pool_put_defragged_page(struct page_pool *pool, struct page *page, - unsigned int dma_sync_size, bool allow_direct) +static bool page_pool_napi_local(const struct page_pool *pool) +{ + const struct napi_struct *napi; + u32 cpuid; + + if (unlikely(!in_softirq())) + return false; + + /* Allow direct recycle if we have reasons to believe that we are + * in the same context as the consumer would run, so there's + * no possible race. + * __page_pool_put_page() makes sure we're not in hardirq context + * and interrupts are enabled prior to accessing the cache. + */ + cpuid = smp_processor_id(); + if (READ_ONCE(pool->cpuid) == cpuid) + return true; + + napi = READ_ONCE(pool->p.napi); + + return napi && READ_ONCE(napi->list_owner) == cpuid; +} + +void page_pool_put_unrefed_page(struct page_pool *pool, struct page *page, + unsigned int dma_sync_size, bool allow_direct) { + if (!allow_direct) + allow_direct = page_pool_napi_local(pool); + page = __page_pool_put_page(pool, page, dma_sync_size, allow_direct); if (page && !page_pool_recycle_in_ring(pool, page)) { /* Cache full, fallback to free pages */ @@ -653,7 +753,7 @@ void page_pool_put_defragged_page(struct page_pool *pool, struct page *page, page_pool_return_page(pool, page); } } -EXPORT_SYMBOL(page_pool_put_defragged_page); +EXPORT_SYMBOL(page_pool_put_unrefed_page); /** * page_pool_put_page_bulk() - release references on multiple pages @@ -674,22 +774,25 @@ void page_pool_put_page_bulk(struct page_pool *pool, void **data, int count) { int i, bulk_len = 0; + bool allow_direct; bool in_softirq; + allow_direct = page_pool_napi_local(pool); + for (i = 0; i < count; i++) { struct page *page = virt_to_head_page(data[i]); /* It is not the last user for the page frag case */ - if (!page_pool_is_last_frag(page)) + if (!page_pool_is_last_ref(page)) continue; - page = __page_pool_put_page(pool, page, -1, false); + page = __page_pool_put_page(pool, page, -1, allow_direct); /* Approved for bulk recycling in ptr_ring cache */ if (page) data[bulk_len++] = page; } - if (unlikely(!bulk_len)) + if (!bulk_len) return; /* Bulk producer into ptr_ring page_pool cache */ @@ -722,10 +825,10 @@ static struct page *page_pool_drain_frag(struct page_pool *pool, long drain_count = BIAS_MAX - pool->frag_users; /* Some user is still using the page frag */ - if (likely(page_pool_defrag_page(page, drain_count))) + if (likely(page_pool_unref_page(page, drain_count))) return NULL; - if (page_ref_count(page) == 1 && !page_is_pfmemalloc(page)) { + if (__page_pool_page_can_be_recycled(page)) { if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) page_pool_dma_sync_for_device(pool, page, -1); @@ -743,7 +846,7 @@ static void page_pool_free_frag(struct page_pool *pool) pool->frag_page = NULL; - if (!page || page_pool_defrag_page(page, drain_count)) + if (!page || page_pool_unref_page(page, drain_count)) return; page_pool_return_page(pool, page); @@ -814,14 +917,8 @@ static void __page_pool_destroy(struct page_pool *pool) if (pool->disconnect) pool->disconnect(pool); - ptr_ring_cleanup(&pool->ring, NULL); - - if (pool->p.flags & PP_FLAG_DMA_MAP) - put_device(pool->p.dev); - -#ifdef CONFIG_PAGE_POOL_STATS - free_percpu(pool->recycle_stats); -#endif + page_pool_unlist(pool); + page_pool_uninit(pool); kfree(pool); } @@ -858,7 +955,7 @@ static int page_pool_release(struct page_pool *pool) int inflight; page_pool_scrub(pool); - inflight = page_pool_inflight(pool); + inflight = page_pool_inflight(pool, true); if (!inflight) __page_pool_destroy(pool); @@ -869,18 +966,21 @@ static void page_pool_release_retry(struct work_struct *wq) { struct delayed_work *dwq = to_delayed_work(wq); struct page_pool *pool = container_of(dwq, typeof(*pool), release_dw); + void *netdev; int inflight; inflight = page_pool_release(pool); if (!inflight) return; - /* Periodic warning */ - if (time_after_eq(jiffies, pool->defer_warn)) { + /* Periodic warning for page pools the user can't see */ + netdev = READ_ONCE(pool->slow.netdev); + if (time_after_eq(jiffies, pool->defer_warn) && + (!netdev || netdev == NET_PTR_POISON)) { int sec = (s32)((u32)jiffies - (u32)pool->defer_start) / HZ; - pr_warn("%s() stalled pool shutdown %d inflight %d sec\n", - __func__, inflight, sec); + pr_warn("%s() stalled pool shutdown: id %u, %d inflight %d sec\n", + __func__, pool->user.id, inflight, sec); pool->defer_warn = jiffies + DEFER_WARN_INTERVAL; } @@ -889,15 +989,20 @@ static void page_pool_release_retry(struct work_struct *wq) } void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *), - struct xdp_mem_info *mem) + const struct xdp_mem_info *mem) { refcount_inc(&pool->user_cnt); pool->disconnect = disconnect; pool->xdp_mem_id = mem->id; } -void page_pool_unlink_napi(struct page_pool *pool) +static void page_pool_disable_direct_recycling(struct page_pool *pool) { + /* Disable direct recycling based on pool->cpuid. + * Paired with READ_ONCE() in page_pool_napi_local(). + */ + WRITE_ONCE(pool->cpuid, -1); + if (!pool->p.napi) return; @@ -909,7 +1014,6 @@ void page_pool_unlink_napi(struct page_pool *pool) WRITE_ONCE(pool->p.napi, NULL); } -EXPORT_SYMBOL(page_pool_unlink_napi); void page_pool_destroy(struct page_pool *pool) { @@ -919,12 +1023,13 @@ void page_pool_destroy(struct page_pool *pool) if (!page_pool_put(pool)) return; - page_pool_unlink_napi(pool); + page_pool_disable_direct_recycling(pool); page_pool_free_frag(pool); if (!page_pool_release(pool)) return; + page_pool_detached(pool); pool->defer_start = jiffies; pool->defer_warn = jiffies + DEFER_WARN_INTERVAL; diff --git a/net/core/page_pool_priv.h b/net/core/page_pool_priv.h new file mode 100644 index 000000000000..90665d40f1eb --- /dev/null +++ b/net/core/page_pool_priv.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef __PAGE_POOL_PRIV_H +#define __PAGE_POOL_PRIV_H + +s32 page_pool_inflight(const struct page_pool *pool, bool strict); + +int page_pool_list(struct page_pool *pool); +void page_pool_detached(struct page_pool *pool); +void page_pool_unlist(struct page_pool *pool); + +#endif diff --git a/net/core/page_pool_user.c b/net/core/page_pool_user.c new file mode 100644 index 000000000000..3a3277ba167b --- /dev/null +++ b/net/core/page_pool_user.c @@ -0,0 +1,409 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/mutex.h> +#include <linux/netdevice.h> +#include <linux/xarray.h> +#include <net/net_debug.h> +#include <net/page_pool/types.h> +#include <net/page_pool/helpers.h> +#include <net/sock.h> + +#include "page_pool_priv.h" +#include "netdev-genl-gen.h" + +static DEFINE_XARRAY_FLAGS(page_pools, XA_FLAGS_ALLOC1); +/* Protects: page_pools, netdevice->page_pools, pool->slow.netdev, pool->user. + * Ordering: inside rtnl_lock + */ +static DEFINE_MUTEX(page_pools_lock); + +/* Page pools are only reachable from user space (via netlink) if they are + * linked to a netdev at creation time. Following page pool "visibility" + * states are possible: + * - normal + * - user.list: linked to real netdev, netdev: real netdev + * - orphaned - real netdev has disappeared + * - user.list: linked to lo, netdev: lo + * - invisible - either (a) created without netdev linking, (b) unlisted due + * to error, or (c) the entire namespace which owned this pool disappeared + * - user.list: unhashed, netdev: unknown + */ + +typedef int (*pp_nl_fill_cb)(struct sk_buff *rsp, const struct page_pool *pool, + const struct genl_info *info); + +static int +netdev_nl_page_pool_get_do(struct genl_info *info, u32 id, pp_nl_fill_cb fill) +{ + struct page_pool *pool; + struct sk_buff *rsp; + int err; + + mutex_lock(&page_pools_lock); + pool = xa_load(&page_pools, id); + if (!pool || hlist_unhashed(&pool->user.list) || + !net_eq(dev_net(pool->slow.netdev), genl_info_net(info))) { + err = -ENOENT; + goto err_unlock; + } + + rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!rsp) { + err = -ENOMEM; + goto err_unlock; + } + + err = fill(rsp, pool, info); + if (err) + goto err_free_msg; + + mutex_unlock(&page_pools_lock); + + return genlmsg_reply(rsp, info); + +err_free_msg: + nlmsg_free(rsp); +err_unlock: + mutex_unlock(&page_pools_lock); + return err; +} + +struct page_pool_dump_cb { + unsigned long ifindex; + u32 pp_id; +}; + +static int +netdev_nl_page_pool_get_dump(struct sk_buff *skb, struct netlink_callback *cb, + pp_nl_fill_cb fill) +{ + struct page_pool_dump_cb *state = (void *)cb->ctx; + const struct genl_info *info = genl_info_dump(cb); + struct net *net = sock_net(skb->sk); + struct net_device *netdev; + struct page_pool *pool; + int err = 0; + + rtnl_lock(); + mutex_lock(&page_pools_lock); + for_each_netdev_dump(net, netdev, state->ifindex) { + hlist_for_each_entry(pool, &netdev->page_pools, user.list) { + if (state->pp_id && state->pp_id < pool->user.id) + continue; + + state->pp_id = pool->user.id; + err = fill(skb, pool, info); + if (err) + goto out; + } + + state->pp_id = 0; + } +out: + mutex_unlock(&page_pools_lock); + rtnl_unlock(); + + return err; +} + +static int +page_pool_nl_stats_fill(struct sk_buff *rsp, const struct page_pool *pool, + const struct genl_info *info) +{ +#ifdef CONFIG_PAGE_POOL_STATS + struct page_pool_stats stats = {}; + struct nlattr *nest; + void *hdr; + + if (!page_pool_get_stats(pool, &stats)) + return 0; + + hdr = genlmsg_iput(rsp, info); + if (!hdr) + return -EMSGSIZE; + + nest = nla_nest_start(rsp, NETDEV_A_PAGE_POOL_STATS_INFO); + + if (nla_put_uint(rsp, NETDEV_A_PAGE_POOL_ID, pool->user.id) || + (pool->slow.netdev->ifindex != LOOPBACK_IFINDEX && + nla_put_u32(rsp, NETDEV_A_PAGE_POOL_IFINDEX, + pool->slow.netdev->ifindex))) + goto err_cancel_nest; + + nla_nest_end(rsp, nest); + + if (nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_ALLOC_FAST, + stats.alloc_stats.fast) || + nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_ALLOC_SLOW, + stats.alloc_stats.slow) || + nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_ALLOC_SLOW_HIGH_ORDER, + stats.alloc_stats.slow_high_order) || + nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_ALLOC_EMPTY, + stats.alloc_stats.empty) || + nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_ALLOC_REFILL, + stats.alloc_stats.refill) || + nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_ALLOC_WAIVE, + stats.alloc_stats.waive) || + nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_RECYCLE_CACHED, + stats.recycle_stats.cached) || + nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_RECYCLE_CACHE_FULL, + stats.recycle_stats.cache_full) || + nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_RECYCLE_RING, + stats.recycle_stats.ring) || + nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_RECYCLE_RING_FULL, + stats.recycle_stats.ring_full) || + nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_RECYCLE_RELEASED_REFCNT, + stats.recycle_stats.released_refcnt)) + goto err_cancel_msg; + + genlmsg_end(rsp, hdr); + + return 0; +err_cancel_nest: + nla_nest_cancel(rsp, nest); +err_cancel_msg: + genlmsg_cancel(rsp, hdr); + return -EMSGSIZE; +#else + GENL_SET_ERR_MSG(info, "kernel built without CONFIG_PAGE_POOL_STATS"); + return -EOPNOTSUPP; +#endif +} + +int netdev_nl_page_pool_stats_get_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct nlattr *tb[ARRAY_SIZE(netdev_page_pool_info_nl_policy)]; + struct nlattr *nest; + int err; + u32 id; + + if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_PAGE_POOL_STATS_INFO)) + return -EINVAL; + + nest = info->attrs[NETDEV_A_PAGE_POOL_STATS_INFO]; + err = nla_parse_nested(tb, ARRAY_SIZE(tb) - 1, nest, + netdev_page_pool_info_nl_policy, + info->extack); + if (err) + return err; + + if (NL_REQ_ATTR_CHECK(info->extack, nest, tb, NETDEV_A_PAGE_POOL_ID)) + return -EINVAL; + if (tb[NETDEV_A_PAGE_POOL_IFINDEX]) { + NL_SET_ERR_MSG_ATTR(info->extack, + tb[NETDEV_A_PAGE_POOL_IFINDEX], + "selecting by ifindex not supported"); + return -EINVAL; + } + + id = nla_get_uint(tb[NETDEV_A_PAGE_POOL_ID]); + + return netdev_nl_page_pool_get_do(info, id, page_pool_nl_stats_fill); +} + +int netdev_nl_page_pool_stats_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb) +{ + return netdev_nl_page_pool_get_dump(skb, cb, page_pool_nl_stats_fill); +} + +static int +page_pool_nl_fill(struct sk_buff *rsp, const struct page_pool *pool, + const struct genl_info *info) +{ + size_t inflight, refsz; + void *hdr; + + hdr = genlmsg_iput(rsp, info); + if (!hdr) + return -EMSGSIZE; + + if (nla_put_uint(rsp, NETDEV_A_PAGE_POOL_ID, pool->user.id)) + goto err_cancel; + + if (pool->slow.netdev->ifindex != LOOPBACK_IFINDEX && + nla_put_u32(rsp, NETDEV_A_PAGE_POOL_IFINDEX, + pool->slow.netdev->ifindex)) + goto err_cancel; + if (pool->user.napi_id && + nla_put_uint(rsp, NETDEV_A_PAGE_POOL_NAPI_ID, pool->user.napi_id)) + goto err_cancel; + + inflight = page_pool_inflight(pool, false); + refsz = PAGE_SIZE << pool->p.order; + if (nla_put_uint(rsp, NETDEV_A_PAGE_POOL_INFLIGHT, inflight) || + nla_put_uint(rsp, NETDEV_A_PAGE_POOL_INFLIGHT_MEM, + inflight * refsz)) + goto err_cancel; + if (pool->user.detach_time && + nla_put_uint(rsp, NETDEV_A_PAGE_POOL_DETACH_TIME, + pool->user.detach_time)) + goto err_cancel; + + genlmsg_end(rsp, hdr); + + return 0; +err_cancel: + genlmsg_cancel(rsp, hdr); + return -EMSGSIZE; +} + +static void netdev_nl_page_pool_event(const struct page_pool *pool, u32 cmd) +{ + struct genl_info info; + struct sk_buff *ntf; + struct net *net; + + lockdep_assert_held(&page_pools_lock); + + /* 'invisible' page pools don't matter */ + if (hlist_unhashed(&pool->user.list)) + return; + net = dev_net(pool->slow.netdev); + + if (!genl_has_listeners(&netdev_nl_family, net, NETDEV_NLGRP_PAGE_POOL)) + return; + + genl_info_init_ntf(&info, &netdev_nl_family, cmd); + + ntf = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!ntf) + return; + + if (page_pool_nl_fill(ntf, pool, &info)) { + nlmsg_free(ntf); + return; + } + + genlmsg_multicast_netns(&netdev_nl_family, net, ntf, + 0, NETDEV_NLGRP_PAGE_POOL, GFP_KERNEL); +} + +int netdev_nl_page_pool_get_doit(struct sk_buff *skb, struct genl_info *info) +{ + u32 id; + + if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_PAGE_POOL_ID)) + return -EINVAL; + + id = nla_get_uint(info->attrs[NETDEV_A_PAGE_POOL_ID]); + + return netdev_nl_page_pool_get_do(info, id, page_pool_nl_fill); +} + +int netdev_nl_page_pool_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb) +{ + return netdev_nl_page_pool_get_dump(skb, cb, page_pool_nl_fill); +} + +int page_pool_list(struct page_pool *pool) +{ + static u32 id_alloc_next; + int err; + + mutex_lock(&page_pools_lock); + err = xa_alloc_cyclic(&page_pools, &pool->user.id, pool, xa_limit_32b, + &id_alloc_next, GFP_KERNEL); + if (err < 0) + goto err_unlock; + + INIT_HLIST_NODE(&pool->user.list); + if (pool->slow.netdev) { + hlist_add_head(&pool->user.list, + &pool->slow.netdev->page_pools); + pool->user.napi_id = pool->p.napi ? pool->p.napi->napi_id : 0; + + netdev_nl_page_pool_event(pool, NETDEV_CMD_PAGE_POOL_ADD_NTF); + } + + mutex_unlock(&page_pools_lock); + return 0; + +err_unlock: + mutex_unlock(&page_pools_lock); + return err; +} + +void page_pool_detached(struct page_pool *pool) +{ + mutex_lock(&page_pools_lock); + pool->user.detach_time = ktime_get_boottime_seconds(); + netdev_nl_page_pool_event(pool, NETDEV_CMD_PAGE_POOL_CHANGE_NTF); + mutex_unlock(&page_pools_lock); +} + +void page_pool_unlist(struct page_pool *pool) +{ + mutex_lock(&page_pools_lock); + netdev_nl_page_pool_event(pool, NETDEV_CMD_PAGE_POOL_DEL_NTF); + xa_erase(&page_pools, pool->user.id); + if (!hlist_unhashed(&pool->user.list)) + hlist_del(&pool->user.list); + mutex_unlock(&page_pools_lock); +} + +static void page_pool_unreg_netdev_wipe(struct net_device *netdev) +{ + struct page_pool *pool; + struct hlist_node *n; + + mutex_lock(&page_pools_lock); + hlist_for_each_entry_safe(pool, n, &netdev->page_pools, user.list) { + hlist_del_init(&pool->user.list); + pool->slow.netdev = NET_PTR_POISON; + } + mutex_unlock(&page_pools_lock); +} + +static void page_pool_unreg_netdev(struct net_device *netdev) +{ + struct page_pool *pool, *last; + struct net_device *lo; + + lo = dev_net(netdev)->loopback_dev; + + mutex_lock(&page_pools_lock); + last = NULL; + hlist_for_each_entry(pool, &netdev->page_pools, user.list) { + pool->slow.netdev = lo; + netdev_nl_page_pool_event(pool, + NETDEV_CMD_PAGE_POOL_CHANGE_NTF); + last = pool; + } + if (last) + hlist_splice_init(&netdev->page_pools, &last->user.list, + &lo->page_pools); + mutex_unlock(&page_pools_lock); +} + +static int +page_pool_netdevice_event(struct notifier_block *nb, + unsigned long event, void *ptr) +{ + struct net_device *netdev = netdev_notifier_info_to_dev(ptr); + + if (event != NETDEV_UNREGISTER) + return NOTIFY_DONE; + + if (hlist_empty(&netdev->page_pools)) + return NOTIFY_OK; + + if (netdev->ifindex != LOOPBACK_IFINDEX) + page_pool_unreg_netdev(netdev); + else + page_pool_unreg_netdev_wipe(netdev); + return NOTIFY_OK; +} + +static struct notifier_block page_pool_netdevice_nb = { + .notifier_call = page_pool_netdevice_event, +}; + +static int __init page_pool_user_init(void) +{ + return register_netdevice_notifier(&page_pool_netdevice_nb); +} + +subsys_initcall(page_pool_user_init); diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 57cea67b7562..ea55a758a475 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -3669,10 +3669,8 @@ static int pktgen_thread_worker(void *arg) if (unlikely(!pkt_dev && t->control == 0)) { if (t->net->pktgen_exiting) break; - wait_event_interruptible_timeout(t->queue, - t->control != 0, - HZ/10); - try_to_freeze(); + wait_event_freezable_timeout(t->queue, + t->control != 0, HZ / 10); continue; } diff --git a/net/core/request_sock.c b/net/core/request_sock.c index f35c2e998406..63de5c635842 100644 --- a/net/core/request_sock.c +++ b/net/core/request_sock.c @@ -33,9 +33,6 @@ void reqsk_queue_alloc(struct request_sock_queue *queue) { - spin_lock_init(&queue->rskq_lock); - - spin_lock_init(&queue->fastopenq.lock); queue->fastopenq.rskq_rst_head = NULL; queue->fastopenq.rskq_rst_tail = NULL; queue->fastopenq.qlen = 0; diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index e8431c6c8490..b86b0a87367d 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -342,8 +342,7 @@ int rtnl_unregister(int protocol, int msgtype) return -ENOENT; } - link = rtnl_dereference(tab[msgindex]); - RCU_INIT_POINTER(tab[msgindex], NULL); + link = rcu_replace_pointer_rtnl(tab[msgindex], NULL); rtnl_unlock(); kfree_rcu(link, rcu); @@ -368,18 +367,13 @@ void rtnl_unregister_all(int protocol) BUG_ON(protocol < 0 || protocol > RTNL_FAMILY_MAX); rtnl_lock(); - tab = rtnl_dereference(rtnl_msg_handlers[protocol]); + tab = rcu_replace_pointer_rtnl(rtnl_msg_handlers[protocol], NULL); if (!tab) { rtnl_unlock(); return; } - RCU_INIT_POINTER(rtnl_msg_handlers[protocol], NULL); for (msgindex = 0; msgindex < RTM_NR_MSGTYPES; msgindex++) { - link = rtnl_dereference(tab[msgindex]); - if (!link) - continue; - - RCU_INIT_POINTER(tab[msgindex], NULL); + link = rcu_replace_pointer_rtnl(tab[msgindex], NULL); kfree_rcu(link, rcu); } rtnl_unlock(); @@ -489,24 +483,15 @@ EXPORT_SYMBOL_GPL(__rtnl_link_unregister); */ static void rtnl_lock_unregistering_all(void) { - struct net *net; - bool unregistering; DEFINE_WAIT_FUNC(wait, woken_wake_function); add_wait_queue(&netdev_unregistering_wq, &wait); for (;;) { - unregistering = false; rtnl_lock(); /* We held write locked pernet_ops_rwsem, and parallel * setup_net() and cleanup_net() are not possible. */ - for_each_net(net) { - if (atomic_read(&net->dev_unreg_count) > 0) { - unregistering = true; - break; - } - } - if (!unregistering) + if (!atomic_read(&dev_unreg_count)) break; __rtnl_unlock(); @@ -857,9 +842,22 @@ int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, u32 id, } EXPORT_SYMBOL_GPL(rtnl_put_cacheinfo); +void netdev_set_operstate(struct net_device *dev, int newstate) +{ + unsigned int old = READ_ONCE(dev->operstate); + + do { + if (old == newstate) + return; + } while (!try_cmpxchg(&dev->operstate, &old, newstate)); + + netdev_state_change(dev); +} +EXPORT_SYMBOL(netdev_set_operstate); + static void set_operstate(struct net_device *dev, unsigned char transition) { - unsigned char operstate = dev->operstate; + unsigned char operstate = READ_ONCE(dev->operstate); switch (transition) { case IF_OPER_UP: @@ -881,12 +879,7 @@ static void set_operstate(struct net_device *dev, unsigned char transition) break; } - if (dev->operstate != operstate) { - write_lock(&dev_base_lock); - dev->operstate = operstate; - write_unlock(&dev_base_lock); - netdev_state_change(dev); - } + netdev_set_operstate(dev, operstate); } static unsigned int rtnl_dev_get_flags(const struct net_device *dev) @@ -1026,22 +1019,25 @@ static size_t rtnl_xdp_size(void) static size_t rtnl_prop_list_size(const struct net_device *dev) { struct netdev_name_node *name_node; - size_t size; + unsigned int cnt = 0; - if (list_empty(&dev->name_node->list)) + rcu_read_lock(); + list_for_each_entry_rcu(name_node, &dev->name_node->list, list) + cnt++; + rcu_read_unlock(); + + if (!cnt) return 0; - size = nla_total_size(0); - list_for_each_entry(name_node, &dev->name_node->list, list) - size += nla_total_size(ALTIFNAMSIZ); - return size; + + return nla_total_size(0) + cnt * nla_total_size(ALTIFNAMSIZ); } static size_t rtnl_proto_down_size(const struct net_device *dev) { size_t size = nla_total_size(1); - if (dev->proto_down_reason) - size += nla_total_size(0) + nla_total_size(4); + /* Assume dev->proto_down_reason is not zero. */ + size += nla_total_size(0) + nla_total_size(4); return size; } @@ -1060,7 +1056,7 @@ static size_t rtnl_dpll_pin_size(const struct net_device *dev) { size_t size = nla_total_size(0); /* nest IFLA_DPLL_PIN */ - size += dpll_msg_pin_handle_size(netdev_dpll_pin(dev)); + size += dpll_netdev_pin_handle_size(dev); return size; } @@ -1459,17 +1455,18 @@ static noinline_for_stack int rtnl_fill_vf(struct sk_buff *skb, return 0; } -static int rtnl_fill_link_ifmap(struct sk_buff *skb, struct net_device *dev) +static int rtnl_fill_link_ifmap(struct sk_buff *skb, + const struct net_device *dev) { struct rtnl_link_ifmap map; memset(&map, 0, sizeof(map)); - map.mem_start = dev->mem_start; - map.mem_end = dev->mem_end; - map.base_addr = dev->base_addr; - map.irq = dev->irq; - map.dma = dev->dma; - map.port = dev->if_port; + map.mem_start = READ_ONCE(dev->mem_start); + map.mem_end = READ_ONCE(dev->mem_end); + map.base_addr = READ_ONCE(dev->base_addr); + map.irq = READ_ONCE(dev->irq); + map.dma = READ_ONCE(dev->dma); + map.port = READ_ONCE(dev->if_port); if (nla_put_64bit(skb, IFLA_MAP, sizeof(map), &map, IFLA_PAD)) return -EMSGSIZE; @@ -1480,13 +1477,15 @@ static int rtnl_fill_link_ifmap(struct sk_buff *skb, struct net_device *dev) static u32 rtnl_xdp_prog_skb(struct net_device *dev) { const struct bpf_prog *generic_xdp_prog; + u32 res = 0; - ASSERT_RTNL(); + rcu_read_lock(); + generic_xdp_prog = rcu_dereference(dev->xdp_prog); + if (generic_xdp_prog) + res = generic_xdp_prog->aux->id; + rcu_read_unlock(); - generic_xdp_prog = rtnl_dereference(dev->xdp_prog); - if (!generic_xdp_prog) - return 0; - return generic_xdp_prog->aux->id; + return res; } static u32 rtnl_xdp_prog_drv(struct net_device *dev) @@ -1606,7 +1605,8 @@ static int put_master_ifindex(struct sk_buff *skb, struct net_device *dev) upper_dev = netdev_master_upper_dev_get_rcu(dev); if (upper_dev) - ret = nla_put_u32(skb, IFLA_MASTER, upper_dev->ifindex); + ret = nla_put_u32(skb, IFLA_MASTER, + READ_ONCE(upper_dev->ifindex)); rcu_read_unlock(); return ret; @@ -1615,10 +1615,10 @@ static int put_master_ifindex(struct sk_buff *skb, struct net_device *dev) static int nla_put_iflink(struct sk_buff *skb, const struct net_device *dev, bool force) { - int ifindex = dev_get_iflink(dev); + int iflink = dev_get_iflink(dev); - if (force || dev->ifindex != ifindex) - return nla_put_u32(skb, IFLA_LINK, ifindex); + if (force || READ_ONCE(dev->ifindex) != iflink) + return nla_put_u32(skb, IFLA_LINK, iflink); return 0; } @@ -1702,7 +1702,7 @@ static int rtnl_fill_alt_ifnames(struct sk_buff *skb, struct netdev_name_node *name_node; int count = 0; - list_for_each_entry(name_node, &dev->name_node->list, list) { + list_for_each_entry_rcu(name_node, &dev->name_node->list, list) { if (nla_put_string(skb, IFLA_ALT_IFNAME, name_node->name)) return -EMSGSIZE; count++; @@ -1710,6 +1710,7 @@ static int rtnl_fill_alt_ifnames(struct sk_buff *skb, return count; } +/* RCU protected. */ static int rtnl_fill_prop_list(struct sk_buff *skb, const struct net_device *dev) { @@ -1738,10 +1739,10 @@ static int rtnl_fill_proto_down(struct sk_buff *skb, struct nlattr *pr; u32 preason; - if (nla_put_u8(skb, IFLA_PROTO_DOWN, dev->proto_down)) + if (nla_put_u8(skb, IFLA_PROTO_DOWN, READ_ONCE(dev->proto_down))) goto nla_put_failure; - preason = dev->proto_down_reason; + preason = READ_ONCE(dev->proto_down_reason); if (!preason) return 0; @@ -1795,7 +1796,7 @@ static int rtnl_fill_dpll_pin(struct sk_buff *skb, if (!dpll_pin_nest) return -EMSGSIZE; - ret = dpll_msg_add_pin_handle(skb, netdev_dpll_pin(dev)); + ret = dpll_netdev_add_pin_handle(skb, dev); if (ret < 0) goto nest_cancel; @@ -1814,6 +1815,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, u32 event, int *new_nsid, int new_ifindex, int tgt_netnsid, gfp_t gfp) { + char devname[IFNAMSIZ]; struct ifinfomsg *ifm; struct nlmsghdr *nlh; struct Qdisc *qdisc; @@ -1826,41 +1828,51 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, ifm = nlmsg_data(nlh); ifm->ifi_family = AF_UNSPEC; ifm->__ifi_pad = 0; - ifm->ifi_type = dev->type; - ifm->ifi_index = dev->ifindex; + ifm->ifi_type = READ_ONCE(dev->type); + ifm->ifi_index = READ_ONCE(dev->ifindex); ifm->ifi_flags = dev_get_flags(dev); ifm->ifi_change = change; if (tgt_netnsid >= 0 && nla_put_s32(skb, IFLA_TARGET_NETNSID, tgt_netnsid)) goto nla_put_failure; - qdisc = rtnl_dereference(dev->qdisc); - if (nla_put_string(skb, IFLA_IFNAME, dev->name) || - nla_put_u32(skb, IFLA_TXQLEN, dev->tx_queue_len) || + netdev_copy_name(dev, devname); + if (nla_put_string(skb, IFLA_IFNAME, devname)) + goto nla_put_failure; + + if (nla_put_u32(skb, IFLA_TXQLEN, READ_ONCE(dev->tx_queue_len)) || nla_put_u8(skb, IFLA_OPERSTATE, - netif_running(dev) ? dev->operstate : IF_OPER_DOWN) || - nla_put_u8(skb, IFLA_LINKMODE, dev->link_mode) || - nla_put_u32(skb, IFLA_MTU, dev->mtu) || - nla_put_u32(skb, IFLA_MIN_MTU, dev->min_mtu) || - nla_put_u32(skb, IFLA_MAX_MTU, dev->max_mtu) || - nla_put_u32(skb, IFLA_GROUP, dev->group) || - nla_put_u32(skb, IFLA_PROMISCUITY, dev->promiscuity) || - nla_put_u32(skb, IFLA_ALLMULTI, dev->allmulti) || - nla_put_u32(skb, IFLA_NUM_TX_QUEUES, dev->num_tx_queues) || - nla_put_u32(skb, IFLA_GSO_MAX_SEGS, dev->gso_max_segs) || - nla_put_u32(skb, IFLA_GSO_MAX_SIZE, dev->gso_max_size) || - nla_put_u32(skb, IFLA_GRO_MAX_SIZE, dev->gro_max_size) || - nla_put_u32(skb, IFLA_GSO_IPV4_MAX_SIZE, dev->gso_ipv4_max_size) || - nla_put_u32(skb, IFLA_GRO_IPV4_MAX_SIZE, dev->gro_ipv4_max_size) || - nla_put_u32(skb, IFLA_TSO_MAX_SIZE, dev->tso_max_size) || - nla_put_u32(skb, IFLA_TSO_MAX_SEGS, dev->tso_max_segs) || + netif_running(dev) ? READ_ONCE(dev->operstate) : + IF_OPER_DOWN) || + nla_put_u8(skb, IFLA_LINKMODE, READ_ONCE(dev->link_mode)) || + nla_put_u32(skb, IFLA_MTU, READ_ONCE(dev->mtu)) || + nla_put_u32(skb, IFLA_MIN_MTU, READ_ONCE(dev->min_mtu)) || + nla_put_u32(skb, IFLA_MAX_MTU, READ_ONCE(dev->max_mtu)) || + nla_put_u32(skb, IFLA_GROUP, READ_ONCE(dev->group)) || + nla_put_u32(skb, IFLA_PROMISCUITY, READ_ONCE(dev->promiscuity)) || + nla_put_u32(skb, IFLA_ALLMULTI, READ_ONCE(dev->allmulti)) || + nla_put_u32(skb, IFLA_NUM_TX_QUEUES, + READ_ONCE(dev->num_tx_queues)) || + nla_put_u32(skb, IFLA_GSO_MAX_SEGS, + READ_ONCE(dev->gso_max_segs)) || + nla_put_u32(skb, IFLA_GSO_MAX_SIZE, + READ_ONCE(dev->gso_max_size)) || + nla_put_u32(skb, IFLA_GRO_MAX_SIZE, + READ_ONCE(dev->gro_max_size)) || + nla_put_u32(skb, IFLA_GSO_IPV4_MAX_SIZE, + READ_ONCE(dev->gso_ipv4_max_size)) || + nla_put_u32(skb, IFLA_GRO_IPV4_MAX_SIZE, + READ_ONCE(dev->gro_ipv4_max_size)) || + nla_put_u32(skb, IFLA_TSO_MAX_SIZE, + READ_ONCE(dev->tso_max_size)) || + nla_put_u32(skb, IFLA_TSO_MAX_SEGS, + READ_ONCE(dev->tso_max_segs)) || #ifdef CONFIG_RPS - nla_put_u32(skb, IFLA_NUM_RX_QUEUES, dev->num_rx_queues) || + nla_put_u32(skb, IFLA_NUM_RX_QUEUES, + READ_ONCE(dev->num_rx_queues)) || #endif put_master_ifindex(skb, dev) || nla_put_u8(skb, IFLA_CARRIER, netif_carrier_ok(dev)) || - (qdisc && - nla_put_string(skb, IFLA_QDISC, qdisc->ops->id)) || nla_put_ifalias(skb, dev) || nla_put_u32(skb, IFLA_CARRIER_CHANGES, atomic_read(&dev->carrier_up_count) + @@ -1879,9 +1891,6 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, goto nla_put_failure; } - if (rtnl_fill_link_ifmap(skb, dev)) - goto nla_put_failure; - if (dev->addr_len) { if (nla_put(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr) || nla_put(skb, IFLA_BROADCAST, dev->addr_len, dev->broadcast)) @@ -1914,9 +1923,6 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, goto nla_put_failure; } - if (rtnl_fill_link_netnsid(skb, dev, src_net, gfp)) - goto nla_put_failure; - if (new_nsid && nla_put_s32(skb, IFLA_NEW_NETNSID, *new_nsid) < 0) goto nla_put_failure; @@ -1929,12 +1935,18 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, goto nla_put_failure; rcu_read_lock(); + if (rtnl_fill_link_netnsid(skb, dev, src_net, GFP_ATOMIC)) + goto nla_put_failure_rcu; + qdisc = rcu_dereference(dev->qdisc); + if (qdisc && nla_put_string(skb, IFLA_QDISC, qdisc->ops->id)) + goto nla_put_failure_rcu; if (rtnl_fill_link_af(skb, dev, ext_filter_mask)) goto nla_put_failure_rcu; - rcu_read_unlock(); - + if (rtnl_fill_link_ifmap(skb, dev)) + goto nla_put_failure_rcu; if (rtnl_fill_prop_list(skb, dev)) - goto nla_put_failure; + goto nla_put_failure_rcu; + rcu_read_unlock(); if (dev->dev.parent && nla_put_string(skb, IFLA_PARENT_DEV_NAME, @@ -2203,25 +2215,22 @@ static int rtnl_valid_dump_ifinfo_req(const struct nlmsghdr *nlh, static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) { + const struct rtnl_link_ops *kind_ops = NULL; struct netlink_ext_ack *extack = cb->extack; const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); - struct net *tgt_net = net; - int h, s_h; - int idx = 0, s_idx; - struct net_device *dev; - struct hlist_head *head; + unsigned int flags = NLM_F_MULTI; struct nlattr *tb[IFLA_MAX+1]; + struct { + unsigned long ifindex; + } *ctx = (void *)cb->ctx; + struct net *tgt_net = net; u32 ext_filter_mask = 0; - const struct rtnl_link_ops *kind_ops = NULL; - unsigned int flags = NLM_F_MULTI; + struct net_device *dev; int master_idx = 0; int netnsid = -1; int err, i; - s_h = cb->args[0]; - s_idx = cb->args[1]; - err = rtnl_valid_dump_ifinfo_req(nlh, cb->strict_check, tb, extack); if (err < 0) { if (cb->strict_check) @@ -2265,36 +2274,18 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) flags |= NLM_F_DUMP_FILTERED; walk_entries: - for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { - idx = 0; - head = &tgt_net->dev_index_head[h]; - hlist_for_each_entry(dev, head, index_hlist) { - if (link_dump_filtered(dev, master_idx, kind_ops)) - goto cont; - if (idx < s_idx) - goto cont; - err = rtnl_fill_ifinfo(skb, dev, net, - RTM_NEWLINK, - NETLINK_CB(cb->skb).portid, - nlh->nlmsg_seq, 0, flags, - ext_filter_mask, 0, NULL, 0, - netnsid, GFP_KERNEL); - - if (err < 0) { - if (likely(skb->len)) - goto out; - - goto out_err; - } -cont: - idx++; - } + err = 0; + for_each_netdev_dump(tgt_net, dev, ctx->ifindex) { + if (link_dump_filtered(dev, master_idx, kind_ops)) + continue; + err = rtnl_fill_ifinfo(skb, dev, net, RTM_NEWLINK, + NETLINK_CB(cb->skb).portid, + nlh->nlmsg_seq, 0, flags, + ext_filter_mask, 0, NULL, 0, + netnsid, GFP_KERNEL); + if (err < 0) + break; } -out: - err = skb->len; -out_err: - cb->args[1] = idx; - cb->args[0] = h; cb->seq = tgt_net->dev_base_seq; nl_dump_check_consistent(cb, nlmsg_hdr(skb)); if (netnsid >= 0) @@ -2555,7 +2546,7 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr **tb) nla_for_each_nested(attr, tb[IFLA_VF_VLAN_LIST], rem) { if (nla_type(attr) != IFLA_VF_VLAN_INFO || - nla_len(attr) < NLA_HDRLEN) { + nla_len(attr) < sizeof(struct ifla_vf_vlan_info)) { return -EINVAL; } if (len >= MAX_VLAN_LIST_LEN) @@ -2905,13 +2896,6 @@ static int do_setlink(const struct sk_buff *skb, call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); } - if (tb[IFLA_MASTER]) { - err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER]), extack); - if (err) - goto errout; - status |= DO_SETLINK_MODIFIED; - } - if (ifm->ifi_flags || ifm->ifi_change) { err = dev_change_flags(dev, rtnl_dev_combine_flags(dev, ifm), extack); @@ -2919,6 +2903,13 @@ static int do_setlink(const struct sk_buff *skb, goto errout; } + if (tb[IFLA_MASTER]) { + err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER]), extack); + if (err) + goto errout; + status |= DO_SETLINK_MODIFIED; + } + if (tb[IFLA_CARRIER]) { err = dev_change_carrier(dev, nla_get_u8(tb[IFLA_CARRIER])); if (err) @@ -2986,11 +2977,9 @@ static int do_setlink(const struct sk_buff *skb, if (tb[IFLA_LINKMODE]) { unsigned char value = nla_get_u8(tb[IFLA_LINKMODE]); - write_lock(&dev_base_lock); if (dev->link_mode ^ value) status |= DO_SETLINK_NOTIFY; - dev->link_mode = value; - write_unlock(&dev_base_lock); + WRITE_ONCE(dev->link_mode, value); } if (tb[IFLA_VFINFO_LIST]) { @@ -3849,10 +3838,18 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr *nlh, goto out; err = -ENOBUFS; - nskb = nlmsg_new(if_nlmsg_size(dev, ext_filter_mask), GFP_KERNEL); + nskb = nlmsg_new_large(if_nlmsg_size(dev, ext_filter_mask)); if (nskb == NULL) goto out; + /* Synchronize the carrier state so we don't report a state + * that we're not actually going to honour immediately; if + * the driver just did a carrier off->on transition, we can + * only TX if link watch work has run, but without this we'd + * already report carrier on, even if it doesn't work yet. + */ + linkwatch_sync_dev(dev); + err = rtnl_fill_ifinfo(nskb, dev, net, RTM_NEWLINK, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0, 0, ext_filter_mask, @@ -5164,10 +5161,9 @@ static int rtnl_bridge_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, struct net *net = sock_net(skb->sk); struct ifinfomsg *ifm; struct net_device *dev; - struct nlattr *br_spec, *attr = NULL; + struct nlattr *br_spec, *attr, *br_flags_attr = NULL; int rem, err = -EOPNOTSUPP; u16 flags = 0; - bool have_flags = false; if (nlmsg_len(nlh) < sizeof(*ifm)) return -EINVAL; @@ -5185,11 +5181,11 @@ static int rtnl_bridge_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, br_spec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_AF_SPEC); if (br_spec) { nla_for_each_nested(attr, br_spec, rem) { - if (nla_type(attr) == IFLA_BRIDGE_FLAGS && !have_flags) { + if (nla_type(attr) == IFLA_BRIDGE_FLAGS && !br_flags_attr) { if (nla_len(attr) < sizeof(flags)) return -EINVAL; - have_flags = true; + br_flags_attr = attr; flags = nla_get_u16(attr); } @@ -5233,8 +5229,8 @@ static int rtnl_bridge_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, } } - if (have_flags) - memcpy(nla_data(attr), &flags, sizeof(flags)); + if (br_flags_attr) + memcpy(nla_data(br_flags_attr), &flags, sizeof(flags)); out: return err; } @@ -5265,15 +5261,14 @@ static int rtnl_bridge_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, br_spec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_AF_SPEC); if (br_spec) { - nla_for_each_nested(attr, br_spec, rem) { - if (nla_type(attr) == IFLA_BRIDGE_FLAGS) { - if (nla_len(attr) < sizeof(flags)) - return -EINVAL; + nla_for_each_nested_type(attr, IFLA_BRIDGE_FLAGS, br_spec, + rem) { + if (nla_len(attr) < sizeof(flags)) + return -EINVAL; - have_flags = true; - flags = nla_get_u16(attr); - break; - } + have_flags = true; + flags = nla_get_u16(attr); + break; } } @@ -5982,19 +5977,17 @@ static int rtnl_stats_get(struct sk_buff *skb, struct nlmsghdr *nlh, static int rtnl_stats_dump(struct sk_buff *skb, struct netlink_callback *cb) { struct netlink_ext_ack *extack = cb->extack; - int h, s_h, err, s_idx, s_idxattr, s_prividx; struct rtnl_stats_dump_filters filters; struct net *net = sock_net(skb->sk); unsigned int flags = NLM_F_MULTI; struct if_stats_msg *ifsm; - struct hlist_head *head; + struct { + unsigned long ifindex; + int idxattr; + int prividx; + } *ctx = (void *)cb->ctx; struct net_device *dev; - int idx = 0; - - s_h = cb->args[0]; - s_idx = cb->args[1]; - s_idxattr = cb->args[2]; - s_prividx = cb->args[3]; + int err; cb->seq = net->dev_base_seq; @@ -6013,39 +6006,26 @@ static int rtnl_stats_dump(struct sk_buff *skb, struct netlink_callback *cb) if (err) return err; - for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { - idx = 0; - head = &net->dev_index_head[h]; - hlist_for_each_entry(dev, head, index_hlist) { - if (idx < s_idx) - goto cont; - err = rtnl_fill_statsinfo(skb, dev, RTM_NEWSTATS, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, 0, - flags, &filters, - &s_idxattr, &s_prividx, - extack); - /* If we ran out of room on the first message, - * we're in trouble - */ - WARN_ON((err == -EMSGSIZE) && (skb->len == 0)); + for_each_netdev_dump(net, dev, ctx->ifindex) { + err = rtnl_fill_statsinfo(skb, dev, RTM_NEWSTATS, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, 0, + flags, &filters, + &ctx->idxattr, &ctx->prividx, + extack); + /* If we ran out of room on the first message, + * we're in trouble. + */ + WARN_ON((err == -EMSGSIZE) && (skb->len == 0)); - if (err < 0) - goto out; - s_prividx = 0; - s_idxattr = 0; - nl_dump_check_consistent(cb, nlmsg_hdr(skb)); -cont: - idx++; - } + if (err < 0) + break; + ctx->prividx = 0; + ctx->idxattr = 0; + nl_dump_check_consistent(cb, nlmsg_hdr(skb)); } -out: - cb->args[3] = s_prividx; - cb->args[2] = s_idxattr; - cb->args[1] = idx; - cb->args[0] = h; - return skb->len; + return err; } void rtnl_offload_xstats_notify(struct net_device *dev) @@ -6408,17 +6388,64 @@ static int rtnl_mdb_add(struct sk_buff *skb, struct nlmsghdr *nlh, return dev->netdev_ops->ndo_mdb_add(dev, tb, nlh->nlmsg_flags, extack); } +static int rtnl_validate_mdb_entry_del_bulk(const struct nlattr *attr, + struct netlink_ext_ack *extack) +{ + struct br_mdb_entry *entry = nla_data(attr); + struct br_mdb_entry zero_entry = {}; + + if (nla_len(attr) != sizeof(struct br_mdb_entry)) { + NL_SET_ERR_MSG_ATTR(extack, attr, "Invalid attribute length"); + return -EINVAL; + } + + if (entry->state != MDB_PERMANENT && entry->state != MDB_TEMPORARY) { + NL_SET_ERR_MSG(extack, "Unknown entry state"); + return -EINVAL; + } + + if (entry->flags) { + NL_SET_ERR_MSG(extack, "Entry flags cannot be set"); + return -EINVAL; + } + + if (entry->vid >= VLAN_N_VID - 1) { + NL_SET_ERR_MSG(extack, "Invalid entry VLAN id"); + return -EINVAL; + } + + if (memcmp(&entry->addr, &zero_entry.addr, sizeof(entry->addr))) { + NL_SET_ERR_MSG(extack, "Entry address cannot be set"); + return -EINVAL; + } + + return 0; +} + +static const struct nla_policy mdba_del_bulk_policy[MDBA_SET_ENTRY_MAX + 1] = { + [MDBA_SET_ENTRY] = NLA_POLICY_VALIDATE_FN(NLA_BINARY, + rtnl_validate_mdb_entry_del_bulk, + sizeof(struct br_mdb_entry)), + [MDBA_SET_ENTRY_ATTRS] = { .type = NLA_NESTED }, +}; + static int rtnl_mdb_del(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { + bool del_bulk = !!(nlh->nlmsg_flags & NLM_F_BULK); struct nlattr *tb[MDBA_SET_ENTRY_MAX + 1]; struct net *net = sock_net(skb->sk); struct br_port_msg *bpm; struct net_device *dev; int err; - err = nlmsg_parse_deprecated(nlh, sizeof(*bpm), tb, - MDBA_SET_ENTRY_MAX, mdba_policy, extack); + if (!del_bulk) + err = nlmsg_parse_deprecated(nlh, sizeof(*bpm), tb, + MDBA_SET_ENTRY_MAX, mdba_policy, + extack); + else + err = nlmsg_parse(nlh, sizeof(*bpm), tb, MDBA_SET_ENTRY_MAX, + mdba_del_bulk_policy, extack); if (err) return err; @@ -6439,6 +6466,14 @@ static int rtnl_mdb_del(struct sk_buff *skb, struct nlmsghdr *nlh, return -EINVAL; } + if (del_bulk) { + if (!dev->netdev_ops->ndo_mdb_del_bulk) { + NL_SET_ERR_MSG(extack, "Device does not support MDB bulk deletion"); + return -EOPNOTSUPP; + } + return dev->netdev_ops->ndo_mdb_del_bulk(dev, tb, extack); + } + if (!dev->netdev_ops->ndo_mdb_del) { NL_SET_ERR_MSG(extack, "Device does not support MDB operations"); return -EOPNOTSUPP; @@ -6493,6 +6528,7 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, } owner = link->owner; dumpit = link->dumpit; + flags = link->flags; if (type == RTM_GETLINK - RTM_BASE) min_dump_alloc = rtnl_calcit(skb, nlh); @@ -6510,6 +6546,7 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, .dump = dumpit, .min_dump_alloc = min_dump_alloc, .module = owner, + .flags = flags, }; err = netlink_dump_start(rtnl, skb, nlh, &c); /* netlink_dump_start() will keep a reference on @@ -6684,5 +6721,6 @@ void __init rtnetlink_init(void) rtnl_register(PF_BRIDGE, RTM_GETMDB, rtnl_mdb_get, rtnl_mdb_dump, 0); rtnl_register(PF_BRIDGE, RTM_NEWMDB, rtnl_mdb_add, NULL, 0); - rtnl_register(PF_BRIDGE, RTM_DELMDB, rtnl_mdb_del, NULL, 0); + rtnl_register(PF_BRIDGE, RTM_DELMDB, rtnl_mdb_del, NULL, + RTNL_FLAG_BULK_DEL_SUPPORTED); } diff --git a/net/core/scm.c b/net/core/scm.c index 880027ecf516..4f6a14babe5a 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -26,6 +26,7 @@ #include <linux/nsproxy.h> #include <linux/slab.h> #include <linux/errqueue.h> +#include <linux/io_uring.h> #include <linux/uaccess.h> @@ -35,6 +36,7 @@ #include <net/compat.h> #include <net/scm.h> #include <net/cls_cgroup.h> +#include <net/af_unix.h> /* @@ -84,8 +86,15 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp) return -ENOMEM; *fplp = fpl; fpl->count = 0; + fpl->count_unix = 0; fpl->max = SCM_MAX_FD; fpl->user = NULL; +#if IS_ENABLED(CONFIG_UNIX) + fpl->inflight = false; + fpl->dead = false; + fpl->edges = NULL; + INIT_LIST_HEAD(&fpl->vertices); +#endif } fpp = &fpl->fp[fpl->count]; @@ -103,6 +112,14 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp) if (fd < 0 || !(file = fget_raw(fd))) return -EBADF; + /* don't allow io_uring files */ + if (io_is_uring_fops(file)) { + fput(file); + return -EINVAL; + } + if (unix_get_socket(file)) + fpl->count_unix++; + *fpp++ = file; fpl->count++; } @@ -319,7 +336,7 @@ void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm) } for (i = 0; i < fdmax; i++) { - err = receive_fd_user(scm->fp->fp[i], cmsg_data + i, o_flags); + err = scm_recv_one_fd(scm->fp->fp[i], cmsg_data + i, o_flags); if (err < 0) break; } @@ -365,8 +382,14 @@ struct scm_fp_list *scm_fp_dup(struct scm_fp_list *fpl) if (new_fpl) { for (i = 0; i < fpl->count; i++) get_file(fpl->fp[i]); + new_fpl->max = new_fpl->count; new_fpl->user = get_uid(fpl->user); +#if IS_ENABLED(CONFIG_UNIX) + new_fpl->inflight = false; + new_fpl->edges = NULL; + INIT_LIST_HEAD(&new_fpl->vertices); +#endif } return new_fpl; } diff --git a/net/core/skbuff.c b/net/core/skbuff.c index b157efea5dea..466999a7515e 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -51,6 +51,7 @@ #endif #include <linux/string.h> #include <linux/skbuff.h> +#include <linux/skbuff_ref.h> #include <linux/splice.h> #include <linux/cache.h> #include <linux/rtnetlink.h> @@ -69,6 +70,7 @@ #include <net/sock.h> #include <net/checksum.h> #include <net/gso.h> +#include <net/hotdata.h> #include <net/ip6_checksum.h> #include <net/xfrm.h> #include <net/mpls.h> @@ -88,15 +90,10 @@ #include "dev.h" #include "sock_destructor.h" -struct kmem_cache *skbuff_cache __ro_after_init; -static struct kmem_cache *skbuff_fclone_cache __ro_after_init; #ifdef CONFIG_SKB_EXTENSIONS static struct kmem_cache *skbuff_ext_cache __ro_after_init; #endif - -static struct kmem_cache *skb_small_head_cache __ro_after_init; - #define SKB_SMALL_HEAD_SIZE SKB_HEAD_ALIGN(MAX_TCP_HEADER) /* We want SKB_SMALL_HEAD_CACHE_SIZE to not be a power of two. @@ -112,8 +109,23 @@ static struct kmem_cache *skb_small_head_cache __ro_after_init; #define SKB_SMALL_HEAD_HEADROOM \ SKB_WITH_OVERHEAD(SKB_SMALL_HEAD_CACHE_SIZE) -int sysctl_max_skb_frags __read_mostly = MAX_SKB_FRAGS; -EXPORT_SYMBOL(sysctl_max_skb_frags); +/* kcm_write_msgs() relies on casting paged frags to bio_vec to use + * iov_iter_bvec(). These static asserts ensure the cast is valid is long as the + * netmem is a page. + */ +static_assert(offsetof(struct bio_vec, bv_page) == + offsetof(skb_frag_t, netmem)); +static_assert(sizeof_field(struct bio_vec, bv_page) == + sizeof_field(skb_frag_t, netmem)); + +static_assert(offsetof(struct bio_vec, bv_len) == offsetof(skb_frag_t, len)); +static_assert(sizeof_field(struct bio_vec, bv_len) == + sizeof_field(skb_frag_t, len)); + +static_assert(offsetof(struct bio_vec, bv_offset) == + offsetof(skb_frag_t, offset)); +static_assert(sizeof_field(struct bio_vec, bv_offset) == + sizeof_field(skb_frag_t, offset)); #undef FN #define FN(reason) [SKB_DROP_REASON_##reason] = #reason, @@ -297,7 +309,8 @@ void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask) fragsz = SKB_DATA_ALIGN(fragsz); - return page_frag_alloc_align(&nc->page, fragsz, GFP_ATOMIC, align_mask); + return __page_frag_alloc_align(&nc->page, fragsz, GFP_ATOMIC, + align_mask); } EXPORT_SYMBOL(__napi_alloc_frag_align); @@ -309,13 +322,15 @@ void *__netdev_alloc_frag_align(unsigned int fragsz, unsigned int align_mask) if (in_hardirq() || irqs_disabled()) { struct page_frag_cache *nc = this_cpu_ptr(&netdev_alloc_cache); - data = page_frag_alloc_align(nc, fragsz, GFP_ATOMIC, align_mask); + data = __page_frag_alloc_align(nc, fragsz, GFP_ATOMIC, + align_mask); } else { struct napi_alloc_cache *nc; local_bh_disable(); nc = this_cpu_ptr(&napi_alloc_cache); - data = page_frag_alloc_align(&nc->page, fragsz, GFP_ATOMIC, align_mask); + data = __page_frag_alloc_align(&nc->page, fragsz, GFP_ATOMIC, + align_mask); local_bh_enable(); } return data; @@ -328,7 +343,7 @@ static struct sk_buff *napi_skb_cache_get(void) struct sk_buff *skb; if (unlikely(!nc->skb_count)) { - nc->skb_count = kmem_cache_alloc_bulk(skbuff_cache, + nc->skb_count = kmem_cache_alloc_bulk(net_hotdata.skbuff_cache, GFP_ATOMIC, NAPI_SKB_CACHE_BULK, nc->skb_cache); @@ -337,7 +352,7 @@ static struct sk_buff *napi_skb_cache_get(void) } skb = nc->skb_cache[--nc->skb_count]; - kasan_unpoison_object_data(skbuff_cache, skb); + kasan_mempool_unpoison_object(skb, kmem_cache_size(net_hotdata.skbuff_cache)); return skb; } @@ -395,7 +410,7 @@ struct sk_buff *slab_build_skb(void *data) struct sk_buff *skb; unsigned int size; - skb = kmem_cache_alloc(skbuff_cache, GFP_ATOMIC); + skb = kmem_cache_alloc(net_hotdata.skbuff_cache, GFP_ATOMIC); if (unlikely(!skb)) return NULL; @@ -446,7 +461,7 @@ struct sk_buff *__build_skb(void *data, unsigned int frag_size) { struct sk_buff *skb; - skb = kmem_cache_alloc(skbuff_cache, GFP_ATOMIC); + skb = kmem_cache_alloc(net_hotdata.skbuff_cache, GFP_ATOMIC); if (unlikely(!skb)) return NULL; @@ -557,7 +572,7 @@ static void *kmalloc_reserve(unsigned int *size, gfp_t flags, int node, obj_size = SKB_HEAD_ALIGN(*size); if (obj_size <= SKB_SMALL_HEAD_CACHE_SIZE && !(flags & KMALLOC_NOT_NORMAL_BITS)) { - obj = kmem_cache_alloc_node(skb_small_head_cache, + obj = kmem_cache_alloc_node(net_hotdata.skb_small_head_cache, flags | __GFP_NOMEMALLOC | __GFP_NOWARN, node); *size = SKB_SMALL_HEAD_CACHE_SIZE; @@ -565,7 +580,7 @@ static void *kmalloc_reserve(unsigned int *size, gfp_t flags, int node, goto out; /* Try again but now we are using pfmemalloc reserves */ ret_pfmemalloc = true; - obj = kmem_cache_alloc_node(skb_small_head_cache, flags, node); + obj = kmem_cache_alloc_node(net_hotdata.skb_small_head_cache, flags, node); goto out; } @@ -628,7 +643,7 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, u8 *data; cache = (flags & SKB_ALLOC_FCLONE) - ? skbuff_fclone_cache : skbuff_cache; + ? net_hotdata.skbuff_fclone_cache : net_hotdata.skbuff_cache; if (sk_memalloc_socks() && (flags & SKB_ALLOC_RX)) gfp_mask |= __GFP_MEMALLOC; @@ -758,10 +773,9 @@ skb_fail: EXPORT_SYMBOL(__netdev_alloc_skb); /** - * __napi_alloc_skb - allocate skbuff for rx in a specific NAPI instance + * napi_alloc_skb - allocate skbuff for rx in a specific NAPI instance * @napi: napi instance this buffer was allocated for * @len: length to allocate - * @gfp_mask: get_free_pages mask, passed to alloc_skb and alloc_pages * * Allocate a new sk_buff for use in NAPI receive. This buffer will * attempt to allocate the head from a special reserved region used @@ -770,9 +784,9 @@ EXPORT_SYMBOL(__netdev_alloc_skb); * * %NULL is returned if there is no free memory. */ -struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, - gfp_t gfp_mask) +struct sk_buff *napi_alloc_skb(struct napi_struct *napi, unsigned int len) { + gfp_t gfp_mask = GFP_ATOMIC | __GFP_NOWARN; struct napi_alloc_cache *nc; struct sk_buff *skb; bool pfmemalloc; @@ -843,19 +857,19 @@ skb_success: skb_fail: return skb; } -EXPORT_SYMBOL(__napi_alloc_skb); +EXPORT_SYMBOL(napi_alloc_skb); -void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off, - int size, unsigned int truesize) +void skb_add_rx_frag_netmem(struct sk_buff *skb, int i, netmem_ref netmem, + int off, int size, unsigned int truesize) { DEBUG_NET_WARN_ON_ONCE(size > truesize); - skb_fill_page_desc(skb, i, page, off, size); + skb_fill_netmem_desc(skb, i, netmem, off, size); skb->len += size; skb->data_len += size; skb->truesize += truesize; } -EXPORT_SYMBOL(skb_add_rx_frag); +EXPORT_SYMBOL(skb_add_rx_frag_netmem); void skb_coalesce_rx_frag(struct sk_buff *skb, int i, int size, unsigned int truesize) @@ -890,12 +904,106 @@ static void skb_clone_fraglist(struct sk_buff *skb) skb_get(list); } +static bool is_pp_page(struct page *page) +{ + return (page->pp_magic & ~0x3UL) == PP_SIGNATURE; +} + +int skb_pp_cow_data(struct page_pool *pool, struct sk_buff **pskb, + unsigned int headroom) +{ #if IS_ENABLED(CONFIG_PAGE_POOL) -bool napi_pp_put_page(struct page *page, bool napi_safe) + u32 size, truesize, len, max_head_size, off; + struct sk_buff *skb = *pskb, *nskb; + int err, i, head_off; + void *data; + + /* XDP does not support fraglist so we need to linearize + * the skb. + */ + if (skb_has_frag_list(skb)) + return -EOPNOTSUPP; + + max_head_size = SKB_WITH_OVERHEAD(PAGE_SIZE - headroom); + if (skb->len > max_head_size + MAX_SKB_FRAGS * PAGE_SIZE) + return -ENOMEM; + + size = min_t(u32, skb->len, max_head_size); + truesize = SKB_HEAD_ALIGN(size) + headroom; + data = page_pool_dev_alloc_va(pool, &truesize); + if (!data) + return -ENOMEM; + + nskb = napi_build_skb(data, truesize); + if (!nskb) { + page_pool_free_va(pool, data, true); + return -ENOMEM; + } + + skb_reserve(nskb, headroom); + skb_copy_header(nskb, skb); + skb_mark_for_recycle(nskb); + + err = skb_copy_bits(skb, 0, nskb->data, size); + if (err) { + consume_skb(nskb); + return err; + } + skb_put(nskb, size); + + head_off = skb_headroom(nskb) - skb_headroom(skb); + skb_headers_offset_update(nskb, head_off); + + off = size; + len = skb->len - off; + for (i = 0; i < MAX_SKB_FRAGS && off < skb->len; i++) { + struct page *page; + u32 page_off; + + size = min_t(u32, len, PAGE_SIZE); + truesize = size; + + page = page_pool_dev_alloc(pool, &page_off, &truesize); + if (!page) { + consume_skb(nskb); + return -ENOMEM; + } + + skb_add_rx_frag(nskb, i, page, page_off, size, truesize); + err = skb_copy_bits(skb, off, page_address(page) + page_off, + size); + if (err) { + consume_skb(nskb); + return err; + } + + len -= size; + off += size; + } + + consume_skb(skb); + *pskb = nskb; + + return 0; +#else + return -EOPNOTSUPP; +#endif +} +EXPORT_SYMBOL(skb_pp_cow_data); + +int skb_cow_data_for_xdp(struct page_pool *pool, struct sk_buff **pskb, + struct bpf_prog *prog) { - bool allow_direct = false; - struct page_pool *pp; + if (!prog->aux->xdp_has_frags) + return -EINVAL; + + return skb_pp_cow_data(pool, pskb, XDP_PACKET_HEADROOM); +} +EXPORT_SYMBOL(skb_cow_data_for_xdp); +#if IS_ENABLED(CONFIG_PAGE_POOL) +bool napi_pp_put_page(struct page *page) +{ page = compound_head(page); /* page->pp_magic is OR'ed with PP_SIGNATURE after the allocation @@ -905,57 +1013,68 @@ bool napi_pp_put_page(struct page *page, bool napi_safe) * and page_is_pfmemalloc() is checked in __page_pool_put_page() * to avoid recycling the pfmemalloc page. */ - if (unlikely((page->pp_magic & ~0x3UL) != PP_SIGNATURE)) + if (unlikely(!is_pp_page(page))) return false; - pp = page->pp; - - /* Allow direct recycle if we have reasons to believe that we are - * in the same context as the consumer would run, so there's - * no possible race. - * __page_pool_put_page() makes sure we're not in hardirq context - * and interrupts are enabled prior to accessing the cache. - */ - if (napi_safe || in_softirq()) { - const struct napi_struct *napi = READ_ONCE(pp->p.napi); - - allow_direct = napi && - READ_ONCE(napi->list_owner) == smp_processor_id(); - } - - /* Driver set this to memory recycling info. Reset it on recycle. - * This will *not* work for NIC using a split-page memory model. - * The page will be returned to the pool here regardless of the - * 'flipped' fragment being in use or not. - */ - page_pool_put_full_page(pp, page, allow_direct); + page_pool_put_full_page(page->pp, page, false); return true; } EXPORT_SYMBOL(napi_pp_put_page); #endif -static bool skb_pp_recycle(struct sk_buff *skb, void *data, bool napi_safe) +static bool skb_pp_recycle(struct sk_buff *skb, void *data) { if (!IS_ENABLED(CONFIG_PAGE_POOL) || !skb->pp_recycle) return false; - return napi_pp_put_page(virt_to_page(data), napi_safe); + return napi_pp_put_page(virt_to_page(data)); +} + +/** + * skb_pp_frag_ref() - Increase fragment references of a page pool aware skb + * @skb: page pool aware skb + * + * Increase the fragment reference count (pp_ref_count) of a skb. This is + * intended to gain fragment references only for page pool aware skbs, + * i.e. when skb->pp_recycle is true, and not for fragments in a + * non-pp-recycling skb. It has a fallback to increase references on normal + * pages, as page pool aware skbs may also have normal page fragments. + */ +static int skb_pp_frag_ref(struct sk_buff *skb) +{ + struct skb_shared_info *shinfo; + struct page *head_page; + int i; + + if (!skb->pp_recycle) + return -EINVAL; + + shinfo = skb_shinfo(skb); + + for (i = 0; i < shinfo->nr_frags; i++) { + head_page = compound_head(skb_frag_page(&shinfo->frags[i])); + if (likely(is_pp_page(head_page))) + page_pool_ref_page(head_page); + else + page_ref_inc(head_page); + } + return 0; } static void skb_kfree_head(void *head, unsigned int end_offset) { if (end_offset == SKB_SMALL_HEAD_HEADROOM) - kmem_cache_free(skb_small_head_cache, head); + kmem_cache_free(net_hotdata.skb_small_head_cache, head); else kfree(head); } -static void skb_free_head(struct sk_buff *skb, bool napi_safe) +static void skb_free_head(struct sk_buff *skb) { unsigned char *head = skb->head; if (skb->head_frag) { - if (skb_pp_recycle(skb, head, napi_safe)) + if (skb_pp_recycle(skb, head)) return; skb_free_frag(head); } else { @@ -963,15 +1082,12 @@ static void skb_free_head(struct sk_buff *skb, bool napi_safe) } } -static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason, - bool napi_safe) +static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason) { struct skb_shared_info *shinfo = skb_shinfo(skb); int i; - if (skb->cloned && - atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1, - &shinfo->dataref)) + if (!skb_data_unref(skb, shinfo)) goto exit; if (skb_zcopy(skb)) { @@ -983,13 +1099,13 @@ static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason, } for (i = 0; i < shinfo->nr_frags; i++) - napi_frag_unref(&shinfo->frags[i], skb->pp_recycle, napi_safe); + __skb_frag_unref(&shinfo->frags[i], skb->pp_recycle); free_head: if (shinfo->frag_list) kfree_skb_list_reason(shinfo->frag_list, reason); - skb_free_head(skb, napi_safe); + skb_free_head(skb); exit: /* When we clone an SKB we copy the reycling bit. The pp_recycle * bit is only set on the head though, so in order to avoid races @@ -1012,7 +1128,7 @@ static void kfree_skbmem(struct sk_buff *skb) switch (skb->fclone) { case SKB_FCLONE_UNAVAILABLE: - kmem_cache_free(skbuff_cache, skb); + kmem_cache_free(net_hotdata.skbuff_cache, skb); return; case SKB_FCLONE_ORIG: @@ -1033,7 +1149,7 @@ static void kfree_skbmem(struct sk_buff *skb) if (!refcount_dec_and_test(&fclones->fclone_ref)) return; fastpath: - kmem_cache_free(skbuff_fclone_cache, fclones); + kmem_cache_free(net_hotdata.skbuff_fclone_cache, fclones); } void skb_release_head_state(struct sk_buff *skb) @@ -1050,12 +1166,11 @@ void skb_release_head_state(struct sk_buff *skb) } /* Free everything but the sk_buff shell. */ -static void skb_release_all(struct sk_buff *skb, enum skb_drop_reason reason, - bool napi_safe) +static void skb_release_all(struct sk_buff *skb, enum skb_drop_reason reason) { skb_release_head_state(skb); if (likely(skb->head)) - skb_release_data(skb, reason, napi_safe); + skb_release_data(skb, reason); } /** @@ -1069,7 +1184,7 @@ static void skb_release_all(struct sk_buff *skb, enum skb_drop_reason reason, void __kfree_skb(struct sk_buff *skb) { - skb_release_all(skb, SKB_DROP_REASON_NOT_SPECIFIED, false); + skb_release_all(skb, SKB_DROP_REASON_NOT_SPECIFIED); kfree_skbmem(skb); } EXPORT_SYMBOL(__kfree_skb); @@ -1126,11 +1241,11 @@ static void kfree_skb_add_bulk(struct sk_buff *skb, return; } - skb_release_all(skb, reason, false); + skb_release_all(skb, reason); sa->skb_array[sa->skb_count++] = skb; if (unlikely(sa->skb_count == KFREE_SKB_BULK_SIZE)) { - kmem_cache_free_bulk(skbuff_cache, KFREE_SKB_BULK_SIZE, + kmem_cache_free_bulk(net_hotdata.skbuff_cache, KFREE_SKB_BULK_SIZE, sa->skb_array); sa->skb_count = 0; } @@ -1155,7 +1270,7 @@ kfree_skb_list_reason(struct sk_buff *segs, enum skb_drop_reason reason) } if (sa.skb_count) - kmem_cache_free_bulk(skbuff_cache, sa.skb_count, sa.skb_array); + kmem_cache_free_bulk(net_hotdata.skbuff_cache, sa.skb_count, sa.skb_array); } EXPORT_SYMBOL(kfree_skb_list_reason); @@ -1187,22 +1302,28 @@ void skb_dump(const char *level, const struct sk_buff *skb, bool full_pkt) has_trans = skb_transport_header_was_set(skb); printk("%sskb len=%u headroom=%u headlen=%u tailroom=%u\n" - "mac=(%d,%d) net=(%d,%d) trans=%d\n" + "mac=(%d,%d) mac_len=%u net=(%d,%d) trans=%d\n" "shinfo(txflags=%u nr_frags=%u gso(size=%hu type=%u segs=%hu))\n" - "csum(0x%x ip_summed=%u complete_sw=%u valid=%u level=%u)\n" - "hash(0x%x sw=%u l4=%u) proto=0x%04x pkttype=%u iif=%d\n", + "csum(0x%x start=%u offset=%u ip_summed=%u complete_sw=%u valid=%u level=%u)\n" + "hash(0x%x sw=%u l4=%u) proto=0x%04x pkttype=%u iif=%d\n" + "priority=0x%x mark=0x%x alloc_cpu=%u vlan_all=0x%x\n" + "encapsulation=%d inner(proto=0x%04x, mac=%u, net=%u, trans=%u)\n", level, skb->len, headroom, skb_headlen(skb), tailroom, has_mac ? skb->mac_header : -1, has_mac ? skb_mac_header_len(skb) : -1, + skb->mac_len, skb->network_header, has_trans ? skb_network_header_len(skb) : -1, has_trans ? skb->transport_header : -1, sh->tx_flags, sh->nr_frags, sh->gso_size, sh->gso_type, sh->gso_segs, - skb->csum, skb->ip_summed, skb->csum_complete_sw, - skb->csum_valid, skb->csum_level, + skb->csum, skb->csum_start, skb->csum_offset, skb->ip_summed, + skb->csum_complete_sw, skb->csum_valid, skb->csum_level, skb->hash, skb->sw_hash, skb->l4_hash, - ntohs(skb->protocol), skb->pkt_type, skb->skb_iif); + ntohs(skb->protocol), skb->pkt_type, skb->skb_iif, + skb->priority, skb->mark, skb->alloc_cpu, skb->vlan_all, + skb->encapsulation, skb->inner_protocol, skb->inner_mac_header, + skb->inner_network_header, skb->inner_transport_header); if (dev) printk("%sdev name=%s feat=%pNF\n", @@ -1300,7 +1421,7 @@ EXPORT_SYMBOL(consume_skb); void __consume_stateless_skb(struct sk_buff *skb) { trace_consume_skb(skb, __builtin_return_address(0)); - skb_release_data(skb, SKB_CONSUMED, false); + skb_release_data(skb, SKB_CONSUMED); kfree_skbmem(skb); } @@ -1309,15 +1430,17 @@ static void napi_skb_cache_put(struct sk_buff *skb) struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); u32 i; - kasan_poison_object_data(skbuff_cache, skb); + if (!kasan_mempool_poison_object(skb)) + return; + nc->skb_cache[nc->skb_count++] = skb; if (unlikely(nc->skb_count == NAPI_SKB_CACHE_SIZE)) { for (i = NAPI_SKB_CACHE_HALF; i < NAPI_SKB_CACHE_SIZE; i++) - kasan_unpoison_object_data(skbuff_cache, - nc->skb_cache[i]); + kasan_mempool_unpoison_object(nc->skb_cache[i], + kmem_cache_size(net_hotdata.skbuff_cache)); - kmem_cache_free_bulk(skbuff_cache, NAPI_SKB_CACHE_HALF, + kmem_cache_free_bulk(net_hotdata.skbuff_cache, NAPI_SKB_CACHE_HALF, nc->skb_cache + NAPI_SKB_CACHE_HALF); nc->skb_count = NAPI_SKB_CACHE_HALF; } @@ -1325,7 +1448,7 @@ static void napi_skb_cache_put(struct sk_buff *skb) void __napi_kfree_skb(struct sk_buff *skb, enum skb_drop_reason reason) { - skb_release_all(skb, reason, true); + skb_release_all(skb, reason); napi_skb_cache_put(skb); } @@ -1363,7 +1486,7 @@ void napi_consume_skb(struct sk_buff *skb, int budget) return; } - skb_release_all(skb, SKB_CONSUMED, !!budget); + skb_release_all(skb, SKB_CONSUMED); napi_skb_cache_put(skb); } EXPORT_SYMBOL(napi_consume_skb); @@ -1494,7 +1617,7 @@ EXPORT_SYMBOL_GPL(alloc_skb_for_msg); */ struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src) { - skb_release_all(dst, SKB_CONSUMED, false); + skb_release_all(dst, SKB_CONSUMED); return __skb_clone(dst, src); } EXPORT_SYMBOL_GPL(skb_morph); @@ -1562,7 +1685,7 @@ static struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size) return NULL; } - uarg->ubuf.callback = msg_zerocopy_callback; + uarg->ubuf.ops = &msg_zerocopy_ubuf_ops; uarg->id = ((u32)atomic_inc_return(&sk->sk_zckey)) - 1; uarg->len = 1; uarg->bytelen = size; @@ -1588,7 +1711,7 @@ struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size, u32 bytelen, next; /* there might be non MSG_ZEROCOPY users */ - if (uarg->callback != msg_zerocopy_callback) + if (uarg->ops != &msg_zerocopy_ubuf_ops) return NULL; /* realloc only when socket is locked (TCP, UDP cork), @@ -1699,8 +1822,8 @@ release: sock_put(sk); } -void msg_zerocopy_callback(struct sk_buff *skb, struct ubuf_info *uarg, - bool success) +static void msg_zerocopy_complete(struct sk_buff *skb, struct ubuf_info *uarg, + bool success) { struct ubuf_info_msgzc *uarg_zc = uarg_to_msgzc(uarg); @@ -1709,7 +1832,6 @@ void msg_zerocopy_callback(struct sk_buff *skb, struct ubuf_info *uarg, if (refcount_dec_and_test(&uarg->refcnt)) __msg_zerocopy_callback(uarg_zc); } -EXPORT_SYMBOL_GPL(msg_zerocopy_callback); void msg_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref) { @@ -1719,10 +1841,15 @@ void msg_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref) uarg_to_msgzc(uarg)->len--; if (have_uref) - msg_zerocopy_callback(NULL, uarg, true); + msg_zerocopy_complete(NULL, uarg, true); } EXPORT_SYMBOL_GPL(msg_zerocopy_put_abort); +const struct ubuf_info_ops msg_zerocopy_ubuf_ops = { + .complete = msg_zerocopy_complete, +}; +EXPORT_SYMBOL_GPL(msg_zerocopy_ubuf_ops); + int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb, struct msghdr *msg, int len, struct ubuf_info *uarg) @@ -1730,11 +1857,18 @@ int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb, struct ubuf_info *orig_uarg = skb_zcopy(skb); int err, orig_len = skb->len; - /* An skb can only point to one uarg. This edge case happens when - * TCP appends to an skb, but zerocopy_realloc triggered a new alloc. - */ - if (orig_uarg && uarg != orig_uarg) - return -EEXIST; + if (uarg->ops->link_skb) { + err = uarg->ops->link_skb(skb, uarg); + if (err) + return err; + } else { + /* An skb can only point to one uarg. This edge case happens + * when TCP appends to an skb, but zerocopy_realloc triggered + * a new alloc. + */ + if (orig_uarg && uarg != orig_uarg) + return -EEXIST; + } err = __zerocopy_sg_from_iter(msg, sk, skb, &msg->msg_iter, len); if (err == -EFAULT || (err == -EMSGSIZE && skb->len == orig_len)) { @@ -1748,7 +1882,8 @@ int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb, return err; } - skb_zcopy_set(skb, uarg, NULL); + if (!uarg->ops->link_skb) + skb_zcopy_set(skb, uarg, NULL); return skb->len - orig_len; } EXPORT_SYMBOL_GPL(skb_zerocopy_iter_stream); @@ -1868,10 +2003,11 @@ int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask) /* skb frags point to kernel buffers */ for (i = 0; i < new_frags - 1; i++) { - __skb_fill_page_desc(skb, i, head, 0, psize); + __skb_fill_netmem_desc(skb, i, page_to_netmem(head), 0, psize); head = (struct page *)page_private(head); } - __skb_fill_page_desc(skb, new_frags - 1, head, 0, d_off); + __skb_fill_netmem_desc(skb, new_frags - 1, page_to_netmem(head), 0, + d_off); skb_shinfo(skb)->nr_frags = new_frags; release: @@ -1913,7 +2049,7 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask) if (skb_pfmemalloc(skb)) gfp_mask |= __GFP_MEMALLOC; - n = kmem_cache_alloc(skbuff_cache, gfp_mask); + n = kmem_cache_alloc(net_hotdata.skbuff_cache, gfp_mask); if (!n) return NULL; @@ -1976,11 +2112,17 @@ static inline int skb_alloc_rx_flag(const struct sk_buff *skb) struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask) { - int headerlen = skb_headroom(skb); - unsigned int size = skb_end_offset(skb) + skb->data_len; - struct sk_buff *n = __alloc_skb(size, gfp_mask, - skb_alloc_rx_flag(skb), NUMA_NO_NODE); + struct sk_buff *n; + unsigned int size; + int headerlen; + if (WARN_ON_ONCE(skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST)) + return NULL; + + headerlen = skb_headroom(skb); + size = skb_end_offset(skb) + skb->data_len; + n = __alloc_skb(size, gfp_mask, + skb_alloc_rx_flag(skb), NUMA_NO_NODE); if (!n) return NULL; @@ -2125,9 +2267,9 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, if (skb_has_frag_list(skb)) skb_clone_fraglist(skb); - skb_release_data(skb, SKB_CONSUMED, false); + skb_release_data(skb, SKB_CONSUMED); } else { - skb_free_head(skb, false); + skb_free_head(skb); } off = (data + nhead) - skb->head; @@ -2308,12 +2450,17 @@ struct sk_buff *skb_copy_expand(const struct sk_buff *skb, /* * Allocate the copy buffer */ - struct sk_buff *n = __alloc_skb(newheadroom + skb->len + newtailroom, - gfp_mask, skb_alloc_rx_flag(skb), - NUMA_NO_NODE); - int oldheadroom = skb_headroom(skb); int head_copy_len, head_copy_off; + struct sk_buff *n; + int oldheadroom; + + if (WARN_ON_ONCE(skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST)) + return NULL; + oldheadroom = skb_headroom(skb); + n = __alloc_skb(newheadroom + skb->len + newtailroom, + gfp_mask, skb_alloc_rx_flag(skb), + NUMA_NO_NODE); if (!n) return NULL; @@ -3609,7 +3756,8 @@ skb_zerocopy(struct sk_buff *to, struct sk_buff *from, int len, int hlen) if (plen) { page = virt_to_head_page(from->head); offset = from->data - (unsigned char *)page_address(page); - __skb_fill_page_desc(to, 0, page, offset, plen); + __skb_fill_netmem_desc(to, 0, page_to_netmem(page), + offset, plen); get_page(page); j = 1; len -= plen; @@ -4522,8 +4670,9 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb, /* GSO partial only requires that we trim off any excess that * doesn't fit into an MSS sized block, so take care of that * now. + * Cap len to not accidentally hit GSO_BY_FRAGS. */ - partial_segs = len / mss; + partial_segs = min(len, GSO_BY_FRAGS - 1) / mss; if (partial_segs > 1) mss *= partial_segs; else @@ -4824,7 +4973,9 @@ static __always_inline unsigned int skb_ext_total_length(void) static void skb_extensions_init(void) { BUILD_BUG_ON(SKB_EXT_NUM >= 8); +#if !IS_ENABLED(CONFIG_KCOV_INSTRUMENT_ALL) BUILD_BUG_ON(skb_ext_total_length() > 255); +#endif skbuff_ext_cache = kmem_cache_create("skbuff_ext_cache", SKB_EXT_ALIGN_VALUE * skb_ext_total_length(), @@ -4848,7 +4999,7 @@ static void skb_extensions_init(void) {} void __init skb_init(void) { - skbuff_cache = kmem_cache_create_usercopy("skbuff_head_cache", + net_hotdata.skbuff_cache = kmem_cache_create_usercopy("skbuff_head_cache", sizeof(struct sk_buff), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC| @@ -4856,7 +5007,7 @@ void __init skb_init(void) offsetof(struct sk_buff, cb), sizeof_field(struct sk_buff, cb), NULL); - skbuff_fclone_cache = kmem_cache_create("skbuff_fclone_cache", + net_hotdata.skbuff_fclone_cache = kmem_cache_create("skbuff_fclone_cache", sizeof(struct sk_buff_fclones), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, @@ -4865,7 +5016,7 @@ void __init skb_init(void) * struct skb_shared_info is located at the end of skb->head, * and should not be copied to/from user. */ - skb_small_head_cache = kmem_cache_create_usercopy("skbuff_small_head", + net_hotdata.skb_small_head_cache = kmem_cache_create_usercopy("skbuff_small_head", SKB_SMALL_HEAD_CACHE_SIZE, 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, @@ -5738,7 +5889,7 @@ void kfree_skb_partial(struct sk_buff *skb, bool head_stolen) { if (head_stolen) { skb_release_head_state(skb); - kmem_cache_free(skbuff_cache, skb); + kmem_cache_free(net_hotdata.skbuff_cache, skb); } else { __kfree_skb(skb); } @@ -5764,17 +5915,12 @@ bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from, return false; /* In general, avoid mixing page_pool and non-page_pool allocated - * pages within the same SKB. Additionally avoid dealing with clones - * with page_pool pages, in case the SKB is using page_pool fragment - * references (page_pool_alloc_frag()). Since we only take full page - * references for cloned SKBs at the moment that would result in - * inconsistent reference counts. - * In theory we could take full references if @from is cloned and - * !@to->pp_recycle but its tricky (due to potential race with - * the clone disappearing) and rare, so not worth dealing with. + * pages within the same SKB. In theory we could take full + * references if @from is cloned and !@to->pp_recycle but its + * tricky (due to potential race with the clone disappearing) and + * rare, so not worth dealing with. */ - if (to->pp_recycle != from->pp_recycle || - (from->pp_recycle && skb_cloned(from))) + if (to->pp_recycle != from->pp_recycle) return false; if (len <= skb_tailroom(to)) { @@ -5831,8 +5977,10 @@ bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from, /* if the skb is not cloned this does nothing * since we set nr_frags to 0. */ - for (i = 0; i < from_shinfo->nr_frags; i++) - __skb_frag_ref(&from_shinfo->frags[i]); + if (skb_pp_frag_ref(from)) { + for (i = 0; i < from_shinfo->nr_frags; i++) + __skb_frag_ref(&from_shinfo->frags[i]); + } to->truesize += delta; to->len += len; @@ -5959,6 +6107,31 @@ int skb_ensure_writable(struct sk_buff *skb, unsigned int write_len) } EXPORT_SYMBOL(skb_ensure_writable); +int skb_ensure_writable_head_tail(struct sk_buff *skb, struct net_device *dev) +{ + int needed_headroom = dev->needed_headroom; + int needed_tailroom = dev->needed_tailroom; + + /* For tail taggers, we need to pad short frames ourselves, to ensure + * that the tail tag does not fail at its role of being at the end of + * the packet, once the conduit interface pads the frame. Account for + * that pad length here, and pad later. + */ + if (unlikely(needed_tailroom && skb->len < ETH_ZLEN)) + needed_tailroom += ETH_ZLEN - skb->len; + /* skb_headroom() returns unsigned int... */ + needed_headroom = max_t(int, needed_headroom - skb_headroom(skb), 0); + needed_tailroom = max_t(int, needed_tailroom - skb_tailroom(skb), 0); + + if (likely(!needed_headroom && !needed_tailroom && !skb_cloned(skb))) + /* No reallocation needed, yay! */ + return 0; + + return pskb_expand_head(skb, needed_headroom, needed_tailroom, + GFP_ATOMIC); +} +EXPORT_SYMBOL(skb_ensure_writable_head_tail); + /* remove VLAN header from packet and update csum accordingly. * expects a non skb_vlan_tag_present skb with a vlan tag payload */ @@ -6402,12 +6575,12 @@ static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off, skb_frag_ref(skb, i); if (skb_has_frag_list(skb)) skb_clone_fraglist(skb); - skb_release_data(skb, SKB_CONSUMED, false); + skb_release_data(skb, SKB_CONSUMED); } else { /* we can reuse existing recount- all we did was * relocate values */ - skb_free_head(skb, false); + skb_free_head(skb); } skb->head = data; @@ -6542,7 +6715,7 @@ static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off, skb_kfree_head(data, size); return -ENOMEM; } - skb_release_data(skb, SKB_CONSUMED, false); + skb_release_data(skb, SKB_CONSUMED); skb->head = data; skb->head_frag = 0; @@ -6674,6 +6847,14 @@ static struct skb_ext *skb_ext_maybe_cow(struct skb_ext *old, xfrm_state_hold(sp->xvec[i]); } #endif +#ifdef CONFIG_MCTP_FLOWS + if (old_active & (1 << SKB_EXT_MCTP)) { + struct mctp_flow *flow = skb_ext_get_ptr(old, SKB_EXT_MCTP); + + if (flow->key) + refcount_inc(&flow->key->refs); + } +#endif __skb_ext_put(old); return new; } @@ -6814,6 +6995,19 @@ free_now: EXPORT_SYMBOL(__skb_ext_put); #endif /* CONFIG_SKB_EXTENSIONS */ +static void kfree_skb_napi_cache(struct sk_buff *skb) +{ + /* if SKB is a clone, don't handle this case */ + if (skb->fclone != SKB_FCLONE_UNAVAILABLE) { + __kfree_skb(skb); + return; + } + + local_bh_disable(); + __napi_kfree_skb(skb, SKB_CONSUMED); + local_bh_enable(); +} + /** * skb_attempt_defer_free - queue skb for remote freeing * @skb: buffer @@ -6829,10 +7023,10 @@ void skb_attempt_defer_free(struct sk_buff *skb) unsigned int defer_max; bool kick; - if (WARN_ON_ONCE(cpu >= nr_cpu_ids) || - !cpu_online(cpu) || - cpu == raw_smp_processor_id()) { -nodefer: __kfree_skb(skb); + if (cpu == raw_smp_processor_id() || + WARN_ON_ONCE(cpu >= nr_cpu_ids) || + !cpu_online(cpu)) { +nodefer: kfree_skb_napi_cache(skb); return; } @@ -6840,7 +7034,7 @@ nodefer: __kfree_skb(skb); DEBUG_NET_WARN_ON_ONCE(skb->destructor); sd = &per_cpu(softnet_data, cpu); - defer_max = READ_ONCE(sysctl_skb_defer_max); + defer_max = READ_ONCE(net_hotdata.sysctl_skb_defer_max); if (READ_ONCE(sd->defer_count) >= defer_max) goto nodefer; @@ -6858,8 +7052,8 @@ nodefer: __kfree_skb(skb); /* Make sure to trigger NET_RX_SOFTIRQ on the remote CPU * if we are unlucky enough (this seems very unlikely). */ - if (unlikely(kick) && !cmpxchg(&sd->defer_ipi_scheduled, 0, 1)) - smp_call_function_single_async(cpu, &sd->defer_csd); + if (unlikely(kick)) + kick_defer_list_purge(sd, cpu); } static void skb_splice_csum_page(struct sk_buff *skb, struct page *page, @@ -6892,7 +7086,7 @@ static void skb_splice_csum_page(struct sk_buff *skb, struct page *page, ssize_t skb_splice_from_iter(struct sk_buff *skb, struct iov_iter *iter, ssize_t maxsize, gfp_t gfp) { - size_t frag_limit = READ_ONCE(sysctl_max_skb_frags); + size_t frag_limit = READ_ONCE(net_hotdata.sysctl_max_skb_frags); struct page *pages[8], **ppages = pages; ssize_t spliced = 0, ret = 0; unsigned int i; diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 6c31eefbd777..fd20aae30be2 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -826,6 +826,8 @@ static void sk_psock_destroy(struct work_struct *work) if (psock->sk_redir) sock_put(psock->sk_redir); + if (psock->sk_pair) + sock_put(psock->sk_pair); sock_put(psock->sk); kfree(psock); } @@ -1225,7 +1227,7 @@ static void sk_psock_verdict_data_ready(struct sock *sk) rcu_read_lock(); psock = sk_psock(sk); if (psock) - psock->saved_data_ready(sk); + sk_psock_data_ready(sk, psock); rcu_read_unlock(); } } diff --git a/net/core/sock.c b/net/core/sock.c index fef349dd72fa..8d6e638b5426 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -107,6 +107,7 @@ #include <linux/interrupt.h> #include <linux/poll.h> #include <linux/tcp.h> +#include <linux/udp.h> #include <linux/init.h> #include <linux/highmem.h> #include <linux/user_namespace.h> @@ -126,6 +127,7 @@ #include <net/net_namespace.h> #include <net/request_sock.h> #include <net/sock.h> +#include <net/proto_memory.h> #include <linux/net_tstamp.h> #include <net/xfrm.h> #include <linux/ipsec.h> @@ -283,10 +285,6 @@ EXPORT_SYMBOL(sysctl_rmem_max); __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX; __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX; -/* Maximal space eaten by iovec or ancillary data plus some space */ -int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512); -EXPORT_SYMBOL(sysctl_optmem_max); - int sysctl_tstamp_allow_data __read_mostly = 1; DEFINE_STATIC_KEY_FALSE(memalloc_socks_key); @@ -484,7 +482,7 @@ int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) unsigned long flags; struct sk_buff_head *list = &sk->sk_receive_queue; - if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) { + if (atomic_read(&sk->sk_rmem_alloc) >= READ_ONCE(sk->sk_rcvbuf)) { atomic_inc(&sk->sk_drops); trace_sock_rcvqueue_full(sk, skb); return -ENOMEM; @@ -554,7 +552,7 @@ int __sk_receive_skb(struct sock *sk, struct sk_buff *skb, skb->dev = NULL; - if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) { + if (sk_rcvqueues_full(sk, READ_ONCE(sk->sk_rcvbuf))) { atomic_inc(&sk->sk_drops); goto discard_and_relse; } @@ -1191,6 +1189,17 @@ int sk_setsockopt(struct sock *sk, int level, int optname, */ WRITE_ONCE(sk->sk_txrehash, (u8)val); return 0; + case SO_PEEK_OFF: + { + int (*set_peek_off)(struct sock *sk, int val); + + set_peek_off = READ_ONCE(sock->ops)->set_peek_off; + if (set_peek_off) + ret = set_peek_off(sk, val); + else + ret = -EOPNOTSUPP; + return ret; + } } sockopt_lock_sock(sk); @@ -1433,18 +1442,6 @@ set_sndbuf: sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool); break; - case SO_PEEK_OFF: - { - int (*set_peek_off)(struct sock *sk, int val); - - set_peek_off = READ_ONCE(sock->ops)->set_peek_off; - if (set_peek_off) - ret = set_peek_off(sk, val); - else - ret = -EOPNOTSUPP; - break; - } - case SO_NOFCS: sock_valbool_flag(sk, SOCK_NOFCS, valbool); break; @@ -1711,9 +1708,16 @@ int sk_getsockopt(struct sock *sk, int level, int optname, break; case SO_TIMESTAMPING_OLD: + case SO_TIMESTAMPING_NEW: lv = sizeof(v.timestamping); - v.timestamping.flags = READ_ONCE(sk->sk_tsflags); - v.timestamping.bind_phc = READ_ONCE(sk->sk_bind_phc); + /* For the later-added case SO_TIMESTAMPING_NEW: Be strict about only + * returning the flags when they were set through the same option. + * Don't change the beviour for the old case SO_TIMESTAMPING_OLD. + */ + if (optname == SO_TIMESTAMPING_OLD || sock_flag(sk, SOCK_TSTAMP_NEW)) { + v.timestamping.flags = READ_ONCE(sk->sk_tsflags); + v.timestamping.bind_phc = READ_ONCE(sk->sk_bind_phc); + } break; case SO_RCVTIMEO_OLD: @@ -2049,8 +2053,9 @@ static void sock_copy(struct sock *nsk, const struct sock *osk) memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin)); - memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end, - prot->obj_size - offsetof(struct sock, sk_dontcopy_end)); + unsafe_memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end, + prot->obj_size - offsetof(struct sock, sk_dontcopy_end), + /* alloc is larger than struct, see sk_prot_alloc() */); #ifdef CONFIG_SECURITY_NETWORK nsk->sk_security = sptr; @@ -2521,13 +2526,12 @@ EXPORT_SYMBOL(skb_set_owner_w); static bool can_skb_orphan_partial(const struct sk_buff *skb) { -#ifdef CONFIG_TLS_DEVICE /* Drivers depend on in-order delivery for crypto offload, * partial orphan breaks out-of-order-OK logic. */ - if (skb->decrypted) + if (skb_is_decrypted(skb)) return false; -#endif + return (skb->destructor == sock_wfree || (IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree)); } @@ -2579,8 +2583,18 @@ EXPORT_SYMBOL(sock_efree); #ifdef CONFIG_INET void sock_pfree(struct sk_buff *skb) { - if (sk_is_refcounted(skb->sk)) - sock_gen_put(skb->sk); + struct sock *sk = skb->sk; + + if (!sk_is_refcounted(sk)) + return; + + if (sk->sk_state == TCP_NEW_SYN_RECV && inet_reqsk(sk)->syncookie) { + inet_reqsk(sk)->rsk_listener = NULL; + reqsk_free(inet_reqsk(sk)); + return; + } + + sock_gen_put(sk); } EXPORT_SYMBOL(sock_pfree); #endif /* CONFIG_INET */ @@ -2651,7 +2665,7 @@ struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size, /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */ if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) > - READ_ONCE(sysctl_optmem_max)) + READ_ONCE(sock_net(sk)->core.sysctl_optmem_max)) return NULL; skb = alloc_skb(size, priority); @@ -2669,7 +2683,7 @@ struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size, */ void *sock_kmalloc(struct sock *sk, int size, gfp_t priority) { - int optmem_max = READ_ONCE(sysctl_optmem_max); + int optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max); if ((unsigned int)size <= optmem_max && atomic_read(&sk->sk_omem_alloc) + size < optmem_max) { @@ -2806,6 +2820,7 @@ int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg, sockc->mark = *(u32 *)CMSG_DATA(cmsg); break; case SO_TIMESTAMPING_OLD: + case SO_TIMESTAMPING_NEW: if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) return -EINVAL; @@ -3322,7 +3337,7 @@ static void sock_def_error_report(struct sock *sk) wq = rcu_dereference(sk->sk_wq); if (skwq_has_sleeper(wq)) wake_up_interruptible_poll(&wq->wait, EPOLLERR); - sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR); + sk_wake_async_rcu(sk, SOCK_WAKE_IO, POLL_ERR); rcu_read_unlock(); } @@ -3337,7 +3352,7 @@ void sock_def_readable(struct sock *sk) if (skwq_has_sleeper(wq)) wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI | EPOLLRDNORM | EPOLLRDBAND); - sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); + sk_wake_async_rcu(sk, SOCK_WAKE_WAITD, POLL_IN); rcu_read_unlock(); } @@ -3357,7 +3372,7 @@ static void sock_def_write_space(struct sock *sk) EPOLLWRNORM | EPOLLWRBAND); /* Should agree with poll, otherwise some programs break */ - sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); + sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT); } rcu_read_unlock(); @@ -3382,7 +3397,7 @@ static void sock_def_write_space_wfree(struct sock *sk) EPOLLWRNORM | EPOLLWRBAND); /* Should agree with poll, otherwise some programs break */ - sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); + sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT); } } @@ -4140,8 +4155,14 @@ bool sk_busy_loop_end(void *p, unsigned long start_time) { struct sock *sk = p; - return !skb_queue_empty_lockless(&sk->sk_receive_queue) || - sk_busy_loop_timeout(sk, start_time); + if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) + return true; + + if (sk_is_udp(sk) && + !skb_queue_empty_lockless(&udp_sk(sk)->reader_queue)) + return true; + + return sk_busy_loop_timeout(sk, start_time); } EXPORT_SYMBOL(sk_busy_loop_end); #endif /* CONFIG_NET_RX_BUSY_POLL */ @@ -4213,3 +4234,65 @@ int sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg) return sock_ioctl_out(sk, cmd, arg); } EXPORT_SYMBOL(sk_ioctl); + +static int __init sock_struct_check(void) +{ + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_drops); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_peek_off); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_error_queue); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_receive_queue); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_backlog); + + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst_ifindex); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst_cookie); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvbuf); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_filter); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_wq); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_data_ready); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvtimeo); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvlowat); + + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_err); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_socket); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_memcg); + + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_lock); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_reserved_mem); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_forward_alloc); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_tsflags); + + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_omem_alloc); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_omem_alloc); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_sndbuf); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_wmem_queued); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_wmem_alloc); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_tsq_flags); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_send_head); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_write_queue); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_write_pending); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_dst_pending_confirm); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_pacing_status); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_frag); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_timer); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_pacing_rate); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_zckey); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_tskey); + + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_max_pacing_rate); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_sndtimeo); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_priority); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_mark); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_dst_cache); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_route_caps); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_type); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_max_size); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_allocation); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_txhash); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_max_segs); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_pacing_shift); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_use_task_frag); + return 0; +} + +core_initcall(sock_struct_check); diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c index b1e29e18d1d6..654122838025 100644 --- a/net/core/sock_diag.c +++ b/net/core/sock_diag.c @@ -16,9 +16,10 @@ #include <linux/inet_diag.h> #include <linux/sock_diag.h> -static const struct sock_diag_handler *sock_diag_handlers[AF_MAX]; -static int (*inet_rcv_compat)(struct sk_buff *skb, struct nlmsghdr *nlh); -static DEFINE_MUTEX(sock_diag_table_mutex); +static const struct sock_diag_handler __rcu *sock_diag_handlers[AF_MAX]; + +static struct sock_diag_inet_compat __rcu *inet_rcv_compat; + static struct workqueue_struct *broadcast_wq; DEFINE_COOKIE(sock_cookie); @@ -122,6 +123,24 @@ static size_t sock_diag_nlmsg_size(void) + nla_total_size_64bit(sizeof(struct tcp_info))); /* INET_DIAG_INFO */ } +static const struct sock_diag_handler *sock_diag_lock_handler(int family) +{ + const struct sock_diag_handler *handler; + + rcu_read_lock(); + handler = rcu_dereference(sock_diag_handlers[family]); + if (handler && !try_module_get(handler->owner)) + handler = NULL; + rcu_read_unlock(); + + return handler; +} + +static void sock_diag_unlock_handler(const struct sock_diag_handler *handler) +{ + module_put(handler->owner); +} + static void sock_diag_broadcast_destroy_work(struct work_struct *work) { struct broadcast_sk *bsk = @@ -138,12 +157,12 @@ static void sock_diag_broadcast_destroy_work(struct work_struct *work) if (!skb) goto out; - mutex_lock(&sock_diag_table_mutex); - hndl = sock_diag_handlers[sk->sk_family]; - if (hndl && hndl->get_info) - err = hndl->get_info(skb, sk); - mutex_unlock(&sock_diag_table_mutex); - + hndl = sock_diag_lock_handler(sk->sk_family); + if (hndl) { + if (hndl->get_info) + err = hndl->get_info(skb, sk); + sock_diag_unlock_handler(hndl); + } if (!err) nlmsg_multicast(sock_net(sk)->diag_nlsk, skb, 0, group, GFP_KERNEL); @@ -166,51 +185,45 @@ void sock_diag_broadcast_destroy(struct sock *sk) queue_work(broadcast_wq, &bsk->work); } -void sock_diag_register_inet_compat(int (*fn)(struct sk_buff *skb, struct nlmsghdr *nlh)) +void sock_diag_register_inet_compat(const struct sock_diag_inet_compat *ptr) { - mutex_lock(&sock_diag_table_mutex); - inet_rcv_compat = fn; - mutex_unlock(&sock_diag_table_mutex); + xchg((__force const struct sock_diag_inet_compat **)&inet_rcv_compat, + ptr); } EXPORT_SYMBOL_GPL(sock_diag_register_inet_compat); -void sock_diag_unregister_inet_compat(int (*fn)(struct sk_buff *skb, struct nlmsghdr *nlh)) +void sock_diag_unregister_inet_compat(const struct sock_diag_inet_compat *ptr) { - mutex_lock(&sock_diag_table_mutex); - inet_rcv_compat = NULL; - mutex_unlock(&sock_diag_table_mutex); + const struct sock_diag_inet_compat *old; + + old = xchg((__force const struct sock_diag_inet_compat **)&inet_rcv_compat, + NULL); + WARN_ON_ONCE(old != ptr); } EXPORT_SYMBOL_GPL(sock_diag_unregister_inet_compat); int sock_diag_register(const struct sock_diag_handler *hndl) { - int err = 0; + int family = hndl->family; - if (hndl->family >= AF_MAX) + if (family >= AF_MAX) return -EINVAL; - mutex_lock(&sock_diag_table_mutex); - if (sock_diag_handlers[hndl->family]) - err = -EBUSY; - else - sock_diag_handlers[hndl->family] = hndl; - mutex_unlock(&sock_diag_table_mutex); - - return err; + return !cmpxchg((const struct sock_diag_handler **) + &sock_diag_handlers[family], + NULL, hndl) ? 0 : -EBUSY; } EXPORT_SYMBOL_GPL(sock_diag_register); -void sock_diag_unregister(const struct sock_diag_handler *hnld) +void sock_diag_unregister(const struct sock_diag_handler *hndl) { - int family = hnld->family; + int family = hndl->family; if (family >= AF_MAX) return; - mutex_lock(&sock_diag_table_mutex); - BUG_ON(sock_diag_handlers[family] != hnld); - sock_diag_handlers[family] = NULL; - mutex_unlock(&sock_diag_table_mutex); + xchg((const struct sock_diag_handler **)&sock_diag_handlers[family], + NULL); } EXPORT_SYMBOL_GPL(sock_diag_unregister); @@ -227,20 +240,20 @@ static int __sock_diag_cmd(struct sk_buff *skb, struct nlmsghdr *nlh) return -EINVAL; req->sdiag_family = array_index_nospec(req->sdiag_family, AF_MAX); - if (sock_diag_handlers[req->sdiag_family] == NULL) + if (!rcu_access_pointer(sock_diag_handlers[req->sdiag_family])) sock_load_diag_module(req->sdiag_family, 0); - mutex_lock(&sock_diag_table_mutex); - hndl = sock_diag_handlers[req->sdiag_family]; + hndl = sock_diag_lock_handler(req->sdiag_family); if (hndl == NULL) - err = -ENOENT; - else if (nlh->nlmsg_type == SOCK_DIAG_BY_FAMILY) + return -ENOENT; + + if (nlh->nlmsg_type == SOCK_DIAG_BY_FAMILY) err = hndl->dump(skb, nlh); else if (nlh->nlmsg_type == SOCK_DESTROY && hndl->destroy) err = hndl->destroy(skb, nlh); else err = -EOPNOTSUPP; - mutex_unlock(&sock_diag_table_mutex); + sock_diag_unlock_handler(hndl); return err; } @@ -248,20 +261,27 @@ static int __sock_diag_cmd(struct sk_buff *skb, struct nlmsghdr *nlh) static int sock_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { + const struct sock_diag_inet_compat *ptr; int ret; switch (nlh->nlmsg_type) { case TCPDIAG_GETSOCK: case DCCPDIAG_GETSOCK: - if (inet_rcv_compat == NULL) + + if (!rcu_access_pointer(inet_rcv_compat)) sock_load_diag_module(AF_INET, 0); - mutex_lock(&sock_diag_table_mutex); - if (inet_rcv_compat != NULL) - ret = inet_rcv_compat(skb, nlh); - else - ret = -EOPNOTSUPP; - mutex_unlock(&sock_diag_table_mutex); + rcu_read_lock(); + ptr = rcu_dereference(inet_rcv_compat); + if (ptr && !try_module_get(ptr->owner)) + ptr = NULL; + rcu_read_unlock(); + + ret = -EOPNOTSUPP; + if (ptr) { + ret = ptr->fn(skb, nlh); + module_put(ptr->owner); + } return ret; case SOCK_DIAG_BY_FAMILY: @@ -272,13 +292,9 @@ static int sock_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, } } -static DEFINE_MUTEX(sock_diag_mutex); - static void sock_diag_rcv(struct sk_buff *skb) { - mutex_lock(&sock_diag_mutex); netlink_rcv_skb(skb, &sock_diag_rcv_msg); - mutex_unlock(&sock_diag_mutex); } static int sock_diag_bind(struct net *net, int group) @@ -286,12 +302,12 @@ static int sock_diag_bind(struct net *net, int group) switch (group) { case SKNLGRP_INET_TCP_DESTROY: case SKNLGRP_INET_UDP_DESTROY: - if (!sock_diag_handlers[AF_INET]) + if (!rcu_access_pointer(sock_diag_handlers[AF_INET])) sock_load_diag_module(AF_INET, 0); break; case SKNLGRP_INET6_TCP_DESTROY: case SKNLGRP_INET6_UDP_DESTROY: - if (!sock_diag_handlers[AF_INET6]) + if (!rcu_access_pointer(sock_diag_handlers[AF_INET6])) sock_load_diag_module(AF_INET6, 0); break; } diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 4292c2ed1828..9402889840bf 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -24,8 +24,16 @@ struct bpf_stab { #define SOCK_CREATE_FLAG_MASK \ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) +/* This mutex is used to + * - protect race between prog/link attach/detach and link prog update, and + * - protect race between releasing and accessing map in bpf_link. + * A single global mutex lock is used since it is expected contention is low. + */ +static DEFINE_MUTEX(sockmap_mutex); + static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, - struct bpf_prog *old, u32 which); + struct bpf_prog *old, struct bpf_link *link, + u32 which); static struct sk_psock_progs *sock_map_progs(struct bpf_map *map); static struct bpf_map *sock_map_alloc(union bpf_attr *attr) @@ -71,7 +79,9 @@ int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog) map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); - ret = sock_map_prog_update(map, prog, NULL, attr->attach_type); + mutex_lock(&sockmap_mutex); + ret = sock_map_prog_update(map, prog, NULL, NULL, attr->attach_type); + mutex_unlock(&sockmap_mutex); fdput(f); return ret; } @@ -103,7 +113,9 @@ int sock_map_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype) goto put_prog; } - ret = sock_map_prog_update(map, NULL, prog, attr->attach_type); + mutex_lock(&sockmap_mutex); + ret = sock_map_prog_update(map, NULL, prog, NULL, attr->attach_type); + mutex_unlock(&sockmap_mutex); put_prog: bpf_prog_put(prog); put_map: @@ -411,6 +423,9 @@ static int __sock_map_delete(struct bpf_stab *stab, struct sock *sk_test, struct sock *sk; int err = 0; + if (irqs_disabled()) + return -EOPNOTSUPP; /* locks here are hardirq-unsafe */ + spin_lock_bh(&stab->lock); sk = *psk; if (!sk_test || sk_test == sk) @@ -536,6 +551,8 @@ static bool sock_map_sk_state_allowed(const struct sock *sk) { if (sk_is_tcp(sk)) return (1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_LISTEN); + if (sk_is_stream_unix(sk)) + return (1 << sk->sk_state) & TCPF_ESTABLISHED; return true; } @@ -931,6 +948,9 @@ static long sock_hash_delete_elem(struct bpf_map *map, void *key) struct bpf_shtab_elem *elem; int ret = -ENOENT; + if (irqs_disabled()) + return -EOPNOTSUPP; /* locks here are hardirq-unsafe */ + hash = sock_hash_bucket_hash(key, key_size); bucket = sock_hash_select_bucket(htab, hash); @@ -1452,55 +1472,84 @@ static struct sk_psock_progs *sock_map_progs(struct bpf_map *map) return NULL; } -static int sock_map_prog_lookup(struct bpf_map *map, struct bpf_prog ***pprog, - u32 which) +static int sock_map_prog_link_lookup(struct bpf_map *map, struct bpf_prog ***pprog, + struct bpf_link ***plink, u32 which) { struct sk_psock_progs *progs = sock_map_progs(map); + struct bpf_prog **cur_pprog; + struct bpf_link **cur_plink; if (!progs) return -EOPNOTSUPP; switch (which) { case BPF_SK_MSG_VERDICT: - *pprog = &progs->msg_parser; + cur_pprog = &progs->msg_parser; + cur_plink = &progs->msg_parser_link; break; #if IS_ENABLED(CONFIG_BPF_STREAM_PARSER) case BPF_SK_SKB_STREAM_PARSER: - *pprog = &progs->stream_parser; + cur_pprog = &progs->stream_parser; + cur_plink = &progs->stream_parser_link; break; #endif case BPF_SK_SKB_STREAM_VERDICT: if (progs->skb_verdict) return -EBUSY; - *pprog = &progs->stream_verdict; + cur_pprog = &progs->stream_verdict; + cur_plink = &progs->stream_verdict_link; break; case BPF_SK_SKB_VERDICT: if (progs->stream_verdict) return -EBUSY; - *pprog = &progs->skb_verdict; + cur_pprog = &progs->skb_verdict; + cur_plink = &progs->skb_verdict_link; break; default: return -EOPNOTSUPP; } + *pprog = cur_pprog; + if (plink) + *plink = cur_plink; return 0; } +/* Handle the following four cases: + * prog_attach: prog != NULL, old == NULL, link == NULL + * prog_detach: prog == NULL, old != NULL, link == NULL + * link_attach: prog != NULL, old == NULL, link != NULL + * link_detach: prog == NULL, old != NULL, link != NULL + */ static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, - struct bpf_prog *old, u32 which) + struct bpf_prog *old, struct bpf_link *link, + u32 which) { struct bpf_prog **pprog; + struct bpf_link **plink; int ret; - ret = sock_map_prog_lookup(map, &pprog, which); + ret = sock_map_prog_link_lookup(map, &pprog, &plink, which); if (ret) return ret; - if (old) - return psock_replace_prog(pprog, prog, old); + /* for prog_attach/prog_detach/link_attach, return error if a bpf_link + * exists for that prog. + */ + if ((!link || prog) && *plink) + return -EBUSY; - psock_set_prog(pprog, prog); - return 0; + if (old) { + ret = psock_replace_prog(pprog, prog, old); + if (!ret) + *plink = NULL; + } else { + psock_set_prog(pprog, prog); + if (link) + *plink = link; + } + + return ret; } int sock_map_bpf_prog_query(const union bpf_attr *attr, @@ -1525,7 +1574,7 @@ int sock_map_bpf_prog_query(const union bpf_attr *attr, rcu_read_lock(); - ret = sock_map_prog_lookup(map, &pprog, attr->query.attach_type); + ret = sock_map_prog_link_lookup(map, &pprog, NULL, attr->query.attach_type); if (ret) goto end; @@ -1655,6 +1704,196 @@ void sock_map_close(struct sock *sk, long timeout) } EXPORT_SYMBOL_GPL(sock_map_close); +struct sockmap_link { + struct bpf_link link; + struct bpf_map *map; + enum bpf_attach_type attach_type; +}; + +static void sock_map_link_release(struct bpf_link *link) +{ + struct sockmap_link *sockmap_link = container_of(link, struct sockmap_link, link); + + mutex_lock(&sockmap_mutex); + if (!sockmap_link->map) + goto out; + + WARN_ON_ONCE(sock_map_prog_update(sockmap_link->map, NULL, link->prog, link, + sockmap_link->attach_type)); + + bpf_map_put_with_uref(sockmap_link->map); + sockmap_link->map = NULL; +out: + mutex_unlock(&sockmap_mutex); +} + +static int sock_map_link_detach(struct bpf_link *link) +{ + sock_map_link_release(link); + return 0; +} + +static void sock_map_link_dealloc(struct bpf_link *link) +{ + kfree(link); +} + +/* Handle the following two cases: + * case 1: link != NULL, prog != NULL, old != NULL + * case 2: link != NULL, prog != NULL, old == NULL + */ +static int sock_map_link_update_prog(struct bpf_link *link, + struct bpf_prog *prog, + struct bpf_prog *old) +{ + const struct sockmap_link *sockmap_link = container_of(link, struct sockmap_link, link); + struct bpf_prog **pprog, *old_link_prog; + struct bpf_link **plink; + int ret = 0; + + mutex_lock(&sockmap_mutex); + + /* If old prog is not NULL, ensure old prog is the same as link->prog. */ + if (old && link->prog != old) { + ret = -EPERM; + goto out; + } + /* Ensure link->prog has the same type/attach_type as the new prog. */ + if (link->prog->type != prog->type || + link->prog->expected_attach_type != prog->expected_attach_type) { + ret = -EINVAL; + goto out; + } + + ret = sock_map_prog_link_lookup(sockmap_link->map, &pprog, &plink, + sockmap_link->attach_type); + if (ret) + goto out; + + /* return error if the stored bpf_link does not match the incoming bpf_link. */ + if (link != *plink) { + ret = -EBUSY; + goto out; + } + + if (old) { + ret = psock_replace_prog(pprog, prog, old); + if (ret) + goto out; + } else { + psock_set_prog(pprog, prog); + } + + bpf_prog_inc(prog); + old_link_prog = xchg(&link->prog, prog); + bpf_prog_put(old_link_prog); + +out: + mutex_unlock(&sockmap_mutex); + return ret; +} + +static u32 sock_map_link_get_map_id(const struct sockmap_link *sockmap_link) +{ + u32 map_id = 0; + + mutex_lock(&sockmap_mutex); + if (sockmap_link->map) + map_id = sockmap_link->map->id; + mutex_unlock(&sockmap_mutex); + return map_id; +} + +static int sock_map_link_fill_info(const struct bpf_link *link, + struct bpf_link_info *info) +{ + const struct sockmap_link *sockmap_link = container_of(link, struct sockmap_link, link); + u32 map_id = sock_map_link_get_map_id(sockmap_link); + + info->sockmap.map_id = map_id; + info->sockmap.attach_type = sockmap_link->attach_type; + return 0; +} + +static void sock_map_link_show_fdinfo(const struct bpf_link *link, + struct seq_file *seq) +{ + const struct sockmap_link *sockmap_link = container_of(link, struct sockmap_link, link); + u32 map_id = sock_map_link_get_map_id(sockmap_link); + + seq_printf(seq, "map_id:\t%u\n", map_id); + seq_printf(seq, "attach_type:\t%u\n", sockmap_link->attach_type); +} + +static const struct bpf_link_ops sock_map_link_ops = { + .release = sock_map_link_release, + .dealloc = sock_map_link_dealloc, + .detach = sock_map_link_detach, + .update_prog = sock_map_link_update_prog, + .fill_link_info = sock_map_link_fill_info, + .show_fdinfo = sock_map_link_show_fdinfo, +}; + +int sock_map_link_create(const union bpf_attr *attr, struct bpf_prog *prog) +{ + struct bpf_link_primer link_primer; + struct sockmap_link *sockmap_link; + enum bpf_attach_type attach_type; + struct bpf_map *map; + int ret; + + if (attr->link_create.flags) + return -EINVAL; + + map = bpf_map_get_with_uref(attr->link_create.target_fd); + if (IS_ERR(map)) + return PTR_ERR(map); + if (map->map_type != BPF_MAP_TYPE_SOCKMAP && map->map_type != BPF_MAP_TYPE_SOCKHASH) { + ret = -EINVAL; + goto out; + } + + sockmap_link = kzalloc(sizeof(*sockmap_link), GFP_USER); + if (!sockmap_link) { + ret = -ENOMEM; + goto out; + } + + attach_type = attr->link_create.attach_type; + bpf_link_init(&sockmap_link->link, BPF_LINK_TYPE_SOCKMAP, &sock_map_link_ops, prog); + sockmap_link->map = map; + sockmap_link->attach_type = attach_type; + + ret = bpf_link_prime(&sockmap_link->link, &link_primer); + if (ret) { + kfree(sockmap_link); + goto out; + } + + mutex_lock(&sockmap_mutex); + ret = sock_map_prog_update(map, prog, NULL, &sockmap_link->link, attach_type); + mutex_unlock(&sockmap_mutex); + if (ret) { + bpf_link_cleanup(&link_primer); + goto out; + } + + /* Increase refcnt for the prog since when old prog is replaced with + * psock_replace_prog() and psock_set_prog() its refcnt will be decreased. + * + * Actually, we do not need to increase refcnt for the prog since bpf_link + * will hold a reference. But in order to have less complexity w.r.t. + * replacing/setting prog, let us increase the refcnt to make things simpler. + */ + bpf_prog_inc(prog); + + return bpf_link_settle(&link_primer); + +out: + bpf_map_put_with_uref(map); + return ret; +} + static int sock_map_iter_attach_target(struct bpf_prog *prog, union bpf_iter_link_info *linfo, struct bpf_iter_aux_info *aux) diff --git a/net/core/stream.c b/net/core/stream.c index 96fbcb9bbb30..b16dfa568a2d 100644 --- a/net/core/stream.c +++ b/net/core/stream.c @@ -79,7 +79,7 @@ int sk_stream_wait_connect(struct sock *sk, long *timeo_p) remove_wait_queue(sk_sleep(sk), &wait); sk->sk_write_pending--; } while (!done); - return 0; + return done < 0 ? done : 0; } EXPORT_SYMBOL(sk_stream_wait_connect); diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 03f1edb948d7..c9fb9ad87485 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -23,6 +23,9 @@ #include <net/net_ratelimit.h> #include <net/busy_poll.h> #include <net/pkt_sched.h> +#include <net/hotdata.h> +#include <net/proto_memory.h> +#include <net/rps.h> #include "dev.h" @@ -30,6 +33,7 @@ static int int_3600 = 3600; static int min_sndbuf = SOCK_MIN_SNDBUF; static int min_rcvbuf = SOCK_MIN_RCVBUF; static int max_skb_frags = MAX_SKB_FRAGS; +static int min_mem_pcpu_rsv = SK_MEMORY_PCPU_RESERVE; static int net_msg_warn; /* Unused, but still a sysctl */ @@ -137,7 +141,8 @@ static int rps_sock_flow_sysctl(struct ctl_table *table, int write, mutex_lock(&sock_flow_mutex); - orig_sock_table = rcu_dereference_protected(rps_sock_flow_table, + orig_sock_table = rcu_dereference_protected( + net_hotdata.rps_sock_flow_table, lockdep_is_held(&sock_flow_mutex)); size = orig_size = orig_sock_table ? orig_sock_table->mask + 1 : 0; @@ -158,7 +163,8 @@ static int rps_sock_flow_sysctl(struct ctl_table *table, int write, mutex_unlock(&sock_flow_mutex); return -ENOMEM; } - rps_cpu_mask = roundup_pow_of_two(nr_cpu_ids) - 1; + net_hotdata.rps_cpu_mask = + roundup_pow_of_two(nr_cpu_ids) - 1; sock_table->mask = size - 1; } else sock_table = orig_sock_table; @@ -169,7 +175,8 @@ static int rps_sock_flow_sysctl(struct ctl_table *table, int write, sock_table = NULL; if (sock_table != orig_sock_table) { - rcu_assign_pointer(rps_sock_flow_table, sock_table); + rcu_assign_pointer(net_hotdata.rps_sock_flow_table, + sock_table); if (sock_table) { static_branch_inc(&rps_needed); static_branch_inc(&rfs_needed); @@ -299,8 +306,8 @@ static int proc_do_dev_weight(struct ctl_table *table, int write, ret = proc_dointvec(table, write, buffer, lenp, ppos); if (!ret && write) { weight = READ_ONCE(weight_p); - WRITE_ONCE(dev_rx_weight, weight * dev_weight_rx_bias); - WRITE_ONCE(dev_tx_weight, weight * dev_weight_tx_bias); + WRITE_ONCE(net_hotdata.dev_rx_weight, weight * dev_weight_rx_bias); + WRITE_ONCE(net_hotdata.dev_tx_weight, weight * dev_weight_tx_bias); } mutex_unlock(&dev_weight_mutex); @@ -408,6 +415,14 @@ static struct ctl_table net_core_table[] = { .extra1 = &min_rcvbuf, }, { + .procname = "mem_pcpu_rsv", + .data = &net_hotdata.sysctl_mem_pcpu_rsv, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_mem_pcpu_rsv, + }, + { .procname = "dev_weight", .data = &weight_p, .maxlen = sizeof(int), @@ -430,7 +445,7 @@ static struct ctl_table net_core_table[] = { }, { .procname = "netdev_max_backlog", - .data = &netdev_max_backlog, + .data = &net_hotdata.max_backlog, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec @@ -489,7 +504,7 @@ static struct ctl_table net_core_table[] = { #endif { .procname = "netdev_tstamp_prequeue", - .data = &netdev_tstamp_prequeue, + .data = &net_hotdata.tstamp_prequeue, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec @@ -509,13 +524,6 @@ static struct ctl_table net_core_table[] = { .proc_handler = proc_dointvec, }, { - .procname = "optmem_max", - .data = &sysctl_optmem_max, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, - { .procname = "tstamp_allow_data", .data = &sysctl_tstamp_allow_data, .maxlen = sizeof(int), @@ -574,7 +582,7 @@ static struct ctl_table net_core_table[] = { #endif { .procname = "netdev_budget", - .data = &netdev_budget, + .data = &net_hotdata.netdev_budget, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec @@ -588,7 +596,7 @@ static struct ctl_table net_core_table[] = { }, { .procname = "max_skb_frags", - .data = &sysctl_max_skb_frags, + .data = &net_hotdata.sysctl_max_skb_frags, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, @@ -597,7 +605,7 @@ static struct ctl_table net_core_table[] = { }, { .procname = "netdev_budget_usecs", - .data = &netdev_budget_usecs, + .data = &net_hotdata.netdev_budget_usecs, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, @@ -630,7 +638,7 @@ static struct ctl_table net_core_table[] = { }, { .procname = "gro_normal_batch", - .data = &gro_normal_batch, + .data = &net_hotdata.gro_normal_batch, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, @@ -647,13 +655,12 @@ static struct ctl_table net_core_table[] = { }, { .procname = "skb_defer_max", - .data = &sysctl_skb_defer_max, + .data = &net_hotdata.sysctl_skb_defer_max, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, }, - { } }; static struct ctl_table netns_core_table[] = { @@ -674,6 +681,14 @@ static struct ctl_table netns_core_table[] = { .proc_handler = proc_dointvec_minmax }, { + .procname = "optmem_max", + .data = &init_net.core.sysctl_optmem_max, + .maxlen = sizeof(int), + .mode = 0644, + .extra1 = SYSCTL_ZERO, + .proc_handler = proc_dointvec_minmax + }, + { .procname = "txrehash", .data = &init_net.core.sysctl_txrehash, .maxlen = sizeof(u8), @@ -682,7 +697,6 @@ static struct ctl_table netns_core_table[] = { .extra2 = SYSCTL_ONE, .proc_handler = proc_dou8vec_minmax, }, - { } }; static int __init fb_tunnels_only_for_init_net_sysctl_setup(char *str) @@ -700,20 +714,21 @@ __setup("fb_tunnels=", fb_tunnels_only_for_init_net_sysctl_setup); static __net_init int sysctl_core_net_init(struct net *net) { - struct ctl_table *tbl, *tmp; + size_t table_size = ARRAY_SIZE(netns_core_table); + struct ctl_table *tbl; tbl = netns_core_table; if (!net_eq(net, &init_net)) { + int i; tbl = kmemdup(tbl, sizeof(netns_core_table), GFP_KERNEL); if (tbl == NULL) goto err_dup; - for (tmp = tbl; tmp->procname; tmp++) - tmp->data += (char *)net - (char *)&init_net; + for (i = 0; i < table_size; ++i) + tbl[i].data += (char *)net - (char *)&init_net; } - net->core.sysctl_hdr = register_net_sysctl_sz(net, "net/core", tbl, - ARRAY_SIZE(netns_core_table)); + net->core.sysctl_hdr = register_net_sysctl_sz(net, "net/core", tbl, table_size); if (net->core.sysctl_hdr == NULL) goto err_reg; @@ -728,7 +743,7 @@ err_dup: static __net_exit void sysctl_core_net_exit(struct net *net) { - struct ctl_table *tbl; + const struct ctl_table *tbl; tbl = net->core.sysctl_hdr->ctl_table_arg; unregister_net_sysctl_table(net->core.sysctl_hdr); diff --git a/net/core/xdp.c b/net/core/xdp.c index df4789ab512d..41693154e426 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -16,6 +16,7 @@ #include <linux/bug.h> #include <net/page_pool/helpers.h> +#include <net/hotdata.h> #include <net/xdp.h> #include <net/xdp_priv.h> /* struct xdp_mem_allocator */ #include <trace/events/xdp.h> @@ -75,7 +76,7 @@ static void __xdp_mem_allocator_rcu_free(struct rcu_head *rcu) xa = container_of(rcu, struct xdp_mem_allocator, rcu); /* Allow this ID to be reused */ - ida_simple_remove(&mem_id_pool, xa->mem.id); + ida_free(&mem_id_pool, xa->mem.id); kfree(xa); } @@ -242,7 +243,7 @@ static int __mem_id_cyclic_get(gfp_t gfp) int id; again: - id = ida_simple_get(&mem_id_pool, mem_id_next, MEM_ID_MAX, gfp); + id = ida_alloc_range(&mem_id_pool, mem_id_next, MEM_ID_MAX - 1, gfp); if (id < 0) { if (id == -ENOSPC) { /* Cyclic allocator, reset next id */ @@ -317,7 +318,7 @@ static struct xdp_mem_allocator *__xdp_reg_mem_model(struct xdp_mem_info *mem, /* Insert allocator into ID lookup table */ ptr = rhashtable_insert_slow(mem_id_ht, &id, &xdp_alloc->node); if (IS_ERR(ptr)) { - ida_simple_remove(&mem_id_pool, mem->id); + ida_free(&mem_id_pool, mem->id); mem->id = 0; errno = PTR_ERR(ptr); goto err; @@ -589,7 +590,7 @@ EXPORT_SYMBOL_GPL(xdp_warn); int xdp_alloc_skb_bulk(void **skbs, int n_skb, gfp_t gfp) { - n_skb = kmem_cache_alloc_bulk(skbuff_cache, gfp, n_skb, skbs); + n_skb = kmem_cache_alloc_bulk(net_hotdata.skbuff_cache, gfp, n_skb, skbs); if (unlikely(!n_skb)) return -ENOMEM; @@ -658,7 +659,7 @@ struct sk_buff *xdp_build_skb_from_frame(struct xdp_frame *xdpf, { struct sk_buff *skb; - skb = kmem_cache_alloc(skbuff_cache, GFP_ATOMIC); + skb = kmem_cache_alloc(net_hotdata.skbuff_cache, GFP_ATOMIC); if (unlikely(!skb)) return NULL; @@ -696,9 +697,7 @@ struct xdp_frame *xdpf_clone(struct xdp_frame *xdpf) return nxdpf; } -__diag_push(); -__diag_ignore_all("-Wmissing-prototypes", - "Global functions as their definitions will be in vmlinux BTF"); +__bpf_kfunc_start_defs(); /** * bpf_xdp_metadata_rx_timestamp - Read XDP frame RX timestamp. @@ -738,13 +737,46 @@ __bpf_kfunc int bpf_xdp_metadata_rx_hash(const struct xdp_md *ctx, u32 *hash, return -EOPNOTSUPP; } -__diag_pop(); +/** + * bpf_xdp_metadata_rx_vlan_tag - Get XDP packet outermost VLAN tag + * @ctx: XDP context pointer. + * @vlan_proto: Destination pointer for VLAN Tag protocol identifier (TPID). + * @vlan_tci: Destination pointer for VLAN TCI (VID + DEI + PCP) + * + * In case of success, ``vlan_proto`` contains *Tag protocol identifier (TPID)*, + * usually ``ETH_P_8021Q`` or ``ETH_P_8021AD``, but some networks can use + * custom TPIDs. ``vlan_proto`` is stored in **network byte order (BE)** + * and should be used as follows: + * ``if (vlan_proto == bpf_htons(ETH_P_8021Q)) do_something();`` + * + * ``vlan_tci`` contains the remaining 16 bits of a VLAN tag. + * Driver is expected to provide those in **host byte order (usually LE)**, + * so the bpf program should not perform byte conversion. + * According to 802.1Q standard, *VLAN TCI (Tag control information)* + * is a bit field that contains: + * *VLAN identifier (VID)* that can be read with ``vlan_tci & 0xfff``, + * *Drop eligible indicator (DEI)* - 1 bit, + * *Priority code point (PCP)* - 3 bits. + * For detailed meaning of DEI and PCP, please refer to other sources. + * + * Return: + * * Returns 0 on success or ``-errno`` on error. + * * ``-EOPNOTSUPP`` : device driver doesn't implement kfunc + * * ``-ENODATA`` : VLAN tag was not stripped or is not available + */ +__bpf_kfunc int bpf_xdp_metadata_rx_vlan_tag(const struct xdp_md *ctx, + __be16 *vlan_proto, u16 *vlan_tci) +{ + return -EOPNOTSUPP; +} + +__bpf_kfunc_end_defs(); -BTF_SET8_START(xdp_metadata_kfunc_ids) +BTF_KFUNCS_START(xdp_metadata_kfunc_ids) #define XDP_METADATA_KFUNC(_, __, name, ___) BTF_ID_FLAGS(func, name, KF_TRUSTED_ARGS) XDP_METADATA_KFUNC_xxx #undef XDP_METADATA_KFUNC -BTF_SET8_END(xdp_metadata_kfunc_ids) +BTF_KFUNCS_END(xdp_metadata_kfunc_ids) static const struct btf_kfunc_id_set xdp_metadata_kfunc_set = { .owner = THIS_MODULE, |