aboutsummaryrefslogtreecommitdiff
path: root/net/core/dev.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/core/dev.c')
-rw-r--r--net/core/dev.c774
1 files changed, 509 insertions, 265 deletions
diff --git a/net/core/dev.c b/net/core/dev.c
index 9f3f8930c691..331848eca7d3 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -153,41 +153,20 @@
#include <linux/prandom.h>
#include <linux/once_lite.h>
#include <net/netdev_rx_queue.h>
+#include <net/page_pool/types.h>
+#include <net/page_pool/helpers.h>
+#include <net/rps.h>
#include "dev.h"
#include "net-sysfs.h"
static DEFINE_SPINLOCK(ptype_lock);
struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
-struct list_head ptype_all __read_mostly; /* Taps */
static int netif_rx_internal(struct sk_buff *skb);
static int call_netdevice_notifiers_extack(unsigned long val,
struct net_device *dev,
struct netlink_ext_ack *extack);
-static struct napi_struct *napi_by_id(unsigned int napi_id);
-
-/*
- * The @dev_base_head list is protected by @dev_base_lock and the rtnl
- * semaphore.
- *
- * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
- *
- * Writers must hold the rtnl semaphore while they loop through the
- * dev_base_head list, and hold dev_base_lock for writing when they do the
- * actual updates. This allows pure readers to access the list even
- * while a writer is preparing to update it.
- *
- * To put it another way, dev_base_lock is held for writing only to
- * protect against pure readers; the rtnl semaphore provides the
- * protection against other writers.
- *
- * See, for example usages, register_netdevice() and
- * unregister_netdevice(), which must be called with the rtnl
- * semaphore held.
- */
-DEFINE_RWLOCK(dev_base_lock);
-EXPORT_SYMBOL(dev_base_lock);
static DEFINE_MUTEX(ifalias_mutex);
@@ -201,8 +180,9 @@ static DECLARE_RWSEM(devnet_rename_sem);
static inline void dev_base_seq_inc(struct net *net)
{
- while (++net->dev_base_seq == 0)
- ;
+ unsigned int val = net->dev_base_seq + 1;
+
+ WRITE_ONCE(net->dev_base_seq, val ?: 1);
}
static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
@@ -337,18 +317,27 @@ int netdev_name_node_alt_create(struct net_device *dev, const char *name)
return -ENOMEM;
netdev_name_node_add(net, name_node);
/* The node that holds dev->name acts as a head of per-device list. */
- list_add_tail(&name_node->list, &dev->name_node->list);
+ list_add_tail_rcu(&name_node->list, &dev->name_node->list);
return 0;
}
-static void __netdev_name_node_alt_destroy(struct netdev_name_node *name_node)
+static void netdev_name_node_alt_free(struct rcu_head *head)
{
- list_del(&name_node->list);
+ struct netdev_name_node *name_node =
+ container_of(head, struct netdev_name_node, rcu);
+
kfree(name_node->name);
netdev_name_node_free(name_node);
}
+static void __netdev_name_node_alt_destroy(struct netdev_name_node *name_node)
+{
+ netdev_name_node_del(name_node);
+ list_del(&name_node->list);
+ call_rcu(&name_node->rcu, netdev_name_node_alt_free);
+}
+
int netdev_name_node_alt_destroy(struct net_device *dev, const char *name)
{
struct netdev_name_node *name_node;
@@ -363,10 +352,7 @@ int netdev_name_node_alt_destroy(struct net_device *dev, const char *name)
if (name_node == dev->name_node || name_node->dev != dev)
return -EINVAL;
- netdev_name_node_del(name_node);
- synchronize_rcu();
__netdev_name_node_alt_destroy(name_node);
-
return 0;
}
@@ -374,8 +360,10 @@ static void netdev_name_node_alt_flush(struct net_device *dev)
{
struct netdev_name_node *name_node, *tmp;
- list_for_each_entry_safe(name_node, tmp, &dev->name_node->list, list)
- __netdev_name_node_alt_destroy(name_node);
+ list_for_each_entry_safe(name_node, tmp, &dev->name_node->list, list) {
+ list_del(&name_node->list);
+ netdev_name_node_alt_free(&name_node->rcu);
+ }
}
/* Device list insertion */
@@ -386,12 +374,10 @@ static void list_netdevice(struct net_device *dev)
ASSERT_RTNL();
- write_lock(&dev_base_lock);
list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
netdev_name_node_add(net, dev->name_node);
hlist_add_head_rcu(&dev->index_hlist,
dev_index_hash(net, dev->ifindex));
- write_unlock(&dev_base_lock);
netdev_for_each_altname(dev, name_node)
netdev_name_node_add(net, name_node);
@@ -405,7 +391,7 @@ static void list_netdevice(struct net_device *dev)
/* Device list removal
* caller must respect a RCU grace period before freeing/reusing dev
*/
-static void unlist_netdevice(struct net_device *dev, bool lock)
+static void unlist_netdevice(struct net_device *dev)
{
struct netdev_name_node *name_node;
struct net *net = dev_net(dev);
@@ -418,13 +404,9 @@ static void unlist_netdevice(struct net_device *dev, bool lock)
netdev_name_node_del(name_node);
/* Unlink dev from the device chain */
- if (lock)
- write_lock(&dev_base_lock);
list_del_rcu(&dev->dev_list);
netdev_name_node_del(dev->name_node);
hlist_del_rcu(&dev->index_hlist);
- if (lock)
- write_unlock(&dev_base_lock);
dev_base_seq_inc(dev_net(dev));
}
@@ -443,6 +425,12 @@ static RAW_NOTIFIER_HEAD(netdev_chain);
DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
EXPORT_PER_CPU_SYMBOL(softnet_data);
+/* Page_pool has a lockless array/stack to alloc/recycle pages.
+ * PP consumers must pay attention to run APIs in the appropriate context
+ * (e.g. NAPI context).
+ */
+static DEFINE_PER_CPU(struct page_pool *, system_page_pool);
+
#ifdef CONFIG_LOCKDEP
/*
* register_netdevice() inits txq->_xmit_lock and sets lockdep class
@@ -552,7 +540,7 @@ static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
static inline struct list_head *ptype_head(const struct packet_type *pt)
{
if (pt->type == htons(ETH_P_ALL))
- return pt->dev ? &pt->dev->ptype_all : &ptype_all;
+ return pt->dev ? &pt->dev->ptype_all : &net_hotdata.ptype_all;
else
return pt->dev ? &pt->dev->ptype_specific :
&ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
@@ -654,7 +642,7 @@ int dev_get_iflink(const struct net_device *dev)
if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink)
return dev->netdev_ops->ndo_get_iflink(dev);
- return dev->ifindex;
+ return READ_ONCE(dev->ifindex);
}
EXPORT_SYMBOL(dev_get_iflink);
@@ -739,9 +727,9 @@ EXPORT_SYMBOL_GPL(dev_fill_forward_path);
* @net: the applicable net namespace
* @name: name to find
*
- * Find an interface by name. Must be called under RTNL semaphore
- * or @dev_base_lock. If the name is found a pointer to the device
- * is returned. If the name is not found then %NULL is returned. The
+ * Find an interface by name. Must be called under RTNL semaphore.
+ * If the name is found a pointer to the device is returned.
+ * If the name is not found then %NULL is returned. The
* reference counters are not incremented so the caller must be
* careful with locks.
*/
@@ -822,8 +810,7 @@ EXPORT_SYMBOL(netdev_get_by_name);
* Search for an interface by index. Returns %NULL if the device
* is not found or a pointer to the device. The device has not
* had its reference counter increased so the caller must be careful
- * about locking. The caller must hold either the RTNL semaphore
- * or @dev_base_lock.
+ * about locking. The caller must hold the RTNL semaphore.
*/
struct net_device *__dev_get_by_index(struct net *net, int ifindex)
@@ -1057,7 +1044,7 @@ EXPORT_SYMBOL(dev_valid_name);
* __dev_alloc_name - allocate a name for a device
* @net: network namespace to allocate the device name in
* @name: name format string
- * @buf: scratch buffer and result name string
+ * @res: result name string
*
* Passed a format string - eg "lt%d" it will try and find a suitable
* id. It scans list of devices to build up a free map, then chooses
@@ -1068,106 +1055,81 @@ EXPORT_SYMBOL(dev_valid_name);
* Returns the number of the unit assigned or a negative errno code.
*/
-static int __dev_alloc_name(struct net *net, const char *name, char *buf)
+static int __dev_alloc_name(struct net *net, const char *name, char *res)
{
int i = 0;
const char *p;
const int max_netdevices = 8*PAGE_SIZE;
unsigned long *inuse;
struct net_device *d;
+ char buf[IFNAMSIZ];
- if (!dev_valid_name(name))
- return -EINVAL;
-
+ /* Verify the string as this thing may have come from the user.
+ * There must be one "%d" and no other "%" characters.
+ */
p = strchr(name, '%');
- if (p) {
- /*
- * Verify the string as this thing may have come from
- * the user. There must be either one "%d" and no other "%"
- * characters.
- */
- if (p[1] != 'd' || strchr(p + 2, '%'))
- return -EINVAL;
-
- /* Use one page as a bit array of possible slots */
- inuse = bitmap_zalloc(max_netdevices, GFP_ATOMIC);
- if (!inuse)
- return -ENOMEM;
+ if (!p || p[1] != 'd' || strchr(p + 2, '%'))
+ return -EINVAL;
- for_each_netdev(net, d) {
- struct netdev_name_node *name_node;
+ /* Use one page as a bit array of possible slots */
+ inuse = bitmap_zalloc(max_netdevices, GFP_ATOMIC);
+ if (!inuse)
+ return -ENOMEM;
- netdev_for_each_altname(d, name_node) {
- if (!sscanf(name_node->name, name, &i))
- continue;
- if (i < 0 || i >= max_netdevices)
- continue;
+ for_each_netdev(net, d) {
+ struct netdev_name_node *name_node;
- /* avoid cases where sscanf is not exact inverse of printf */
- snprintf(buf, IFNAMSIZ, name, i);
- if (!strncmp(buf, name_node->name, IFNAMSIZ))
- __set_bit(i, inuse);
- }
- if (!sscanf(d->name, name, &i))
+ netdev_for_each_altname(d, name_node) {
+ if (!sscanf(name_node->name, name, &i))
continue;
if (i < 0 || i >= max_netdevices)
continue;
- /* avoid cases where sscanf is not exact inverse of printf */
+ /* avoid cases where sscanf is not exact inverse of printf */
snprintf(buf, IFNAMSIZ, name, i);
- if (!strncmp(buf, d->name, IFNAMSIZ))
+ if (!strncmp(buf, name_node->name, IFNAMSIZ))
__set_bit(i, inuse);
}
+ if (!sscanf(d->name, name, &i))
+ continue;
+ if (i < 0 || i >= max_netdevices)
+ continue;
- i = find_first_zero_bit(inuse, max_netdevices);
- bitmap_free(inuse);
+ /* avoid cases where sscanf is not exact inverse of printf */
+ snprintf(buf, IFNAMSIZ, name, i);
+ if (!strncmp(buf, d->name, IFNAMSIZ))
+ __set_bit(i, inuse);
}
- snprintf(buf, IFNAMSIZ, name, i);
- if (!netdev_name_in_use(net, buf))
- return i;
+ i = find_first_zero_bit(inuse, max_netdevices);
+ bitmap_free(inuse);
+ if (i == max_netdevices)
+ return -ENFILE;
- /* It is possible to run out of possible slots
- * when the name is long and there isn't enough space left
- * for the digits, or if all bits are used.
- */
- return -ENFILE;
+ /* 'res' and 'name' could overlap, use 'buf' as an intermediate buffer */
+ strscpy(buf, name, IFNAMSIZ);
+ snprintf(res, IFNAMSIZ, buf, i);
+ return i;
}
+/* Returns negative errno or allocated unit id (see __dev_alloc_name()) */
static int dev_prep_valid_name(struct net *net, struct net_device *dev,
- const char *want_name, char *out_name)
+ const char *want_name, char *out_name,
+ int dup_errno)
{
- int ret;
-
if (!dev_valid_name(want_name))
return -EINVAL;
- if (strchr(want_name, '%')) {
- ret = __dev_alloc_name(net, want_name, out_name);
- return ret < 0 ? ret : 0;
- } else if (netdev_name_in_use(net, want_name)) {
- return -EEXIST;
- } else if (out_name != want_name) {
- strscpy(out_name, want_name, IFNAMSIZ);
- }
+ if (strchr(want_name, '%'))
+ return __dev_alloc_name(net, want_name, out_name);
+ if (netdev_name_in_use(net, want_name))
+ return -dup_errno;
+ if (out_name != want_name)
+ strscpy(out_name, want_name, IFNAMSIZ);
return 0;
}
-static int dev_alloc_name_ns(struct net *net,
- struct net_device *dev,
- const char *name)
-{
- char buf[IFNAMSIZ];
- int ret;
-
- BUG_ON(!net);
- ret = __dev_alloc_name(net, name, buf);
- if (ret >= 0)
- strscpy(dev->name, buf, IFNAMSIZ);
- return ret;
-}
-
/**
* dev_alloc_name - allocate a name for a device
* @dev: device
@@ -1184,20 +1146,17 @@ static int dev_alloc_name_ns(struct net *net,
int dev_alloc_name(struct net_device *dev, const char *name)
{
- return dev_alloc_name_ns(dev_net(dev), dev, name);
+ return dev_prep_valid_name(dev_net(dev), dev, name, dev->name, ENFILE);
}
EXPORT_SYMBOL(dev_alloc_name);
static int dev_get_valid_name(struct net *net, struct net_device *dev,
const char *name)
{
- char buf[IFNAMSIZ];
int ret;
- ret = dev_prep_valid_name(net, dev, name, buf);
- if (ret >= 0)
- strscpy(dev->name, buf, IFNAMSIZ);
- return ret;
+ ret = dev_prep_valid_name(net, dev, name, dev->name, EEXIST);
+ return ret < 0 ? ret : 0;
}
/**
@@ -1241,13 +1200,13 @@ int dev_change_name(struct net_device *dev, const char *newname)
dev->flags & IFF_UP ? " (while UP)" : "");
old_assign_type = dev->name_assign_type;
- dev->name_assign_type = NET_NAME_RENAMED;
+ WRITE_ONCE(dev->name_assign_type, NET_NAME_RENAMED);
rollback:
ret = device_rename(&dev->dev, dev->name);
if (ret) {
memcpy(dev->name, oldname, IFNAMSIZ);
- dev->name_assign_type = old_assign_type;
+ WRITE_ONCE(dev->name_assign_type, old_assign_type);
up_write(&devnet_rename_sem);
return ret;
}
@@ -1256,15 +1215,11 @@ rollback:
netdev_adjacent_rename_links(dev, oldname);
- write_lock(&dev_base_lock);
netdev_name_node_del(dev->name_node);
- write_unlock(&dev_base_lock);
- synchronize_rcu();
+ synchronize_net();
- write_lock(&dev_base_lock);
netdev_name_node_add(net, dev->name_node);
- write_unlock(&dev_base_lock);
ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
ret = notifier_to_errno(ret);
@@ -1276,7 +1231,7 @@ rollback:
down_write(&devnet_rename_sem);
memcpy(dev->name, oldname, IFNAMSIZ);
memcpy(oldname, newname, IFNAMSIZ);
- dev->name_assign_type = old_assign_type;
+ WRITE_ONCE(dev->name_assign_type, old_assign_type);
old_assign_type = NET_NAME_RENAMED;
goto rollback;
} else {
@@ -2271,7 +2226,8 @@ static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
*/
bool dev_nit_active(struct net_device *dev)
{
- return !list_empty(&ptype_all) || !list_empty(&dev->ptype_all);
+ return !list_empty(&net_hotdata.ptype_all) ||
+ !list_empty(&dev->ptype_all);
}
EXPORT_SYMBOL_GPL(dev_nit_active);
@@ -2282,15 +2238,14 @@ EXPORT_SYMBOL_GPL(dev_nit_active);
void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
{
- struct packet_type *ptype;
+ struct list_head *ptype_list = &net_hotdata.ptype_all;
+ struct packet_type *ptype, *pt_prev = NULL;
struct sk_buff *skb2 = NULL;
- struct packet_type *pt_prev = NULL;
- struct list_head *ptype_list = &ptype_all;
rcu_read_lock();
again:
list_for_each_entry_rcu(ptype, ptype_list, list) {
- if (ptype->ignore_outgoing)
+ if (READ_ONCE(ptype->ignore_outgoing))
continue;
/* Never send packets back to the socket
@@ -2331,7 +2286,7 @@ again:
pt_prev = ptype;
}
- if (ptype_list == &ptype_all) {
+ if (ptype_list == &net_hotdata.ptype_all) {
ptype_list = &dev->ptype_all;
goto again;
}
@@ -3500,6 +3455,9 @@ static netdev_features_t gso_features_check(const struct sk_buff *skb,
if (gso_segs > READ_ONCE(dev->gso_max_segs))
return features & ~NETIF_F_GSO_MASK;
+ if (unlikely(skb->len >= READ_ONCE(dev->gso_max_size)))
+ return features & ~NETIF_F_GSO_MASK;
+
if (!skb_shinfo(skb)->gso_type) {
skb_warn_bad_offload(skb);
return features & ~NETIF_F_GSO_MASK;
@@ -3782,6 +3740,8 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
qdisc_calculate_pkt_len(skb, q);
+ tcf_set_drop_reason(skb, SKB_DROP_REASON_QDISC_DROP);
+
if (q->flags & TCQ_F_NOLOCK) {
if (q->flags & TCQ_F_CAN_BYPASS && nolock_qdisc_is_empty(q) &&
qdisc_run_begin(q)) {
@@ -3811,10 +3771,14 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
no_lock_out:
if (unlikely(to_free))
kfree_skb_list_reason(to_free,
- SKB_DROP_REASON_QDISC_DROP);
+ tcf_get_drop_reason(to_free));
return rc;
}
+ if (unlikely(READ_ONCE(q->owner) == smp_processor_id())) {
+ kfree_skb_reason(skb, SKB_DROP_REASON_TC_RECLASSIFY_LOOP);
+ return NET_XMIT_DROP;
+ }
/*
* Heuristic to force contended enqueues to serialize on a
* separate lock before trying to get qdisc main lock.
@@ -3854,7 +3818,9 @@ no_lock_out:
qdisc_run_end(q);
rc = NET_XMIT_SUCCESS;
} else {
+ WRITE_ONCE(q->owner, smp_processor_id());
rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
+ WRITE_ONCE(q->owner, -1);
if (qdisc_run_begin(q)) {
if (unlikely(contended)) {
spin_unlock(&q->busylock);
@@ -3866,7 +3832,8 @@ no_lock_out:
}
spin_unlock(root_lock);
if (unlikely(to_free))
- kfree_skb_list_reason(to_free, SKB_DROP_REASON_QDISC_DROP);
+ kfree_skb_list_reason(to_free,
+ tcf_get_drop_reason(to_free));
if (unlikely(contended))
spin_unlock(&q->busylock);
return rc;
@@ -3939,7 +3906,8 @@ EXPORT_SYMBOL_GPL(netdev_xmit_skip_txqueue);
#endif /* CONFIG_NET_EGRESS */
#ifdef CONFIG_NET_XGRESS
-static int tc_run(struct tcx_entry *entry, struct sk_buff *skb)
+static int tc_run(struct tcx_entry *entry, struct sk_buff *skb,
+ enum skb_drop_reason *drop_reason)
{
int ret = TC_ACT_UNSPEC;
#ifdef CONFIG_NET_CLS_ACT
@@ -3951,12 +3919,14 @@ static int tc_run(struct tcx_entry *entry, struct sk_buff *skb)
tc_skb_cb(skb)->mru = 0;
tc_skb_cb(skb)->post_ct = false;
+ tcf_set_drop_reason(skb, *drop_reason);
mini_qdisc_bstats_cpu_update(miniq, skb);
ret = tcf_classify(skb, miniq->block, miniq->filter_list, &res, false);
/* Only tcf related quirks below. */
switch (ret) {
case TC_ACT_SHOT:
+ *drop_reason = tcf_get_drop_reason(skb);
mini_qdisc_qstats_cpu_drop(miniq);
break;
case TC_ACT_OK:
@@ -4006,6 +3976,7 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
struct net_device *orig_dev, bool *another)
{
struct bpf_mprog_entry *entry = rcu_dereference_bh(skb->dev->tcx_ingress);
+ enum skb_drop_reason drop_reason = SKB_DROP_REASON_TC_INGRESS;
int sch_ret;
if (!entry)
@@ -4023,7 +3994,7 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
if (sch_ret != TC_ACT_UNSPEC)
goto ingress_verdict;
}
- sch_ret = tc_run(tcx_entry(entry), skb);
+ sch_ret = tc_run(tcx_entry(entry), skb, &drop_reason);
ingress_verdict:
switch (sch_ret) {
case TC_ACT_REDIRECT:
@@ -4040,7 +4011,7 @@ ingress_verdict:
*ret = NET_RX_SUCCESS;
return NULL;
case TC_ACT_SHOT:
- kfree_skb_reason(skb, SKB_DROP_REASON_TC_INGRESS);
+ kfree_skb_reason(skb, drop_reason);
*ret = NET_RX_DROP;
return NULL;
/* used by tc_run */
@@ -4061,6 +4032,7 @@ static __always_inline struct sk_buff *
sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
{
struct bpf_mprog_entry *entry = rcu_dereference_bh(dev->tcx_egress);
+ enum skb_drop_reason drop_reason = SKB_DROP_REASON_TC_EGRESS;
int sch_ret;
if (!entry)
@@ -4074,7 +4046,7 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
if (sch_ret != TC_ACT_UNSPEC)
goto egress_verdict;
}
- sch_ret = tc_run(tcx_entry(entry), skb);
+ sch_ret = tc_run(tcx_entry(entry), skb, &drop_reason);
egress_verdict:
switch (sch_ret) {
case TC_ACT_REDIRECT:
@@ -4083,7 +4055,7 @@ egress_verdict:
*ret = NET_XMIT_SUCCESS;
return NULL;
case TC_ACT_SHOT:
- kfree_skb_reason(skb, SKB_DROP_REASON_TC_EGRESS);
+ kfree_skb_reason(skb, drop_reason);
*ret = NET_XMIT_DROP;
return NULL;
/* used by tc_run */
@@ -4439,19 +4411,10 @@ EXPORT_SYMBOL(__dev_direct_xmit);
* Receiver routines
*************************************************************************/
-int netdev_max_backlog __read_mostly = 1000;
-EXPORT_SYMBOL(netdev_max_backlog);
-
-int netdev_tstamp_prequeue __read_mostly = 1;
unsigned int sysctl_skb_defer_max __read_mostly = 64;
-int netdev_budget __read_mostly = 300;
-/* Must be at least 2 jiffes to guarantee 1 jiffy timeout */
-unsigned int __read_mostly netdev_budget_usecs = 2 * USEC_PER_SEC / HZ;
int weight_p __read_mostly = 64; /* old backlog weight */
int dev_weight_rx_bias __read_mostly = 1; /* bias for backlog weight */
int dev_weight_tx_bias __read_mostly = 1; /* bias for output_queue quota */
-int dev_rx_weight __read_mostly = 64;
-int dev_tx_weight __read_mostly = 64;
/* Called with irq disabled */
static inline void ____napi_schedule(struct softnet_data *sd,
@@ -4493,12 +4456,6 @@ static inline void ____napi_schedule(struct softnet_data *sd,
#ifdef CONFIG_RPS
-/* One global table that all flow-based protocols share. */
-struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
-EXPORT_SYMBOL(rps_sock_flow_table);
-u32 rps_cpu_mask __read_mostly;
-EXPORT_SYMBOL(rps_cpu_mask);
-
struct static_key_false rps_needed __read_mostly;
EXPORT_SYMBOL(rps_needed);
struct static_key_false rfs_needed __read_mostly;
@@ -4590,7 +4547,7 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
if (!hash)
goto done;
- sock_flow_table = rcu_dereference(rps_sock_flow_table);
+ sock_flow_table = rcu_dereference(net_hotdata.rps_sock_flow_table);
if (flow_table && sock_flow_table) {
struct rps_dev_flow *rflow;
u32 next_cpu;
@@ -4600,10 +4557,10 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
* This READ_ONCE() pairs with WRITE_ONCE() from rps_record_sock_flow().
*/
ident = READ_ONCE(sock_flow_table->ents[hash & sock_flow_table->mask]);
- if ((ident ^ hash) & ~rps_cpu_mask)
+ if ((ident ^ hash) & ~net_hotdata.rps_cpu_mask)
goto try_rps;
- next_cpu = ident & rps_cpu_mask;
+ next_cpu = ident & net_hotdata.rps_cpu_mask;
/* OK, now we know there is a match,
* we can look at the local (per receive queue) flow table
@@ -4752,7 +4709,7 @@ static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
struct softnet_data *sd;
unsigned int old_flow, new_flow;
- if (qlen < (READ_ONCE(netdev_max_backlog) >> 1))
+ if (qlen < (READ_ONCE(net_hotdata.max_backlog) >> 1))
return false;
sd = this_cpu_ptr(&softnet_data);
@@ -4800,7 +4757,8 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
if (!netif_running(skb->dev))
goto drop;
qlen = skb_queue_len(&sd->input_pkt_queue);
- if (qlen <= READ_ONCE(netdev_max_backlog) && !skb_flow_limit(skb, qlen)) {
+ if (qlen <= READ_ONCE(net_hotdata.max_backlog) &&
+ !skb_flow_limit(skb, qlen)) {
if (qlen) {
enqueue:
__skb_queue_tail(&sd->input_pkt_queue, skb);
@@ -4876,6 +4834,12 @@ u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp,
xdp_init_buff(xdp, frame_sz, &rxqueue->xdp_rxq);
xdp_prepare_buff(xdp, hard_start, skb_headroom(skb) - mac_len,
skb_headlen(skb) + mac_len, true);
+ if (skb_is_nonlinear(skb)) {
+ skb_shinfo(skb)->xdp_frags_size = skb->data_len;
+ xdp_buff_set_frags_flag(xdp);
+ } else {
+ xdp_buff_clear_frags_flag(xdp);
+ }
orig_data_end = xdp->data_end;
orig_data = xdp->data;
@@ -4905,6 +4869,14 @@ u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp,
skb->len += off; /* positive on grow, negative on shrink */
}
+ /* XDP frag metadata (e.g. nr_frags) are updated in eBPF helpers
+ * (e.g. bpf_xdp_adjust_tail), we need to update data_len here.
+ */
+ if (xdp_buff_has_frags(xdp))
+ skb->data_len = skb_shinfo(skb)->xdp_frags_size;
+ else
+ skb->data_len = 0;
+
/* check if XDP changed eth hdr such SKB needs update */
eth = (struct ethhdr *)xdp->data;
if ((orig_eth_type != eth->h_proto) ||
@@ -4938,11 +4910,35 @@ u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp,
return act;
}
-static u32 netif_receive_generic_xdp(struct sk_buff *skb,
+static int
+netif_skb_check_for_xdp(struct sk_buff **pskb, struct bpf_prog *prog)
+{
+ struct sk_buff *skb = *pskb;
+ int err, hroom, troom;
+
+ if (!skb_cow_data_for_xdp(this_cpu_read(system_page_pool), pskb, prog))
+ return 0;
+
+ /* In case we have to go down the path and also linearize,
+ * then lets do the pskb_expand_head() work just once here.
+ */
+ hroom = XDP_PACKET_HEADROOM - skb_headroom(skb);
+ troom = skb->tail + skb->data_len - skb->end;
+ err = pskb_expand_head(skb,
+ hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0,
+ troom > 0 ? troom + 128 : 0, GFP_ATOMIC);
+ if (err)
+ return err;
+
+ return skb_linearize(skb);
+}
+
+static u32 netif_receive_generic_xdp(struct sk_buff **pskb,
struct xdp_buff *xdp,
struct bpf_prog *xdp_prog)
{
- u32 act = XDP_DROP;
+ struct sk_buff *skb = *pskb;
+ u32 mac_len, act = XDP_DROP;
/* Reinjected packets coming from act_mirred or similar should
* not get XDP generic processing.
@@ -4950,41 +4946,36 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb,
if (skb_is_redirected(skb))
return XDP_PASS;
- /* XDP packets must be linear and must have sufficient headroom
- * of XDP_PACKET_HEADROOM bytes. This is the guarantee that also
- * native XDP provides, thus we need to do it here as well.
+ /* XDP packets must have sufficient headroom of XDP_PACKET_HEADROOM
+ * bytes. This is the guarantee that also native XDP provides,
+ * thus we need to do it here as well.
*/
+ mac_len = skb->data - skb_mac_header(skb);
+ __skb_push(skb, mac_len);
+
if (skb_cloned(skb) || skb_is_nonlinear(skb) ||
skb_headroom(skb) < XDP_PACKET_HEADROOM) {
- int hroom = XDP_PACKET_HEADROOM - skb_headroom(skb);
- int troom = skb->tail + skb->data_len - skb->end;
-
- /* In case we have to go down the path and also linearize,
- * then lets do the pskb_expand_head() work just once here.
- */
- if (pskb_expand_head(skb,
- hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0,
- troom > 0 ? troom + 128 : 0, GFP_ATOMIC))
- goto do_drop;
- if (skb_linearize(skb))
+ if (netif_skb_check_for_xdp(pskb, xdp_prog))
goto do_drop;
}
- act = bpf_prog_run_generic_xdp(skb, xdp, xdp_prog);
+ __skb_pull(*pskb, mac_len);
+
+ act = bpf_prog_run_generic_xdp(*pskb, xdp, xdp_prog);
switch (act) {
case XDP_REDIRECT:
case XDP_TX:
case XDP_PASS:
break;
default:
- bpf_warn_invalid_xdp_action(skb->dev, xdp_prog, act);
+ bpf_warn_invalid_xdp_action((*pskb)->dev, xdp_prog, act);
fallthrough;
case XDP_ABORTED:
- trace_xdp_exception(skb->dev, xdp_prog, act);
+ trace_xdp_exception((*pskb)->dev, xdp_prog, act);
fallthrough;
case XDP_DROP:
do_drop:
- kfree_skb(skb);
+ kfree_skb(*pskb);
break;
}
@@ -5022,24 +5013,24 @@ void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog)
static DEFINE_STATIC_KEY_FALSE(generic_xdp_needed_key);
-int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb)
+int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff **pskb)
{
if (xdp_prog) {
struct xdp_buff xdp;
u32 act;
int err;
- act = netif_receive_generic_xdp(skb, &xdp, xdp_prog);
+ act = netif_receive_generic_xdp(pskb, &xdp, xdp_prog);
if (act != XDP_PASS) {
switch (act) {
case XDP_REDIRECT:
- err = xdp_do_generic_redirect(skb->dev, skb,
+ err = xdp_do_generic_redirect((*pskb)->dev, *pskb,
&xdp, xdp_prog);
if (err)
goto out_redir;
break;
case XDP_TX:
- generic_xdp_tx(skb, xdp_prog);
+ generic_xdp_tx(*pskb, xdp_prog);
break;
}
return XDP_DROP;
@@ -5047,7 +5038,7 @@ int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb)
}
return XDP_PASS;
out_redir:
- kfree_skb_reason(skb, SKB_DROP_REASON_XDP);
+ kfree_skb_reason(*pskb, SKB_DROP_REASON_XDP);
return XDP_DROP;
}
EXPORT_SYMBOL_GPL(do_xdp_generic);
@@ -5056,7 +5047,7 @@ static int netif_rx_internal(struct sk_buff *skb)
{
int ret;
- net_timestamp_check(READ_ONCE(netdev_tstamp_prequeue), skb);
+ net_timestamp_check(READ_ONCE(net_hotdata.tstamp_prequeue), skb);
trace_netif_rx(skb);
@@ -5348,7 +5339,7 @@ static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc,
int ret = NET_RX_DROP;
__be16 type;
- net_timestamp_check(!READ_ONCE(netdev_tstamp_prequeue), skb);
+ net_timestamp_check(!READ_ONCE(net_hotdata.tstamp_prequeue), skb);
trace_netif_receive_skb(skb);
@@ -5370,7 +5361,8 @@ another_round:
int ret2;
migrate_disable();
- ret2 = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), skb);
+ ret2 = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog),
+ &skb);
migrate_enable();
if (ret2 != XDP_PASS) {
@@ -5391,7 +5383,7 @@ another_round:
if (pfmemalloc)
goto skip_taps;
- list_for_each_entry_rcu(ptype, &ptype_all, list) {
+ list_for_each_entry_rcu(ptype, &net_hotdata.ptype_all, list) {
if (pt_prev)
ret = deliver_skb(skb, pt_prev, orig_dev);
pt_prev = ptype;
@@ -5731,7 +5723,7 @@ static int netif_receive_skb_internal(struct sk_buff *skb)
{
int ret;
- net_timestamp_check(READ_ONCE(netdev_tstamp_prequeue), skb);
+ net_timestamp_check(READ_ONCE(net_hotdata.tstamp_prequeue), skb);
if (skb_defer_rx_timestamp(skb))
return NET_RX_SUCCESS;
@@ -5761,7 +5753,8 @@ void netif_receive_skb_list_internal(struct list_head *head)
INIT_LIST_HEAD(&sublist);
list_for_each_entry_safe(skb, next, head, list) {
- net_timestamp_check(READ_ONCE(netdev_tstamp_prequeue), skb);
+ net_timestamp_check(READ_ONCE(net_hotdata.tstamp_prequeue),
+ skb);
skb_list_del_init(skb);
if (!skb_defer_rx_timestamp(skb))
list_add_tail(&skb->list, &sublist);
@@ -5985,7 +5978,7 @@ static int process_backlog(struct napi_struct *napi, int quota)
net_rps_action_and_irq_enable(sd);
}
- napi->weight = READ_ONCE(dev_rx_weight);
+ napi->weight = READ_ONCE(net_hotdata.dev_rx_weight);
while (again) {
struct sk_buff *skb;
@@ -6162,7 +6155,7 @@ bool napi_complete_done(struct napi_struct *n, int work_done)
EXPORT_SYMBOL(napi_complete_done);
/* must be called under rcu_read_lock(), as we dont take a reference */
-static struct napi_struct *napi_by_id(unsigned int napi_id)
+struct napi_struct *napi_by_id(unsigned int napi_id)
{
unsigned int hash = napi_id % HASH_SIZE(napi_hash);
struct napi_struct *napi;
@@ -6174,6 +6167,27 @@ static struct napi_struct *napi_by_id(unsigned int napi_id)
return NULL;
}
+static void skb_defer_free_flush(struct softnet_data *sd)
+{
+ struct sk_buff *skb, *next;
+
+ /* Paired with WRITE_ONCE() in skb_attempt_defer_free() */
+ if (!READ_ONCE(sd->defer_list))
+ return;
+
+ spin_lock(&sd->defer_lock);
+ skb = sd->defer_list;
+ sd->defer_list = NULL;
+ sd->defer_count = 0;
+ spin_unlock(&sd->defer_lock);
+
+ while (skb != NULL) {
+ next = skb->next;
+ napi_consume_skb(skb, 1);
+ skb = next;
+ }
+}
+
#if defined(CONFIG_NET_RX_BUSY_POLL)
static void __busy_poll_stop(struct napi_struct *napi, bool skip_schedule)
@@ -6195,8 +6209,13 @@ static void __busy_poll_stop(struct napi_struct *napi, bool skip_schedule)
clear_bit(NAPI_STATE_SCHED, &napi->state);
}
-static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, bool prefer_busy_poll,
- u16 budget)
+enum {
+ NAPI_F_PREFER_BUSY_POLL = 1,
+ NAPI_F_END_ON_RESCHED = 2,
+};
+
+static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock,
+ unsigned flags, u16 budget)
{
bool skip_schedule = false;
unsigned long timeout;
@@ -6216,7 +6235,7 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, bool
local_bh_disable();
- if (prefer_busy_poll) {
+ if (flags & NAPI_F_PREFER_BUSY_POLL) {
napi->defer_hard_irqs_count = READ_ONCE(napi->dev->napi_defer_hard_irqs);
timeout = READ_ONCE(napi->dev->gro_flush_timeout);
if (napi->defer_hard_irqs_count && timeout) {
@@ -6240,23 +6259,23 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, bool
local_bh_enable();
}
-void napi_busy_loop(unsigned int napi_id,
- bool (*loop_end)(void *, unsigned long),
- void *loop_end_arg, bool prefer_busy_poll, u16 budget)
+static void __napi_busy_loop(unsigned int napi_id,
+ bool (*loop_end)(void *, unsigned long),
+ void *loop_end_arg, unsigned flags, u16 budget)
{
unsigned long start_time = loop_end ? busy_loop_current_time() : 0;
int (*napi_poll)(struct napi_struct *napi, int budget);
void *have_poll_lock = NULL;
struct napi_struct *napi;
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
restart:
napi_poll = NULL;
- rcu_read_lock();
-
napi = napi_by_id(napi_id);
if (!napi)
- goto out;
+ return;
if (!IS_ENABLED(CONFIG_PREEMPT_RT))
preempt_disable();
@@ -6272,14 +6291,14 @@ restart:
*/
if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED |
NAPIF_STATE_IN_BUSY_POLL)) {
- if (prefer_busy_poll)
+ if (flags & NAPI_F_PREFER_BUSY_POLL)
set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
goto count;
}
if (cmpxchg(&napi->state, val,
val | NAPIF_STATE_IN_BUSY_POLL |
NAPIF_STATE_SCHED) != val) {
- if (prefer_busy_poll)
+ if (flags & NAPI_F_PREFER_BUSY_POLL)
set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
goto count;
}
@@ -6293,18 +6312,22 @@ count:
if (work > 0)
__NET_ADD_STATS(dev_net(napi->dev),
LINUX_MIB_BUSYPOLLRXPACKETS, work);
+ skb_defer_free_flush(this_cpu_ptr(&softnet_data));
local_bh_enable();
if (!loop_end || loop_end(loop_end_arg, start_time))
break;
if (unlikely(need_resched())) {
+ if (flags & NAPI_F_END_ON_RESCHED)
+ break;
if (napi_poll)
- busy_poll_stop(napi, have_poll_lock, prefer_busy_poll, budget);
+ busy_poll_stop(napi, have_poll_lock, flags, budget);
if (!IS_ENABLED(CONFIG_PREEMPT_RT))
preempt_enable();
rcu_read_unlock();
cond_resched();
+ rcu_read_lock();
if (loop_end(loop_end_arg, start_time))
return;
goto restart;
@@ -6312,10 +6335,31 @@ count:
cpu_relax();
}
if (napi_poll)
- busy_poll_stop(napi, have_poll_lock, prefer_busy_poll, budget);
+ busy_poll_stop(napi, have_poll_lock, flags, budget);
if (!IS_ENABLED(CONFIG_PREEMPT_RT))
preempt_enable();
-out:
+}
+
+void napi_busy_loop_rcu(unsigned int napi_id,
+ bool (*loop_end)(void *, unsigned long),
+ void *loop_end_arg, bool prefer_busy_poll, u16 budget)
+{
+ unsigned flags = NAPI_F_END_ON_RESCHED;
+
+ if (prefer_busy_poll)
+ flags |= NAPI_F_PREFER_BUSY_POLL;
+
+ __napi_busy_loop(napi_id, loop_end, loop_end_arg, flags, budget);
+}
+
+void napi_busy_loop(unsigned int napi_id,
+ bool (*loop_end)(void *, unsigned long),
+ void *loop_end_arg, bool prefer_busy_poll, u16 budget)
+{
+ unsigned flags = prefer_busy_poll ? NAPI_F_PREFER_BUSY_POLL : 0;
+
+ rcu_read_lock();
+ __napi_busy_loop(napi_id, loop_end, loop_end_arg, flags, budget);
rcu_read_unlock();
}
EXPORT_SYMBOL(napi_busy_loop);
@@ -6423,6 +6467,43 @@ int dev_set_threaded(struct net_device *dev, bool threaded)
}
EXPORT_SYMBOL(dev_set_threaded);
+/**
+ * netif_queue_set_napi - Associate queue with the napi
+ * @dev: device to which NAPI and queue belong
+ * @queue_index: Index of queue
+ * @type: queue type as RX or TX
+ * @napi: NAPI context, pass NULL to clear previously set NAPI
+ *
+ * Set queue with its corresponding napi context. This should be done after
+ * registering the NAPI handler for the queue-vector and the queues have been
+ * mapped to the corresponding interrupt vector.
+ */
+void netif_queue_set_napi(struct net_device *dev, unsigned int queue_index,
+ enum netdev_queue_type type, struct napi_struct *napi)
+{
+ struct netdev_rx_queue *rxq;
+ struct netdev_queue *txq;
+
+ if (WARN_ON_ONCE(napi && !napi->dev))
+ return;
+ if (dev->reg_state >= NETREG_REGISTERED)
+ ASSERT_RTNL();
+
+ switch (type) {
+ case NETDEV_QUEUE_TYPE_RX:
+ rxq = __netif_get_rx_queue(dev, queue_index);
+ rxq->napi = napi;
+ return;
+ case NETDEV_QUEUE_TYPE_TX:
+ txq = netdev_get_tx_queue(dev, queue_index);
+ txq->napi = napi;
+ return;
+ default:
+ return;
+ }
+}
+EXPORT_SYMBOL(netif_queue_set_napi);
+
void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi,
int (*poll)(struct napi_struct *, int), int weight)
{
@@ -6458,6 +6539,7 @@ void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi,
*/
if (dev->threaded && napi_kthread_create(napi))
dev->threaded = 0;
+ netif_napi_set_irq(napi, -1);
}
EXPORT_SYMBOL(netif_napi_add_weight);
@@ -6552,9 +6634,11 @@ static int __napi_poll(struct napi_struct *n, bool *repoll)
* accidentally calling ->poll() when NAPI is not scheduled.
*/
work = 0;
- if (test_bit(NAPI_STATE_SCHED, &n->state)) {
+ if (napi_is_scheduled(n)) {
work = n->poll(n, weight);
trace_napi_poll(n, work, weight);
+
+ xdp_do_check_flushed(n);
}
if (unlikely(work > weight))
@@ -6658,27 +6742,6 @@ static int napi_thread_wait(struct napi_struct *napi)
return -1;
}
-static void skb_defer_free_flush(struct softnet_data *sd)
-{
- struct sk_buff *skb, *next;
-
- /* Paired with WRITE_ONCE() in skb_attempt_defer_free() */
- if (!READ_ONCE(sd->defer_list))
- return;
-
- spin_lock(&sd->defer_lock);
- skb = sd->defer_list;
- sd->defer_list = NULL;
- sd->defer_count = 0;
- spin_unlock(&sd->defer_lock);
-
- while (skb != NULL) {
- next = skb->next;
- napi_consume_skb(skb, 1);
- skb = next;
- }
-}
-
static int napi_threaded_poll(void *data)
{
struct napi_struct *napi = data;
@@ -6686,6 +6749,8 @@ static int napi_threaded_poll(void *data)
void *have;
while (!napi_thread_wait(napi)) {
+ unsigned long last_qs = jiffies;
+
for (;;) {
bool repoll = false;
@@ -6710,6 +6775,7 @@ static int napi_threaded_poll(void *data)
if (!repoll)
break;
+ rcu_softirq_qs_periodic(last_qs);
cond_resched();
}
}
@@ -6720,8 +6786,8 @@ static __latent_entropy void net_rx_action(struct softirq_action *h)
{
struct softnet_data *sd = this_cpu_ptr(&softnet_data);
unsigned long time_limit = jiffies +
- usecs_to_jiffies(READ_ONCE(netdev_budget_usecs));
- int budget = READ_ONCE(netdev_budget);
+ usecs_to_jiffies(READ_ONCE(net_hotdata.netdev_budget_usecs));
+ int budget = READ_ONCE(net_hotdata.netdev_budget);
LIST_HEAD(list);
LIST_HEAD(repoll);
@@ -8564,12 +8630,12 @@ unsigned int dev_get_flags(const struct net_device *dev)
{
unsigned int flags;
- flags = (dev->flags & ~(IFF_PROMISC |
+ flags = (READ_ONCE(dev->flags) & ~(IFF_PROMISC |
IFF_ALLMULTI |
IFF_RUNNING |
IFF_LOWER_UP |
IFF_DORMANT)) |
- (dev->gflags & (IFF_PROMISC |
+ (READ_ONCE(dev->gflags) & (IFF_PROMISC |
IFF_ALLMULTI));
if (netif_running(dev)) {
@@ -8892,7 +8958,7 @@ int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa,
}
EXPORT_SYMBOL(dev_set_mac_address);
-static DECLARE_RWSEM(dev_addr_sem);
+DECLARE_RWSEM(dev_addr_sem);
int dev_set_mac_address_user(struct net_device *dev, struct sockaddr *sa,
struct netlink_ext_ack *extack)
@@ -9646,11 +9712,11 @@ static void dev_index_release(struct net *net, int ifindex)
/* Delayed registration/unregisteration */
LIST_HEAD(net_todo_list);
DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
+atomic_t dev_unreg_count = ATOMIC_INIT(0);
static void net_set_todo(struct net_device *dev)
{
list_add_tail(&dev->todo_list, &net_todo_list);
- atomic_inc(&dev_net(dev)->dev_unreg_count);
}
static netdev_features_t netdev_sync_upper_features(struct net_device *lower,
@@ -10050,6 +10116,54 @@ void netif_tx_stop_all_queues(struct net_device *dev)
}
EXPORT_SYMBOL(netif_tx_stop_all_queues);
+static int netdev_do_alloc_pcpu_stats(struct net_device *dev)
+{
+ void __percpu *v;
+
+ /* Drivers implementing ndo_get_peer_dev must support tstat
+ * accounting, so that skb_do_redirect() can bump the dev's
+ * RX stats upon network namespace switch.
+ */
+ if (dev->netdev_ops->ndo_get_peer_dev &&
+ dev->pcpu_stat_type != NETDEV_PCPU_STAT_TSTATS)
+ return -EOPNOTSUPP;
+
+ switch (dev->pcpu_stat_type) {
+ case NETDEV_PCPU_STAT_NONE:
+ return 0;
+ case NETDEV_PCPU_STAT_LSTATS:
+ v = dev->lstats = netdev_alloc_pcpu_stats(struct pcpu_lstats);
+ break;
+ case NETDEV_PCPU_STAT_TSTATS:
+ v = dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
+ break;
+ case NETDEV_PCPU_STAT_DSTATS:
+ v = dev->dstats = netdev_alloc_pcpu_stats(struct pcpu_dstats);
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return v ? 0 : -ENOMEM;
+}
+
+static void netdev_do_free_pcpu_stats(struct net_device *dev)
+{
+ switch (dev->pcpu_stat_type) {
+ case NETDEV_PCPU_STAT_NONE:
+ return;
+ case NETDEV_PCPU_STAT_LSTATS:
+ free_percpu(dev->lstats);
+ break;
+ case NETDEV_PCPU_STAT_TSTATS:
+ free_percpu(dev->tstats);
+ break;
+ case NETDEV_PCPU_STAT_DSTATS:
+ free_percpu(dev->dstats);
+ break;
+ }
+}
+
/**
* register_netdevice() - register a network device
* @dev: device to register
@@ -10110,9 +10224,13 @@ int register_netdevice(struct net_device *dev)
goto err_uninit;
}
+ ret = netdev_do_alloc_pcpu_stats(dev);
+ if (ret)
+ goto err_uninit;
+
ret = dev_index_reserve(net, dev->ifindex);
if (ret < 0)
- goto err_uninit;
+ goto err_free_pcpu;
dev->ifindex = ret;
/* Transfer changeable features to wanted_features and enable
@@ -10163,9 +10281,9 @@ int register_netdevice(struct net_device *dev)
goto err_ifindex_release;
ret = netdev_register_kobject(dev);
- write_lock(&dev_base_lock);
- dev->reg_state = ret ? NETREG_UNREGISTERED : NETREG_REGISTERED;
- write_unlock(&dev_base_lock);
+
+ WRITE_ONCE(dev->reg_state, ret ? NETREG_UNREGISTERED : NETREG_REGISTERED);
+
if (ret)
goto err_uninit_notify;
@@ -10218,6 +10336,8 @@ err_uninit_notify:
call_netdevice_notifiers(NETDEV_PRE_UNINIT, dev);
err_ifindex_release:
dev_index_release(net, dev->ifindex);
+err_free_pcpu:
+ netdev_do_free_pcpu_stats(dev);
err_uninit:
if (dev->netdev_ops->ndo_uninit)
dev->netdev_ops->ndo_uninit(dev);
@@ -10239,7 +10359,7 @@ EXPORT_SYMBOL(register_netdevice);
* that need to tie several hardware interfaces to a single NAPI
* poll scheduler due to HW limitations.
*/
-int init_dummy_netdev(struct net_device *dev)
+void init_dummy_netdev(struct net_device *dev)
{
/* Clear everything. Note we don't initialize spinlocks
* are they aren't supposed to be taken by any of the
@@ -10267,8 +10387,6 @@ int init_dummy_netdev(struct net_device *dev)
* because users of this 'device' dont need to change
* its refcount.
*/
-
- return 0;
}
EXPORT_SYMBOL_GPL(init_dummy_netdev);
@@ -10423,6 +10541,7 @@ void netdev_run_todo(void)
{
struct net_device *dev, *tmp;
struct list_head list;
+ int cnt;
#ifdef CONFIG_LOCKDEP
struct list_head unlink_list;
@@ -10453,12 +10572,11 @@ void netdev_run_todo(void)
continue;
}
- write_lock(&dev_base_lock);
- dev->reg_state = NETREG_UNREGISTERED;
- write_unlock(&dev_base_lock);
- linkwatch_forget_dev(dev);
+ WRITE_ONCE(dev->reg_state, NETREG_UNREGISTERED);
+ linkwatch_sync_dev(dev);
}
+ cnt = 0;
while (!list_empty(&list)) {
dev = netdev_wait_allrefs_any(&list);
list_del(&dev->todo_list);
@@ -10470,17 +10588,19 @@ void netdev_run_todo(void)
WARN_ON(rcu_access_pointer(dev->ip_ptr));
WARN_ON(rcu_access_pointer(dev->ip6_ptr));
+ netdev_do_free_pcpu_stats(dev);
if (dev->priv_destructor)
dev->priv_destructor(dev);
if (dev->needs_free_netdev)
free_netdev(dev);
- if (atomic_dec_and_test(&dev_net(dev)->dev_unreg_count))
- wake_up(&netdev_unregistering_wq);
+ cnt++;
/* Free network device */
kobject_put(&dev->dev.kobj);
}
+ if (cnt && atomic_sub_and_test(cnt, &dev_unreg_count))
+ wake_up(&netdev_unregistering_wq);
}
/* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has
@@ -10504,7 +10624,8 @@ void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
}
EXPORT_SYMBOL(netdev_stats_to_stats64);
-struct net_device_core_stats __percpu *netdev_core_stats_alloc(struct net_device *dev)
+static __cold struct net_device_core_stats __percpu *netdev_core_stats_alloc(
+ struct net_device *dev)
{
struct net_device_core_stats __percpu *p;
@@ -10517,7 +10638,23 @@ struct net_device_core_stats __percpu *netdev_core_stats_alloc(struct net_device
/* This READ_ONCE() pairs with the cmpxchg() above */
return READ_ONCE(dev->core_stats);
}
-EXPORT_SYMBOL(netdev_core_stats_alloc);
+
+noinline void netdev_core_stats_inc(struct net_device *dev, u32 offset)
+{
+ /* This READ_ONCE() pairs with the write in netdev_core_stats_alloc() */
+ struct net_device_core_stats __percpu *p = READ_ONCE(dev->core_stats);
+ unsigned long __percpu *field;
+
+ if (unlikely(!p)) {
+ p = netdev_core_stats_alloc(dev);
+ if (!p)
+ return;
+ }
+
+ field = (__force unsigned long __percpu *)((__force void *)p + offset);
+ this_cpu_inc(*field);
+}
+EXPORT_SYMBOL_GPL(netdev_core_stats_inc);
/**
* dev_get_stats - get network device statistics
@@ -10540,6 +10677,8 @@ struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
ops->ndo_get_stats64(dev, storage);
} else if (ops->ndo_get_stats) {
netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
+ } else if (dev->pcpu_stat_type == NETDEV_PCPU_STAT_TSTATS) {
+ dev_get_tstats64(dev, storage);
} else {
netdev_stats_to_stats64(storage, &dev->stats);
}
@@ -10854,7 +10993,7 @@ void free_netdev(struct net_device *dev)
}
BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
- dev->reg_state = NETREG_RELEASED;
+ WRITE_ONCE(dev->reg_state, NETREG_RELEASED);
/* will free via device release */
put_device(&dev->dev);
@@ -10910,6 +11049,7 @@ void unregister_netdevice_many_notify(struct list_head *head,
{
struct net_device *dev, *tmp;
LIST_HEAD(close_head);
+ int cnt = 0;
BUG_ON(dev_boot_phase);
ASSERT_RTNL();
@@ -10941,10 +11081,8 @@ void unregister_netdevice_many_notify(struct list_head *head,
list_for_each_entry(dev, head, unreg_list) {
/* And unlink it from device chain. */
- write_lock(&dev_base_lock);
- unlist_netdevice(dev, false);
- dev->reg_state = NETREG_UNREGISTERING;
- write_unlock(&dev_base_lock);
+ unlist_netdevice(dev);
+ WRITE_ONCE(dev->reg_state, NETREG_UNREGISTERING);
}
flush_all_backlogs();
@@ -11006,7 +11144,9 @@ void unregister_netdevice_many_notify(struct list_head *head,
list_for_each_entry(dev, head, unreg_list) {
netdev_put(dev, &dev->dev_registered_tracker);
net_set_todo(dev);
+ cnt++;
}
+ atomic_add(cnt, &dev_unreg_count);
list_del(head);
}
@@ -11091,7 +11231,7 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net,
/* We get here if we can't use the current device name */
if (!pat)
goto out;
- err = dev_prep_valid_name(net, dev, pat, new_name);
+ err = dev_prep_valid_name(net, dev, pat, new_name, EEXIST);
if (err < 0)
goto out;
}
@@ -11124,7 +11264,7 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net,
dev_close(dev);
/* And unlink it from device chain */
- unlist_netdevice(dev, true);
+ unlist_netdevice(dev);
synchronize_net();
@@ -11163,17 +11303,19 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net,
dev_net_set(dev, net);
dev->ifindex = new_ifindex;
- /* Send a netdev-add uevent to the new namespace */
- kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
- netdev_adjacent_add_links(dev);
-
if (new_name[0]) /* Rename the netdev to prepared name */
strscpy(dev->name, new_name, IFNAMSIZ);
/* Fixup kobjects */
+ dev_set_uevent_suppress(&dev->dev, 1);
err = device_rename(&dev->dev, dev->name);
+ dev_set_uevent_suppress(&dev->dev, 0);
WARN_ON(err);
+ /* Send a netdev-add uevent to the new namespace */
+ kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
+ netdev_adjacent_add_links(dev);
+
/* Adapt owner in case owning user namespace of target network
* namespace is different from the original one.
*/
@@ -11433,6 +11575,7 @@ static struct pernet_operations __net_initdata netdev_net_ops = {
static void __net_exit default_device_exit_net(struct net *net)
{
+ struct netdev_name_node *name_node, *tmp;
struct net_device *dev, *aux;
/*
* Push all migratable network devices back to the
@@ -11455,6 +11598,11 @@ static void __net_exit default_device_exit_net(struct net *net)
snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
if (netdev_name_in_use(&init_net, fb_name))
snprintf(fb_name, IFNAMSIZ, "dev%%d");
+
+ netdev_for_each_altname_safe(dev, name_node, tmp)
+ if (netdev_name_in_use(&init_net, name_node->name))
+ __netdev_name_node_alt_destroy(name_node);
+
err = dev_change_net_namespace(dev, &init_net, fb_name);
if (err) {
pr_emerg("%s: failed to move %s to init_net: %d\n",
@@ -11497,6 +11645,63 @@ static struct pernet_operations __net_initdata default_device_ops = {
.exit_batch = default_device_exit_batch,
};
+static void __init net_dev_struct_check(void)
+{
+ /* TX read-mostly hotpath */
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, priv_flags);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, netdev_ops);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, header_ops);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, _tx);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, real_num_tx_queues);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_max_size);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_ipv4_max_size);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_max_segs);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_partial_features);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, num_tc);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, mtu);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, needed_headroom);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, tc_to_txq);
+#ifdef CONFIG_XPS
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, xps_maps);
+#endif
+#ifdef CONFIG_NETFILTER_EGRESS
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, nf_hooks_egress);
+#endif
+#ifdef CONFIG_NET_XGRESS
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, tcx_egress);
+#endif
+ CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_tx, 160);
+
+ /* TXRX read-mostly hotpath */
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, lstats);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, state);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, flags);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, hard_header_len);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, features);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, ip6_ptr);
+ CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_txrx, 46);
+
+ /* RX read-mostly hotpath */
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, ptype_specific);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, ifindex);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, real_num_rx_queues);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, _rx);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_flush_timeout);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, napi_defer_hard_irqs);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_max_size);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_ipv4_max_size);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, rx_handler);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, rx_handler_data);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, nd_net);
+#ifdef CONFIG_NETPOLL
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, npinfo);
+#endif
+#ifdef CONFIG_NET_XGRESS
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, tcx_ingress);
+#endif
+ CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_rx, 104);
+}
+
/*
* Initialize the DEV module. At boot time this walks the device list and
* unhooks any devices that fail to initialise (normally hardware not
@@ -11504,6 +11709,28 @@ static struct pernet_operations __net_initdata default_device_ops = {
*
*/
+/* We allocate 256 pages for each CPU if PAGE_SHIFT is 12 */
+#define SYSTEM_PERCPU_PAGE_POOL_SIZE ((1 << 20) / PAGE_SIZE)
+
+static int net_page_pool_create(int cpuid)
+{
+#if IS_ENABLED(CONFIG_PAGE_POOL)
+ struct page_pool_params page_pool_params = {
+ .pool_size = SYSTEM_PERCPU_PAGE_POOL_SIZE,
+ .flags = PP_FLAG_SYSTEM_POOL,
+ .nid = NUMA_NO_NODE,
+ };
+ struct page_pool *pp_ptr;
+
+ pp_ptr = page_pool_create_percpu(&page_pool_params, cpuid);
+ if (IS_ERR(pp_ptr))
+ return -ENOMEM;
+
+ per_cpu(system_page_pool, cpuid) = pp_ptr;
+#endif
+ return 0;
+}
+
/*
* This is called single threaded during boot, so no need
* to take the rtnl semaphore.
@@ -11514,13 +11741,14 @@ static int __init net_dev_init(void)
BUG_ON(!dev_boot_phase);
+ net_dev_struct_check();
+
if (dev_proc_init())
goto out;
if (netdev_kobject_init())
goto out;
- INIT_LIST_HEAD(&ptype_all);
for (i = 0; i < PTYPE_HASH_SIZE; i++)
INIT_LIST_HEAD(&ptype_base[i]);
@@ -11554,6 +11782,9 @@ static int __init net_dev_init(void)
init_gro_hash(&sd->backlog);
sd->backlog.poll = process_backlog;
sd->backlog.weight = weight_p;
+
+ if (net_page_pool_create(i))
+ goto out;
}
dev_boot_phase = 0;
@@ -11581,6 +11812,19 @@ static int __init net_dev_init(void)
WARN_ON(rc < 0);
rc = 0;
out:
+ if (rc < 0) {
+ for_each_possible_cpu(i) {
+ struct page_pool *pp_ptr;
+
+ pp_ptr = per_cpu(system_page_pool, i);
+ if (!pp_ptr)
+ continue;
+
+ page_pool_destroy(pp_ptr);
+ per_cpu(system_page_pool, i) = NULL;
+ }
+ }
+
return rc;
}