aboutsummaryrefslogtreecommitdiff
path: root/net/core/dev.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/core/dev.c')
-rw-r--r--net/core/dev.c139
1 files changed, 97 insertions, 42 deletions
diff --git a/net/core/dev.c b/net/core/dev.c
index 4bfdcd6b20e8..bde98cfd166f 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1069,19 +1069,6 @@ struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
}
EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
-struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
-{
- struct net_device *dev;
-
- ASSERT_RTNL();
- for_each_netdev(net, dev)
- if (dev->type == type)
- return dev;
-
- return NULL;
-}
-EXPORT_SYMBOL(__dev_getfirstbyhwtype);
-
struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
{
struct net_device *dev, *ret = NULL;
@@ -3495,6 +3482,11 @@ static netdev_features_t gso_features_check(const struct sk_buff *skb,
if (gso_segs > dev->gso_max_segs)
return features & ~NETIF_F_GSO_MASK;
+ if (!skb_shinfo(skb)->gso_type) {
+ skb_warn_bad_offload(skb);
+ return features & ~NETIF_F_GSO_MASK;
+ }
+
/* Support for GSO partial features requires software
* intervention before we can actually process the packets
* so we need to strip support for any partial features now
@@ -3867,6 +3859,7 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
return skb;
/* qdisc_skb_cb(skb)->pkt_len was already set by the caller. */
+ qdisc_skb_cb(skb)->mru = 0;
mini_qdisc_bstats_cpu_update(miniq, skb);
switch (tcf_classify(skb, miniq->filter_list, &cl_res, false)) {
@@ -4180,7 +4173,7 @@ int dev_queue_xmit_accel(struct sk_buff *skb, struct net_device *sb_dev)
}
EXPORT_SYMBOL(dev_queue_xmit_accel);
-int dev_direct_xmit(struct sk_buff *skb, u16 queue_id)
+int __dev_direct_xmit(struct sk_buff *skb, u16 queue_id)
{
struct net_device *dev = skb->dev;
struct sk_buff *orig_skb = skb;
@@ -4210,17 +4203,13 @@ int dev_direct_xmit(struct sk_buff *skb, u16 queue_id)
dev_xmit_recursion_dec();
local_bh_enable();
-
- if (!dev_xmit_complete(ret))
- kfree_skb(skb);
-
return ret;
drop:
atomic_long_inc(&dev->tx_dropped);
kfree_skb_list(skb);
return NET_XMIT_DROP;
}
-EXPORT_SYMBOL(dev_direct_xmit);
+EXPORT_SYMBOL(__dev_direct_xmit);
/*************************************************************************
* Receiver routines
@@ -4954,6 +4943,7 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
}
qdisc_skb_cb(skb)->pkt_len = skb->len;
+ qdisc_skb_cb(skb)->mru = 0;
skb->tc_at_ingress = 1;
mini_qdisc_bstats_cpu_update(miniq, skb);
@@ -6458,7 +6448,8 @@ bool napi_complete_done(struct napi_struct *n, int work_done)
WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED));
- new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED);
+ new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED |
+ NAPIF_STATE_PREFER_BUSY_POLL);
/* If STATE_MISSED was set, leave STATE_SCHED set,
* because we will call napi->poll() one more time.
@@ -6495,10 +6486,30 @@ static struct napi_struct *napi_by_id(unsigned int napi_id)
#if defined(CONFIG_NET_RX_BUSY_POLL)
-#define BUSY_POLL_BUDGET 8
+static void __busy_poll_stop(struct napi_struct *napi, bool skip_schedule)
+{
+ if (!skip_schedule) {
+ gro_normal_list(napi);
+ __napi_schedule(napi);
+ return;
+ }
+
+ if (napi->gro_bitmask) {
+ /* flush too old packets
+ * If HZ < 1000, flush all packets.
+ */
+ napi_gro_flush(napi, HZ >= 1000);
+ }
+
+ gro_normal_list(napi);
+ clear_bit(NAPI_STATE_SCHED, &napi->state);
+}
-static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock)
+static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, bool prefer_busy_poll,
+ u16 budget)
{
+ bool skip_schedule = false;
+ unsigned long timeout;
int rc;
/* Busy polling means there is a high chance device driver hard irq
@@ -6515,29 +6526,33 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock)
local_bh_disable();
+ if (prefer_busy_poll) {
+ napi->defer_hard_irqs_count = READ_ONCE(napi->dev->napi_defer_hard_irqs);
+ timeout = READ_ONCE(napi->dev->gro_flush_timeout);
+ if (napi->defer_hard_irqs_count && timeout) {
+ hrtimer_start(&napi->timer, ns_to_ktime(timeout), HRTIMER_MODE_REL_PINNED);
+ skip_schedule = true;
+ }
+ }
+
/* All we really want here is to re-enable device interrupts.
* Ideally, a new ndo_busy_poll_stop() could avoid another round.
*/
- rc = napi->poll(napi, BUSY_POLL_BUDGET);
+ rc = napi->poll(napi, budget);
/* We can't gro_normal_list() here, because napi->poll() might have
* rearmed the napi (napi_complete_done()) in which case it could
* already be running on another CPU.
*/
- trace_napi_poll(napi, rc, BUSY_POLL_BUDGET);
+ trace_napi_poll(napi, rc, budget);
netpoll_poll_unlock(have_poll_lock);
- if (rc == BUSY_POLL_BUDGET) {
- /* As the whole budget was spent, we still own the napi so can
- * safely handle the rx_list.
- */
- gro_normal_list(napi);
- __napi_schedule(napi);
- }
+ if (rc == budget)
+ __busy_poll_stop(napi, skip_schedule);
local_bh_enable();
}
void napi_busy_loop(unsigned int napi_id,
bool (*loop_end)(void *, unsigned long),
- void *loop_end_arg)
+ void *loop_end_arg, bool prefer_busy_poll, u16 budget)
{
unsigned long start_time = loop_end ? busy_loop_current_time() : 0;
int (*napi_poll)(struct napi_struct *napi, int budget);
@@ -6565,17 +6580,23 @@ restart:
* we avoid dirtying napi->state as much as we can.
*/
if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED |
- NAPIF_STATE_IN_BUSY_POLL))
+ NAPIF_STATE_IN_BUSY_POLL)) {
+ if (prefer_busy_poll)
+ set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
goto count;
+ }
if (cmpxchg(&napi->state, val,
val | NAPIF_STATE_IN_BUSY_POLL |
- NAPIF_STATE_SCHED) != val)
+ NAPIF_STATE_SCHED) != val) {
+ if (prefer_busy_poll)
+ set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
goto count;
+ }
have_poll_lock = netpoll_poll_lock(napi);
napi_poll = napi->poll;
}
- work = napi_poll(napi, BUSY_POLL_BUDGET);
- trace_napi_poll(napi, work, BUSY_POLL_BUDGET);
+ work = napi_poll(napi, budget);
+ trace_napi_poll(napi, work, budget);
gro_normal_list(napi);
count:
if (work > 0)
@@ -6588,7 +6609,7 @@ count:
if (unlikely(need_resched())) {
if (napi_poll)
- busy_poll_stop(napi, have_poll_lock);
+ busy_poll_stop(napi, have_poll_lock, prefer_busy_poll, budget);
preempt_enable();
rcu_read_unlock();
cond_resched();
@@ -6599,7 +6620,7 @@ count:
cpu_relax();
}
if (napi_poll)
- busy_poll_stop(napi, have_poll_lock);
+ busy_poll_stop(napi, have_poll_lock, prefer_busy_poll, budget);
preempt_enable();
out:
rcu_read_unlock();
@@ -6650,8 +6671,10 @@ static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
* NAPI_STATE_MISSED, since we do not react to a device IRQ.
*/
if (!napi_disable_pending(napi) &&
- !test_and_set_bit(NAPI_STATE_SCHED, &napi->state))
+ !test_and_set_bit(NAPI_STATE_SCHED, &napi->state)) {
+ clear_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
__napi_schedule_irqoff(napi);
+ }
return HRTIMER_NORESTART;
}
@@ -6709,6 +6732,7 @@ void napi_disable(struct napi_struct *n)
hrtimer_cancel(&n->timer);
+ clear_bit(NAPI_STATE_PREFER_BUSY_POLL, &n->state);
clear_bit(NAPI_STATE_DISABLE, &n->state);
}
EXPORT_SYMBOL(napi_disable);
@@ -6781,6 +6805,19 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll)
goto out_unlock;
}
+ /* The NAPI context has more processing work, but busy-polling
+ * is preferred. Exit early.
+ */
+ if (napi_prefer_busy_poll(n)) {
+ if (napi_complete_done(n, work)) {
+ /* If timeout is not set, we need to make sure
+ * that the NAPI is re-scheduled.
+ */
+ napi_schedule(n);
+ }
+ goto out_unlock;
+ }
+
if (n->gro_bitmask) {
/* flush too old packets
* If HZ < 1000, flush all packets.
@@ -8921,6 +8958,17 @@ static struct bpf_prog *dev_xdp_prog(struct net_device *dev,
return dev->xdp_state[mode].prog;
}
+static u8 dev_xdp_prog_count(struct net_device *dev)
+{
+ u8 count = 0;
+ int i;
+
+ for (i = 0; i < __MAX_XDP_MODE; i++)
+ if (dev->xdp_state[i].prog || dev->xdp_state[i].link)
+ count++;
+ return count;
+}
+
u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode)
{
struct bpf_prog *prog = dev_xdp_prog(dev, mode);
@@ -9011,6 +9059,7 @@ static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack
struct bpf_xdp_link *link, struct bpf_prog *new_prog,
struct bpf_prog *old_prog, u32 flags)
{
+ unsigned int num_modes = hweight32(flags & XDP_FLAGS_MODES);
struct bpf_prog *cur_prog;
enum bpf_xdp_mode mode;
bpf_op_t bpf_op;
@@ -9026,11 +9075,17 @@ static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack
NL_SET_ERR_MSG(extack, "Invalid XDP flags for BPF link attachment");
return -EINVAL;
}
- /* just one XDP mode bit should be set, zero defaults to SKB mode */
- if (hweight32(flags & XDP_FLAGS_MODES) > 1) {
+ /* just one XDP mode bit should be set, zero defaults to drv/skb mode */
+ if (num_modes > 1) {
NL_SET_ERR_MSG(extack, "Only one XDP mode flag can be set");
return -EINVAL;
}
+ /* avoid ambiguity if offload + drv/skb mode progs are both loaded */
+ if (!num_modes && dev_xdp_prog_count(dev) > 1) {
+ NL_SET_ERR_MSG(extack,
+ "More than one program loaded, unset mode is ambiguous");
+ return -EINVAL;
+ }
/* old_prog != NULL implies XDP_FLAGS_REPLACE is set */
if (old_prog && !(flags & XDP_FLAGS_REPLACE)) {
NL_SET_ERR_MSG(extack, "XDP_FLAGS_REPLACE is not specified");
@@ -9763,7 +9818,7 @@ static int netif_alloc_rx_queues(struct net_device *dev)
rx[i].dev = dev;
/* XDP RX-queue setup */
- err = xdp_rxq_info_reg(&rx[i].xdp_rxq, dev, i);
+ err = xdp_rxq_info_reg(&rx[i].xdp_rxq, dev, i, 0);
if (err < 0)
goto err_rxq_info;
}