diff options
Diffstat (limited to 'net/core/dev.c')
-rw-r--r-- | net/core/dev.c | 441 |
1 files changed, 276 insertions, 165 deletions
diff --git a/net/core/dev.c b/net/core/dev.c index 6c5967e80132..222b1d322c96 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -848,6 +848,52 @@ int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb) } EXPORT_SYMBOL_GPL(dev_fill_metadata_dst); +static struct net_device_path *dev_fwd_path(struct net_device_path_stack *stack) +{ + int k = stack->num_paths++; + + if (WARN_ON_ONCE(k >= NET_DEVICE_PATH_STACK_MAX)) + return NULL; + + return &stack->path[k]; +} + +int dev_fill_forward_path(const struct net_device *dev, const u8 *daddr, + struct net_device_path_stack *stack) +{ + const struct net_device *last_dev; + struct net_device_path_ctx ctx = { + .dev = dev, + .daddr = daddr, + }; + struct net_device_path *path; + int ret = 0; + + stack->num_paths = 0; + while (ctx.dev && ctx.dev->netdev_ops->ndo_fill_forward_path) { + last_dev = ctx.dev; + path = dev_fwd_path(stack); + if (!path) + return -1; + + memset(path, 0, sizeof(struct net_device_path)); + ret = ctx.dev->netdev_ops->ndo_fill_forward_path(&ctx, path); + if (ret < 0) + return -1; + + if (WARN_ON_ONCE(last_dev == ctx.dev)) + return -1; + } + path = dev_fwd_path(stack); + if (!path) + return -1; + path->type = DEV_PATH_ETHERNET; + path->dev = ctx.dev; + + return ret; +} +EXPORT_SYMBOL_GPL(dev_fill_forward_path); + /** * __dev_get_by_name - find a device by its name * @net: the applicable net namespace @@ -1184,6 +1230,18 @@ static int __dev_alloc_name(struct net *net, const char *name, char *buf) return -ENOMEM; for_each_netdev(net, d) { + struct netdev_name_node *name_node; + list_for_each_entry(name_node, &d->name_node->list, list) { + if (!sscanf(name_node->name, name, &i)) + continue; + if (i < 0 || i >= max_netdevices) + continue; + + /* avoid cases where sscanf is not exact inverse of printf */ + snprintf(buf, IFNAMSIZ, name, i); + if (!strncmp(buf, name_node->name, IFNAMSIZ)) + set_bit(i, inuse); + } if (!sscanf(d->name, name, &i)) continue; if (i < 0 || i >= max_netdevices) @@ -2451,16 +2509,14 @@ int netdev_txq_to_tc(struct net_device *dev, unsigned int txq) EXPORT_SYMBOL(netdev_txq_to_tc); #ifdef CONFIG_XPS -struct static_key xps_needed __read_mostly; -EXPORT_SYMBOL(xps_needed); -struct static_key xps_rxqs_needed __read_mostly; -EXPORT_SYMBOL(xps_rxqs_needed); +static struct static_key xps_needed __read_mostly; +static struct static_key xps_rxqs_needed __read_mostly; static DEFINE_MUTEX(xps_map_mutex); #define xmap_dereference(P) \ rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex)) static bool remove_xps_queue(struct xps_dev_maps *dev_maps, - int tci, u16 index) + struct xps_dev_maps *old_maps, int tci, u16 index) { struct xps_map *map = NULL; int pos; @@ -2479,6 +2535,8 @@ static bool remove_xps_queue(struct xps_dev_maps *dev_maps, break; } + if (old_maps) + RCU_INIT_POINTER(old_maps->attr_map[tci], NULL); RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL); kfree_rcu(map, rcu); return false; @@ -2491,7 +2549,7 @@ static bool remove_xps_queue_cpu(struct net_device *dev, struct xps_dev_maps *dev_maps, int cpu, u16 offset, u16 count) { - int num_tc = dev->num_tc ? : 1; + int num_tc = dev_maps->num_tc; bool active = false; int tci; @@ -2499,7 +2557,7 @@ static bool remove_xps_queue_cpu(struct net_device *dev, int i, j; for (i = count, j = offset; i--; j++) { - if (!remove_xps_queue(dev_maps, tci, j)) + if (!remove_xps_queue(dev_maps, NULL, tci, j)) break; } @@ -2511,74 +2569,54 @@ static bool remove_xps_queue_cpu(struct net_device *dev, static void reset_xps_maps(struct net_device *dev, struct xps_dev_maps *dev_maps, - bool is_rxqs_map) + enum xps_map_type type) { - if (is_rxqs_map) { - static_key_slow_dec_cpuslocked(&xps_rxqs_needed); - RCU_INIT_POINTER(dev->xps_rxqs_map, NULL); - } else { - RCU_INIT_POINTER(dev->xps_cpus_map, NULL); - } static_key_slow_dec_cpuslocked(&xps_needed); + if (type == XPS_RXQS) + static_key_slow_dec_cpuslocked(&xps_rxqs_needed); + + RCU_INIT_POINTER(dev->xps_maps[type], NULL); + kfree_rcu(dev_maps, rcu); } -static void clean_xps_maps(struct net_device *dev, const unsigned long *mask, - struct xps_dev_maps *dev_maps, unsigned int nr_ids, - u16 offset, u16 count, bool is_rxqs_map) +static void clean_xps_maps(struct net_device *dev, enum xps_map_type type, + u16 offset, u16 count) { + struct xps_dev_maps *dev_maps; bool active = false; int i, j; - for (j = -1; j = netif_attrmask_next(j, mask, nr_ids), - j < nr_ids;) - active |= remove_xps_queue_cpu(dev, dev_maps, j, offset, - count); + dev_maps = xmap_dereference(dev->xps_maps[type]); + if (!dev_maps) + return; + + for (j = 0; j < dev_maps->nr_ids; j++) + active |= remove_xps_queue_cpu(dev, dev_maps, j, offset, count); if (!active) - reset_xps_maps(dev, dev_maps, is_rxqs_map); + reset_xps_maps(dev, dev_maps, type); - if (!is_rxqs_map) { - for (i = offset + (count - 1); count--; i--) { + if (type == XPS_CPUS) { + for (i = offset + (count - 1); count--; i--) netdev_queue_numa_node_write( - netdev_get_tx_queue(dev, i), - NUMA_NO_NODE); - } + netdev_get_tx_queue(dev, i), NUMA_NO_NODE); } } static void netif_reset_xps_queues(struct net_device *dev, u16 offset, u16 count) { - const unsigned long *possible_mask = NULL; - struct xps_dev_maps *dev_maps; - unsigned int nr_ids; - if (!static_key_false(&xps_needed)) return; cpus_read_lock(); mutex_lock(&xps_map_mutex); - if (static_key_false(&xps_rxqs_needed)) { - dev_maps = xmap_dereference(dev->xps_rxqs_map); - if (dev_maps) { - nr_ids = dev->num_rx_queues; - clean_xps_maps(dev, possible_mask, dev_maps, nr_ids, - offset, count, true); - } - } - - dev_maps = xmap_dereference(dev->xps_cpus_map); - if (!dev_maps) - goto out_no_maps; + if (static_key_false(&xps_rxqs_needed)) + clean_xps_maps(dev, XPS_RXQS, offset, count); - if (num_possible_cpus() > 1) - possible_mask = cpumask_bits(cpu_possible_mask); - nr_ids = nr_cpu_ids; - clean_xps_maps(dev, possible_mask, dev_maps, nr_ids, offset, count, - false); + clean_xps_maps(dev, XPS_CPUS, offset, count); -out_no_maps: mutex_unlock(&xps_map_mutex); cpus_read_unlock(); } @@ -2628,16 +2666,35 @@ static struct xps_map *expand_xps_map(struct xps_map *map, int attr_index, return new_map; } +/* Copy xps maps at a given index */ +static void xps_copy_dev_maps(struct xps_dev_maps *dev_maps, + struct xps_dev_maps *new_dev_maps, int index, + int tc, bool skip_tc) +{ + int i, tci = index * dev_maps->num_tc; + struct xps_map *map; + + /* copy maps belonging to foreign traffic classes */ + for (i = 0; i < dev_maps->num_tc; i++, tci++) { + if (i == tc && skip_tc) + continue; + + /* fill in the new device map from the old device map */ + map = xmap_dereference(dev_maps->attr_map[tci]); + RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map); + } +} + /* Must be called under cpus_read_lock */ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask, - u16 index, bool is_rxqs_map) + u16 index, enum xps_map_type type) { - const unsigned long *online_mask = NULL, *possible_mask = NULL; - struct xps_dev_maps *dev_maps, *new_dev_maps = NULL; + struct xps_dev_maps *dev_maps, *new_dev_maps = NULL, *old_dev_maps = NULL; + const unsigned long *online_mask = NULL; + bool active = false, copy = false; int i, j, tci, numa_node_id = -2; int maps_sz, num_tc = 1, tc = 0; struct xps_map *map, *new_map; - bool active = false; unsigned int nr_ids; if (dev->num_tc) { @@ -2655,38 +2712,48 @@ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask, } mutex_lock(&xps_map_mutex); - if (is_rxqs_map) { + + dev_maps = xmap_dereference(dev->xps_maps[type]); + if (type == XPS_RXQS) { maps_sz = XPS_RXQ_DEV_MAPS_SIZE(num_tc, dev->num_rx_queues); - dev_maps = xmap_dereference(dev->xps_rxqs_map); nr_ids = dev->num_rx_queues; } else { maps_sz = XPS_CPU_DEV_MAPS_SIZE(num_tc); - if (num_possible_cpus() > 1) { + if (num_possible_cpus() > 1) online_mask = cpumask_bits(cpu_online_mask); - possible_mask = cpumask_bits(cpu_possible_mask); - } - dev_maps = xmap_dereference(dev->xps_cpus_map); nr_ids = nr_cpu_ids; } if (maps_sz < L1_CACHE_BYTES) maps_sz = L1_CACHE_BYTES; + /* The old dev_maps could be larger or smaller than the one we're + * setting up now, as dev->num_tc or nr_ids could have been updated in + * between. We could try to be smart, but let's be safe instead and only + * copy foreign traffic classes if the two map sizes match. + */ + if (dev_maps && + dev_maps->num_tc == num_tc && dev_maps->nr_ids == nr_ids) + copy = true; + /* allocate memory for queue storage */ for (j = -1; j = netif_attrmask_next_and(j, online_mask, mask, nr_ids), j < nr_ids;) { - if (!new_dev_maps) - new_dev_maps = kzalloc(maps_sz, GFP_KERNEL); if (!new_dev_maps) { - mutex_unlock(&xps_map_mutex); - return -ENOMEM; + new_dev_maps = kzalloc(maps_sz, GFP_KERNEL); + if (!new_dev_maps) { + mutex_unlock(&xps_map_mutex); + return -ENOMEM; + } + + new_dev_maps->nr_ids = nr_ids; + new_dev_maps->num_tc = num_tc; } tci = j * num_tc + tc; - map = dev_maps ? xmap_dereference(dev_maps->attr_map[tci]) : - NULL; + map = copy ? xmap_dereference(dev_maps->attr_map[tci]) : NULL; - map = expand_xps_map(map, j, index, is_rxqs_map); + map = expand_xps_map(map, j, index, type == XPS_RXQS); if (!map) goto error; @@ -2699,29 +2766,21 @@ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask, if (!dev_maps) { /* Increment static keys at most once per type */ static_key_slow_inc_cpuslocked(&xps_needed); - if (is_rxqs_map) + if (type == XPS_RXQS) static_key_slow_inc_cpuslocked(&xps_rxqs_needed); } - for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids), - j < nr_ids;) { - /* copy maps belonging to foreign traffic classes */ - for (i = tc, tci = j * num_tc; dev_maps && i--; tci++) { - /* fill in the new device map from the old device map */ - map = xmap_dereference(dev_maps->attr_map[tci]); - RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map); - } + for (j = 0; j < nr_ids; j++) { + bool skip_tc = false; - /* We need to explicitly update tci as prevous loop - * could break out early if dev_maps is NULL. - */ tci = j * num_tc + tc; - if (netif_attr_test_mask(j, mask, nr_ids) && netif_attr_test_online(j, online_mask, nr_ids)) { /* add tx-queue to CPU/rx-queue maps */ int pos = 0; + skip_tc = true; + map = xmap_dereference(new_dev_maps->attr_map[tci]); while ((pos < map->len) && (map->queues[pos] != index)) pos++; @@ -2729,78 +2788,81 @@ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask, if (pos == map->len) map->queues[map->len++] = index; #ifdef CONFIG_NUMA - if (!is_rxqs_map) { + if (type == XPS_CPUS) { if (numa_node_id == -2) numa_node_id = cpu_to_node(j); else if (numa_node_id != cpu_to_node(j)) numa_node_id = -1; } #endif - } else if (dev_maps) { - /* fill in the new device map from the old device map */ - map = xmap_dereference(dev_maps->attr_map[tci]); - RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map); } - /* copy maps belonging to foreign traffic classes */ - for (i = num_tc - tc, tci++; dev_maps && --i; tci++) { - /* fill in the new device map from the old device map */ - map = xmap_dereference(dev_maps->attr_map[tci]); - RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map); - } + if (copy) + xps_copy_dev_maps(dev_maps, new_dev_maps, j, tc, + skip_tc); } - if (is_rxqs_map) - rcu_assign_pointer(dev->xps_rxqs_map, new_dev_maps); - else - rcu_assign_pointer(dev->xps_cpus_map, new_dev_maps); + rcu_assign_pointer(dev->xps_maps[type], new_dev_maps); /* Cleanup old maps */ if (!dev_maps) goto out_no_old_maps; - for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids), - j < nr_ids;) { - for (i = num_tc, tci = j * num_tc; i--; tci++) { - new_map = xmap_dereference(new_dev_maps->attr_map[tci]); + for (j = 0; j < dev_maps->nr_ids; j++) { + for (i = num_tc, tci = j * dev_maps->num_tc; i--; tci++) { map = xmap_dereference(dev_maps->attr_map[tci]); - if (map && map != new_map) - kfree_rcu(map, rcu); + if (!map) + continue; + + if (copy) { + new_map = xmap_dereference(new_dev_maps->attr_map[tci]); + if (map == new_map) + continue; + } + + RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL); + kfree_rcu(map, rcu); } } - kfree_rcu(dev_maps, rcu); + old_dev_maps = dev_maps; out_no_old_maps: dev_maps = new_dev_maps; active = true; out_no_new_maps: - if (!is_rxqs_map) { + if (type == XPS_CPUS) /* update Tx queue numa node */ netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index), (numa_node_id >= 0) ? numa_node_id : NUMA_NO_NODE); - } if (!dev_maps) goto out_no_maps; /* removes tx-queue from unused CPUs/rx-queues */ - for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids), - j < nr_ids;) { - for (i = tc, tci = j * num_tc; i--; tci++) - active |= remove_xps_queue(dev_maps, tci, index); - if (!netif_attr_test_mask(j, mask, nr_ids) || - !netif_attr_test_online(j, online_mask, nr_ids)) - active |= remove_xps_queue(dev_maps, tci, index); - for (i = num_tc - tc, tci++; --i; tci++) - active |= remove_xps_queue(dev_maps, tci, index); + for (j = 0; j < dev_maps->nr_ids; j++) { + tci = j * dev_maps->num_tc; + + for (i = 0; i < dev_maps->num_tc; i++, tci++) { + if (i == tc && + netif_attr_test_mask(j, mask, dev_maps->nr_ids) && + netif_attr_test_online(j, online_mask, dev_maps->nr_ids)) + continue; + + active |= remove_xps_queue(dev_maps, + copy ? old_dev_maps : NULL, + tci, index); + } } + if (old_dev_maps) + kfree_rcu(old_dev_maps, rcu); + /* free map if not active */ if (!active) - reset_xps_maps(dev, dev_maps, is_rxqs_map); + reset_xps_maps(dev, dev_maps, type); out_no_maps: mutex_unlock(&xps_map_mutex); @@ -2808,11 +2870,10 @@ out_no_maps: return 0; error: /* remove any maps that we added */ - for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids), - j < nr_ids;) { + for (j = 0; j < nr_ids; j++) { for (i = num_tc, tci = j * num_tc; i--; tci++) { new_map = xmap_dereference(new_dev_maps->attr_map[tci]); - map = dev_maps ? + map = copy ? xmap_dereference(dev_maps->attr_map[tci]) : NULL; if (new_map && new_map != map) @@ -2833,7 +2894,7 @@ int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask, int ret; cpus_read_lock(); - ret = __netif_set_xps_queue(dev, cpumask_bits(mask), index, false); + ret = __netif_set_xps_queue(dev, cpumask_bits(mask), index, XPS_CPUS); cpus_read_unlock(); return ret; @@ -3944,13 +4005,15 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev) static int __get_xps_queue_idx(struct net_device *dev, struct sk_buff *skb, struct xps_dev_maps *dev_maps, unsigned int tci) { + int tc = netdev_get_prio_tc_map(dev, skb->priority); struct xps_map *map; int queue_index = -1; - if (dev->num_tc) { - tci *= dev->num_tc; - tci += netdev_get_prio_tc_map(dev, skb->priority); - } + if (tc >= dev_maps->num_tc || tci >= dev_maps->nr_ids) + return queue_index; + + tci *= dev_maps->num_tc; + tci += tc; map = rcu_dereference(dev_maps->attr_map[tci]); if (map) { @@ -3981,18 +4044,18 @@ static int get_xps_queue(struct net_device *dev, struct net_device *sb_dev, if (!static_key_false(&xps_rxqs_needed)) goto get_cpus_map; - dev_maps = rcu_dereference(sb_dev->xps_rxqs_map); + dev_maps = rcu_dereference(sb_dev->xps_maps[XPS_RXQS]); if (dev_maps) { int tci = sk_rx_queue_get(sk); - if (tci >= 0 && tci < dev->num_rx_queues) + if (tci >= 0) queue_index = __get_xps_queue_idx(dev, skb, dev_maps, tci); } get_cpus_map: if (queue_index < 0) { - dev_maps = rcu_dereference(sb_dev->xps_cpus_map); + dev_maps = rcu_dereference(sb_dev->xps_maps[XPS_CPUS]); if (dev_maps) { unsigned int tci = skb->sender_cpu - 1; @@ -4294,6 +4357,13 @@ static inline void ____napi_schedule(struct softnet_data *sd, */ thread = READ_ONCE(napi->thread); if (thread) { + /* Avoid doing set_bit() if the thread is in + * INTERRUPTIBLE state, cause napi_thread_wait() + * makes sure to proceed with napi polling + * if the thread is explicitly woken from here. + */ + if (READ_ONCE(thread->state) != TASK_INTERRUPTIBLE) + set_bit(NAPI_STATE_SCHED_THREADED, &napi->state); wake_up_process(thread); return; } @@ -4653,10 +4723,10 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, void *orig_data, *orig_data_end, *hard_start; struct netdev_rx_queue *rxqueue; u32 metalen, act = XDP_DROP; + bool orig_bcast, orig_host; u32 mac_len, frame_sz; __be16 orig_eth_type; struct ethhdr *eth; - bool orig_bcast; int off; /* Reinjected packets coming from act_mirred or similar should @@ -4703,6 +4773,7 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, orig_data_end = xdp->data_end; orig_data = xdp->data; eth = (struct ethhdr *)xdp->data; + orig_host = ether_addr_equal_64bits(eth->h_dest, skb->dev->dev_addr); orig_bcast = is_multicast_ether_addr_64bits(eth->h_dest); orig_eth_type = eth->h_proto; @@ -4730,8 +4801,11 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, /* check if XDP changed eth hdr such SKB needs update */ eth = (struct ethhdr *)xdp->data; if ((orig_eth_type != eth->h_proto) || + (orig_host != ether_addr_equal_64bits(eth->h_dest, + skb->dev->dev_addr)) || (orig_bcast != is_multicast_ether_addr_64bits(eth->h_dest))) { __skb_push(skb, ETH_HLEN); + skb->pkt_type = PACKET_HOST; skb->protocol = eth_type_trans(skb, skb->dev); } @@ -5265,6 +5339,7 @@ skip_classify: goto another_round; case RX_HANDLER_EXACT: deliver_exact = true; + break; case RX_HANDLER_PASS: break; default: @@ -5857,15 +5932,13 @@ void napi_gro_flush(struct napi_struct *napi, bool flush_old) } EXPORT_SYMBOL(napi_gro_flush); -static struct list_head *gro_list_prepare(struct napi_struct *napi, - struct sk_buff *skb) +static void gro_list_prepare(const struct list_head *head, + const struct sk_buff *skb) { unsigned int maclen = skb->dev->hard_header_len; u32 hash = skb_get_hash_raw(skb); - struct list_head *head; struct sk_buff *p; - head = &napi->gro_hash[hash & (GRO_HASH_BUCKETS - 1)].list; list_for_each_entry(p, head, list) { unsigned long diffs; @@ -5891,11 +5964,9 @@ static struct list_head *gro_list_prepare(struct napi_struct *napi, maclen); NAPI_GRO_CB(p)->same_flow = !diffs; } - - return head; } -static void skb_gro_reset_offset(struct sk_buff *skb) +static inline void skb_gro_reset_offset(struct sk_buff *skb, u32 nhoff) { const struct skb_shared_info *pinfo = skb_shinfo(skb); const skb_frag_t *frag0 = &pinfo->frags[0]; @@ -5905,7 +5976,8 @@ static void skb_gro_reset_offset(struct sk_buff *skb) NAPI_GRO_CB(skb)->frag0_len = 0; if (!skb_headlen(skb) && pinfo->nr_frags && - !PageHighMem(skb_frag_page(frag0))) { + !PageHighMem(skb_frag_page(frag0)) && + (!NET_IP_ALIGN || !((skb_frag_off(frag0) + nhoff) & 3))) { NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0); NAPI_GRO_CB(skb)->frag0_len = min_t(unsigned int, skb_frag_size(frag0), @@ -5955,11 +6027,11 @@ static void gro_flush_oldest(struct napi_struct *napi, struct list_head *head) static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) { - u32 hash = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1); + u32 bucket = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1); + struct gro_list *gro_list = &napi->gro_hash[bucket]; struct list_head *head = &offload_base; struct packet_offload *ptype; __be16 type = skb->protocol; - struct list_head *gro_head; struct sk_buff *pp = NULL; enum gro_result ret; int same_flow; @@ -5968,7 +6040,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff if (netif_elide_gro(skb->dev)) goto normal; - gro_head = gro_list_prepare(napi, skb); + gro_list_prepare(&gro_list->list, skb); rcu_read_lock(); list_for_each_entry_rcu(ptype, head, list) { @@ -6004,7 +6076,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff pp = INDIRECT_CALL_INET(ptype->callbacks.gro_receive, ipv6_gro_receive, inet_gro_receive, - gro_head, skb); + &gro_list->list, skb); break; } rcu_read_unlock(); @@ -6023,7 +6095,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff if (pp) { skb_list_del_init(pp); napi_gro_complete(napi, pp); - napi->gro_hash[hash].count--; + gro_list->count--; } if (same_flow) @@ -6032,16 +6104,16 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff if (NAPI_GRO_CB(skb)->flush) goto normal; - if (unlikely(napi->gro_hash[hash].count >= MAX_GRO_SKBS)) { - gro_flush_oldest(napi, gro_head); - } else { - napi->gro_hash[hash].count++; - } + if (unlikely(gro_list->count >= MAX_GRO_SKBS)) + gro_flush_oldest(napi, &gro_list->list); + else + gro_list->count++; + NAPI_GRO_CB(skb)->count = 1; NAPI_GRO_CB(skb)->age = jiffies; NAPI_GRO_CB(skb)->last = skb; skb_shinfo(skb)->gso_size = skb_gro_len(skb); - list_add(&skb->list, gro_head); + list_add(&skb->list, &gro_list->list); ret = GRO_HELD; pull: @@ -6049,11 +6121,11 @@ pull: if (grow > 0) gro_pull_from_frag0(skb, grow); ok: - if (napi->gro_hash[hash].count) { - if (!test_bit(hash, &napi->gro_bitmask)) - __set_bit(hash, &napi->gro_bitmask); - } else if (test_bit(hash, &napi->gro_bitmask)) { - __clear_bit(hash, &napi->gro_bitmask); + if (gro_list->count) { + if (!test_bit(bucket, &napi->gro_bitmask)) + __set_bit(bucket, &napi->gro_bitmask); + } else if (test_bit(bucket, &napi->gro_bitmask)) { + __clear_bit(bucket, &napi->gro_bitmask); } return ret; @@ -6123,7 +6195,7 @@ gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) skb_mark_napi_id(skb, napi); trace_napi_gro_receive_entry(skb); - skb_gro_reset_offset(skb); + skb_gro_reset_offset(skb, 0); ret = napi_skb_finish(napi, skb, dev_gro_receive(napi, skb)); trace_napi_gro_receive_exit(ret); @@ -6212,7 +6284,7 @@ static struct sk_buff *napi_frags_skb(struct napi_struct *napi) napi->skb = NULL; skb_reset_mac_header(skb); - skb_gro_reset_offset(skb); + skb_gro_reset_offset(skb, hlen); if (unlikely(skb_gro_header_hard(skb, hlen))) { eth = skb_gro_header_slow(skb, hlen, 0); @@ -6486,6 +6558,7 @@ bool napi_complete_done(struct napi_struct *n, int work_done) WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED)); new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED | + NAPIF_STATE_SCHED_THREADED | NAPIF_STATE_PREFER_BUSY_POLL); /* If STATE_MISSED was set, leave STATE_SCHED set, @@ -6769,6 +6842,7 @@ int dev_set_threaded(struct net_device *dev, bool threaded) return err; } +EXPORT_SYMBOL(dev_set_threaded); void netif_napi_add(struct net_device *dev, struct napi_struct *napi, int (*poll)(struct napi_struct *, int), int weight) @@ -6968,19 +7042,29 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll) static int napi_thread_wait(struct napi_struct *napi) { + bool woken = false; + set_current_state(TASK_INTERRUPTIBLE); - while (!kthread_should_stop() && !napi_disable_pending(napi)) { - if (test_bit(NAPI_STATE_SCHED, &napi->state)) { + while (!kthread_should_stop()) { + /* Testing SCHED_THREADED bit here to make sure the current + * kthread owns this napi and could poll on this napi. + * Testing SCHED bit is not enough because SCHED bit might be + * set by some other busy poll thread or by napi_disable(). + */ + if (test_bit(NAPI_STATE_SCHED_THREADED, &napi->state) || woken) { WARN_ON(!list_empty(&napi->poll_list)); __set_current_state(TASK_RUNNING); return 0; } schedule(); + /* woken being true indicates this thread owns this napi. */ + woken = true; set_current_state(TASK_INTERRUPTIBLE); } __set_current_state(TASK_RUNNING); + return -1; } @@ -10307,14 +10391,20 @@ EXPORT_SYMBOL(register_netdev); int netdev_refcnt_read(const struct net_device *dev) { +#ifdef CONFIG_PCPU_DEV_REFCNT int i, refcnt = 0; for_each_possible_cpu(i) refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i); return refcnt; +#else + return refcount_read(&dev->dev_refcnt); +#endif } EXPORT_SYMBOL(netdev_refcnt_read); +int netdev_unregister_timeout_secs __read_mostly = 10; + #define WAIT_REFS_MIN_MSECS 1 #define WAIT_REFS_MAX_MSECS 250 /** @@ -10339,7 +10429,7 @@ static void netdev_wait_allrefs(struct net_device *dev) rebroadcast_time = warning_time = jiffies; refcnt = netdev_refcnt_read(dev); - while (refcnt != 0) { + while (refcnt != 1) { if (time_after(jiffies, rebroadcast_time + 1 * HZ)) { rtnl_lock(); @@ -10376,7 +10466,9 @@ static void netdev_wait_allrefs(struct net_device *dev) refcnt = netdev_refcnt_read(dev); - if (refcnt && time_after(jiffies, warning_time + 10 * HZ)) { + if (refcnt != 1 && + time_after(jiffies, warning_time + + netdev_unregister_timeout_secs * HZ)) { pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n", dev->name, refcnt); warning_time = jiffies; @@ -10452,7 +10544,7 @@ void netdev_run_todo(void) netdev_wait_allrefs(dev); /* paranoia */ - BUG_ON(netdev_refcnt_read(dev)); + BUG_ON(netdev_refcnt_read(dev) != 1); BUG_ON(!list_empty(&dev->ptype_all)); BUG_ON(!list_empty(&dev->ptype_specific)); WARN_ON(rcu_access_pointer(dev->ip_ptr)); @@ -10669,9 +10761,14 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, dev = PTR_ALIGN(p, NETDEV_ALIGN); dev->padded = (char *)dev - (char *)p; +#ifdef CONFIG_PCPU_DEV_REFCNT dev->pcpu_refcnt = alloc_percpu(int); if (!dev->pcpu_refcnt) goto free_dev; + dev_hold(dev); +#else + refcount_set(&dev->dev_refcnt, 1); +#endif if (dev_addr_init(dev)) goto free_pcpu; @@ -10735,8 +10832,10 @@ free_all: return NULL; free_pcpu: +#ifdef CONFIG_PCPU_DEV_REFCNT free_percpu(dev->pcpu_refcnt); free_dev: +#endif netdev_freemem(dev); return NULL; } @@ -10778,8 +10877,10 @@ void free_netdev(struct net_device *dev) list_for_each_entry_safe(p, n, &dev->napi_list, dev_list) netif_napi_del(p); +#ifdef CONFIG_PCPU_DEV_REFCNT free_percpu(dev->pcpu_refcnt); dev->pcpu_refcnt = NULL; +#endif free_percpu(dev->xdp_bulkq); dev->xdp_bulkq = NULL; @@ -10967,11 +11068,13 @@ void unregister_netdev(struct net_device *dev) EXPORT_SYMBOL(unregister_netdev); /** - * dev_change_net_namespace - move device to different nethost namespace + * __dev_change_net_namespace - move device to different nethost namespace * @dev: device * @net: network namespace * @pat: If not NULL name pattern to try if the current device name * is already taken in the destination network namespace. + * @new_ifindex: If not zero, specifies device index in the target + * namespace. * * This function shuts down a device interface and moves it * to a new network namespace. On success 0 is returned, on @@ -10980,10 +11083,11 @@ EXPORT_SYMBOL(unregister_netdev); * Callers must hold the rtnl semaphore. */ -int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat) +int __dev_change_net_namespace(struct net_device *dev, struct net *net, + const char *pat, int new_ifindex) { struct net *net_old = dev_net(dev); - int err, new_nsid, new_ifindex; + int err, new_nsid; ASSERT_RTNL(); @@ -11014,6 +11118,11 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char goto out; } + /* Check that new_ifindex isn't used yet. */ + err = -EBUSY; + if (new_ifindex && __dev_get_by_index(net, new_ifindex)) + goto out; + /* * And now a mini version of register_netdevice unregister_netdevice. */ @@ -11041,10 +11150,12 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char new_nsid = peernet2id_alloc(dev_net(dev), net, GFP_KERNEL); /* If there is an ifindex conflict assign a new one */ - if (__dev_get_by_index(net, dev->ifindex)) - new_ifindex = dev_new_index(net); - else - new_ifindex = dev->ifindex; + if (!new_ifindex) { + if (__dev_get_by_index(net, dev->ifindex)) + new_ifindex = dev_new_index(net); + else + new_ifindex = dev->ifindex; + } rtmsg_ifinfo_newnet(RTM_DELLINK, dev, ~0U, GFP_KERNEL, &new_nsid, new_ifindex); @@ -11097,7 +11208,7 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char out: return err; } -EXPORT_SYMBOL_GPL(dev_change_net_namespace); +EXPORT_SYMBOL_GPL(__dev_change_net_namespace); static int dev_cpu_dead(unsigned int oldcpu) { @@ -11346,7 +11457,7 @@ static void __net_exit default_device_exit(struct net *net) continue; /* Leave virtual devices for the generic cleanup */ - if (dev->rtnl_link_ops) + if (dev->rtnl_link_ops && !dev->rtnl_link_ops->netns_refund) continue; /* Push remaining network devices to init_net */ |