diff options
Diffstat (limited to 'net/core/dev.c')
| -rw-r--r-- | net/core/dev.c | 264 | 
1 files changed, 150 insertions, 114 deletions
diff --git a/net/core/dev.c b/net/core/dev.c index 171420e75b03..d030575532a2 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -371,9 +371,10 @@ static inline void netdev_set_addr_lockdep_class(struct net_device *dev)  static inline struct list_head *ptype_head(const struct packet_type *pt)  {  	if (pt->type == htons(ETH_P_ALL)) -		return &ptype_all; +		return pt->dev ? &pt->dev->ptype_all : &ptype_all;  	else -		return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK]; +		return pt->dev ? &pt->dev->ptype_specific : +				 &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];  }  /** @@ -1734,6 +1735,23 @@ static inline int deliver_skb(struct sk_buff *skb,  	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);  } +static inline void deliver_ptype_list_skb(struct sk_buff *skb, +					  struct packet_type **pt, +					  struct net_device *dev, __be16 type, +					  struct list_head *ptype_list) +{ +	struct packet_type *ptype, *pt_prev = *pt; + +	list_for_each_entry_rcu(ptype, ptype_list, list) { +		if (ptype->type != type) +			continue; +		if (pt_prev) +			deliver_skb(skb, pt_prev, dev); +		pt_prev = ptype; +	} +	*pt = pt_prev; +} +  static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)  {  	if (!ptype->af_packet_priv || !skb->sk) @@ -1757,45 +1775,54 @@ static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)  	struct packet_type *ptype;  	struct sk_buff *skb2 = NULL;  	struct packet_type *pt_prev = NULL; +	struct list_head *ptype_list = &ptype_all;  	rcu_read_lock(); -	list_for_each_entry_rcu(ptype, &ptype_all, list) { +again: +	list_for_each_entry_rcu(ptype, ptype_list, list) {  		/* Never send packets back to the socket  		 * they originated from - MvS ([email protected])  		 */ -		if ((ptype->dev == dev || !ptype->dev) && -		    (!skb_loop_sk(ptype, skb))) { -			if (pt_prev) { -				deliver_skb(skb2, pt_prev, skb->dev); -				pt_prev = ptype; -				continue; -			} +		if (skb_loop_sk(ptype, skb)) +			continue; -			skb2 = skb_clone(skb, GFP_ATOMIC); -			if (!skb2) -				break; +		if (pt_prev) { +			deliver_skb(skb2, pt_prev, skb->dev); +			pt_prev = ptype; +			continue; +		} -			net_timestamp_set(skb2); +		/* need to clone skb, done only once */ +		skb2 = skb_clone(skb, GFP_ATOMIC); +		if (!skb2) +			goto out_unlock; -			/* skb->nh should be correctly -			   set by sender, so that the second statement is -			   just protection against buggy protocols. -			 */ -			skb_reset_mac_header(skb2); - -			if (skb_network_header(skb2) < skb2->data || -			    skb_network_header(skb2) > skb_tail_pointer(skb2)) { -				net_crit_ratelimited("protocol %04x is buggy, dev %s\n", -						     ntohs(skb2->protocol), -						     dev->name); -				skb_reset_network_header(skb2); -			} +		net_timestamp_set(skb2); -			skb2->transport_header = skb2->network_header; -			skb2->pkt_type = PACKET_OUTGOING; -			pt_prev = ptype; +		/* skb->nh should be correctly +		 * set by sender, so that the second statement is +		 * just protection against buggy protocols. +		 */ +		skb_reset_mac_header(skb2); + +		if (skb_network_header(skb2) < skb2->data || +		    skb_network_header(skb2) > skb_tail_pointer(skb2)) { +			net_crit_ratelimited("protocol %04x is buggy, dev %s\n", +					     ntohs(skb2->protocol), +					     dev->name); +			skb_reset_network_header(skb2);  		} + +		skb2->transport_header = skb2->network_header; +		skb2->pkt_type = PACKET_OUTGOING; +		pt_prev = ptype; +	} + +	if (ptype_list == &ptype_all) { +		ptype_list = &dev->ptype_all; +		goto again;  	} +out_unlock:  	if (pt_prev)  		pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);  	rcu_read_unlock(); @@ -2352,7 +2379,6 @@ EXPORT_SYMBOL(skb_checksum_help);  __be16 skb_network_protocol(struct sk_buff *skb, int *depth)  { -	unsigned int vlan_depth = skb->mac_len;  	__be16 type = skb->protocol;  	/* Tunnel gso handlers can set protocol to ethernet. */ @@ -2366,35 +2392,7 @@ __be16 skb_network_protocol(struct sk_buff *skb, int *depth)  		type = eth->h_proto;  	} -	/* if skb->protocol is 802.1Q/AD then the header should already be -	 * present at mac_len - VLAN_HLEN (if mac_len > 0), or at -	 * ETH_HLEN otherwise -	 */ -	if (type == htons(ETH_P_8021Q) || type == htons(ETH_P_8021AD)) { -		if (vlan_depth) { -			if (WARN_ON(vlan_depth < VLAN_HLEN)) -				return 0; -			vlan_depth -= VLAN_HLEN; -		} else { -			vlan_depth = ETH_HLEN; -		} -		do { -			struct vlan_hdr *vh; - -			if (unlikely(!pskb_may_pull(skb, -						    vlan_depth + VLAN_HLEN))) -				return 0; - -			vh = (struct vlan_hdr *)(skb->data + vlan_depth); -			type = vh->h_vlan_encapsulated_proto; -			vlan_depth += VLAN_HLEN; -		} while (type == htons(ETH_P_8021Q) || -			 type == htons(ETH_P_8021AD)); -	} - -	*depth = vlan_depth; - -	return type; +	return __vlan_get_protocol(skb, type, depth);  }  /** @@ -2578,7 +2576,7 @@ netdev_features_t netif_skb_features(struct sk_buff *skb)  	if (skb->encapsulation)  		features &= dev->hw_enc_features; -	if (!vlan_tx_tag_present(skb)) { +	if (!skb_vlan_tag_present(skb)) {  		if (unlikely(protocol == htons(ETH_P_8021Q) ||  			     protocol == htons(ETH_P_8021AD))) {  			struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data; @@ -2617,7 +2615,7 @@ static int xmit_one(struct sk_buff *skb, struct net_device *dev,  	unsigned int len;  	int rc; -	if (!list_empty(&ptype_all)) +	if (!list_empty(&ptype_all) || !list_empty(&dev->ptype_all))  		dev_queue_xmit_nit(skb, dev);  	len = skb->len; @@ -2659,7 +2657,7 @@ out:  static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,  					  netdev_features_t features)  { -	if (vlan_tx_tag_present(skb) && +	if (skb_vlan_tag_present(skb) &&  	    !vlan_hw_offload_capable(features, skb->vlan_proto))  		skb = __vlan_hwaccel_push_inside(skb);  	return skb; @@ -3032,6 +3030,8 @@ static inline void ____napi_schedule(struct softnet_data *sd,  /* One global table that all flow-based protocols share. */  struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;  EXPORT_SYMBOL(rps_sock_flow_table); +u32 rps_cpu_mask __read_mostly; +EXPORT_SYMBOL(rps_cpu_mask);  struct static_key rps_needed __read_mostly; @@ -3088,16 +3088,17 @@ set_rps_cpu(struct net_device *dev, struct sk_buff *skb,  static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,  		       struct rps_dev_flow **rflowp)  { -	struct netdev_rx_queue *rxqueue; -	struct rps_map *map; +	const struct rps_sock_flow_table *sock_flow_table; +	struct netdev_rx_queue *rxqueue = dev->_rx;  	struct rps_dev_flow_table *flow_table; -	struct rps_sock_flow_table *sock_flow_table; +	struct rps_map *map;  	int cpu = -1; -	u16 tcpu; +	u32 tcpu;  	u32 hash;  	if (skb_rx_queue_recorded(skb)) {  		u16 index = skb_get_rx_queue(skb); +  		if (unlikely(index >= dev->real_num_rx_queues)) {  			WARN_ONCE(dev->real_num_rx_queues > 1,  				  "%s received packet on queue %u, but number " @@ -3105,39 +3106,40 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,  				  dev->name, index, dev->real_num_rx_queues);  			goto done;  		} -		rxqueue = dev->_rx + index; -	} else -		rxqueue = dev->_rx; +		rxqueue += index; +	} +	/* Avoid computing hash if RFS/RPS is not active for this rxqueue */ + +	flow_table = rcu_dereference(rxqueue->rps_flow_table);  	map = rcu_dereference(rxqueue->rps_map); -	if (map) { -		if (map->len == 1 && -		    !rcu_access_pointer(rxqueue->rps_flow_table)) { -			tcpu = map->cpus[0]; -			if (cpu_online(tcpu)) -				cpu = tcpu; -			goto done; -		} -	} else if (!rcu_access_pointer(rxqueue->rps_flow_table)) { +	if (!flow_table && !map)  		goto done; -	}  	skb_reset_network_header(skb);  	hash = skb_get_hash(skb);  	if (!hash)  		goto done; -	flow_table = rcu_dereference(rxqueue->rps_flow_table);  	sock_flow_table = rcu_dereference(rps_sock_flow_table);  	if (flow_table && sock_flow_table) { -		u16 next_cpu;  		struct rps_dev_flow *rflow; +		u32 next_cpu; +		u32 ident; + +		/* First check into global flow table if there is a match */ +		ident = sock_flow_table->ents[hash & sock_flow_table->mask]; +		if ((ident ^ hash) & ~rps_cpu_mask) +			goto try_rps; +		next_cpu = ident & rps_cpu_mask; + +		/* OK, now we know there is a match, +		 * we can look at the local (per receive queue) flow table +		 */  		rflow = &flow_table->flows[hash & flow_table->mask];  		tcpu = rflow->cpu; -		next_cpu = sock_flow_table->ents[hash & sock_flow_table->mask]; -  		/*  		 * If the desired CPU (where last recvmsg was done) is  		 * different from current CPU (one in the rx-queue flow @@ -3164,6 +3166,8 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,  		}  	} +try_rps: +  	if (map) {  		tcpu = map->cpus[reciprocal_scale(hash, map->len)];  		if (cpu_online(tcpu)) { @@ -3615,7 +3619,6 @@ static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)  	struct packet_type *ptype, *pt_prev;  	rx_handler_func_t *rx_handler;  	struct net_device *orig_dev; -	struct net_device *null_or_dev;  	bool deliver_exact = false;  	int ret = NET_RX_DROP;  	__be16 type; @@ -3658,11 +3661,15 @@ another_round:  		goto skip_taps;  	list_for_each_entry_rcu(ptype, &ptype_all, list) { -		if (!ptype->dev || ptype->dev == skb->dev) { -			if (pt_prev) -				ret = deliver_skb(skb, pt_prev, orig_dev); -			pt_prev = ptype; -		} +		if (pt_prev) +			ret = deliver_skb(skb, pt_prev, orig_dev); +		pt_prev = ptype; +	} + +	list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) { +		if (pt_prev) +			ret = deliver_skb(skb, pt_prev, orig_dev); +		pt_prev = ptype;  	}  skip_taps: @@ -3676,7 +3683,7 @@ ncls:  	if (pfmemalloc && !skb_pfmemalloc_protocol(skb))  		goto drop; -	if (vlan_tx_tag_present(skb)) { +	if (skb_vlan_tag_present(skb)) {  		if (pt_prev) {  			ret = deliver_skb(skb, pt_prev, orig_dev);  			pt_prev = NULL; @@ -3708,8 +3715,8 @@ ncls:  		}  	} -	if (unlikely(vlan_tx_tag_present(skb))) { -		if (vlan_tx_tag_get_id(skb)) +	if (unlikely(skb_vlan_tag_present(skb))) { +		if (skb_vlan_tag_get_id(skb))  			skb->pkt_type = PACKET_OTHERHOST;  		/* Note: we might in the future use prio bits  		 * and set skb->priority like in vlan_do_receive() @@ -3718,19 +3725,21 @@ ncls:  		skb->vlan_tci = 0;  	} +	type = skb->protocol; +  	/* deliver only exact match when indicated */ -	null_or_dev = deliver_exact ? skb->dev : NULL; +	if (likely(!deliver_exact)) { +		deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type, +				       &ptype_base[ntohs(type) & +						   PTYPE_HASH_MASK]); +	} -	type = skb->protocol; -	list_for_each_entry_rcu(ptype, -			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) { -		if (ptype->type == type && -		    (ptype->dev == null_or_dev || ptype->dev == skb->dev || -		     ptype->dev == orig_dev)) { -			if (pt_prev) -				ret = deliver_skb(skb, pt_prev, orig_dev); -			pt_prev = ptype; -		} +	deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type, +			       &orig_dev->ptype_specific); + +	if (unlikely(skb->dev != orig_dev)) { +		deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type, +				       &skb->dev->ptype_specific);  	}  	if (pt_prev) { @@ -5323,7 +5332,27 @@ void netdev_upper_dev_unlink(struct net_device *dev,  }  EXPORT_SYMBOL(netdev_upper_dev_unlink); -void netdev_adjacent_add_links(struct net_device *dev) +/** + * netdev_bonding_info_change - Dispatch event about slave change + * @dev: device + * @netdev_bonding_info: info to dispatch + * + * Send NETDEV_BONDING_INFO to netdev notifiers with info. + * The caller must hold the RTNL lock. + */ +void netdev_bonding_info_change(struct net_device *dev, +				struct netdev_bonding_info *bonding_info) +{ +	struct netdev_notifier_bonding_info	info; + +	memcpy(&info.bonding_info, bonding_info, +	       sizeof(struct netdev_bonding_info)); +	call_netdevice_notifiers_info(NETDEV_BONDING_INFO, dev, +				      &info.info); +} +EXPORT_SYMBOL(netdev_bonding_info_change); + +static void netdev_adjacent_add_links(struct net_device *dev)  {  	struct netdev_adjacent *iter; @@ -5348,7 +5377,7 @@ void netdev_adjacent_add_links(struct net_device *dev)  	}  } -void netdev_adjacent_del_links(struct net_device *dev) +static void netdev_adjacent_del_links(struct net_device *dev)  {  	struct netdev_adjacent *iter; @@ -6172,13 +6201,16 @@ static int netif_alloc_rx_queues(struct net_device *dev)  {  	unsigned int i, count = dev->num_rx_queues;  	struct netdev_rx_queue *rx; +	size_t sz = count * sizeof(*rx);  	BUG_ON(count < 1); -	rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL); -	if (!rx) -		return -ENOMEM; - +	rx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT); +	if (!rx) { +		rx = vzalloc(sz); +		if (!rx) +			return -ENOMEM; +	}  	dev->_rx = rx;  	for (i = 0; i < count; i++) @@ -6576,6 +6608,8 @@ void netdev_run_todo(void)  		/* paranoia */  		BUG_ON(netdev_refcnt_read(dev)); +		BUG_ON(!list_empty(&dev->ptype_all)); +		BUG_ON(!list_empty(&dev->ptype_specific));  		WARN_ON(rcu_access_pointer(dev->ip_ptr));  		WARN_ON(rcu_access_pointer(dev->ip6_ptr));  		WARN_ON(dev->dn_ptr); @@ -6656,7 +6690,7 @@ struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)  	if (!queue)  		return NULL;  	netdev_init_one_queue(dev, queue, NULL); -	queue->qdisc = &noop_qdisc; +	RCU_INIT_POINTER(queue->qdisc, &noop_qdisc);  	queue->qdisc_sleeping = &noop_qdisc;  	rcu_assign_pointer(dev->ingress_queue, queue);  #endif @@ -6758,6 +6792,8 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,  	INIT_LIST_HEAD(&dev->adj_list.lower);  	INIT_LIST_HEAD(&dev->all_adj_list.upper);  	INIT_LIST_HEAD(&dev->all_adj_list.lower); +	INIT_LIST_HEAD(&dev->ptype_all); +	INIT_LIST_HEAD(&dev->ptype_specific);  	dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;  	setup(dev); @@ -6808,7 +6844,7 @@ void free_netdev(struct net_device *dev)  	netif_free_tx_queues(dev);  #ifdef CONFIG_SYSFS -	kfree(dev->_rx); +	kvfree(dev->_rx);  #endif  	kfree(rcu_dereference_protected(dev->ingress_queue, 1)); @@ -7093,11 +7129,11 @@ static int dev_cpu_callback(struct notifier_block *nfb,  	/* Process offline CPU's input_pkt_queue */  	while ((skb = __skb_dequeue(&oldsd->process_queue))) { -		netif_rx_internal(skb); +		netif_rx_ni(skb);  		input_queue_head_incr(oldsd);  	}  	while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) { -		netif_rx_internal(skb); +		netif_rx_ni(skb);  		input_queue_head_incr(oldsd);  	}  |