diff options
Diffstat (limited to 'net/core/dev.c')
| -rw-r--r-- | net/core/dev.c | 204 | 
1 files changed, 166 insertions, 38 deletions
| diff --git a/net/core/dev.c b/net/core/dev.c index 4d4de9008f6f..6ea1d20676fb 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -229,7 +229,7 @@ static inline void backlog_lock_irq_save(struct softnet_data *sd,  {  	if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads())  		spin_lock_irqsave(&sd->input_pkt_queue.lock, *flags); -	else if (!IS_ENABLED(CONFIG_PREEMPT_RT)) +	else  		local_irq_save(*flags);  } @@ -237,7 +237,7 @@ static inline void backlog_lock_irq_disable(struct softnet_data *sd)  {  	if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads())  		spin_lock_irq(&sd->input_pkt_queue.lock); -	else if (!IS_ENABLED(CONFIG_PREEMPT_RT)) +	else  		local_irq_disable();  } @@ -246,7 +246,7 @@ static inline void backlog_unlock_irq_restore(struct softnet_data *sd,  {  	if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads())  		spin_unlock_irqrestore(&sd->input_pkt_queue.lock, *flags); -	else if (!IS_ENABLED(CONFIG_PREEMPT_RT)) +	else  		local_irq_restore(*flags);  } @@ -254,7 +254,7 @@ static inline void backlog_unlock_irq_enable(struct softnet_data *sd)  {  	if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads())  		spin_unlock_irq(&sd->input_pkt_queue.lock); -	else if (!IS_ENABLED(CONFIG_PREEMPT_RT)) +	else  		local_irq_enable();  } @@ -449,7 +449,9 @@ static RAW_NOTIFIER_HEAD(netdev_chain);   *	queue in the local softnet handler.   */ -DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data); +DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data) = { +	.process_queue_bh_lock = INIT_LOCAL_LOCK(process_queue_bh_lock), +};  EXPORT_PER_CPU_SYMBOL(softnet_data);  /* Page_pool has a lockless array/stack to alloc/recycle pages. @@ -1226,9 +1228,9 @@ int dev_change_name(struct net_device *dev, const char *newname)  	memcpy(oldname, dev->name, IFNAMSIZ); -	write_seqlock(&netdev_rename_lock); +	write_seqlock_bh(&netdev_rename_lock);  	err = dev_get_valid_name(net, dev, newname); -	write_sequnlock(&netdev_rename_lock); +	write_sequnlock_bh(&netdev_rename_lock);  	if (err < 0) {  		up_write(&devnet_rename_sem); @@ -1269,9 +1271,9 @@ rollback:  		if (err >= 0) {  			err = ret;  			down_write(&devnet_rename_sem); -			write_seqlock(&netdev_rename_lock); +			write_seqlock_bh(&netdev_rename_lock);  			memcpy(dev->name, oldname, IFNAMSIZ); -			write_sequnlock(&netdev_rename_lock); +			write_sequnlock_bh(&netdev_rename_lock);  			memcpy(oldname, newname, IFNAMSIZ);  			WRITE_ONCE(dev->name_assign_type, old_assign_type);  			old_assign_type = NET_NAME_RENAMED; @@ -2160,7 +2162,7 @@ EXPORT_SYMBOL(net_disable_timestamp);  static inline void net_timestamp_set(struct sk_buff *skb)  {  	skb->tstamp = 0; -	skb->mono_delivery_time = 0; +	skb->tstamp_type = SKB_CLOCK_REALTIME;  	if (static_branch_unlikely(&netstamp_needed_key))  		skb->tstamp = ktime_get_real();  } @@ -3940,6 +3942,7 @@ netdev_tx_queue_mapping(struct net_device *dev, struct sk_buff *skb)  	return netdev_get_tx_queue(dev, netdev_cap_txqueue(dev, qm));  } +#ifndef CONFIG_PREEMPT_RT  static bool netdev_xmit_txqueue_skipped(void)  {  	return __this_cpu_read(softnet_data.xmit.skip_txqueue); @@ -3950,6 +3953,19 @@ void netdev_xmit_skip_txqueue(bool skip)  	__this_cpu_write(softnet_data.xmit.skip_txqueue, skip);  }  EXPORT_SYMBOL_GPL(netdev_xmit_skip_txqueue); + +#else +static bool netdev_xmit_txqueue_skipped(void) +{ +	return current->net_xmit.skip_txqueue; +} + +void netdev_xmit_skip_txqueue(bool skip) +{ +	current->net_xmit.skip_txqueue = skip; +} +EXPORT_SYMBOL_GPL(netdev_xmit_skip_txqueue); +#endif  #endif /* CONFIG_NET_EGRESS */  #ifdef CONFIG_NET_XGRESS @@ -4029,10 +4045,13 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,  {  	struct bpf_mprog_entry *entry = rcu_dereference_bh(skb->dev->tcx_ingress);  	enum skb_drop_reason drop_reason = SKB_DROP_REASON_TC_INGRESS; +	struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;  	int sch_ret;  	if (!entry)  		return skb; + +	bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);  	if (*pt_prev) {  		*ret = deliver_skb(skb, *pt_prev, orig_dev);  		*pt_prev = NULL; @@ -4061,10 +4080,12 @@ ingress_verdict:  			break;  		}  		*ret = NET_RX_SUCCESS; +		bpf_net_ctx_clear(bpf_net_ctx);  		return NULL;  	case TC_ACT_SHOT:  		kfree_skb_reason(skb, drop_reason);  		*ret = NET_RX_DROP; +		bpf_net_ctx_clear(bpf_net_ctx);  		return NULL;  	/* used by tc_run */  	case TC_ACT_STOLEN: @@ -4074,8 +4095,10 @@ ingress_verdict:  		fallthrough;  	case TC_ACT_CONSUMED:  		*ret = NET_RX_SUCCESS; +		bpf_net_ctx_clear(bpf_net_ctx);  		return NULL;  	} +	bpf_net_ctx_clear(bpf_net_ctx);  	return skb;  } @@ -4085,11 +4108,14 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)  {  	struct bpf_mprog_entry *entry = rcu_dereference_bh(dev->tcx_egress);  	enum skb_drop_reason drop_reason = SKB_DROP_REASON_TC_EGRESS; +	struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;  	int sch_ret;  	if (!entry)  		return skb; +	bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); +  	/* qdisc_skb_cb(skb)->pkt_len & tcx_set_ingress() was  	 * already set by the caller.  	 */ @@ -4105,10 +4131,12 @@ egress_verdict:  		/* No need to push/pop skb's mac_header here on egress! */  		skb_do_redirect(skb);  		*ret = NET_XMIT_SUCCESS; +		bpf_net_ctx_clear(bpf_net_ctx);  		return NULL;  	case TC_ACT_SHOT:  		kfree_skb_reason(skb, drop_reason);  		*ret = NET_XMIT_DROP; +		bpf_net_ctx_clear(bpf_net_ctx);  		return NULL;  	/* used by tc_run */  	case TC_ACT_STOLEN: @@ -4118,8 +4146,10 @@ egress_verdict:  		fallthrough;  	case TC_ACT_CONSUMED:  		*ret = NET_XMIT_SUCCESS; +		bpf_net_ctx_clear(bpf_net_ctx);  		return NULL;  	} +	bpf_net_ctx_clear(bpf_net_ctx);  	return skb;  } @@ -5096,11 +5126,14 @@ static DEFINE_STATIC_KEY_FALSE(generic_xdp_needed_key);  int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff **pskb)  { +	struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; +  	if (xdp_prog) {  		struct xdp_buff xdp;  		u32 act;  		int err; +		bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);  		act = netif_receive_generic_xdp(pskb, &xdp, xdp_prog);  		if (act != XDP_PASS) {  			switch (act) { @@ -5114,11 +5147,13 @@ int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff **pskb)  				generic_xdp_tx(*pskb, xdp_prog);  				break;  			} +			bpf_net_ctx_clear(bpf_net_ctx);  			return XDP_DROP;  		}  	}  	return XDP_PASS;  out_redir: +	bpf_net_ctx_clear(bpf_net_ctx);  	kfree_skb_reason(*pskb, SKB_DROP_REASON_XDP);  	return XDP_DROP;  } @@ -5234,7 +5269,7 @@ static __latent_entropy void net_tx_action(struct softirq_action *h)  				trace_consume_skb(skb, net_tx_action);  			else  				trace_kfree_skb(skb, net_tx_action, -						get_kfree_skb_cb(skb)->reason); +						get_kfree_skb_cb(skb)->reason, NULL);  			if (skb->fclone != SKB_FCLONE_UNAVAILABLE)  				__kfree_skb(skb); @@ -5935,6 +5970,7 @@ static void flush_backlog(struct work_struct *work)  	}  	backlog_unlock_irq_enable(sd); +	local_lock_nested_bh(&softnet_data.process_queue_bh_lock);  	skb_queue_walk_safe(&sd->process_queue, skb, tmp) {  		if (skb->dev->reg_state == NETREG_UNREGISTERING) {  			__skb_unlink(skb, &sd->process_queue); @@ -5942,6 +5978,7 @@ static void flush_backlog(struct work_struct *work)  			rps_input_queue_head_incr(sd);  		}  	} +	local_unlock_nested_bh(&softnet_data.process_queue_bh_lock);  	local_bh_enable();  } @@ -6063,7 +6100,9 @@ static int process_backlog(struct napi_struct *napi, int quota)  	while (again) {  		struct sk_buff *skb; +		local_lock_nested_bh(&softnet_data.process_queue_bh_lock);  		while ((skb = __skb_dequeue(&sd->process_queue))) { +			local_unlock_nested_bh(&softnet_data.process_queue_bh_lock);  			rcu_read_lock();  			__netif_receive_skb(skb);  			rcu_read_unlock(); @@ -6072,7 +6111,9 @@ static int process_backlog(struct napi_struct *napi, int quota)  				return work;  			} +			local_lock_nested_bh(&softnet_data.process_queue_bh_lock);  		} +		local_unlock_nested_bh(&softnet_data.process_queue_bh_lock);  		backlog_lock_irq_disable(sd);  		if (skb_queue_empty(&sd->input_pkt_queue)) { @@ -6087,8 +6128,10 @@ static int process_backlog(struct napi_struct *napi, int quota)  			napi->state &= NAPIF_STATE_THREADED;  			again = false;  		} else { +			local_lock_nested_bh(&softnet_data.process_queue_bh_lock);  			skb_queue_splice_tail_init(&sd->input_pkt_queue,  						   &sd->process_queue); +			local_unlock_nested_bh(&softnet_data.process_queue_bh_lock);  		}  		backlog_unlock_irq_enable(sd);  	} @@ -6301,6 +6344,7 @@ enum {  static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock,  			   unsigned flags, u16 budget)  { +	struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;  	bool skip_schedule = false;  	unsigned long timeout;  	int rc; @@ -6318,6 +6362,7 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock,  	clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state);  	local_bh_disable(); +	bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);  	if (flags & NAPI_F_PREFER_BUSY_POLL) {  		napi->defer_hard_irqs_count = READ_ONCE(napi->dev->napi_defer_hard_irqs); @@ -6340,6 +6385,7 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock,  	netpoll_poll_unlock(have_poll_lock);  	if (rc == budget)  		__busy_poll_stop(napi, skip_schedule); +	bpf_net_ctx_clear(bpf_net_ctx);  	local_bh_enable();  } @@ -6349,6 +6395,7 @@ static void __napi_busy_loop(unsigned int napi_id,  {  	unsigned long start_time = loop_end ? busy_loop_current_time() : 0;  	int (*napi_poll)(struct napi_struct *napi, int budget); +	struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;  	void *have_poll_lock = NULL;  	struct napi_struct *napi; @@ -6367,6 +6414,7 @@ restart:  		int work = 0;  		local_bh_disable(); +		bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);  		if (!napi_poll) {  			unsigned long val = READ_ONCE(napi->state); @@ -6397,6 +6445,7 @@ count:  			__NET_ADD_STATS(dev_net(napi->dev),  					LINUX_MIB_BUSYPOLLRXPACKETS, work);  		skb_defer_free_flush(this_cpu_ptr(&softnet_data)); +		bpf_net_ctx_clear(bpf_net_ctx);  		local_bh_enable();  		if (!loop_end || loop_end(loop_end_arg, start_time)) @@ -6824,6 +6873,7 @@ static int napi_thread_wait(struct napi_struct *napi)  static void napi_threaded_poll_loop(struct napi_struct *napi)  { +	struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;  	struct softnet_data *sd;  	unsigned long last_qs = jiffies; @@ -6832,6 +6882,8 @@ static void napi_threaded_poll_loop(struct napi_struct *napi)  		void *have;  		local_bh_disable(); +		bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); +  		sd = this_cpu_ptr(&softnet_data);  		sd->in_napi_threaded_poll = true; @@ -6847,6 +6899,7 @@ static void napi_threaded_poll_loop(struct napi_struct *napi)  			net_rps_action_and_irq_enable(sd);  		}  		skb_defer_free_flush(sd); +		bpf_net_ctx_clear(bpf_net_ctx);  		local_bh_enable();  		if (!repoll) @@ -6872,10 +6925,12 @@ static __latent_entropy void net_rx_action(struct softirq_action *h)  	struct softnet_data *sd = this_cpu_ptr(&softnet_data);  	unsigned long time_limit = jiffies +  		usecs_to_jiffies(READ_ONCE(net_hotdata.netdev_budget_usecs)); +	struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;  	int budget = READ_ONCE(net_hotdata.netdev_budget);  	LIST_HEAD(list);  	LIST_HEAD(repoll); +	bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);  start:  	sd->in_net_rx_action = true;  	local_irq_disable(); @@ -6928,7 +6983,8 @@ start:  		sd->in_net_rx_action = false;  	net_rps_action_and_irq_enable(sd); -end:; +end: +	bpf_net_ctx_clear(bpf_net_ctx);  }  struct netdev_adjacent { @@ -10285,6 +10341,10 @@ int register_netdevice(struct net_device *dev)  	if (ret)  		return ret; +	/* rss ctx ID 0 is reserved for the default context, start from 1 */ +	xa_init_flags(&dev->ethtool->rss_ctx, XA_FLAGS_ALLOC1); +	mutex_init(&dev->ethtool->rss_lock); +  	spin_lock_init(&dev->addr_list_lock);  	netdev_set_addr_lockdep_class(dev); @@ -10703,6 +10763,54 @@ void netdev_run_todo(void)  		wake_up(&netdev_unregistering_wq);  } +/* Collate per-cpu network dstats statistics + * + * Read per-cpu network statistics from dev->dstats and populate the related + * fields in @s. + */ +static void dev_fetch_dstats(struct rtnl_link_stats64 *s, +			     const struct pcpu_dstats __percpu *dstats) +{ +	int cpu; + +	for_each_possible_cpu(cpu) { +		u64 rx_packets, rx_bytes, rx_drops; +		u64 tx_packets, tx_bytes, tx_drops; +		const struct pcpu_dstats *stats; +		unsigned int start; + +		stats = per_cpu_ptr(dstats, cpu); +		do { +			start = u64_stats_fetch_begin(&stats->syncp); +			rx_packets = u64_stats_read(&stats->rx_packets); +			rx_bytes   = u64_stats_read(&stats->rx_bytes); +			rx_drops   = u64_stats_read(&stats->rx_drops); +			tx_packets = u64_stats_read(&stats->tx_packets); +			tx_bytes   = u64_stats_read(&stats->tx_bytes); +			tx_drops   = u64_stats_read(&stats->tx_drops); +		} while (u64_stats_fetch_retry(&stats->syncp, start)); + +		s->rx_packets += rx_packets; +		s->rx_bytes   += rx_bytes; +		s->rx_dropped += rx_drops; +		s->tx_packets += tx_packets; +		s->tx_bytes   += tx_bytes; +		s->tx_dropped += tx_drops; +	} +} + +/* ndo_get_stats64 implementation for dtstats-based accounting. + * + * Populate @s from dev->stats and dev->dstats. This is used internally by the + * core for NETDEV_PCPU_STAT_DSTAT-type stats collection. + */ +static void dev_get_dstats64(const struct net_device *dev, +			     struct rtnl_link_stats64 *s) +{ +	netdev_stats_to_stats64(s, &dev->stats); +	dev_fetch_dstats(s, dev->dstats); +} +  /* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has   * all the same fields in the same order as net_device_stats, with only   * the type differing, but rtnl_link_stats64 may have additional fields @@ -10779,6 +10887,8 @@ struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,  		netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));  	} else if (dev->pcpu_stat_type == NETDEV_PCPU_STAT_TSTATS) {  		dev_get_tstats64(dev, storage); +	} else if (dev->pcpu_stat_type == NETDEV_PCPU_STAT_DSTATS) { +		dev_get_dstats64(dev, storage);  	} else {  		netdev_stats_to_stats64(storage, &dev->stats);  	} @@ -10896,13 +11006,6 @@ void netdev_sw_irq_coalesce_default_on(struct net_device *dev)  }  EXPORT_SYMBOL_GPL(netdev_sw_irq_coalesce_default_on); -void netdev_freemem(struct net_device *dev) -{ -	char *addr = (char *)dev - dev->padded; - -	kvfree(addr); -} -  /**   * alloc_netdev_mqs - allocate network device   * @sizeof_priv: size of private data to allocate space for @@ -10922,8 +11025,6 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,  		unsigned int txqs, unsigned int rxqs)  {  	struct net_device *dev; -	unsigned int alloc_size; -	struct net_device *p;  	BUG_ON(strlen(name) >= sizeof(dev->name)); @@ -10937,21 +11038,12 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,  		return NULL;  	} -	alloc_size = sizeof(struct net_device); -	if (sizeof_priv) { -		/* ensure 32-byte alignment of private area */ -		alloc_size = ALIGN(alloc_size, NETDEV_ALIGN); -		alloc_size += sizeof_priv; -	} -	/* ensure 32-byte alignment of whole construct */ -	alloc_size += NETDEV_ALIGN - 1; - -	p = kvzalloc(alloc_size, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL); -	if (!p) +	dev = kvzalloc(struct_size(dev, priv, sizeof_priv), +		       GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL); +	if (!dev)  		return NULL; -	dev = PTR_ALIGN(p, NETDEV_ALIGN); -	dev->padded = (char *)dev - (char *)p; +	dev->priv_len = sizeof_priv;  	ref_tracker_dir_init(&dev->refcnt_tracker, 128, name);  #ifdef CONFIG_PCPU_DEV_REFCNT @@ -11015,6 +11107,9 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,  	dev->real_num_rx_queues = rxqs;  	if (netif_alloc_rx_queues(dev))  		goto free_all; +	dev->ethtool = kzalloc(sizeof(*dev->ethtool), GFP_KERNEL_ACCOUNT); +	if (!dev->ethtool) +		goto free_all;  	strcpy(dev->name, name);  	dev->name_assign_type = name_assign_type; @@ -11035,7 +11130,7 @@ free_pcpu:  	free_percpu(dev->pcpu_refcnt);  free_dev:  #endif -	netdev_freemem(dev); +	kvfree(dev);  	return NULL;  }  EXPORT_SYMBOL(alloc_netdev_mqs); @@ -11065,6 +11160,7 @@ void free_netdev(struct net_device *dev)  		return;  	} +	kfree(dev->ethtool);  	netif_free_tx_queues(dev);  	netif_free_rx_queues(dev); @@ -11089,7 +11185,7 @@ void free_netdev(struct net_device *dev)  	/*  Compatibility with error handling in drivers */  	if (dev->reg_state == NETREG_UNINITIALIZED ||  	    dev->reg_state == NETREG_DUMMY) { -		netdev_freemem(dev); +		kvfree(dev);  		return;  	} @@ -11130,6 +11226,34 @@ void synchronize_net(void)  }  EXPORT_SYMBOL(synchronize_net); +static void netdev_rss_contexts_free(struct net_device *dev) +{ +	struct ethtool_rxfh_context *ctx; +	unsigned long context; + +	mutex_lock(&dev->ethtool->rss_lock); +	xa_for_each(&dev->ethtool->rss_ctx, context, ctx) { +		struct ethtool_rxfh_param rxfh; + +		rxfh.indir = ethtool_rxfh_context_indir(ctx); +		rxfh.key = ethtool_rxfh_context_key(ctx); +		rxfh.hfunc = ctx->hfunc; +		rxfh.input_xfrm = ctx->input_xfrm; +		rxfh.rss_context = context; +		rxfh.rss_delete = true; + +		xa_erase(&dev->ethtool->rss_ctx, context); +		if (dev->ethtool_ops->create_rxfh_context) +			dev->ethtool_ops->remove_rxfh_context(dev, ctx, +							      context, NULL); +		else +			dev->ethtool_ops->set_rxfh(dev, &rxfh, NULL); +		kfree(ctx); +	} +	xa_destroy(&dev->ethtool->rss_ctx); +	mutex_unlock(&dev->ethtool->rss_lock); +} +  /**   *	unregister_netdevice_queue - remove device from the kernel   *	@dev: device @@ -11233,11 +11357,15 @@ void unregister_netdevice_many_notify(struct list_head *head,  		netdev_name_node_alt_flush(dev);  		netdev_name_node_free(dev->name_node); +		netdev_rss_contexts_free(dev); +  		call_netdevice_notifiers(NETDEV_PRE_UNINIT, dev);  		if (dev->netdev_ops->ndo_uninit)  			dev->netdev_ops->ndo_uninit(dev); +		mutex_destroy(&dev->ethtool->rss_lock); +  		if (skb)  			rtmsg_ifinfo_send(skb, dev, GFP_KERNEL, portid, nlh); @@ -11419,9 +11547,9 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net,  	if (new_name[0]) {  		/* Rename the netdev to prepared name */ -		write_seqlock(&netdev_rename_lock); +		write_seqlock_bh(&netdev_rename_lock);  		strscpy(dev->name, new_name, IFNAMSIZ); -		write_sequnlock(&netdev_rename_lock); +		write_sequnlock_bh(&netdev_rename_lock);  	}  	/* Fixup kobjects */ |