diff options
Diffstat (limited to 'net/core')
| -rw-r--r-- | net/core/Makefile | 3 | ||||
| -rw-r--r-- | net/core/bpf_sk_storage.c | 23 | ||||
| -rw-r--r-- | net/core/datagram.c | 19 | ||||
| -rw-r--r-- | net/core/dev.c | 462 | ||||
| -rw-r--r-- | net/core/dev.h | 24 | ||||
| -rw-r--r-- | net/core/dev_addr_lists_test.c | 14 | ||||
| -rw-r--r-- | net/core/drop_monitor.c | 20 | ||||
| -rw-r--r-- | net/core/dst_cache.c | 11 | ||||
| -rw-r--r-- | net/core/fib_rules.c | 17 | ||||
| -rw-r--r-- | net/core/filter.c | 48 | ||||
| -rw-r--r-- | net/core/flow_dissector.c | 20 | ||||
| -rw-r--r-- | net/core/gro.c | 31 | ||||
| -rw-r--r-- | net/core/hotdata.c | 7 | ||||
| -rw-r--r-- | net/core/ieee8021q_helpers.c | 242 | ||||
| -rw-r--r-- | net/core/neighbour.c | 79 | ||||
| -rw-r--r-- | net/core/net-procfs.c | 3 | ||||
| -rw-r--r-- | net/core/net-sysfs.c | 16 | ||||
| -rw-r--r-- | net/core/net_namespace.c | 5 | ||||
| -rw-r--r-- | net/core/net_test.c (renamed from net/core/gso_test.c) | 129 | ||||
| -rw-r--r-- | net/core/netdev-genl-gen.c | 1 | ||||
| -rw-r--r-- | net/core/netdev-genl.c | 77 | ||||
| -rw-r--r-- | net/core/netpoll.c | 2 | ||||
| -rw-r--r-- | net/core/page_pool.c | 124 | ||||
| -rw-r--r-- | net/core/rtnetlink.c | 166 | ||||
| -rw-r--r-- | net/core/scm.c | 12 | ||||
| -rw-r--r-- | net/core/skbuff.c | 166 | ||||
| -rw-r--r-- | net/core/sock.c | 19 | ||||
| -rw-r--r-- | net/core/sock_map.c | 263 | ||||
| -rw-r--r-- | net/core/sysctl_net_core.c | 22 | 
29 files changed, 1466 insertions, 559 deletions
diff --git a/net/core/Makefile b/net/core/Makefile index 6e6548011fae..62be9aef2528 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -26,6 +26,7 @@ obj-$(CONFIG_NETPOLL) += netpoll.o  obj-$(CONFIG_FIB_RULES) += fib_rules.o  obj-$(CONFIG_TRACEPOINTS) += net-traces.o  obj-$(CONFIG_NET_DROP_MONITOR) += drop_monitor.o +obj-$(CONFIG_NET_IEEE8021Q_HELPERS) += ieee8021q_helpers.o  obj-$(CONFIG_NET_SELFTESTS) += selftests.o  obj-$(CONFIG_NETWORK_PHY_TIMESTAMPING) += timestamping.o  obj-$(CONFIG_NET_PTP_CLASSIFY) += ptp_classifier.o @@ -41,4 +42,4 @@ obj-$(CONFIG_NET_SOCK_MSG) += skmsg.o  obj-$(CONFIG_BPF_SYSCALL) += sock_map.o  obj-$(CONFIG_BPF_SYSCALL) += bpf_sk_storage.o  obj-$(CONFIG_OF)	+= of_net.o -obj-$(CONFIG_NET_TEST) += gso_test.o +obj-$(CONFIG_NET_TEST) += net_test.o diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index 6c4d90b24d46..bc01b3aa6b0f 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -496,27 +496,22 @@ bpf_sk_storage_diag_alloc(const struct nlattr *nla_stgs)  	if (!bpf_capable())  		return ERR_PTR(-EPERM); -	nla_for_each_nested(nla, nla_stgs, rem) { -		if (nla_type(nla) == SK_DIAG_BPF_STORAGE_REQ_MAP_FD) { -			if (nla_len(nla) != sizeof(u32)) -				return ERR_PTR(-EINVAL); -			nr_maps++; -		} +	nla_for_each_nested_type(nla, SK_DIAG_BPF_STORAGE_REQ_MAP_FD, +				 nla_stgs, rem) { +		if (nla_len(nla) != sizeof(u32)) +			return ERR_PTR(-EINVAL); +		nr_maps++;  	}  	diag = kzalloc(struct_size(diag, maps, nr_maps), GFP_KERNEL);  	if (!diag)  		return ERR_PTR(-ENOMEM); -	nla_for_each_nested(nla, nla_stgs, rem) { -		struct bpf_map *map; -		int map_fd; - -		if (nla_type(nla) != SK_DIAG_BPF_STORAGE_REQ_MAP_FD) -			continue; +	nla_for_each_nested_type(nla, SK_DIAG_BPF_STORAGE_REQ_MAP_FD, +				 nla_stgs, rem) { +		int map_fd = nla_get_u32(nla); +		struct bpf_map *map = bpf_map_get(map_fd); -		map_fd = nla_get_u32(nla); -		map = bpf_map_get(map_fd);  		if (IS_ERR(map)) {  			err = PTR_ERR(map);  			goto err_free; diff --git a/net/core/datagram.c b/net/core/datagram.c index a8b625abe242..e614cfd8e14a 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -324,25 +324,6 @@ void skb_free_datagram(struct sock *sk, struct sk_buff *skb)  }  EXPORT_SYMBOL(skb_free_datagram); -void __skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb, int len) -{ -	bool slow; - -	if (!skb_unref(skb)) { -		sk_peek_offset_bwd(sk, len); -		return; -	} - -	slow = lock_sock_fast(sk); -	sk_peek_offset_bwd(sk, len); -	skb_orphan(skb); -	unlock_sock_fast(sk, slow); - -	/* skb is now orphaned, can be freed outside of locked section */ -	__kfree_skb(skb); -} -EXPORT_SYMBOL(__skb_free_datagram_locked); -  int __sk_queue_drop_skb(struct sock *sk, struct sk_buff_head *sk_queue,  			struct sk_buff *skb, unsigned int flags,  			void (*destructor)(struct sock *sk, diff --git a/net/core/dev.c b/net/core/dev.c index 331848eca7d3..e1bb6d7856d9 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -77,7 +77,9 @@  #include <linux/hash.h>  #include <linux/slab.h>  #include <linux/sched.h> +#include <linux/sched/isolation.h>  #include <linux/sched/mm.h> +#include <linux/smpboot.h>  #include <linux/mutex.h>  #include <linux/rwsem.h>  #include <linux/string.h> @@ -197,35 +199,60 @@ static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)  	return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];  } -static inline void rps_lock_irqsave(struct softnet_data *sd, -				    unsigned long *flags) +#ifndef CONFIG_PREEMPT_RT + +static DEFINE_STATIC_KEY_FALSE(use_backlog_threads_key); + +static int __init setup_backlog_napi_threads(char *arg) +{ +	static_branch_enable(&use_backlog_threads_key); +	return 0; +} +early_param("thread_backlog_napi", setup_backlog_napi_threads); + +static bool use_backlog_threads(void) +{ +	return static_branch_unlikely(&use_backlog_threads_key); +} + +#else + +static bool use_backlog_threads(void)  { -	if (IS_ENABLED(CONFIG_RPS)) +	return true; +} + +#endif + +static inline void backlog_lock_irq_save(struct softnet_data *sd, +					 unsigned long *flags) +{ +	if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads())  		spin_lock_irqsave(&sd->input_pkt_queue.lock, *flags);  	else if (!IS_ENABLED(CONFIG_PREEMPT_RT))  		local_irq_save(*flags);  } -static inline void rps_lock_irq_disable(struct softnet_data *sd) +static inline void backlog_lock_irq_disable(struct softnet_data *sd)  { -	if (IS_ENABLED(CONFIG_RPS)) +	if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads())  		spin_lock_irq(&sd->input_pkt_queue.lock);  	else if (!IS_ENABLED(CONFIG_PREEMPT_RT))  		local_irq_disable();  } -static inline void rps_unlock_irq_restore(struct softnet_data *sd, -					  unsigned long *flags) +static inline void backlog_unlock_irq_restore(struct softnet_data *sd, +					      unsigned long *flags)  { -	if (IS_ENABLED(CONFIG_RPS)) +	if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads())  		spin_unlock_irqrestore(&sd->input_pkt_queue.lock, *flags);  	else if (!IS_ENABLED(CONFIG_PREEMPT_RT))  		local_irq_restore(*flags);  } -static inline void rps_unlock_irq_enable(struct softnet_data *sd) +static inline void backlog_unlock_irq_enable(struct softnet_data *sd)  { -	if (IS_ENABLED(CONFIG_RPS)) +	if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads())  		spin_unlock_irq(&sd->input_pkt_queue.lock);  	else if (!IS_ENABLED(CONFIG_PREEMPT_RT))  		local_irq_enable(); @@ -912,6 +939,18 @@ struct net_device *dev_get_by_napi_id(unsigned int napi_id)  }  EXPORT_SYMBOL(dev_get_by_napi_id); +static DEFINE_SEQLOCK(netdev_rename_lock); + +void netdev_copy_name(struct net_device *dev, char *name) +{ +	unsigned int seq; + +	do { +		seq = read_seqbegin(&netdev_rename_lock); +		strscpy(name, dev->name, IFNAMSIZ); +	} while (read_seqretry(&netdev_rename_lock, seq)); +} +  /**   *	netdev_get_name - get a netdevice name, knowing its ifindex.   *	@net: network namespace @@ -923,7 +962,6 @@ int netdev_get_name(struct net *net, char *name, int ifindex)  	struct net_device *dev;  	int ret; -	down_read(&devnet_rename_sem);  	rcu_read_lock();  	dev = dev_get_by_index_rcu(net, ifindex); @@ -932,12 +970,11 @@ int netdev_get_name(struct net *net, char *name, int ifindex)  		goto out;  	} -	strcpy(name, dev->name); +	netdev_copy_name(dev, name);  	ret = 0;  out:  	rcu_read_unlock(); -	up_read(&devnet_rename_sem);  	return ret;  } @@ -1189,7 +1226,10 @@ int dev_change_name(struct net_device *dev, const char *newname)  	memcpy(oldname, dev->name, IFNAMSIZ); +	write_seqlock(&netdev_rename_lock);  	err = dev_get_valid_name(net, dev, newname); +	write_sequnlock(&netdev_rename_lock); +  	if (err < 0) {  		up_write(&devnet_rename_sem);  		return err; @@ -1229,7 +1269,9 @@ rollback:  		if (err >= 0) {  			err = ret;  			down_write(&devnet_rename_sem); +			write_seqlock(&netdev_rename_lock);  			memcpy(dev->name, oldname, IFNAMSIZ); +			write_sequnlock(&netdev_rename_lock);  			memcpy(oldname, newname, IFNAMSIZ);  			WRITE_ONCE(dev->name_assign_type, old_assign_type);  			old_assign_type = NET_NAME_RENAMED; @@ -2057,6 +2099,11 @@ void net_dec_egress_queue(void)  EXPORT_SYMBOL_GPL(net_dec_egress_queue);  #endif +#ifdef CONFIG_NET_CLS_ACT +DEFINE_STATIC_KEY_FALSE(tcf_bypass_check_needed_key); +EXPORT_SYMBOL(tcf_bypass_check_needed_key); +#endif +  DEFINE_STATIC_KEY_FALSE(netstamp_needed_key);  EXPORT_SYMBOL(netstamp_needed_key);  #ifdef CONFIG_JUMP_LABEL @@ -3917,6 +3964,11 @@ static int tc_run(struct tcx_entry *entry, struct sk_buff *skb,  	if (!miniq)  		return ret; +	if (static_branch_unlikely(&tcf_bypass_check_needed_key)) { +		if (tcf_block_bypass_sw(miniq->block)) +			return ret; +	} +  	tc_skb_cb(skb)->mru = 0;  	tc_skb_cb(skb)->post_ct = false;  	tcf_set_drop_reason(skb, *drop_reason); @@ -4410,8 +4462,8 @@ EXPORT_SYMBOL(__dev_direct_xmit);  /*************************************************************************   *			Receiver routines   *************************************************************************/ +static DEFINE_PER_CPU(struct task_struct *, backlog_napi); -unsigned int sysctl_skb_defer_max __read_mostly = 64;  int weight_p __read_mostly = 64;           /* old backlog weight */  int dev_weight_rx_bias __read_mostly = 1;  /* bias for backlog weight */  int dev_weight_tx_bias __read_mostly = 1;  /* bias for output_queue quota */ @@ -4433,18 +4485,16 @@ static inline void ____napi_schedule(struct softnet_data *sd,  		 */  		thread = READ_ONCE(napi->thread);  		if (thread) { -			/* Avoid doing set_bit() if the thread is in -			 * INTERRUPTIBLE state, cause napi_thread_wait() -			 * makes sure to proceed with napi polling -			 * if the thread is explicitly woken from here. -			 */ -			if (READ_ONCE(thread->__state) != TASK_INTERRUPTIBLE) -				set_bit(NAPI_STATE_SCHED_THREADED, &napi->state); +			if (use_backlog_threads() && thread == raw_cpu_read(backlog_napi)) +				goto use_local_napi; + +			set_bit(NAPI_STATE_SCHED_THREADED, &napi->state);  			wake_up_process(thread);  			return;  		}  	} +use_local_napi:  	list_add_tail(&napi->poll_list, &sd->poll_list);  	WRITE_ONCE(napi->list_owner, smp_processor_id());  	/* If not called from net_rx_action() @@ -4470,7 +4520,7 @@ set_rps_cpu(struct net_device *dev, struct sk_buff *skb,  		struct netdev_rx_queue *rxqueue;  		struct rps_dev_flow_table *flow_table;  		struct rps_dev_flow *old_rflow; -		u32 flow_id; +		u32 flow_id, head;  		u16 rxq_index;  		int rc; @@ -4493,16 +4543,16 @@ set_rps_cpu(struct net_device *dev, struct sk_buff *skb,  			goto out;  		old_rflow = rflow;  		rflow = &flow_table->flows[flow_id]; -		rflow->filter = rc; -		if (old_rflow->filter == rflow->filter) -			old_rflow->filter = RPS_NO_FILTER; +		WRITE_ONCE(rflow->filter, rc); +		if (old_rflow->filter == rc) +			WRITE_ONCE(old_rflow->filter, RPS_NO_FILTER);  	out:  #endif -		rflow->last_qtail = -			per_cpu(softnet_data, next_cpu).input_queue_head; +		head = READ_ONCE(per_cpu(softnet_data, next_cpu).input_queue_head); +		rps_input_queue_tail_save(&rflow->last_qtail, head);  	} -	rflow->cpu = next_cpu; +	WRITE_ONCE(rflow->cpu, next_cpu);  	return rflow;  } @@ -4581,7 +4631,7 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,  		 */  		if (unlikely(tcpu != next_cpu) &&  		    (tcpu >= nr_cpu_ids || !cpu_online(tcpu) || -		     ((int)(per_cpu(softnet_data, tcpu).input_queue_head - +		     ((int)(READ_ONCE(per_cpu(softnet_data, tcpu).input_queue_head) -  		      rflow->last_qtail)) >= 0)) {  			tcpu = next_cpu;  			rflow = set_rps_cpu(dev, skb, rflow, next_cpu); @@ -4635,9 +4685,9 @@ bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,  	if (flow_table && flow_id <= flow_table->mask) {  		rflow = &flow_table->flows[flow_id];  		cpu = READ_ONCE(rflow->cpu); -		if (rflow->filter == filter_id && cpu < nr_cpu_ids && -		    ((int)(per_cpu(softnet_data, cpu).input_queue_head - -			   rflow->last_qtail) < +		if (READ_ONCE(rflow->filter) == filter_id && cpu < nr_cpu_ids && +		    ((int)(READ_ONCE(per_cpu(softnet_data, cpu).input_queue_head) - +			   READ_ONCE(rflow->last_qtail)) <  		     (int)(10 * flow_table->mask)))  			expire = false;  	} @@ -4684,6 +4734,11 @@ static void napi_schedule_rps(struct softnet_data *sd)  #ifdef CONFIG_RPS  	if (sd != mysd) { +		if (use_backlog_threads()) { +			__napi_schedule_irqoff(&sd->backlog); +			return; +		} +  		sd->rps_ipi_next = mysd->rps_ipi_list;  		mysd->rps_ipi_list = sd; @@ -4698,6 +4753,23 @@ static void napi_schedule_rps(struct softnet_data *sd)  	__napi_schedule_irqoff(&mysd->backlog);  } +void kick_defer_list_purge(struct softnet_data *sd, unsigned int cpu) +{ +	unsigned long flags; + +	if (use_backlog_threads()) { +		backlog_lock_irq_save(sd, &flags); + +		if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) +			__napi_schedule_irqoff(&sd->backlog); + +		backlog_unlock_irq_restore(sd, &flags); + +	} else if (!cmpxchg(&sd->defer_ipi_scheduled, 0, 1)) { +		smp_call_function_single_async(cpu, &sd->defer_csd); +	} +} +  #ifdef CONFIG_NET_FLOW_LIMIT  int netdev_flow_limit_table_len __read_mostly = (1 << 12);  #endif @@ -4749,37 +4821,45 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu,  	struct softnet_data *sd;  	unsigned long flags;  	unsigned int qlen; +	int max_backlog; +	u32 tail; -	reason = SKB_DROP_REASON_NOT_SPECIFIED; +	reason = SKB_DROP_REASON_DEV_READY; +	if (!netif_running(skb->dev)) +		goto bad_dev; + +	reason = SKB_DROP_REASON_CPU_BACKLOG;  	sd = &per_cpu(softnet_data, cpu); -	rps_lock_irqsave(sd, &flags); -	if (!netif_running(skb->dev)) -		goto drop; +	qlen = skb_queue_len_lockless(&sd->input_pkt_queue); +	max_backlog = READ_ONCE(net_hotdata.max_backlog); +	if (unlikely(qlen > max_backlog)) +		goto cpu_backlog_drop; +	backlog_lock_irq_save(sd, &flags);  	qlen = skb_queue_len(&sd->input_pkt_queue); -	if (qlen <= READ_ONCE(net_hotdata.max_backlog) && -	    !skb_flow_limit(skb, qlen)) { -		if (qlen) { -enqueue: -			__skb_queue_tail(&sd->input_pkt_queue, skb); -			input_queue_tail_incr_save(sd, qtail); -			rps_unlock_irq_restore(sd, &flags); -			return NET_RX_SUCCESS; +	if (qlen <= max_backlog && !skb_flow_limit(skb, qlen)) { +		if (!qlen) { +			/* Schedule NAPI for backlog device. We can use +			 * non atomic operation as we own the queue lock. +			 */ +			if (!__test_and_set_bit(NAPI_STATE_SCHED, +						&sd->backlog.state)) +				napi_schedule_rps(sd);  		} +		__skb_queue_tail(&sd->input_pkt_queue, skb); +		tail = rps_input_queue_tail_incr(sd); +		backlog_unlock_irq_restore(sd, &flags); -		/* Schedule NAPI for backlog device -		 * We can use non atomic operation since we own the queue lock -		 */ -		if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) -			napi_schedule_rps(sd); -		goto enqueue; +		/* save the tail outside of the critical section */ +		rps_input_queue_tail_save(qtail, tail); +		return NET_RX_SUCCESS;  	} -	reason = SKB_DROP_REASON_CPU_BACKLOG; -drop: -	sd->dropped++; -	rps_unlock_irq_restore(sd, &flags); +	backlog_unlock_irq_restore(sd, &flags); +cpu_backlog_drop: +	atomic_inc(&sd->dropped); +bad_dev:  	dev_core_stats_rx_dropped_inc(skb->dev);  	kfree_skb_reason(skb, reason);  	return NET_RX_DROP; @@ -5844,21 +5924,21 @@ static void flush_backlog(struct work_struct *work)  	local_bh_disable();  	sd = this_cpu_ptr(&softnet_data); -	rps_lock_irq_disable(sd); +	backlog_lock_irq_disable(sd);  	skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {  		if (skb->dev->reg_state == NETREG_UNREGISTERING) {  			__skb_unlink(skb, &sd->input_pkt_queue);  			dev_kfree_skb_irq(skb); -			input_queue_head_incr(sd); +			rps_input_queue_head_incr(sd);  		}  	} -	rps_unlock_irq_enable(sd); +	backlog_unlock_irq_enable(sd);  	skb_queue_walk_safe(&sd->process_queue, skb, tmp) {  		if (skb->dev->reg_state == NETREG_UNREGISTERING) {  			__skb_unlink(skb, &sd->process_queue);  			kfree_skb(skb); -			input_queue_head_incr(sd); +			rps_input_queue_head_incr(sd);  		}  	}  	local_bh_enable(); @@ -5870,14 +5950,14 @@ static bool flush_required(int cpu)  	struct softnet_data *sd = &per_cpu(softnet_data, cpu);  	bool do_flush; -	rps_lock_irq_disable(sd); +	backlog_lock_irq_disable(sd);  	/* as insertion into process_queue happens with the rps lock held,  	 * process_queue access may race only with dequeue  	 */  	do_flush = !skb_queue_empty(&sd->input_pkt_queue) ||  		   !skb_queue_empty_lockless(&sd->process_queue); -	rps_unlock_irq_enable(sd); +	backlog_unlock_irq_enable(sd);  	return do_flush;  #endif @@ -5943,7 +6023,7 @@ static void net_rps_action_and_irq_enable(struct softnet_data *sd)  #ifdef CONFIG_RPS  	struct softnet_data *remsd = sd->rps_ipi_list; -	if (remsd) { +	if (!use_backlog_threads() && remsd) {  		sd->rps_ipi_list = NULL;  		local_irq_enable(); @@ -5958,7 +6038,7 @@ static void net_rps_action_and_irq_enable(struct softnet_data *sd)  static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)  {  #ifdef CONFIG_RPS -	return sd->rps_ipi_list != NULL; +	return !use_backlog_threads() && sd->rps_ipi_list;  #else  	return false;  #endif @@ -5986,13 +6066,14 @@ static int process_backlog(struct napi_struct *napi, int quota)  			rcu_read_lock();  			__netif_receive_skb(skb);  			rcu_read_unlock(); -			input_queue_head_incr(sd); -			if (++work >= quota) +			if (++work >= quota) { +				rps_input_queue_head_add(sd, work);  				return work; +			}  		} -		rps_lock_irq_disable(sd); +		backlog_lock_irq_disable(sd);  		if (skb_queue_empty(&sd->input_pkt_queue)) {  			/*  			 * Inline a custom version of __napi_complete(). @@ -6002,15 +6083,17 @@ static int process_backlog(struct napi_struct *napi, int quota)  			 * We can use a plain write instead of clear_bit(),  			 * and we dont need an smp_mb() memory barrier.  			 */ -			napi->state = 0; +			napi->state &= NAPIF_STATE_THREADED;  			again = false;  		} else {  			skb_queue_splice_tail_init(&sd->input_pkt_queue,  						   &sd->process_queue);  		} -		rps_unlock_irq_enable(sd); +		backlog_unlock_irq_enable(sd);  	} +	if (work) +		rps_input_queue_head_add(sd, work);  	return work;  } @@ -6447,7 +6530,7 @@ int dev_set_threaded(struct net_device *dev, bool threaded)  		}  	} -	dev->threaded = threaded; +	WRITE_ONCE(dev->threaded, threaded);  	/* Make sure kthread is created before THREADED bit  	 * is set. @@ -6538,7 +6621,7 @@ void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi,  	 * threaded mode will not be enabled in napi_enable().  	 */  	if (dev->threaded && napi_kthread_create(napi)) -		dev->threaded = 0; +		dev->threaded = false;  	netif_napi_set_irq(napi, -1);  }  EXPORT_SYMBOL(netif_napi_add_weight); @@ -6716,8 +6799,6 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll)  static int napi_thread_wait(struct napi_struct *napi)  { -	bool woken = false; -  	set_current_state(TASK_INTERRUPTIBLE);  	while (!kthread_should_stop()) { @@ -6726,15 +6807,13 @@ static int napi_thread_wait(struct napi_struct *napi)  		 * Testing SCHED bit is not enough because SCHED bit might be  		 * set by some other busy poll thread or by napi_disable().  		 */ -		if (test_bit(NAPI_STATE_SCHED_THREADED, &napi->state) || woken) { +		if (test_bit(NAPI_STATE_SCHED_THREADED, &napi->state)) {  			WARN_ON(!list_empty(&napi->poll_list));  			__set_current_state(TASK_RUNNING);  			return 0;  		}  		schedule(); -		/* woken being true indicates this thread owns this napi. */ -		woken = true;  		set_current_state(TASK_INTERRUPTIBLE);  	}  	__set_current_state(TASK_RUNNING); @@ -6742,43 +6821,48 @@ static int napi_thread_wait(struct napi_struct *napi)  	return -1;  } -static int napi_threaded_poll(void *data) +static void napi_threaded_poll_loop(struct napi_struct *napi)  { -	struct napi_struct *napi = data;  	struct softnet_data *sd; -	void *have; +	unsigned long last_qs = jiffies; -	while (!napi_thread_wait(napi)) { -		unsigned long last_qs = jiffies; - -		for (;;) { -			bool repoll = false; +	for (;;) { +		bool repoll = false; +		void *have; -			local_bh_disable(); -			sd = this_cpu_ptr(&softnet_data); -			sd->in_napi_threaded_poll = true; +		local_bh_disable(); +		sd = this_cpu_ptr(&softnet_data); +		sd->in_napi_threaded_poll = true; -			have = netpoll_poll_lock(napi); -			__napi_poll(napi, &repoll); -			netpoll_poll_unlock(have); +		have = netpoll_poll_lock(napi); +		__napi_poll(napi, &repoll); +		netpoll_poll_unlock(have); -			sd->in_napi_threaded_poll = false; -			barrier(); +		sd->in_napi_threaded_poll = false; +		barrier(); -			if (sd_has_rps_ipi_waiting(sd)) { -				local_irq_disable(); -				net_rps_action_and_irq_enable(sd); -			} -			skb_defer_free_flush(sd); -			local_bh_enable(); +		if (sd_has_rps_ipi_waiting(sd)) { +			local_irq_disable(); +			net_rps_action_and_irq_enable(sd); +		} +		skb_defer_free_flush(sd); +		local_bh_enable(); -			if (!repoll) -				break; +		if (!repoll) +			break; -			rcu_softirq_qs_periodic(last_qs); -			cond_resched(); -		} +		rcu_softirq_qs_periodic(last_qs); +		cond_resched();  	} +} + +static int napi_threaded_poll(void *data) +{ +	struct napi_struct *napi = data; + +	while (!napi_thread_wait(napi)) +		napi_threaded_poll_loop(napi); +  	return 0;  } @@ -8459,27 +8543,29 @@ static void dev_change_rx_flags(struct net_device *dev, int flags)  static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)  {  	unsigned int old_flags = dev->flags; +	unsigned int promiscuity, flags;  	kuid_t uid;  	kgid_t gid;  	ASSERT_RTNL(); -	dev->flags |= IFF_PROMISC; -	dev->promiscuity += inc; -	if (dev->promiscuity == 0) { +	promiscuity = dev->promiscuity + inc; +	if (promiscuity == 0) {  		/*  		 * Avoid overflow.  		 * If inc causes overflow, untouch promisc and return error.  		 */ -		if (inc < 0) -			dev->flags &= ~IFF_PROMISC; -		else { -			dev->promiscuity -= inc; +		if (unlikely(inc > 0)) {  			netdev_warn(dev, "promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n");  			return -EOVERFLOW;  		} +		flags = old_flags & ~IFF_PROMISC; +	} else { +		flags = old_flags | IFF_PROMISC;  	} -	if (dev->flags != old_flags) { +	WRITE_ONCE(dev->promiscuity, promiscuity); +	if (flags != old_flags) { +		WRITE_ONCE(dev->flags, flags);  		netdev_info(dev, "%s promiscuous mode\n",  			    dev->flags & IFF_PROMISC ? "entered" : "left");  		if (audit_enabled) { @@ -8530,25 +8616,27 @@ EXPORT_SYMBOL(dev_set_promiscuity);  static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)  {  	unsigned int old_flags = dev->flags, old_gflags = dev->gflags; +	unsigned int allmulti, flags;  	ASSERT_RTNL(); -	dev->flags |= IFF_ALLMULTI; -	dev->allmulti += inc; -	if (dev->allmulti == 0) { +	allmulti = dev->allmulti + inc; +	if (allmulti == 0) {  		/*  		 * Avoid overflow.  		 * If inc causes overflow, untouch allmulti and return error.  		 */ -		if (inc < 0) -			dev->flags &= ~IFF_ALLMULTI; -		else { -			dev->allmulti -= inc; +		if (unlikely(inc > 0)) {  			netdev_warn(dev, "allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n");  			return -EOVERFLOW;  		} +		flags = old_flags & ~IFF_ALLMULTI; +	} else { +		flags = old_flags | IFF_ALLMULTI;  	} -	if (dev->flags ^ old_flags) { +	WRITE_ONCE(dev->allmulti, allmulti); +	if (flags != old_flags) { +		WRITE_ONCE(dev->flags, flags);  		netdev_info(dev, "%s allmulticast mode\n",  			    dev->flags & IFF_ALLMULTI ? "entered" : "left");  		dev_change_rx_flags(dev, IFF_ALLMULTI); @@ -8874,7 +8962,7 @@ int dev_change_tx_queue_len(struct net_device *dev, unsigned long new_len)  		return -ERANGE;  	if (new_len != orig_len) { -		dev->tx_queue_len = new_len; +		WRITE_ONCE(dev->tx_queue_len, new_len);  		res = call_netdevice_notifiers(NETDEV_CHANGE_TX_QUEUE_LEN, dev);  		res = notifier_to_errno(res);  		if (res) @@ -8888,7 +8976,7 @@ int dev_change_tx_queue_len(struct net_device *dev, unsigned long new_len)  err_rollback:  	netdev_err(dev, "refused to change device tx_queue_len\n"); -	dev->tx_queue_len = orig_len; +	WRITE_ONCE(dev->tx_queue_len, orig_len);  	return res;  } @@ -9134,7 +9222,7 @@ int dev_change_proto_down(struct net_device *dev, bool proto_down)  		netif_carrier_off(dev);  	else  		netif_carrier_on(dev); -	dev->proto_down = proto_down; +	WRITE_ONCE(dev->proto_down, proto_down);  	return 0;  } @@ -9148,18 +9236,21 @@ int dev_change_proto_down(struct net_device *dev, bool proto_down)  void dev_change_proto_down_reason(struct net_device *dev, unsigned long mask,  				  u32 value)  { +	u32 proto_down_reason;  	int b;  	if (!mask) { -		dev->proto_down_reason = value; +		proto_down_reason = value;  	} else { +		proto_down_reason = dev->proto_down_reason;  		for_each_set_bit(b, &mask, 32) {  			if (value & (1 << b)) -				dev->proto_down_reason |= BIT(b); +				proto_down_reason |= BIT(b);  			else -				dev->proto_down_reason &= ~BIT(b); +				proto_down_reason &= ~BIT(b);  		}  	} +	WRITE_ONCE(dev->proto_down_reason, proto_down_reason);  }  struct bpf_xdp_link { @@ -10349,25 +10440,12 @@ err_free_name:  }  EXPORT_SYMBOL(register_netdevice); -/** - *	init_dummy_netdev	- init a dummy network device for NAPI - *	@dev: device to init - * - *	This takes a network device structure and initialize the minimum - *	amount of fields so it can be used to schedule NAPI polls without - *	registering a full blown interface. This is to be used by drivers - *	that need to tie several hardware interfaces to a single NAPI - *	poll scheduler due to HW limitations. +/* Initialize the core of a dummy net device. + * This is useful if you are calling this function after alloc_netdev(), + * since it does not memset the net_device fields.   */ -void init_dummy_netdev(struct net_device *dev) +static void init_dummy_netdev_core(struct net_device *dev)  { -	/* Clear everything. Note we don't initialize spinlocks -	 * are they aren't supposed to be taken by any of the -	 * NAPI code and this dummy netdev is supposed to be -	 * only ever used for NAPI polls -	 */ -	memset(dev, 0, sizeof(struct net_device)); -  	/* make sure we BUG if trying to hit standard  	 * register/unregister code path  	 */ @@ -10388,8 +10466,28 @@ void init_dummy_netdev(struct net_device *dev)  	 * its refcount.  	 */  } -EXPORT_SYMBOL_GPL(init_dummy_netdev); +/** + *	init_dummy_netdev	- init a dummy network device for NAPI + *	@dev: device to init + * + *	This takes a network device structure and initializes the minimum + *	amount of fields so it can be used to schedule NAPI polls without + *	registering a full blown interface. This is to be used by drivers + *	that need to tie several hardware interfaces to a single NAPI + *	poll scheduler due to HW limitations. + */ +void init_dummy_netdev(struct net_device *dev) +{ +	/* Clear everything. Note we don't initialize spinlocks +	 * as they aren't supposed to be taken by any of the +	 * NAPI code and this dummy netdev is supposed to be +	 * only ever used for NAPI polls +	 */ +	memset(dev, 0, sizeof(struct net_device)); +	init_dummy_netdev_core(dev); +} +EXPORT_SYMBOL_GPL(init_dummy_netdev);  /**   *	register_netdev	- register a network device @@ -10488,8 +10586,9 @@ static struct net_device *netdev_wait_allrefs_any(struct list_head *list)  			rebroadcast_time = jiffies;  		} +		rcu_barrier(); +  		if (!wait) { -			rcu_barrier();  			wait = WAIT_REFS_MIN_MSECS;  		} else {  			msleep(wait); @@ -10987,7 +11086,8 @@ void free_netdev(struct net_device *dev)  	dev->xdp_bulkq = NULL;  	/*  Compatibility with error handling in drivers */ -	if (dev->reg_state == NETREG_UNINITIALIZED) { +	if (dev->reg_state == NETREG_UNINITIALIZED || +	    dev->reg_state == NETREG_DUMMY) {  		netdev_freemem(dev);  		return;  	} @@ -11001,6 +11101,19 @@ void free_netdev(struct net_device *dev)  EXPORT_SYMBOL(free_netdev);  /** + * alloc_netdev_dummy - Allocate and initialize a dummy net device. + * @sizeof_priv: size of private data to allocate space for + * + * Return: the allocated net_device on success, NULL otherwise + */ +struct net_device *alloc_netdev_dummy(int sizeof_priv) +{ +	return alloc_netdev(sizeof_priv, "dummy#", NET_NAME_UNKNOWN, +			    init_dummy_netdev_core); +} +EXPORT_SYMBOL_GPL(alloc_netdev_dummy); + +/**   *	synchronize_net -  Synchronize with packet receive processing   *   *	Wait for packets currently being received to be done. @@ -11303,8 +11416,12 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net,  	dev_net_set(dev, net);  	dev->ifindex = new_ifindex; -	if (new_name[0]) /* Rename the netdev to prepared name */ +	if (new_name[0]) { +		/* Rename the netdev to prepared name */ +		write_seqlock(&netdev_rename_lock);  		strscpy(dev->name, new_name, IFNAMSIZ); +		write_sequnlock(&netdev_rename_lock); +	}  	/* Fixup kobjects */  	dev_set_uevent_suppress(&dev->dev, 1); @@ -11379,7 +11496,7 @@ static int dev_cpu_dead(unsigned int oldcpu)  		list_del_init(&napi->poll_list);  		if (napi->poll == process_backlog) -			napi->state = 0; +			napi->state &= NAPIF_STATE_THREADED;  		else  			____napi_schedule(sd, napi);  	} @@ -11387,21 +11504,23 @@ static int dev_cpu_dead(unsigned int oldcpu)  	raise_softirq_irqoff(NET_TX_SOFTIRQ);  	local_irq_enable(); +	if (!use_backlog_threads()) {  #ifdef CONFIG_RPS -	remsd = oldsd->rps_ipi_list; -	oldsd->rps_ipi_list = NULL; +		remsd = oldsd->rps_ipi_list; +		oldsd->rps_ipi_list = NULL;  #endif -	/* send out pending IPI's on offline CPU */ -	net_rps_send_ipi(remsd); +		/* send out pending IPI's on offline CPU */ +		net_rps_send_ipi(remsd); +	}  	/* Process offline CPU's input_pkt_queue */  	while ((skb = __skb_dequeue(&oldsd->process_queue))) {  		netif_rx(skb); -		input_queue_head_incr(oldsd); +		rps_input_queue_head_incr(oldsd);  	}  	while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {  		netif_rx(skb); -		input_queue_head_incr(oldsd); +		rps_input_queue_head_incr(oldsd);  	}  	return 0; @@ -11718,7 +11837,7 @@ static int net_page_pool_create(int cpuid)  	struct page_pool_params page_pool_params = {  		.pool_size = SYSTEM_PERCPU_PAGE_POOL_SIZE,  		.flags = PP_FLAG_SYSTEM_POOL, -		.nid = NUMA_NO_NODE, +		.nid = cpu_to_mem(cpuid),  	};  	struct page_pool *pp_ptr; @@ -11731,6 +11850,38 @@ static int net_page_pool_create(int cpuid)  	return 0;  } +static int backlog_napi_should_run(unsigned int cpu) +{ +	struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu); +	struct napi_struct *napi = &sd->backlog; + +	return test_bit(NAPI_STATE_SCHED_THREADED, &napi->state); +} + +static void run_backlog_napi(unsigned int cpu) +{ +	struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu); + +	napi_threaded_poll_loop(&sd->backlog); +} + +static void backlog_napi_setup(unsigned int cpu) +{ +	struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu); +	struct napi_struct *napi = &sd->backlog; + +	napi->thread = this_cpu_read(backlog_napi); +	set_bit(NAPI_STATE_THREADED, &napi->state); +} + +static struct smp_hotplug_thread backlog_threads = { +	.store			= &backlog_napi, +	.thread_should_run	= backlog_napi_should_run, +	.thread_fn		= run_backlog_napi, +	.thread_comm		= "backlog_napi/%u", +	.setup			= backlog_napi_setup, +}; +  /*   *       This is called single threaded during boot, so no need   *       to take the rtnl semaphore. @@ -11782,10 +11933,13 @@ static int __init net_dev_init(void)  		init_gro_hash(&sd->backlog);  		sd->backlog.poll = process_backlog;  		sd->backlog.weight = weight_p; +		INIT_LIST_HEAD(&sd->backlog.poll_list);  		if (net_page_pool_create(i))  			goto out;  	} +	if (use_backlog_threads()) +		smpboot_register_percpu_thread(&backlog_threads);  	dev_boot_phase = 0; @@ -11811,6 +11965,10 @@ static int __init net_dev_init(void)  				       NULL, dev_cpu_dead);  	WARN_ON(rc < 0);  	rc = 0; + +	/* avoid static key IPIs to isolated CPUs */ +	if (housekeeping_enabled(HK_TYPE_MISC)) +		net_enable_timestamp();  out:  	if (rc < 0) {  		for_each_possible_cpu(i) { diff --git a/net/core/dev.h b/net/core/dev.h index 2bcaf8eee50c..b7b518bc2be5 100644 --- a/net/core/dev.h +++ b/net/core/dev.h @@ -4,11 +4,9 @@  #include <linux/types.h>  #include <linux/rwsem.h> +#include <linux/netdevice.h>  struct net; -struct net_device; -struct netdev_bpf; -struct netdev_phys_item_id;  struct netlink_ext_ack;  struct cpumask; @@ -38,7 +36,6 @@ int dev_addr_init(struct net_device *dev);  void dev_addr_check(struct net_device *dev);  /* sysctls not referred to from outside net/core/ */ -extern unsigned int	sysctl_skb_defer_max;  extern int		netdev_unregister_timeout_secs;  extern int		weight_p;  extern int		dev_weight_rx_bias; @@ -150,4 +147,23 @@ static inline void xdp_do_check_flushed(struct napi_struct *napi) { }  #endif  struct napi_struct *napi_by_id(unsigned int napi_id); +void kick_defer_list_purge(struct softnet_data *sd, unsigned int cpu); + +#define XMIT_RECURSION_LIMIT	8 +static inline bool dev_xmit_recursion(void) +{ +	return unlikely(__this_cpu_read(softnet_data.xmit.recursion) > +			XMIT_RECURSION_LIMIT); +} + +static inline void dev_xmit_recursion_inc(void) +{ +	__this_cpu_inc(softnet_data.xmit.recursion); +} + +static inline void dev_xmit_recursion_dec(void) +{ +	__this_cpu_dec(softnet_data.xmit.recursion); +} +  #endif diff --git a/net/core/dev_addr_lists_test.c b/net/core/dev_addr_lists_test.c index 4dbd0dc6aea2..8e1dba825e94 100644 --- a/net/core/dev_addr_lists_test.c +++ b/net/core/dev_addr_lists_test.c @@ -49,7 +49,6 @@ static int dev_addr_test_init(struct kunit *test)  		KUNIT_FAIL(test, "Can't register netdev %d", err);  	} -	rtnl_lock();  	return 0;  } @@ -57,7 +56,6 @@ static void dev_addr_test_exit(struct kunit *test)  {  	struct net_device *netdev = test->priv; -	rtnl_unlock();  	unregister_netdev(netdev);  	free_netdev(netdev);  } @@ -67,6 +65,7 @@ static void dev_addr_test_basic(struct kunit *test)  	struct net_device *netdev = test->priv;  	u8 addr[ETH_ALEN]; +	rtnl_lock();  	KUNIT_EXPECT_TRUE(test, !!netdev->dev_addr);  	memset(addr, 2, sizeof(addr)); @@ -76,6 +75,7 @@ static void dev_addr_test_basic(struct kunit *test)  	memset(addr, 3, sizeof(addr));  	dev_addr_set(netdev, addr);  	KUNIT_EXPECT_MEMEQ(test, netdev->dev_addr, addr, sizeof(addr)); +	rtnl_unlock();  }  static void dev_addr_test_sync_one(struct kunit *test) @@ -86,6 +86,7 @@ static void dev_addr_test_sync_one(struct kunit *test)  	datp = netdev_priv(netdev); +	rtnl_lock();  	memset(addr, 1, sizeof(addr));  	eth_hw_addr_set(netdev, addr); @@ -103,6 +104,7 @@ static void dev_addr_test_sync_one(struct kunit *test)  	 * considered synced and we overwrite in place.  	 */  	KUNIT_EXPECT_EQ(test, 0, datp->addr_seen); +	rtnl_unlock();  }  static void dev_addr_test_add_del(struct kunit *test) @@ -114,6 +116,7 @@ static void dev_addr_test_add_del(struct kunit *test)  	datp = netdev_priv(netdev); +	rtnl_lock();  	for (i = 1; i < 4; i++) {  		memset(addr, i, sizeof(addr));  		KUNIT_EXPECT_EQ(test, 0, dev_addr_add(netdev, addr, @@ -143,6 +146,7 @@ static void dev_addr_test_add_del(struct kunit *test)  	__hw_addr_sync_dev(&netdev->dev_addrs, netdev, dev_addr_test_sync,  			   dev_addr_test_unsync);  	KUNIT_EXPECT_EQ(test, 1, datp->addr_seen); +	rtnl_unlock();  }  static void dev_addr_test_del_main(struct kunit *test) @@ -150,6 +154,7 @@ static void dev_addr_test_del_main(struct kunit *test)  	struct net_device *netdev = test->priv;  	u8 addr[ETH_ALEN]; +	rtnl_lock();  	memset(addr, 1, sizeof(addr));  	eth_hw_addr_set(netdev, addr); @@ -161,6 +166,7 @@ static void dev_addr_test_del_main(struct kunit *test)  					      NETDEV_HW_ADDR_T_LAN));  	KUNIT_EXPECT_EQ(test, -ENOENT, dev_addr_del(netdev, addr,  						    NETDEV_HW_ADDR_T_LAN)); +	rtnl_unlock();  }  static void dev_addr_test_add_set(struct kunit *test) @@ -172,6 +178,7 @@ static void dev_addr_test_add_set(struct kunit *test)  	datp = netdev_priv(netdev); +	rtnl_lock();  	/* There is no external API like dev_addr_add_excl(),  	 * so shuffle the tree a little bit and exploit aliasing.  	 */ @@ -191,6 +198,7 @@ static void dev_addr_test_add_set(struct kunit *test)  	__hw_addr_sync_dev(&netdev->dev_addrs, netdev, dev_addr_test_sync,  			   dev_addr_test_unsync);  	KUNIT_EXPECT_EQ(test, 0xffff, datp->addr_seen); +	rtnl_unlock();  }  static void dev_addr_test_add_excl(struct kunit *test) @@ -199,6 +207,7 @@ static void dev_addr_test_add_excl(struct kunit *test)  	u8 addr[ETH_ALEN];  	int i; +	rtnl_lock();  	for (i = 0; i < 10; i++) {  		memset(addr, i, sizeof(addr));  		KUNIT_EXPECT_EQ(test, 0, dev_uc_add_excl(netdev, addr)); @@ -213,6 +222,7 @@ static void dev_addr_test_add_excl(struct kunit *test)  		memset(addr, i, sizeof(addr));  		KUNIT_EXPECT_EQ(test, -EEXIST, dev_uc_add_excl(netdev, addr));  	} +	rtnl_unlock();  }  static struct kunit_case dev_addr_test_cases[] = { diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c index b0f221d658be..430ed18f8584 100644 --- a/net/core/drop_monitor.c +++ b/net/core/drop_monitor.c @@ -74,7 +74,7 @@ struct net_dm_hw_entries {  };  struct per_cpu_dm_data { -	spinlock_t		lock;	/* Protects 'skb', 'hw_entries' and +	raw_spinlock_t		lock;	/* Protects 'skb', 'hw_entries' and  					 * 'send_timer'  					 */  	union { @@ -168,9 +168,9 @@ static struct sk_buff *reset_per_cpu_data(struct per_cpu_dm_data *data)  err:  	mod_timer(&data->send_timer, jiffies + HZ / 10);  out: -	spin_lock_irqsave(&data->lock, flags); +	raw_spin_lock_irqsave(&data->lock, flags);  	swap(data->skb, skb); -	spin_unlock_irqrestore(&data->lock, flags); +	raw_spin_unlock_irqrestore(&data->lock, flags);  	if (skb) {  		struct nlmsghdr *nlh = (struct nlmsghdr *)skb->data; @@ -225,7 +225,7 @@ static void trace_drop_common(struct sk_buff *skb, void *location)  	local_irq_save(flags);  	data = this_cpu_ptr(&dm_cpu_data); -	spin_lock(&data->lock); +	raw_spin_lock(&data->lock);  	dskb = data->skb;  	if (!dskb) @@ -259,7 +259,7 @@ static void trace_drop_common(struct sk_buff *skb, void *location)  	}  out: -	spin_unlock_irqrestore(&data->lock, flags); +	raw_spin_unlock_irqrestore(&data->lock, flags);  }  static void trace_kfree_skb_hit(void *ignore, struct sk_buff *skb, @@ -314,9 +314,9 @@ net_dm_hw_reset_per_cpu_data(struct per_cpu_dm_data *hw_data)  		mod_timer(&hw_data->send_timer, jiffies + HZ / 10);  	} -	spin_lock_irqsave(&hw_data->lock, flags); +	raw_spin_lock_irqsave(&hw_data->lock, flags);  	swap(hw_data->hw_entries, hw_entries); -	spin_unlock_irqrestore(&hw_data->lock, flags); +	raw_spin_unlock_irqrestore(&hw_data->lock, flags);  	return hw_entries;  } @@ -448,7 +448,7 @@ net_dm_hw_trap_summary_probe(void *ignore, const struct devlink *devlink,  		return;  	hw_data = this_cpu_ptr(&dm_hw_cpu_data); -	spin_lock_irqsave(&hw_data->lock, flags); +	raw_spin_lock_irqsave(&hw_data->lock, flags);  	hw_entries = hw_data->hw_entries;  	if (!hw_entries) @@ -477,7 +477,7 @@ net_dm_hw_trap_summary_probe(void *ignore, const struct devlink *devlink,  	}  out: -	spin_unlock_irqrestore(&hw_data->lock, flags); +	raw_spin_unlock_irqrestore(&hw_data->lock, flags);  }  static const struct net_dm_alert_ops net_dm_alert_summary_ops = { @@ -1673,7 +1673,7 @@ static struct notifier_block dropmon_net_notifier = {  static void __net_dm_cpu_data_init(struct per_cpu_dm_data *data)  { -	spin_lock_init(&data->lock); +	raw_spin_lock_init(&data->lock);  	skb_queue_head_init(&data->drop_queue);  	u64_stats_init(&data->stats.syncp);  } diff --git a/net/core/dst_cache.c b/net/core/dst_cache.c index 0ccfd5fa5cb9..6a0482e676d3 100644 --- a/net/core/dst_cache.c +++ b/net/core/dst_cache.c @@ -47,7 +47,8 @@ static struct dst_entry *dst_cache_per_cpu_get(struct dst_cache *dst_cache,  	/* the cache already hold a dst reference; it can't go away */  	dst_hold(dst); -	if (unlikely(!time_after(idst->refresh_ts, dst_cache->reset_ts) || +	if (unlikely(!time_after(idst->refresh_ts, +				 READ_ONCE(dst_cache->reset_ts)) ||  		     (dst->obsolete && !dst->ops->check(dst, idst->cookie)))) {  		dst_cache_per_cpu_dst_set(idst, NULL, 0);  		dst_release(dst); @@ -83,7 +84,7 @@ struct rtable *dst_cache_get_ip4(struct dst_cache *dst_cache, __be32 *saddr)  		return NULL;  	*saddr = idst->in_saddr.s_addr; -	return container_of(dst, struct rtable, dst); +	return dst_rtable(dst);  }  EXPORT_SYMBOL_GPL(dst_cache_get_ip4); @@ -111,8 +112,8 @@ void dst_cache_set_ip6(struct dst_cache *dst_cache, struct dst_entry *dst,  		return;  	idst = this_cpu_ptr(dst_cache->cache); -	dst_cache_per_cpu_dst_set(this_cpu_ptr(dst_cache->cache), dst, -				  rt6_get_cookie((struct rt6_info *)dst)); +	dst_cache_per_cpu_dst_set(idst, dst, +				  rt6_get_cookie(dst_rt6_info(dst)));  	idst->in6_saddr = *saddr;  }  EXPORT_SYMBOL_GPL(dst_cache_set_ip6); @@ -170,7 +171,7 @@ void dst_cache_reset_now(struct dst_cache *dst_cache)  	if (!dst_cache->cache)  		return; -	dst_cache->reset_ts = jiffies; +	dst_cache_reset(dst_cache);  	for_each_possible_cpu(i) {  		struct dst_cache_pcpu *idst = per_cpu_ptr(dst_cache->cache, i);  		struct dst_entry *dst = idst->dst; diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 3f933ffcefc3..6ebffbc63236 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -1142,10 +1142,10 @@ static int fib_nl_dumprule(struct sk_buff *skb, struct netlink_callback *cb)  	const struct nlmsghdr *nlh = cb->nlh;  	struct net *net = sock_net(skb->sk);  	struct fib_rules_ops *ops; -	int idx = 0, family; +	int err, idx = 0, family;  	if (cb->strict_check) { -		int err = fib_valid_dumprule_req(nlh, cb->extack); +		err = fib_valid_dumprule_req(nlh, cb->extack);  		if (err < 0)  			return err; @@ -1158,17 +1158,17 @@ static int fib_nl_dumprule(struct sk_buff *skb, struct netlink_callback *cb)  		if (ops == NULL)  			return -EAFNOSUPPORT; -		dump_rules(skb, cb, ops); - -		return skb->len; +		return dump_rules(skb, cb, ops);  	} +	err = 0;  	rcu_read_lock();  	list_for_each_entry_rcu(ops, &net->rules_ops, list) {  		if (idx < cb->args[0] || !try_module_get(ops->owner))  			goto skip; -		if (dump_rules(skb, cb, ops) < 0) +		err = dump_rules(skb, cb, ops); +		if (err < 0)  			break;  		cb->args[1] = 0; @@ -1178,7 +1178,7 @@ skip:  	rcu_read_unlock();  	cb->args[0] = idx; -	return skb->len; +	return err;  }  static void notify_rule_change(int event, struct fib_rule *rule, @@ -1293,7 +1293,8 @@ static int __init fib_rules_init(void)  	int err;  	rtnl_register(PF_UNSPEC, RTM_NEWRULE, fib_nl_newrule, NULL, 0);  	rtnl_register(PF_UNSPEC, RTM_DELRULE, fib_nl_delrule, NULL, 0); -	rtnl_register(PF_UNSPEC, RTM_GETRULE, NULL, fib_nl_dumprule, 0); +	rtnl_register(PF_UNSPEC, RTM_GETRULE, NULL, fib_nl_dumprule, +		      RTNL_FLAG_DUMP_UNLOCKED);  	err = register_pernet_subsys(&fib_rules_net_ops);  	if (err < 0) diff --git a/net/core/filter.c b/net/core/filter.c index ae5254f712c9..2510464692af 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -87,6 +87,9 @@  #include "dev.h" +/* Keep the struct bpf_fib_lookup small so that it fits into a cacheline */ +static_assert(sizeof(struct bpf_fib_lookup) == 64, "struct bpf_fib_lookup size check"); +  static const struct bpf_func_proto *  bpf_sk_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog); @@ -2215,7 +2218,7 @@ static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb,  	rcu_read_lock();  	if (!nh) {  		dst = skb_dst(skb); -		nexthop = rt6_nexthop(container_of(dst, struct rt6_info, dst), +		nexthop = rt6_nexthop(dst_rt6_info(dst),  				      &ipv6_hdr(skb)->daddr);  	} else {  		nexthop = &nh->ipv6_nh; @@ -2314,8 +2317,7 @@ static int bpf_out_neigh_v4(struct net *net, struct sk_buff *skb,  	rcu_read_lock();  	if (!nh) { -		struct dst_entry *dst = skb_dst(skb); -		struct rtable *rt = container_of(dst, struct rtable, dst); +		struct rtable *rt = skb_rtable(skb);  		neigh = ip_neigh_for_gw(rt, skb, &is_v6gw);  	} else if (nh->nh_family == AF_INET6) { @@ -4684,7 +4686,7 @@ set_compat:  	to->tunnel_tos = info->key.tos;  	to->tunnel_ttl = info->key.ttl;  	if (flags & BPF_F_TUNINFO_FLAGS) -		to->tunnel_flags = info->key.tun_flags; +		to->tunnel_flags = ip_tunnel_flags_to_be16(info->key.tun_flags);  	else  		to->tunnel_ext = 0; @@ -4727,7 +4729,7 @@ BPF_CALL_3(bpf_skb_get_tunnel_opt, struct sk_buff *, skb, u8 *, to, u32, size)  	int err;  	if (unlikely(!info || -		     !(info->key.tun_flags & TUNNEL_OPTIONS_PRESENT))) { +		     !ip_tunnel_is_options_present(info->key.tun_flags))) {  		err = -ENOENT;  		goto err_clear;  	} @@ -4797,15 +4799,15 @@ BPF_CALL_4(bpf_skb_set_tunnel_key, struct sk_buff *, skb,  	memset(info, 0, sizeof(*info));  	info->mode = IP_TUNNEL_INFO_TX; -	info->key.tun_flags = TUNNEL_KEY | TUNNEL_CSUM | TUNNEL_NOCACHE; -	if (flags & BPF_F_DONT_FRAGMENT) -		info->key.tun_flags |= TUNNEL_DONT_FRAGMENT; -	if (flags & BPF_F_ZERO_CSUM_TX) -		info->key.tun_flags &= ~TUNNEL_CSUM; -	if (flags & BPF_F_SEQ_NUMBER) -		info->key.tun_flags |= TUNNEL_SEQ; -	if (flags & BPF_F_NO_TUNNEL_KEY) -		info->key.tun_flags &= ~TUNNEL_KEY; +	__set_bit(IP_TUNNEL_NOCACHE_BIT, info->key.tun_flags); +	__assign_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, info->key.tun_flags, +		     flags & BPF_F_DONT_FRAGMENT); +	__assign_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags, +		     !(flags & BPF_F_ZERO_CSUM_TX)); +	__assign_bit(IP_TUNNEL_SEQ_BIT, info->key.tun_flags, +		     flags & BPF_F_SEQ_NUMBER); +	__assign_bit(IP_TUNNEL_KEY_BIT, info->key.tun_flags, +		     !(flags & BPF_F_NO_TUNNEL_KEY));  	info->key.tun_id = cpu_to_be64(from->tunnel_id);  	info->key.tos = from->tunnel_tos; @@ -4843,13 +4845,15 @@ BPF_CALL_3(bpf_skb_set_tunnel_opt, struct sk_buff *, skb,  {  	struct ip_tunnel_info *info = skb_tunnel_info(skb);  	const struct metadata_dst *md = this_cpu_ptr(md_dst); +	IP_TUNNEL_DECLARE_FLAGS(present) = { };  	if (unlikely(info != &md->u.tun_info || (size & (sizeof(u32) - 1))))  		return -EINVAL;  	if (unlikely(size > IP_TUNNEL_OPTS_MAX))  		return -ENOMEM; -	ip_tunnel_info_opts_set(info, from, size, TUNNEL_OPTIONS_PRESENT); +	ip_tunnel_set_options_present(present); +	ip_tunnel_info_opts_set(info, from, size, present);  	return 0;  } @@ -5906,7 +5910,10 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params,  		err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF);  	} else { -		fl4.flowi4_mark = 0; +		if (flags & BPF_FIB_LOOKUP_MARK) +			fl4.flowi4_mark = params->mark; +		else +			fl4.flowi4_mark = 0;  		fl4.flowi4_secid = 0;  		fl4.flowi4_tun_key.tun_id = 0;  		fl4.flowi4_uid = sock_net_uid(net, NULL); @@ -6049,7 +6056,10 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,  		err = ipv6_stub->fib6_table_lookup(net, tb, oif, &fl6, &res,  						   strict);  	} else { -		fl6.flowi6_mark = 0; +		if (flags & BPF_FIB_LOOKUP_MARK) +			fl6.flowi6_mark = params->mark; +		else +			fl6.flowi6_mark = 0;  		fl6.flowi6_secid = 0;  		fl6.flowi6_tun_key.tun_id = 0;  		fl6.flowi6_uid = sock_net_uid(net, NULL); @@ -6127,7 +6137,7 @@ set_fwd_params:  #define BPF_FIB_LOOKUP_MASK (BPF_FIB_LOOKUP_DIRECT | BPF_FIB_LOOKUP_OUTPUT | \  			     BPF_FIB_LOOKUP_SKIP_NEIGH | BPF_FIB_LOOKUP_TBID | \ -			     BPF_FIB_LOOKUP_SRC) +			     BPF_FIB_LOOKUP_SRC | BPF_FIB_LOOKUP_MARK)  BPF_CALL_4(bpf_xdp_fib_lookup, struct xdp_buff *, ctx,  	   struct bpf_fib_lookup *, params, int, plen, u32, flags) @@ -8364,8 +8374,6 @@ sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)  		return &bpf_event_output_data_proto;  	case BPF_FUNC_get_current_uid_gid:  		return &bpf_get_current_uid_gid_proto; -	case BPF_FUNC_get_current_pid_tgid: -		return &bpf_get_current_pid_tgid_proto;  	case BPF_FUNC_sk_storage_get:  		return &bpf_sk_storage_get_proto;  	case BPF_FUNC_sk_storage_delete: diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 272f09251343..f82e9a7d3b37 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -455,17 +455,25 @@ skb_flow_dissect_tunnel_info(const struct sk_buff *skb,  	if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_OPTS)) {  		struct flow_dissector_key_enc_opts *enc_opt; +		IP_TUNNEL_DECLARE_FLAGS(flags) = { }; +		u32 val;  		enc_opt = skb_flow_dissector_target(flow_dissector,  						    FLOW_DISSECTOR_KEY_ENC_OPTS,  						    target_container); -		if (info->options_len) { -			enc_opt->len = info->options_len; -			ip_tunnel_info_opts_get(enc_opt->data, info); -			enc_opt->dst_opt_type = info->key.tun_flags & -						TUNNEL_OPTIONS_PRESENT; -		} +		if (!info->options_len) +			return; + +		enc_opt->len = info->options_len; +		ip_tunnel_info_opts_get(enc_opt->data, info); + +		ip_tunnel_set_options_present(flags); +		ip_tunnel_flags_and(flags, info->key.tun_flags, flags); + +		val = find_next_bit(flags, __IP_TUNNEL_FLAG_NUM, +				    IP_TUNNEL_GENEVE_OPT_BIT); +		enc_opt->dst_opt_type = val < __IP_TUNNEL_FLAG_NUM ? val : 0;  	}  }  EXPORT_SYMBOL(skb_flow_dissect_tunnel_info); diff --git a/net/core/gro.c b/net/core/gro.c index c7901253a1a8..b3b43de1a650 100644 --- a/net/core/gro.c +++ b/net/core/gro.c @@ -3,6 +3,7 @@  #include <net/dst_metadata.h>  #include <net/busy_poll.h>  #include <trace/events/net.h> +#include <linux/skbuff_ref.h>  #define MAX_GRO_SKBS 8 @@ -230,6 +231,33 @@ done:  	return 0;  } +int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb) +{ +	if (unlikely(p->len + skb->len >= 65536)) +		return -E2BIG; + +	if (NAPI_GRO_CB(p)->last == p) +		skb_shinfo(p)->frag_list = skb; +	else +		NAPI_GRO_CB(p)->last->next = skb; + +	skb_pull(skb, skb_gro_offset(skb)); + +	NAPI_GRO_CB(p)->last = skb; +	NAPI_GRO_CB(p)->count++; +	p->data_len += skb->len; + +	/* sk ownership - if any - completely transferred to the aggregated packet */ +	skb->destructor = NULL; +	skb->sk = NULL; +	p->truesize += skb->truesize; +	p->len += skb->len; + +	NAPI_GRO_CB(skb)->same_flow = 1; + +	return 0; +} +  static void napi_gro_complete(struct napi_struct *napi, struct sk_buff *skb)  { @@ -330,8 +358,6 @@ static void gro_list_prepare(const struct list_head *head,  	list_for_each_entry(p, head, list) {  		unsigned long diffs; -		NAPI_GRO_CB(p)->flush = 0; -  		if (hash != skb_get_hash_raw(p)) {  			NAPI_GRO_CB(p)->same_flow = 0;  			continue; @@ -471,7 +497,6 @@ found_ptype:  					sizeof(u32))); /* Avoid slow unaligned acc */  	*(u32 *)&NAPI_GRO_CB(skb)->zeroed = 0;  	NAPI_GRO_CB(skb)->flush = skb_has_frag_list(skb); -	NAPI_GRO_CB(skb)->is_atomic = 1;  	NAPI_GRO_CB(skb)->count = 1;  	if (unlikely(skb_is_gso(skb))) {  		NAPI_GRO_CB(skb)->count = skb_shinfo(skb)->gso_segs; diff --git a/net/core/hotdata.c b/net/core/hotdata.c index c8a7a451c18a..d0aaaaa556f2 100644 --- a/net/core/hotdata.c +++ b/net/core/hotdata.c @@ -1,9 +1,9 @@  // SPDX-License-Identifier: GPL-2.0-or-later -#include <net/hotdata.h>  #include <linux/cache.h>  #include <linux/jiffies.h>  #include <linux/list.h> - +#include <net/hotdata.h> +#include <net/proto_memory.h>  struct net_hotdata net_hotdata __cacheline_aligned = {  	.offload_base = LIST_HEAD_INIT(net_hotdata.offload_base), @@ -18,5 +18,8 @@ struct net_hotdata net_hotdata __cacheline_aligned = {  	.max_backlog = 1000,  	.dev_tx_weight = 64,  	.dev_rx_weight = 64, +	.sysctl_max_skb_frags = MAX_SKB_FRAGS, +	.sysctl_skb_defer_max = 64, +	.sysctl_mem_pcpu_rsv = SK_MEMORY_PCPU_RESERVE  };  EXPORT_SYMBOL(net_hotdata); diff --git a/net/core/ieee8021q_helpers.c b/net/core/ieee8021q_helpers.c new file mode 100644 index 000000000000..759a9b9f3f89 --- /dev/null +++ b/net/core/ieee8021q_helpers.c @@ -0,0 +1,242 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2024 Pengutronix, Oleksij Rempel <[email protected]> + +#include <linux/array_size.h> +#include <linux/printk.h> +#include <linux/types.h> +#include <net/dscp.h> +#include <net/ieee8021q.h> + +/* The following arrays map Traffic Types (TT) to traffic classes (TC) for + * different number of queues as shown in the example provided by + * IEEE 802.1Q-2022 in Annex I "I.3 Traffic type to traffic class mapping" and + * Table I-1 "Traffic type to traffic class mapping". + */ +static const u8 ieee8021q_8queue_tt_tc_map[] = { +	[IEEE8021Q_TT_BK] = 0, +	[IEEE8021Q_TT_BE] = 1, +	[IEEE8021Q_TT_EE] = 2, +	[IEEE8021Q_TT_CA] = 3, +	[IEEE8021Q_TT_VI] = 4, +	[IEEE8021Q_TT_VO] = 5, +	[IEEE8021Q_TT_IC] = 6, +	[IEEE8021Q_TT_NC] = 7, +}; + +static const u8 ieee8021q_7queue_tt_tc_map[] = { +	[IEEE8021Q_TT_BK] = 0, +	[IEEE8021Q_TT_BE] = 1, +	[IEEE8021Q_TT_EE] = 2, +	[IEEE8021Q_TT_CA] = 3, +	[IEEE8021Q_TT_VI] = 4,	[IEEE8021Q_TT_VO] = 4, +	[IEEE8021Q_TT_IC] = 5, +	[IEEE8021Q_TT_NC] = 6, +}; + +static const u8 ieee8021q_6queue_tt_tc_map[] = { +	[IEEE8021Q_TT_BK] = 0, +	[IEEE8021Q_TT_BE] = 1, +	[IEEE8021Q_TT_EE] = 2,	[IEEE8021Q_TT_CA] = 2, +	[IEEE8021Q_TT_VI] = 3,	[IEEE8021Q_TT_VO] = 3, +	[IEEE8021Q_TT_IC] = 4, +	[IEEE8021Q_TT_NC] = 5, +}; + +static const u8 ieee8021q_5queue_tt_tc_map[] = { +	[IEEE8021Q_TT_BK] = 0, [IEEE8021Q_TT_BE] = 0, +	[IEEE8021Q_TT_EE] = 1, [IEEE8021Q_TT_CA] = 1, +	[IEEE8021Q_TT_VI] = 2, [IEEE8021Q_TT_VO] = 2, +	[IEEE8021Q_TT_IC] = 3, +	[IEEE8021Q_TT_NC] = 4, +}; + +static const u8 ieee8021q_4queue_tt_tc_map[] = { +	[IEEE8021Q_TT_BK] = 0, [IEEE8021Q_TT_BE] = 0, +	[IEEE8021Q_TT_EE] = 1, [IEEE8021Q_TT_CA] = 1, +	[IEEE8021Q_TT_VI] = 2, [IEEE8021Q_TT_VO] = 2, +	[IEEE8021Q_TT_IC] = 3, [IEEE8021Q_TT_NC] = 3, +}; + +static const u8 ieee8021q_3queue_tt_tc_map[] = { +	[IEEE8021Q_TT_BK] = 0, [IEEE8021Q_TT_BE] = 0, +	[IEEE8021Q_TT_EE] = 0, [IEEE8021Q_TT_CA] = 0, +	[IEEE8021Q_TT_VI] = 1, [IEEE8021Q_TT_VO] = 1, +	[IEEE8021Q_TT_IC] = 2, [IEEE8021Q_TT_NC] = 2, +}; + +static const u8 ieee8021q_2queue_tt_tc_map[] = { +	[IEEE8021Q_TT_BK] = 0, [IEEE8021Q_TT_BE] = 0, +	[IEEE8021Q_TT_EE] = 0, [IEEE8021Q_TT_CA] = 0, +	[IEEE8021Q_TT_VI] = 1, [IEEE8021Q_TT_VO] = 1, +	[IEEE8021Q_TT_IC] = 1, [IEEE8021Q_TT_NC] = 1, +}; + +static const u8 ieee8021q_1queue_tt_tc_map[] = { +	[IEEE8021Q_TT_BK] = 0, [IEEE8021Q_TT_BE] = 0, +	[IEEE8021Q_TT_EE] = 0, [IEEE8021Q_TT_CA] = 0, +	[IEEE8021Q_TT_VI] = 0, [IEEE8021Q_TT_VO] = 0, +	[IEEE8021Q_TT_IC] = 0, [IEEE8021Q_TT_NC] = 0, +}; + +/** + * ieee8021q_tt_to_tc - Map IEEE 802.1Q Traffic Type to Traffic Class + * @tt: IEEE 802.1Q Traffic Type + * @num_queues: Number of queues + * + * This function maps an IEEE 802.1Q Traffic Type to a Traffic Class (TC) based + * on the number of queues configured on the NIC. The mapping is based on the + * example provided by IEEE 802.1Q-2022 in Annex I "I.3 Traffic type to traffic + * class mapping" and Table I-1 "Traffic type to traffic class mapping". + * + * Return: Traffic Class corresponding to the given Traffic Type or negative + * value in case of error. + */ +int ieee8021q_tt_to_tc(enum ieee8021q_traffic_type tt, unsigned int num_queues) +{ +	if (tt < 0 || tt >= IEEE8021Q_TT_MAX) { +		pr_err("Requested Traffic Type (%d) is out of range (%d)\n", tt, +		       IEEE8021Q_TT_MAX); +		return -EINVAL; +	} + +	switch (num_queues) { +	case 8: +		compiletime_assert(ARRAY_SIZE(ieee8021q_8queue_tt_tc_map) != +				   IEEE8021Q_TT_MAX - 1, +				   "ieee8021q_8queue_tt_tc_map != max - 1"); +		return ieee8021q_8queue_tt_tc_map[tt]; +	case 7: +		compiletime_assert(ARRAY_SIZE(ieee8021q_7queue_tt_tc_map) != +				   IEEE8021Q_TT_MAX - 1, +				   "ieee8021q_7queue_tt_tc_map != max - 1"); + +		return ieee8021q_7queue_tt_tc_map[tt]; +	case 6: +		compiletime_assert(ARRAY_SIZE(ieee8021q_6queue_tt_tc_map) != +				   IEEE8021Q_TT_MAX - 1, +				   "ieee8021q_6queue_tt_tc_map != max - 1"); + +		return ieee8021q_6queue_tt_tc_map[tt]; +	case 5: +		compiletime_assert(ARRAY_SIZE(ieee8021q_5queue_tt_tc_map) != +				   IEEE8021Q_TT_MAX - 1, +				   "ieee8021q_5queue_tt_tc_map != max - 1"); + +		return ieee8021q_5queue_tt_tc_map[tt]; +	case 4: +		compiletime_assert(ARRAY_SIZE(ieee8021q_4queue_tt_tc_map) != +				   IEEE8021Q_TT_MAX - 1, +				   "ieee8021q_4queue_tt_tc_map != max - 1"); + +		return ieee8021q_4queue_tt_tc_map[tt]; +	case 3: +		compiletime_assert(ARRAY_SIZE(ieee8021q_3queue_tt_tc_map) != +				   IEEE8021Q_TT_MAX - 1, +				   "ieee8021q_3queue_tt_tc_map != max - 1"); + +		return ieee8021q_3queue_tt_tc_map[tt]; +	case 2: +		compiletime_assert(ARRAY_SIZE(ieee8021q_2queue_tt_tc_map) != +				   IEEE8021Q_TT_MAX - 1, +				   "ieee8021q_2queue_tt_tc_map != max - 1"); + +		return ieee8021q_2queue_tt_tc_map[tt]; +	case 1: +		compiletime_assert(ARRAY_SIZE(ieee8021q_1queue_tt_tc_map) != +				   IEEE8021Q_TT_MAX - 1, +				   "ieee8021q_1queue_tt_tc_map != max - 1"); + +		return ieee8021q_1queue_tt_tc_map[tt]; +	} + +	pr_err("Invalid number of queues %d\n", num_queues); + +	return -EINVAL; +} +EXPORT_SYMBOL_GPL(ieee8021q_tt_to_tc); + +/** + * ietf_dscp_to_ieee8021q_tt - Map IETF DSCP to IEEE 802.1Q Traffic Type + * @dscp: IETF DSCP value + * + * This function maps an IETF DSCP value to an IEEE 802.1Q Traffic Type (TT). + * Since there is no corresponding mapping between DSCP and IEEE 802.1Q Traffic + * Type, this function is inspired by the RFC8325 documentation which describe + * the mapping between DSCP and 802.11 User Priority (UP) values. + * + * Return: IEEE 802.1Q Traffic Type corresponding to the given DSCP value + */ +int ietf_dscp_to_ieee8021q_tt(u8 dscp) +{ +	switch (dscp) { +	case DSCP_CS0: +	/* Comment from RFC8325: +	 * [RFC4594], Section 4.8, recommends High-Throughput Data be marked +	 * AF1x (that is, AF11, AF12, and AF13, according to the rules defined +	 * in [RFC2475]). +	 * +	 * By default (as described in Section 2.3), High-Throughput Data will +	 * map to UP 1 and, thus, to the Background Access Category (AC_BK), +	 * which is contrary to the intent expressed in [RFC4594]. + +	 * Unfortunately, there really is no corresponding fit for the High- +	 * Throughput Data service class within the constrained 4 Access +	 * Category [IEEE.802.11-2016] model.  If the High-Throughput Data +	 * service class is assigned to the Best Effort Access Category (AC_BE), +	 * then it would contend with Low-Latency Data (while [RFC4594] +	 * recommends a distinction in servicing between these service classes) +	 * as well as with the default service class; alternatively, if it is +	 * assigned to the Background Access Category (AC_BK), then it would +	 * receive a less-then-best-effort service and contend with Low-Priority +	 * Data (as discussed in Section 4.2.10). +	 * +	 * As such, since there is no directly corresponding fit for the High- +	 * Throughout Data service class within the [IEEE.802.11-2016] model, it +	 * is generally RECOMMENDED to map High-Throughput Data to UP 0, thereby +	 * admitting it to the Best Effort Access Category (AC_BE). +	 * +	 * Note: The above text is from RFC8325 which is describing the mapping +	 * between DSCP and 802.11 User Priority (UP) values. The mapping +	 * between UP and IEEE 802.1Q Traffic Type is not defined in the RFC but +	 * the 802.11 AC_BK and AC_BE are closely related to the IEEE 802.1Q +	 * Traffic Types BE and BK. +	 */ +	case DSCP_AF11: +	case DSCP_AF12: +	case DSCP_AF13: +		return IEEE8021Q_TT_BE; +	/* Comment from RFC8325: +	 * RFC3662 and RFC4594 both recommend Low-Priority Data be marked +	 * with DSCP CS1. The Low-Priority Data service class loosely +	 * corresponds to the [IEEE.802.11-2016] Background Access Category +	 */ +	case DSCP_CS1: +		return IEEE8021Q_TT_BK; +	case DSCP_CS2: +	case DSCP_AF21: +	case DSCP_AF22: +	case DSCP_AF23: +		return IEEE8021Q_TT_EE; +	case DSCP_CS3: +	case DSCP_AF31: +	case DSCP_AF32: +	case DSCP_AF33: +		return IEEE8021Q_TT_CA; +	case DSCP_CS4: +	case DSCP_AF41: +	case DSCP_AF42: +	case DSCP_AF43: +		return IEEE8021Q_TT_VI; +	case DSCP_CS5: +	case DSCP_EF: +	case DSCP_VOICE_ADMIT: +		return IEEE8021Q_TT_VO; +	case DSCP_CS6: +		return IEEE8021Q_TT_IC; +	case DSCP_CS7: +		return IEEE8021Q_TT_NC; +	} + +	return SIMPLE_IETF_DSCP_TO_IEEE8021Q_TT(dscp); +} +EXPORT_SYMBOL_GPL(ietf_dscp_to_ieee8021q_tt); diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 552719c3bbc3..45fd88405b6b 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -734,7 +734,9 @@ out_neigh_release:  struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey,  				 struct net_device *dev, bool want_ref)  { -	return ___neigh_create(tbl, pkey, dev, 0, false, want_ref); +	bool exempt_from_gc = !!(dev->flags & IFF_LOOPBACK); + +	return ___neigh_create(tbl, pkey, dev, 0, exempt_from_gc, want_ref);  }  EXPORT_SYMBOL(__neigh_create); @@ -1769,7 +1771,7 @@ static void neigh_parms_destroy(struct neigh_parms *parms)  static struct lock_class_key neigh_table_proxy_queue_class; -static struct neigh_table *neigh_tables[NEIGH_NR_TABLES] __read_mostly; +static struct neigh_table __rcu *neigh_tables[NEIGH_NR_TABLES] __read_mostly;  void neigh_table_init(int index, struct neigh_table *tbl)  { @@ -1826,13 +1828,19 @@ void neigh_table_init(int index, struct neigh_table *tbl)  	tbl->last_flush = now;  	tbl->last_rand	= now + tbl->parms.reachable_time * 20; -	neigh_tables[index] = tbl; +	rcu_assign_pointer(neigh_tables[index], tbl);  }  EXPORT_SYMBOL(neigh_table_init); +/* + * Only called from ndisc_cleanup(), which means this is dead code + * because we no longer can unload IPv6 module. + */  int neigh_table_clear(int index, struct neigh_table *tbl)  { -	neigh_tables[index] = NULL; +	RCU_INIT_POINTER(neigh_tables[index], NULL); +	synchronize_rcu(); +  	/* It is not clean... Fix it to unload IPv6 module safely */  	cancel_delayed_work_sync(&tbl->managed_work);  	cancel_delayed_work_sync(&tbl->gc_work); @@ -1864,10 +1872,10 @@ static struct neigh_table *neigh_find_table(int family)  	switch (family) {  	case AF_INET: -		tbl = neigh_tables[NEIGH_ARP_TABLE]; +		tbl = rcu_dereference_rtnl(neigh_tables[NEIGH_ARP_TABLE]);  		break;  	case AF_INET6: -		tbl = neigh_tables[NEIGH_ND_TABLE]; +		tbl = rcu_dereference_rtnl(neigh_tables[NEIGH_ND_TABLE]);  		break;  	} @@ -2331,7 +2339,7 @@ static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh,  	ndtmsg = nlmsg_data(nlh);  	for (tidx = 0; tidx < NEIGH_NR_TABLES; tidx++) { -		tbl = neigh_tables[tidx]; +		tbl = rcu_dereference_rtnl(neigh_tables[tidx]);  		if (!tbl)  			continue;  		if (ndtmsg->ndtm_family && tbl->family != ndtmsg->ndtm_family) @@ -2519,7 +2527,7 @@ static int neightbl_dump_info(struct sk_buff *skb, struct netlink_callback *cb)  	for (tidx = 0; tidx < NEIGH_NR_TABLES; tidx++) {  		struct neigh_parms *p; -		tbl = neigh_tables[tidx]; +		tbl = rcu_dereference_rtnl(neigh_tables[tidx]);  		if (!tbl)  			continue; @@ -2674,7 +2682,7 @@ static bool neigh_master_filtered(struct net_device *dev, int master_idx)  	if (!master_idx)  		return false; -	master = dev ? netdev_master_upper_dev_get(dev) : NULL; +	master = dev ? netdev_master_upper_dev_get_rcu(dev) : NULL;  	/* 0 is already used to denote NDA_MASTER wasn't passed, therefore need another  	 * invalid value for ifindex to denote "no master". @@ -2707,7 +2715,7 @@ static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb,  {  	struct net *net = sock_net(skb->sk);  	struct neighbour *n; -	int rc, h, s_h = cb->args[1]; +	int err = 0, h, s_h = cb->args[1];  	int idx, s_idx = idx = cb->args[2];  	struct neigh_hash_table *nht;  	unsigned int flags = NLM_F_MULTI; @@ -2715,7 +2723,6 @@ static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb,  	if (filter->dev_idx || filter->master_idx)  		flags |= NLM_F_DUMP_FILTERED; -	rcu_read_lock();  	nht = rcu_dereference(tbl->nht);  	for (h = s_h; h < (1 << nht->hash_shift); h++) { @@ -2729,23 +2736,19 @@ static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb,  			if (neigh_ifindex_filtered(n->dev, filter->dev_idx) ||  			    neigh_master_filtered(n->dev, filter->master_idx))  				goto next; -			if (neigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid, -					    cb->nlh->nlmsg_seq, -					    RTM_NEWNEIGH, -					    flags) < 0) { -				rc = -1; +			err = neigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid, +					      cb->nlh->nlmsg_seq, +					      RTM_NEWNEIGH, flags); +			if (err < 0)  				goto out; -			}  next:  			idx++;  		}  	} -	rc = skb->len;  out: -	rcu_read_unlock();  	cb->args[1] = h;  	cb->args[2] = idx; -	return rc; +	return err;  }  static int pneigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, @@ -2754,7 +2757,7 @@ static int pneigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb,  {  	struct pneigh_entry *n;  	struct net *net = sock_net(skb->sk); -	int rc, h, s_h = cb->args[3]; +	int err = 0, h, s_h = cb->args[3];  	int idx, s_idx = idx = cb->args[4];  	unsigned int flags = NLM_F_MULTI; @@ -2772,11 +2775,11 @@ static int pneigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb,  			if (neigh_ifindex_filtered(n->dev, filter->dev_idx) ||  			    neigh_master_filtered(n->dev, filter->master_idx))  				goto next; -			if (pneigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid, -					    cb->nlh->nlmsg_seq, -					    RTM_NEWNEIGH, flags, tbl) < 0) { +			err = pneigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid, +					       cb->nlh->nlmsg_seq, +					       RTM_NEWNEIGH, flags, tbl); +			if (err < 0) {  				read_unlock_bh(&tbl->lock); -				rc = -1;  				goto out;  			}  		next: @@ -2785,12 +2788,10 @@ static int pneigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb,  	}  	read_unlock_bh(&tbl->lock); -	rc = skb->len;  out:  	cb->args[3] = h;  	cb->args[4] = idx; -	return rc; - +	return err;  }  static int neigh_valid_dump_req(const struct nlmsghdr *nlh, @@ -2878,8 +2879,9 @@ static int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb)  	s_t = cb->args[0]; +	rcu_read_lock();  	for (t = 0; t < NEIGH_NR_TABLES; t++) { -		tbl = neigh_tables[t]; +		tbl = rcu_dereference(neigh_tables[t]);  		if (!tbl)  			continue; @@ -2895,9 +2897,10 @@ static int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb)  		if (err < 0)  			break;  	} +	rcu_read_unlock();  	cb->args[0] = t; -	return skb->len; +	return err;  }  static int neigh_valid_get_req(const struct nlmsghdr *nlh, @@ -3143,14 +3146,15 @@ int neigh_xmit(int index, struct net_device *dev,  	       const void *addr, struct sk_buff *skb)  {  	int err = -EAFNOSUPPORT; +  	if (likely(index < NEIGH_NR_TABLES)) {  		struct neigh_table *tbl;  		struct neighbour *neigh; -		tbl = neigh_tables[index]; -		if (!tbl) -			goto out;  		rcu_read_lock(); +		tbl = rcu_dereference(neigh_tables[index]); +		if (!tbl) +			goto out_unlock;  		if (index == NEIGH_ARP_TABLE) {  			u32 key = *((u32 *)addr); @@ -3166,6 +3170,7 @@ int neigh_xmit(int index, struct net_device *dev,  			goto out_kfree_skb;  		}  		err = READ_ONCE(neigh->output)(neigh, skb); +out_unlock:  		rcu_read_unlock();  	}  	else if (index == NEIGH_LINK_TABLE) { @@ -3728,7 +3733,7 @@ static int neigh_proc_base_reachable_time(struct ctl_table *ctl, int write,  static struct neigh_sysctl_table {  	struct ctl_table_header *sysctl_header; -	struct ctl_table neigh_vars[NEIGH_VAR_MAX + 1]; +	struct ctl_table neigh_vars[NEIGH_VAR_MAX];  } neigh_sysctl_template __read_mostly = {  	.neigh_vars = {  		NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(MCAST_PROBES, "mcast_solicit"), @@ -3779,7 +3784,6 @@ static struct neigh_sysctl_table {  			.extra2		= SYSCTL_INT_MAX,  			.proc_handler	= proc_dointvec_minmax,  		}, -		{},  	},  }; @@ -3807,8 +3811,6 @@ int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p,  	if (dev) {  		dev_name_source = dev->name;  		/* Terminate the table early */ -		memset(&t->neigh_vars[NEIGH_VAR_GC_INTERVAL], 0, -		       sizeof(t->neigh_vars[NEIGH_VAR_GC_INTERVAL]));  		neigh_vars_size = NEIGH_VAR_BASE_REACHABLE_TIME_MS + 1;  	} else {  		struct neigh_table *tbl = p->tbl; @@ -3889,7 +3891,8 @@ static int __init neigh_init(void)  {  	rtnl_register(PF_UNSPEC, RTM_NEWNEIGH, neigh_add, NULL, 0);  	rtnl_register(PF_UNSPEC, RTM_DELNEIGH, neigh_delete, NULL, 0); -	rtnl_register(PF_UNSPEC, RTM_GETNEIGH, neigh_get, neigh_dump_info, 0); +	rtnl_register(PF_UNSPEC, RTM_GETNEIGH, neigh_get, neigh_dump_info, +		      RTNL_FLAG_DUMP_UNLOCKED);  	rtnl_register(PF_UNSPEC, RTM_GETNEIGHTBL, NULL, neightbl_dump_info,  		      0); diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c index a97eceb84e61..fa6d3969734a 100644 --- a/net/core/net-procfs.c +++ b/net/core/net-procfs.c @@ -144,7 +144,8 @@ static int softnet_seq_show(struct seq_file *seq, void *v)  	seq_printf(seq,  		   "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x "  		   "%08x %08x\n", -		   sd->processed, sd->dropped, sd->time_squeeze, 0, +		   sd->processed, atomic_read(&sd->dropped), +		   sd->time_squeeze, 0,  		   0, 0, 0, 0, /* was fastroute */  		   0,	/* was cpu_collision */  		   sd->received_rps, flow_limit_count, diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index e3d7a8cfa20b..4c27a360c294 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -605,13 +605,13 @@ static ssize_t threaded_show(struct device *dev,  	struct net_device *netdev = to_net_dev(dev);  	ssize_t ret = -EINVAL; -	if (!rtnl_trylock()) -		return restart_syscall(); +	rcu_read_lock();  	if (dev_isalive(netdev)) -		ret = sysfs_emit(buf, fmt_dec, netdev->threaded); +		ret = sysfs_emit(buf, fmt_dec, READ_ONCE(netdev->threaded)); + +	rcu_read_unlock(); -	rtnl_unlock();  	return ret;  } @@ -1419,7 +1419,7 @@ static ssize_t bql_show_stall_thrs(struct netdev_queue *queue, char *buf)  {  	struct dql *dql = &queue->dql; -	return sprintf(buf, "%u\n", jiffies_to_msecs(dql->stall_thrs)); +	return sysfs_emit(buf, "%u\n", jiffies_to_msecs(dql->stall_thrs));  }  static ssize_t bql_set_stall_thrs(struct netdev_queue *queue, @@ -1451,7 +1451,7 @@ static struct netdev_queue_attribute bql_stall_thrs_attribute __ro_after_init =  static ssize_t bql_show_stall_max(struct netdev_queue *queue, char *buf)  { -	return sprintf(buf, "%u\n", READ_ONCE(queue->dql.stall_max)); +	return sysfs_emit(buf, "%u\n", READ_ONCE(queue->dql.stall_max));  }  static ssize_t bql_set_stall_max(struct netdev_queue *queue, @@ -1468,7 +1468,7 @@ static ssize_t bql_show_stall_cnt(struct netdev_queue *queue, char *buf)  {  	struct dql *dql = &queue->dql; -	return sprintf(buf, "%lu\n", dql->stall_cnt); +	return sysfs_emit(buf, "%lu\n", dql->stall_cnt);  }  static struct netdev_queue_attribute bql_stall_cnt_attribute __ro_after_init = @@ -2046,7 +2046,7 @@ static void net_get_ownership(const struct device *d, kuid_t *uid, kgid_t *gid)  	net_ns_get_ownership(net, uid, gid);  } -static struct class net_class __ro_after_init = { +static const struct class net_class = {  	.name = "net",  	.dev_release = netdev_release,  	.dev_groups = net_class_groups, diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 9d690d32da33..4f7a61688d18 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -1093,7 +1093,7 @@ static int rtnl_net_dumpid(struct sk_buff *skb, struct netlink_callback *cb)  end:  	if (net_cb.fillargs.add_ref)  		put_net(net_cb.tgt_net); -	return err < 0 ? err : skb->len; +	return err;  }  static void rtnl_net_notifyid(struct net *net, int cmd, int id, u32 portid, @@ -1208,7 +1208,8 @@ void __init net_ns_init(void)  	rtnl_register(PF_UNSPEC, RTM_NEWNSID, rtnl_net_newid, NULL,  		      RTNL_FLAG_DOIT_UNLOCKED);  	rtnl_register(PF_UNSPEC, RTM_GETNSID, rtnl_net_getid, rtnl_net_dumpid, -		      RTNL_FLAG_DOIT_UNLOCKED); +		      RTNL_FLAG_DOIT_UNLOCKED | +		      RTNL_FLAG_DUMP_UNLOCKED);  }  static void free_exit_list(struct pernet_operations *ops, struct list_head *net_exit_list) diff --git a/net/core/gso_test.c b/net/core/net_test.c index 358c44680d91..9c3a590865d2 100644 --- a/net/core/gso_test.c +++ b/net/core/net_test.c @@ -1,6 +1,9 @@  // SPDX-License-Identifier: GPL-2.0-or-later  #include <kunit/test.h> + +/* GSO */ +  #include <linux/skbuff.h>  static const char hdr[] = "abcdefgh"; @@ -258,17 +261,127 @@ free_gso_skb:  	consume_skb(skb);  } -static struct kunit_case gso_test_cases[] = { -	KUNIT_CASE_PARAM(gso_test_func, gso_test_gen_params), -	{} +/* IP tunnel flags */ + +#include <net/ip_tunnels.h> + +struct ip_tunnel_flags_test { +	const char	*name; + +	const u16	*src_bits; +	const u16	*exp_bits; +	u8		src_num; +	u8		exp_num; + +	__be16		exp_val; +	bool		exp_comp; +}; + +#define IP_TUNNEL_FLAGS_TEST(n, src, comp, eval, exp) {	\ +	.name		= (n),				\ +	.src_bits	= (src),			\ +	.src_num	= ARRAY_SIZE(src),		\ +	.exp_comp	= (comp),			\ +	.exp_val	= (eval),			\ +	.exp_bits	= (exp),			\ +	.exp_num	= ARRAY_SIZE(exp),		\ +} + +/* These are __be16-compatible and can be compared as is */ +static const u16 ip_tunnel_flags_1[] = { +	IP_TUNNEL_KEY_BIT, +	IP_TUNNEL_STRICT_BIT, +	IP_TUNNEL_ERSPAN_OPT_BIT, +}; + +/* Due to the previous flags design limitation, setting either + * ``IP_TUNNEL_CSUM_BIT`` (on Big Endian) or ``IP_TUNNEL_DONT_FRAGMENT_BIT`` + * (on Little) also sets VTI/ISATAP bit. In the bitmap implementation, they + * correspond to ``BIT(16)``, which is bigger than ``U16_MAX``, but still is + * backward-compatible. + */ +#ifdef __LITTLE_ENDIAN +#define IP_TUNNEL_CONFLICT_BIT	IP_TUNNEL_DONT_FRAGMENT_BIT +#else +#define IP_TUNNEL_CONFLICT_BIT	IP_TUNNEL_CSUM_BIT +#endif + +static const u16 ip_tunnel_flags_2_src[] = { +	IP_TUNNEL_CONFLICT_BIT, +}; + +static const u16 ip_tunnel_flags_2_exp[] = { +	IP_TUNNEL_CONFLICT_BIT, +	IP_TUNNEL_SIT_ISATAP_BIT,  }; -static struct kunit_suite gso_test_suite = { -	.name = "net_core_gso", -	.test_cases = gso_test_cases, +/* Bits 17 and higher are not compatible with __be16 flags */ +static const u16 ip_tunnel_flags_3_src[] = { +	IP_TUNNEL_VXLAN_OPT_BIT, +	17, +	18, +	20,  }; -kunit_test_suite(gso_test_suite); +static const u16 ip_tunnel_flags_3_exp[] = { +	IP_TUNNEL_VXLAN_OPT_BIT, +}; + +static const struct ip_tunnel_flags_test ip_tunnel_flags_test[] = { +	IP_TUNNEL_FLAGS_TEST("compat", ip_tunnel_flags_1, true, +			     cpu_to_be16(BIT(IP_TUNNEL_KEY_BIT) | +					 BIT(IP_TUNNEL_STRICT_BIT) | +					 BIT(IP_TUNNEL_ERSPAN_OPT_BIT)), +			     ip_tunnel_flags_1), +	IP_TUNNEL_FLAGS_TEST("conflict", ip_tunnel_flags_2_src, true, +			     VTI_ISVTI, ip_tunnel_flags_2_exp), +	IP_TUNNEL_FLAGS_TEST("new", ip_tunnel_flags_3_src, false, +			     cpu_to_be16(BIT(IP_TUNNEL_VXLAN_OPT_BIT)), +			     ip_tunnel_flags_3_exp), +}; + +static void +ip_tunnel_flags_test_case_to_desc(const struct ip_tunnel_flags_test *t, +				  char *desc) +{ +	strscpy(desc, t->name, KUNIT_PARAM_DESC_SIZE); +} +KUNIT_ARRAY_PARAM(ip_tunnel_flags_test, ip_tunnel_flags_test, +		  ip_tunnel_flags_test_case_to_desc); + +static void ip_tunnel_flags_test_run(struct kunit *test) +{ +	const struct ip_tunnel_flags_test *t = test->param_value; +	IP_TUNNEL_DECLARE_FLAGS(src) = { }; +	IP_TUNNEL_DECLARE_FLAGS(exp) = { }; +	IP_TUNNEL_DECLARE_FLAGS(out); + +	for (u32 j = 0; j < t->src_num; j++) +		__set_bit(t->src_bits[j], src); +	for (u32 j = 0; j < t->exp_num; j++) +		__set_bit(t->exp_bits[j], exp); + +	KUNIT_ASSERT_EQ(test, t->exp_comp, +			ip_tunnel_flags_is_be16_compat(src)); +	KUNIT_ASSERT_EQ(test, (__force u16)t->exp_val, +			(__force u16)ip_tunnel_flags_to_be16(src)); + +	ip_tunnel_flags_from_be16(out, t->exp_val); +	KUNIT_ASSERT_TRUE(test, __ipt_flag_op(bitmap_equal, exp, out)); +} + +static struct kunit_case net_test_cases[] = { +	KUNIT_CASE_PARAM(gso_test_func, gso_test_gen_params), +	KUNIT_CASE_PARAM(ip_tunnel_flags_test_run, +			 ip_tunnel_flags_test_gen_params), +	{ }, +}; + +static struct kunit_suite net_test_suite = { +	.name		= "net_core", +	.test_cases	= net_test_cases, +}; +kunit_test_suite(net_test_suite); +MODULE_DESCRIPTION("KUnit tests for networking core");  MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("KUnit tests for segmentation offload"); diff --git a/net/core/netdev-genl-gen.c b/net/core/netdev-genl-gen.c index 8d8ace9ef87f..8350a0afa9ec 100644 --- a/net/core/netdev-genl-gen.c +++ b/net/core/netdev-genl-gen.c @@ -70,6 +70,7 @@ static const struct nla_policy netdev_napi_get_dump_nl_policy[NETDEV_A_NAPI_IFIN  /* NETDEV_CMD_QSTATS_GET - dump */  static const struct nla_policy netdev_qstats_get_nl_policy[NETDEV_A_QSTATS_SCOPE + 1] = { +	[NETDEV_A_QSTATS_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1),  	[NETDEV_A_QSTATS_SCOPE] = NLA_POLICY_MASK(NLA_UINT, 0x1),  }; diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index 7004b3399c2b..1f6ae6379e0f 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -489,7 +489,17 @@ netdev_nl_stats_write_rx(struct sk_buff *rsp, struct netdev_queue_stats_rx *rx)  {  	if (netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_PACKETS, rx->packets) ||  	    netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_BYTES, rx->bytes) || -	    netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_ALLOC_FAIL, rx->alloc_fail)) +	    netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_ALLOC_FAIL, rx->alloc_fail) || +	    netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_DROPS, rx->hw_drops) || +	    netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_DROP_OVERRUNS, rx->hw_drop_overruns) || +	    netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_CSUM_UNNECESSARY, rx->csum_unnecessary) || +	    netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_CSUM_NONE, rx->csum_none) || +	    netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_CSUM_BAD, rx->csum_bad) || +	    netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_GRO_PACKETS, rx->hw_gro_packets) || +	    netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_GRO_BYTES, rx->hw_gro_bytes) || +	    netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_GRO_WIRE_PACKETS, rx->hw_gro_wire_packets) || +	    netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_GRO_WIRE_BYTES, rx->hw_gro_wire_bytes) || +	    netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_DROP_RATELIMITS, rx->hw_drop_ratelimits))  		return -EMSGSIZE;  	return 0;  } @@ -498,7 +508,18 @@ static int  netdev_nl_stats_write_tx(struct sk_buff *rsp, struct netdev_queue_stats_tx *tx)  {  	if (netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_PACKETS, tx->packets) || -	    netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_BYTES, tx->bytes)) +	    netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_BYTES, tx->bytes) || +	    netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_DROPS, tx->hw_drops) || +	    netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_DROP_ERRORS, tx->hw_drop_errors) || +	    netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_CSUM_NONE, tx->csum_none) || +	    netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_NEEDS_CSUM, tx->needs_csum) || +	    netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_GSO_PACKETS, tx->hw_gso_packets) || +	    netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_GSO_BYTES, tx->hw_gso_bytes) || +	    netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_GSO_WIRE_PACKETS, tx->hw_gso_wire_packets) || +	    netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_GSO_WIRE_BYTES, tx->hw_gso_wire_bytes) || +	    netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_DROP_RATELIMITS, tx->hw_drop_ratelimits) || +	    netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_STOP, tx->stop) || +	    netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_WAKE, tx->wake))  		return -EMSGSIZE;  	return 0;  } @@ -639,6 +660,24 @@ nla_put_failure:  	return -EMSGSIZE;  } +static int +netdev_nl_qstats_get_dump_one(struct net_device *netdev, unsigned int scope, +			      struct sk_buff *skb, const struct genl_info *info, +			      struct netdev_nl_dump_ctx *ctx) +{ +	if (!netdev->stat_ops) +		return 0; + +	switch (scope) { +	case 0: +		return netdev_nl_stats_by_netdev(netdev, skb, info); +	case NETDEV_QSTATS_SCOPE_QUEUE: +		return netdev_nl_stats_by_queue(netdev, skb, info, ctx); +	} + +	return -EINVAL;	/* Should not happen, per netlink policy */ +} +  int netdev_nl_qstats_get_dumpit(struct sk_buff *skb,  				struct netlink_callback *cb)  { @@ -646,6 +685,7 @@ int netdev_nl_qstats_get_dumpit(struct sk_buff *skb,  	const struct genl_info *info = genl_info_dump(cb);  	struct net *net = sock_net(skb->sk);  	struct net_device *netdev; +	unsigned int ifindex;  	unsigned int scope;  	int err = 0; @@ -653,21 +693,28 @@ int netdev_nl_qstats_get_dumpit(struct sk_buff *skb,  	if (info->attrs[NETDEV_A_QSTATS_SCOPE])  		scope = nla_get_uint(info->attrs[NETDEV_A_QSTATS_SCOPE]); -	rtnl_lock(); -	for_each_netdev_dump(net, netdev, ctx->ifindex) { -		if (!netdev->stat_ops) -			continue; +	ifindex = 0; +	if (info->attrs[NETDEV_A_QSTATS_IFINDEX]) +		ifindex = nla_get_u32(info->attrs[NETDEV_A_QSTATS_IFINDEX]); -		switch (scope) { -		case 0: -			err = netdev_nl_stats_by_netdev(netdev, skb, info); -			break; -		case NETDEV_QSTATS_SCOPE_QUEUE: -			err = netdev_nl_stats_by_queue(netdev, skb, info, ctx); -			break; +	rtnl_lock(); +	if (ifindex) { +		netdev = __dev_get_by_index(net, ifindex); +		if (netdev && netdev->stat_ops) { +			err = netdev_nl_qstats_get_dump_one(netdev, scope, skb, +							    info, ctx); +		} else { +			NL_SET_BAD_ATTR(info->extack, +					info->attrs[NETDEV_A_QSTATS_IFINDEX]); +			err = netdev ? -EOPNOTSUPP : -ENODEV; +		} +	} else { +		for_each_netdev_dump(net, netdev, ctx->ifindex) { +			err = netdev_nl_qstats_get_dump_one(netdev, scope, skb, +							    info, ctx); +			if (err < 0) +				break;  		} -		if (err < 0) -			break;  	}  	rtnl_unlock(); diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 543007f159f9..55bcacf67df3 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -316,7 +316,7 @@ static int netpoll_owner_active(struct net_device *dev)  	struct napi_struct *napi;  	list_for_each_entry_rcu(napi, &dev->napi_list, dev_list) { -		if (napi->poll_owner == smp_processor_id()) +		if (READ_ONCE(napi->poll_owner) == smp_processor_id())  			return 1;  	}  	return 0; diff --git a/net/core/page_pool.c b/net/core/page_pool.c index dd364d738c00..f4444b4e39e6 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -5,6 +5,7 @@   *	Copyright (C) 2016 Red Hat, Inc.   */ +#include <linux/error-injection.h>  #include <linux/types.h>  #include <linux/kernel.h>  #include <linux/slab.h> @@ -123,9 +124,9 @@ int page_pool_ethtool_stats_get_count(void)  }  EXPORT_SYMBOL(page_pool_ethtool_stats_get_count); -u64 *page_pool_ethtool_stats_get(u64 *data, void *stats) +u64 *page_pool_ethtool_stats_get(u64 *data, const void *stats)  { -	struct page_pool_stats *pool_stats = stats; +	const struct page_pool_stats *pool_stats = stats;  	*data++ = pool_stats->alloc_stats.fast;  	*data++ = pool_stats->alloc_stats.slow; @@ -172,19 +173,29 @@ static void page_pool_producer_unlock(struct page_pool *pool,  		spin_unlock_bh(&pool->ring.producer_lock);  } +static void page_pool_struct_check(void) +{ +	CACHELINE_ASSERT_GROUP_MEMBER(struct page_pool, frag, frag_users); +	CACHELINE_ASSERT_GROUP_MEMBER(struct page_pool, frag, frag_page); +	CACHELINE_ASSERT_GROUP_MEMBER(struct page_pool, frag, frag_offset); +	CACHELINE_ASSERT_GROUP_SIZE(struct page_pool, frag, 4 * sizeof(long)); +} +  static int page_pool_init(struct page_pool *pool,  			  const struct page_pool_params *params,  			  int cpuid)  {  	unsigned int ring_qsize = 1024; /* Default */ +	page_pool_struct_check(); +  	memcpy(&pool->p, ¶ms->fast, sizeof(pool->p));  	memcpy(&pool->slow, ¶ms->slow, sizeof(pool->slow));  	pool->cpuid = cpuid;  	/* Validate only known flags were used */ -	if (pool->p.flags & ~(PP_FLAG_ALL)) +	if (pool->slow.flags & ~PP_FLAG_ALL)  		return -EINVAL;  	if (pool->p.pool_size) @@ -198,22 +209,26 @@ static int page_pool_init(struct page_pool *pool,  	 * DMA_BIDIRECTIONAL is for allowing page used for DMA sending,  	 * which is the XDP_TX use-case.  	 */ -	if (pool->p.flags & PP_FLAG_DMA_MAP) { +	if (pool->slow.flags & PP_FLAG_DMA_MAP) {  		if ((pool->p.dma_dir != DMA_FROM_DEVICE) &&  		    (pool->p.dma_dir != DMA_BIDIRECTIONAL))  			return -EINVAL; + +		pool->dma_map = true;  	} -	if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) { +	if (pool->slow.flags & PP_FLAG_DMA_SYNC_DEV) {  		/* In order to request DMA-sync-for-device the page  		 * needs to be mapped  		 */ -		if (!(pool->p.flags & PP_FLAG_DMA_MAP)) +		if (!(pool->slow.flags & PP_FLAG_DMA_MAP))  			return -EINVAL;  		if (!pool->p.max_len)  			return -EINVAL; +		pool->dma_sync = true; +  		/* pool->p.offset has to be set according to the address  		 * offset used by the DMA engine to start copying rx data  		 */ @@ -222,7 +237,7 @@ static int page_pool_init(struct page_pool *pool,  	pool->has_init_callback = !!pool->slow.init_callback;  #ifdef CONFIG_PAGE_POOL_STATS -	if (!(pool->p.flags & PP_FLAG_SYSTEM_POOL)) { +	if (!(pool->slow.flags & PP_FLAG_SYSTEM_POOL)) {  		pool->recycle_stats = alloc_percpu(struct page_pool_recycle_stats);  		if (!pool->recycle_stats)  			return -ENOMEM; @@ -232,12 +247,13 @@ static int page_pool_init(struct page_pool *pool,  		 * (also percpu) page pool instance.  		 */  		pool->recycle_stats = &pp_system_recycle_stats; +		pool->system = true;  	}  #endif  	if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0) {  #ifdef CONFIG_PAGE_POOL_STATS -		if (!(pool->p.flags & PP_FLAG_SYSTEM_POOL)) +		if (!pool->system)  			free_percpu(pool->recycle_stats);  #endif  		return -ENOMEM; @@ -248,7 +264,7 @@ static int page_pool_init(struct page_pool *pool,  	/* Driver calling page_pool_create() also call page_pool_destroy() */  	refcount_set(&pool->user_cnt, 1); -	if (pool->p.flags & PP_FLAG_DMA_MAP) +	if (pool->dma_map)  		get_device(pool->p.dev);  	return 0; @@ -258,11 +274,11 @@ static void page_pool_uninit(struct page_pool *pool)  {  	ptr_ring_cleanup(&pool->ring, NULL); -	if (pool->p.flags & PP_FLAG_DMA_MAP) +	if (pool->dma_map)  		put_device(pool->p.dev);  #ifdef CONFIG_PAGE_POOL_STATS -	if (!(pool->p.flags & PP_FLAG_SYSTEM_POOL)) +	if (!pool->system)  		free_percpu(pool->recycle_stats);  #endif  } @@ -383,16 +399,26 @@ static struct page *__page_pool_get_cached(struct page_pool *pool)  	return page;  } -static void page_pool_dma_sync_for_device(struct page_pool *pool, -					  struct page *page, -					  unsigned int dma_sync_size) +static void __page_pool_dma_sync_for_device(const struct page_pool *pool, +					    const struct page *page, +					    u32 dma_sync_size)  { +#if defined(CONFIG_HAS_DMA) && defined(CONFIG_DMA_NEED_SYNC)  	dma_addr_t dma_addr = page_pool_get_dma_addr(page);  	dma_sync_size = min(dma_sync_size, pool->p.max_len); -	dma_sync_single_range_for_device(pool->p.dev, dma_addr, -					 pool->p.offset, dma_sync_size, -					 pool->p.dma_dir); +	__dma_sync_single_for_device(pool->p.dev, dma_addr + pool->p.offset, +				     dma_sync_size, pool->p.dma_dir); +#endif +} + +static __always_inline void +page_pool_dma_sync_for_device(const struct page_pool *pool, +			      const struct page *page, +			      u32 dma_sync_size) +{ +	if (pool->dma_sync && dma_dev_need_sync(pool->p.dev)) +		__page_pool_dma_sync_for_device(pool, page, dma_sync_size);  }  static bool page_pool_dma_map(struct page_pool *pool, struct page *page) @@ -414,8 +440,7 @@ static bool page_pool_dma_map(struct page_pool *pool, struct page *page)  	if (page_pool_set_dma_addr(page, dma))  		goto unmap_failed; -	if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) -		page_pool_dma_sync_for_device(pool, page, pool->p.max_len); +	page_pool_dma_sync_for_device(pool, page, pool->p.max_len);  	return true; @@ -460,8 +485,7 @@ static struct page *__page_pool_alloc_page_order(struct page_pool *pool,  	if (unlikely(!page))  		return NULL; -	if ((pool->p.flags & PP_FLAG_DMA_MAP) && -	    unlikely(!page_pool_dma_map(pool, page))) { +	if (pool->dma_map && unlikely(!page_pool_dma_map(pool, page))) {  		put_page(page);  		return NULL;  	} @@ -481,8 +505,8 @@ static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool,  						 gfp_t gfp)  {  	const int bulk = PP_ALLOC_CACHE_REFILL; -	unsigned int pp_flags = pool->p.flags;  	unsigned int pp_order = pool->p.order; +	bool dma_map = pool->dma_map;  	struct page *page;  	int i, nr_pages; @@ -507,8 +531,7 @@ static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool,  	 */  	for (i = 0; i < nr_pages; i++) {  		page = pool->alloc.cache[i]; -		if ((pp_flags & PP_FLAG_DMA_MAP) && -		    unlikely(!page_pool_dma_map(pool, page))) { +		if (dma_map && unlikely(!page_pool_dma_map(pool, page))) {  			put_page(page);  			continue;  		} @@ -550,6 +573,7 @@ struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp)  	return page;  }  EXPORT_SYMBOL(page_pool_alloc_pages); +ALLOW_ERROR_INJECTION(page_pool_alloc_pages, NULL);  /* Calculate distance between two u32 values, valid if distance is below 2^(31)   *  https://en.wikipedia.org/wiki/Serial_number_arithmetic#General_Solution @@ -580,7 +604,7 @@ void __page_pool_release_page_dma(struct page_pool *pool, struct page *page)  {  	dma_addr_t dma; -	if (!(pool->p.flags & PP_FLAG_DMA_MAP)) +	if (!pool->dma_map)  		/* Always account for inflight pages, even if we didn't  		 * map them  		 */ @@ -663,7 +687,7 @@ static bool __page_pool_page_can_be_recycled(const struct page *page)  }  /* If the page refcnt == 1, this will try to recycle the page. - * if PP_FLAG_DMA_SYNC_DEV is set, we'll try to sync the DMA area for + * If pool->dma_sync is set, we'll try to sync the DMA area for   * the configured size min(dma_sync_size, pool->max_len).   * If the page refcnt != 1, then the page will be returned to memory   * subsystem. @@ -686,12 +710,9 @@ __page_pool_put_page(struct page_pool *pool, struct page *page,  	if (likely(__page_pool_page_can_be_recycled(page))) {  		/* Read barrier done in page_ref_count / READ_ONCE */ -		if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) -			page_pool_dma_sync_for_device(pool, page, -						      dma_sync_size); +		page_pool_dma_sync_for_device(pool, page, dma_sync_size); -		if (allow_direct && in_softirq() && -		    page_pool_recycle_in_cache(page, pool)) +		if (allow_direct && page_pool_recycle_in_cache(page, pool))  			return NULL;  		/* Page found as candidate for recycling */ @@ -716,9 +737,35 @@ __page_pool_put_page(struct page_pool *pool, struct page *page,  	return NULL;  } +static bool page_pool_napi_local(const struct page_pool *pool) +{ +	const struct napi_struct *napi; +	u32 cpuid; + +	if (unlikely(!in_softirq())) +		return false; + +	/* Allow direct recycle if we have reasons to believe that we are +	 * in the same context as the consumer would run, so there's +	 * no possible race. +	 * __page_pool_put_page() makes sure we're not in hardirq context +	 * and interrupts are enabled prior to accessing the cache. +	 */ +	cpuid = smp_processor_id(); +	if (READ_ONCE(pool->cpuid) == cpuid) +		return true; + +	napi = READ_ONCE(pool->p.napi); + +	return napi && READ_ONCE(napi->list_owner) == cpuid; +} +  void page_pool_put_unrefed_page(struct page_pool *pool, struct page *page,  				unsigned int dma_sync_size, bool allow_direct)  { +	if (!allow_direct) +		allow_direct = page_pool_napi_local(pool); +  	page = __page_pool_put_page(pool, page, dma_sync_size, allow_direct);  	if (page && !page_pool_recycle_in_ring(pool, page)) {  		/* Cache full, fallback to free pages */ @@ -747,8 +794,11 @@ void page_pool_put_page_bulk(struct page_pool *pool, void **data,  			     int count)  {  	int i, bulk_len = 0; +	bool allow_direct;  	bool in_softirq; +	allow_direct = page_pool_napi_local(pool); +  	for (i = 0; i < count; i++) {  		struct page *page = virt_to_head_page(data[i]); @@ -756,13 +806,13 @@ void page_pool_put_page_bulk(struct page_pool *pool, void **data,  		if (!page_pool_is_last_ref(page))  			continue; -		page = __page_pool_put_page(pool, page, -1, false); +		page = __page_pool_put_page(pool, page, -1, allow_direct);  		/* Approved for bulk recycling in ptr_ring cache */  		if (page)  			data[bulk_len++] = page;  	} -	if (unlikely(!bulk_len)) +	if (!bulk_len)  		return;  	/* Bulk producer into ptr_ring page_pool cache */ @@ -799,9 +849,7 @@ static struct page *page_pool_drain_frag(struct page_pool *pool,  		return NULL;  	if (__page_pool_page_can_be_recycled(page)) { -		if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) -			page_pool_dma_sync_for_device(pool, page, -1); - +		page_pool_dma_sync_for_device(pool, page, -1);  		return page;  	} @@ -959,7 +1007,7 @@ static void page_pool_release_retry(struct work_struct *wq)  }  void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *), -			   struct xdp_mem_info *mem) +			   const struct xdp_mem_info *mem)  {  	refcount_inc(&pool->user_cnt);  	pool->disconnect = disconnect; @@ -969,7 +1017,7 @@ void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *),  static void page_pool_disable_direct_recycling(struct page_pool *pool)  {  	/* Disable direct recycling based on pool->cpuid. -	 * Paired with READ_ONCE() in napi_pp_put_page(). +	 * Paired with READ_ONCE() in page_pool_napi_local().  	 */  	WRITE_ONCE(pool->cpuid, -1); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 8ba6a4e4be26..b86b0a87367d 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1036,8 +1036,8 @@ static size_t rtnl_proto_down_size(const struct net_device *dev)  {  	size_t size = nla_total_size(1); -	if (dev->proto_down_reason) -		size += nla_total_size(0) + nla_total_size(4); +	/* Assume dev->proto_down_reason is not zero. */ +	size += nla_total_size(0) + nla_total_size(4);  	return size;  } @@ -1477,13 +1477,15 @@ static int rtnl_fill_link_ifmap(struct sk_buff *skb,  static u32 rtnl_xdp_prog_skb(struct net_device *dev)  {  	const struct bpf_prog *generic_xdp_prog; +	u32 res = 0; -	ASSERT_RTNL(); +	rcu_read_lock(); +	generic_xdp_prog = rcu_dereference(dev->xdp_prog); +	if (generic_xdp_prog) +		res = generic_xdp_prog->aux->id; +	rcu_read_unlock(); -	generic_xdp_prog = rtnl_dereference(dev->xdp_prog); -	if (!generic_xdp_prog) -		return 0; -	return generic_xdp_prog->aux->id; +	return res;  }  static u32 rtnl_xdp_prog_drv(struct net_device *dev) @@ -1603,7 +1605,8 @@ static int put_master_ifindex(struct sk_buff *skb, struct net_device *dev)  	upper_dev = netdev_master_upper_dev_get_rcu(dev);  	if (upper_dev) -		ret = nla_put_u32(skb, IFLA_MASTER, upper_dev->ifindex); +		ret = nla_put_u32(skb, IFLA_MASTER, +				  READ_ONCE(upper_dev->ifindex));  	rcu_read_unlock();  	return ret; @@ -1736,10 +1739,10 @@ static int rtnl_fill_proto_down(struct sk_buff *skb,  	struct nlattr *pr;  	u32 preason; -	if (nla_put_u8(skb, IFLA_PROTO_DOWN, dev->proto_down)) +	if (nla_put_u8(skb, IFLA_PROTO_DOWN, READ_ONCE(dev->proto_down)))  		goto nla_put_failure; -	preason = dev->proto_down_reason; +	preason = READ_ONCE(dev->proto_down_reason);  	if (!preason)  		return 0; @@ -1812,6 +1815,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb,  			    u32 event, int *new_nsid, int new_ifindex,  			    int tgt_netnsid, gfp_t gfp)  { +	char devname[IFNAMSIZ];  	struct ifinfomsg *ifm;  	struct nlmsghdr *nlh;  	struct Qdisc *qdisc; @@ -1824,41 +1828,51 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb,  	ifm = nlmsg_data(nlh);  	ifm->ifi_family = AF_UNSPEC;  	ifm->__ifi_pad = 0; -	ifm->ifi_type = dev->type; -	ifm->ifi_index = dev->ifindex; +	ifm->ifi_type = READ_ONCE(dev->type); +	ifm->ifi_index = READ_ONCE(dev->ifindex);  	ifm->ifi_flags = dev_get_flags(dev);  	ifm->ifi_change = change;  	if (tgt_netnsid >= 0 && nla_put_s32(skb, IFLA_TARGET_NETNSID, tgt_netnsid))  		goto nla_put_failure; -	qdisc = rtnl_dereference(dev->qdisc); -	if (nla_put_string(skb, IFLA_IFNAME, dev->name) || -	    nla_put_u32(skb, IFLA_TXQLEN, dev->tx_queue_len) || +	netdev_copy_name(dev, devname); +	if (nla_put_string(skb, IFLA_IFNAME, devname)) +		goto nla_put_failure; + +	if (nla_put_u32(skb, IFLA_TXQLEN, READ_ONCE(dev->tx_queue_len)) ||  	    nla_put_u8(skb, IFLA_OPERSTATE, -		       netif_running(dev) ? dev->operstate : IF_OPER_DOWN) || -	    nla_put_u8(skb, IFLA_LINKMODE, dev->link_mode) || -	    nla_put_u32(skb, IFLA_MTU, dev->mtu) || -	    nla_put_u32(skb, IFLA_MIN_MTU, dev->min_mtu) || -	    nla_put_u32(skb, IFLA_MAX_MTU, dev->max_mtu) || -	    nla_put_u32(skb, IFLA_GROUP, dev->group) || -	    nla_put_u32(skb, IFLA_PROMISCUITY, dev->promiscuity) || -	    nla_put_u32(skb, IFLA_ALLMULTI, dev->allmulti) || -	    nla_put_u32(skb, IFLA_NUM_TX_QUEUES, dev->num_tx_queues) || -	    nla_put_u32(skb, IFLA_GSO_MAX_SEGS, dev->gso_max_segs) || -	    nla_put_u32(skb, IFLA_GSO_MAX_SIZE, dev->gso_max_size) || -	    nla_put_u32(skb, IFLA_GRO_MAX_SIZE, dev->gro_max_size) || -	    nla_put_u32(skb, IFLA_GSO_IPV4_MAX_SIZE, dev->gso_ipv4_max_size) || -	    nla_put_u32(skb, IFLA_GRO_IPV4_MAX_SIZE, dev->gro_ipv4_max_size) || -	    nla_put_u32(skb, IFLA_TSO_MAX_SIZE, dev->tso_max_size) || -	    nla_put_u32(skb, IFLA_TSO_MAX_SEGS, dev->tso_max_segs) || +		       netif_running(dev) ? READ_ONCE(dev->operstate) : +					    IF_OPER_DOWN) || +	    nla_put_u8(skb, IFLA_LINKMODE, READ_ONCE(dev->link_mode)) || +	    nla_put_u32(skb, IFLA_MTU, READ_ONCE(dev->mtu)) || +	    nla_put_u32(skb, IFLA_MIN_MTU, READ_ONCE(dev->min_mtu)) || +	    nla_put_u32(skb, IFLA_MAX_MTU, READ_ONCE(dev->max_mtu)) || +	    nla_put_u32(skb, IFLA_GROUP, READ_ONCE(dev->group)) || +	    nla_put_u32(skb, IFLA_PROMISCUITY, READ_ONCE(dev->promiscuity)) || +	    nla_put_u32(skb, IFLA_ALLMULTI, READ_ONCE(dev->allmulti)) || +	    nla_put_u32(skb, IFLA_NUM_TX_QUEUES, +			READ_ONCE(dev->num_tx_queues)) || +	    nla_put_u32(skb, IFLA_GSO_MAX_SEGS, +			READ_ONCE(dev->gso_max_segs)) || +	    nla_put_u32(skb, IFLA_GSO_MAX_SIZE, +			READ_ONCE(dev->gso_max_size)) || +	    nla_put_u32(skb, IFLA_GRO_MAX_SIZE, +			READ_ONCE(dev->gro_max_size)) || +	    nla_put_u32(skb, IFLA_GSO_IPV4_MAX_SIZE, +			READ_ONCE(dev->gso_ipv4_max_size)) || +	    nla_put_u32(skb, IFLA_GRO_IPV4_MAX_SIZE, +			READ_ONCE(dev->gro_ipv4_max_size)) || +	    nla_put_u32(skb, IFLA_TSO_MAX_SIZE, +			READ_ONCE(dev->tso_max_size)) || +	    nla_put_u32(skb, IFLA_TSO_MAX_SEGS, +			READ_ONCE(dev->tso_max_segs)) ||  #ifdef CONFIG_RPS -	    nla_put_u32(skb, IFLA_NUM_RX_QUEUES, dev->num_rx_queues) || +	    nla_put_u32(skb, IFLA_NUM_RX_QUEUES, +			READ_ONCE(dev->num_rx_queues)) ||  #endif  	    put_master_ifindex(skb, dev) ||  	    nla_put_u8(skb, IFLA_CARRIER, netif_carrier_ok(dev)) || -	    (qdisc && -	     nla_put_string(skb, IFLA_QDISC, qdisc->ops->id)) ||  	    nla_put_ifalias(skb, dev) ||  	    nla_put_u32(skb, IFLA_CARRIER_CHANGES,  			atomic_read(&dev->carrier_up_count) + @@ -1909,9 +1923,6 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb,  			goto nla_put_failure;  	} -	if (rtnl_fill_link_netnsid(skb, dev, src_net, gfp)) -		goto nla_put_failure; -  	if (new_nsid &&  	    nla_put_s32(skb, IFLA_NEW_NETNSID, *new_nsid) < 0)  		goto nla_put_failure; @@ -1924,6 +1935,11 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb,  		goto nla_put_failure;  	rcu_read_lock(); +	if (rtnl_fill_link_netnsid(skb, dev, src_net, GFP_ATOMIC)) +		goto nla_put_failure_rcu; +	qdisc = rcu_dereference(dev->qdisc); +	if (qdisc && nla_put_string(skb, IFLA_QDISC, qdisc->ops->id)) +		goto nla_put_failure_rcu;  	if (rtnl_fill_link_af(skb, dev, ext_filter_mask))  		goto nla_put_failure_rcu;  	if (rtnl_fill_link_ifmap(skb, dev)) @@ -5245,15 +5261,14 @@ static int rtnl_bridge_dellink(struct sk_buff *skb, struct nlmsghdr *nlh,  	br_spec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_AF_SPEC);  	if (br_spec) { -		nla_for_each_nested(attr, br_spec, rem) { -			if (nla_type(attr) == IFLA_BRIDGE_FLAGS) { -				if (nla_len(attr) < sizeof(flags)) -					return -EINVAL; +		nla_for_each_nested_type(attr, IFLA_BRIDGE_FLAGS, br_spec, +					 rem) { +			if (nla_len(attr) < sizeof(flags)) +				return -EINVAL; -				have_flags = true; -				flags = nla_get_u16(attr); -				break; -			} +			have_flags = true; +			flags = nla_get_u16(attr); +			break;  		}  	} @@ -5962,19 +5977,17 @@ static int rtnl_stats_get(struct sk_buff *skb, struct nlmsghdr *nlh,  static int rtnl_stats_dump(struct sk_buff *skb, struct netlink_callback *cb)  {  	struct netlink_ext_ack *extack = cb->extack; -	int h, s_h, err, s_idx, s_idxattr, s_prividx;  	struct rtnl_stats_dump_filters filters;  	struct net *net = sock_net(skb->sk);  	unsigned int flags = NLM_F_MULTI;  	struct if_stats_msg *ifsm; -	struct hlist_head *head; +	struct { +		unsigned long ifindex; +		int idxattr; +		int prividx; +	} *ctx = (void *)cb->ctx;  	struct net_device *dev; -	int idx = 0; - -	s_h = cb->args[0]; -	s_idx = cb->args[1]; -	s_idxattr = cb->args[2]; -	s_prividx = cb->args[3]; +	int err;  	cb->seq = net->dev_base_seq; @@ -5993,39 +6006,26 @@ static int rtnl_stats_dump(struct sk_buff *skb, struct netlink_callback *cb)  	if (err)  		return err; -	for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { -		idx = 0; -		head = &net->dev_index_head[h]; -		hlist_for_each_entry(dev, head, index_hlist) { -			if (idx < s_idx) -				goto cont; -			err = rtnl_fill_statsinfo(skb, dev, RTM_NEWSTATS, -						  NETLINK_CB(cb->skb).portid, -						  cb->nlh->nlmsg_seq, 0, -						  flags, &filters, -						  &s_idxattr, &s_prividx, -						  extack); -			/* If we ran out of room on the first message, -			 * we're in trouble -			 */ -			WARN_ON((err == -EMSGSIZE) && (skb->len == 0)); +	for_each_netdev_dump(net, dev, ctx->ifindex) { +		err = rtnl_fill_statsinfo(skb, dev, RTM_NEWSTATS, +					  NETLINK_CB(cb->skb).portid, +					  cb->nlh->nlmsg_seq, 0, +					  flags, &filters, +					  &ctx->idxattr, &ctx->prividx, +					  extack); +		/* If we ran out of room on the first message, +		 * we're in trouble. +		 */ +		WARN_ON((err == -EMSGSIZE) && (skb->len == 0)); -			if (err < 0) -				goto out; -			s_prividx = 0; -			s_idxattr = 0; -			nl_dump_check_consistent(cb, nlmsg_hdr(skb)); -cont: -			idx++; -		} +		if (err < 0) +			break; +		ctx->prividx = 0; +		ctx->idxattr = 0; +		nl_dump_check_consistent(cb, nlmsg_hdr(skb));  	} -out: -	cb->args[3] = s_prividx; -	cb->args[2] = s_idxattr; -	cb->args[1] = idx; -	cb->args[0] = h; -	return skb->len; +	return err;  }  void rtnl_offload_xstats_notify(struct net_device *dev) diff --git a/net/core/scm.c b/net/core/scm.c index 9cd4b0a01cd6..4f6a14babe5a 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -89,6 +89,12 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp)  		fpl->count_unix = 0;  		fpl->max = SCM_MAX_FD;  		fpl->user = NULL; +#if IS_ENABLED(CONFIG_UNIX) +		fpl->inflight = false; +		fpl->dead = false; +		fpl->edges = NULL; +		INIT_LIST_HEAD(&fpl->vertices); +#endif  	}  	fpp = &fpl->fp[fpl->count]; @@ -376,8 +382,14 @@ struct scm_fp_list *scm_fp_dup(struct scm_fp_list *fpl)  	if (new_fpl) {  		for (i = 0; i < fpl->count; i++)  			get_file(fpl->fp[i]); +  		new_fpl->max = new_fpl->count;  		new_fpl->user = get_uid(fpl->user); +#if IS_ENABLED(CONFIG_UNIX) +		new_fpl->inflight = false; +		new_fpl->edges = NULL; +		INIT_LIST_HEAD(&new_fpl->vertices); +#endif  	}  	return new_fpl;  } diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 4096e679f61c..466999a7515e 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -51,6 +51,7 @@  #endif  #include <linux/string.h>  #include <linux/skbuff.h> +#include <linux/skbuff_ref.h>  #include <linux/splice.h>  #include <linux/cache.h>  #include <linux/rtnetlink.h> @@ -108,9 +109,6 @@ static struct kmem_cache *skbuff_ext_cache __ro_after_init;  #define SKB_SMALL_HEAD_HEADROOM						\  	SKB_WITH_OVERHEAD(SKB_SMALL_HEAD_CACHE_SIZE) -int sysctl_max_skb_frags __read_mostly = MAX_SKB_FRAGS; -EXPORT_SYMBOL(sysctl_max_skb_frags); -  /* kcm_write_msgs() relies on casting paged frags to bio_vec to use   * iov_iter_bvec(). These static asserts ensure the cast is valid is long as the   * netmem is a page. @@ -775,10 +773,9 @@ skb_fail:  EXPORT_SYMBOL(__netdev_alloc_skb);  /** - *	__napi_alloc_skb - allocate skbuff for rx in a specific NAPI instance + *	napi_alloc_skb - allocate skbuff for rx in a specific NAPI instance   *	@napi: napi instance this buffer was allocated for   *	@len: length to allocate - *	@gfp_mask: get_free_pages mask, passed to alloc_skb and alloc_pages   *   *	Allocate a new sk_buff for use in NAPI receive.  This buffer will   *	attempt to allocate the head from a special reserved region used @@ -787,9 +784,9 @@ EXPORT_SYMBOL(__netdev_alloc_skb);   *   *	%NULL is returned if there is no free memory.   */ -struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, -				 gfp_t gfp_mask) +struct sk_buff *napi_alloc_skb(struct napi_struct *napi, unsigned int len)  { +	gfp_t gfp_mask = GFP_ATOMIC | __GFP_NOWARN;  	struct napi_alloc_cache *nc;  	struct sk_buff *skb;  	bool pfmemalloc; @@ -860,7 +857,7 @@ skb_success:  skb_fail:  	return skb;  } -EXPORT_SYMBOL(__napi_alloc_skb); +EXPORT_SYMBOL(napi_alloc_skb);  void skb_add_rx_frag_netmem(struct sk_buff *skb, int i, netmem_ref netmem,  			    int off, int size, unsigned int truesize) @@ -1005,11 +1002,8 @@ int skb_cow_data_for_xdp(struct page_pool *pool, struct sk_buff **pskb,  EXPORT_SYMBOL(skb_cow_data_for_xdp);  #if IS_ENABLED(CONFIG_PAGE_POOL) -bool napi_pp_put_page(struct page *page, bool napi_safe) +bool napi_pp_put_page(struct page *page)  { -	bool allow_direct = false; -	struct page_pool *pp; -  	page = compound_head(page);  	/* page->pp_magic is OR'ed with PP_SIGNATURE after the allocation @@ -1022,39 +1016,18 @@ bool napi_pp_put_page(struct page *page, bool napi_safe)  	if (unlikely(!is_pp_page(page)))  		return false; -	pp = page->pp; - -	/* Allow direct recycle if we have reasons to believe that we are -	 * in the same context as the consumer would run, so there's -	 * no possible race. -	 * __page_pool_put_page() makes sure we're not in hardirq context -	 * and interrupts are enabled prior to accessing the cache. -	 */ -	if (napi_safe || in_softirq()) { -		const struct napi_struct *napi = READ_ONCE(pp->p.napi); -		unsigned int cpuid = smp_processor_id(); - -		allow_direct = napi && READ_ONCE(napi->list_owner) == cpuid; -		allow_direct |= READ_ONCE(pp->cpuid) == cpuid; -	} - -	/* Driver set this to memory recycling info. Reset it on recycle. -	 * This will *not* work for NIC using a split-page memory model. -	 * The page will be returned to the pool here regardless of the -	 * 'flipped' fragment being in use or not. -	 */ -	page_pool_put_full_page(pp, page, allow_direct); +	page_pool_put_full_page(page->pp, page, false);  	return true;  }  EXPORT_SYMBOL(napi_pp_put_page);  #endif -static bool skb_pp_recycle(struct sk_buff *skb, void *data, bool napi_safe) +static bool skb_pp_recycle(struct sk_buff *skb, void *data)  {  	if (!IS_ENABLED(CONFIG_PAGE_POOL) || !skb->pp_recycle)  		return false; -	return napi_pp_put_page(virt_to_page(data), napi_safe); +	return napi_pp_put_page(virt_to_page(data));  }  /** @@ -1096,12 +1069,12 @@ static void skb_kfree_head(void *head, unsigned int end_offset)  		kfree(head);  } -static void skb_free_head(struct sk_buff *skb, bool napi_safe) +static void skb_free_head(struct sk_buff *skb)  {  	unsigned char *head = skb->head;  	if (skb->head_frag) { -		if (skb_pp_recycle(skb, head, napi_safe)) +		if (skb_pp_recycle(skb, head))  			return;  		skb_free_frag(head);  	} else { @@ -1109,8 +1082,7 @@ static void skb_free_head(struct sk_buff *skb, bool napi_safe)  	}  } -static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason, -			     bool napi_safe) +static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason)  {  	struct skb_shared_info *shinfo = skb_shinfo(skb);  	int i; @@ -1127,13 +1099,13 @@ static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason,  	}  	for (i = 0; i < shinfo->nr_frags; i++) -		napi_frag_unref(&shinfo->frags[i], skb->pp_recycle, napi_safe); +		__skb_frag_unref(&shinfo->frags[i], skb->pp_recycle);  free_head:  	if (shinfo->frag_list)  		kfree_skb_list_reason(shinfo->frag_list, reason); -	skb_free_head(skb, napi_safe); +	skb_free_head(skb);  exit:  	/* When we clone an SKB we copy the reycling bit. The pp_recycle  	 * bit is only set on the head though, so in order to avoid races @@ -1194,12 +1166,11 @@ void skb_release_head_state(struct sk_buff *skb)  }  /* Free everything but the sk_buff shell. */ -static void skb_release_all(struct sk_buff *skb, enum skb_drop_reason reason, -			    bool napi_safe) +static void skb_release_all(struct sk_buff *skb, enum skb_drop_reason reason)  {  	skb_release_head_state(skb);  	if (likely(skb->head)) -		skb_release_data(skb, reason, napi_safe); +		skb_release_data(skb, reason);  }  /** @@ -1213,7 +1184,7 @@ static void skb_release_all(struct sk_buff *skb, enum skb_drop_reason reason,  void __kfree_skb(struct sk_buff *skb)  { -	skb_release_all(skb, SKB_DROP_REASON_NOT_SPECIFIED, false); +	skb_release_all(skb, SKB_DROP_REASON_NOT_SPECIFIED);  	kfree_skbmem(skb);  }  EXPORT_SYMBOL(__kfree_skb); @@ -1270,7 +1241,7 @@ static void kfree_skb_add_bulk(struct sk_buff *skb,  		return;  	} -	skb_release_all(skb, reason, false); +	skb_release_all(skb, reason);  	sa->skb_array[sa->skb_count++] = skb;  	if (unlikely(sa->skb_count == KFREE_SKB_BULK_SIZE)) { @@ -1331,22 +1302,28 @@ void skb_dump(const char *level, const struct sk_buff *skb, bool full_pkt)  	has_trans = skb_transport_header_was_set(skb);  	printk("%sskb len=%u headroom=%u headlen=%u tailroom=%u\n" -	       "mac=(%d,%d) net=(%d,%d) trans=%d\n" +	       "mac=(%d,%d) mac_len=%u net=(%d,%d) trans=%d\n"  	       "shinfo(txflags=%u nr_frags=%u gso(size=%hu type=%u segs=%hu))\n" -	       "csum(0x%x ip_summed=%u complete_sw=%u valid=%u level=%u)\n" -	       "hash(0x%x sw=%u l4=%u) proto=0x%04x pkttype=%u iif=%d\n", +	       "csum(0x%x start=%u offset=%u ip_summed=%u complete_sw=%u valid=%u level=%u)\n" +	       "hash(0x%x sw=%u l4=%u) proto=0x%04x pkttype=%u iif=%d\n" +	       "priority=0x%x mark=0x%x alloc_cpu=%u vlan_all=0x%x\n" +	       "encapsulation=%d inner(proto=0x%04x, mac=%u, net=%u, trans=%u)\n",  	       level, skb->len, headroom, skb_headlen(skb), tailroom,  	       has_mac ? skb->mac_header : -1,  	       has_mac ? skb_mac_header_len(skb) : -1, +	       skb->mac_len,  	       skb->network_header,  	       has_trans ? skb_network_header_len(skb) : -1,  	       has_trans ? skb->transport_header : -1,  	       sh->tx_flags, sh->nr_frags,  	       sh->gso_size, sh->gso_type, sh->gso_segs, -	       skb->csum, skb->ip_summed, skb->csum_complete_sw, -	       skb->csum_valid, skb->csum_level, +	       skb->csum, skb->csum_start, skb->csum_offset, skb->ip_summed, +	       skb->csum_complete_sw, skb->csum_valid, skb->csum_level,  	       skb->hash, skb->sw_hash, skb->l4_hash, -	       ntohs(skb->protocol), skb->pkt_type, skb->skb_iif); +	       ntohs(skb->protocol), skb->pkt_type, skb->skb_iif, +	       skb->priority, skb->mark, skb->alloc_cpu, skb->vlan_all, +	       skb->encapsulation, skb->inner_protocol, skb->inner_mac_header, +	       skb->inner_network_header, skb->inner_transport_header);  	if (dev)  		printk("%sdev name=%s feat=%pNF\n", @@ -1444,7 +1421,7 @@ EXPORT_SYMBOL(consume_skb);  void __consume_stateless_skb(struct sk_buff *skb)  {  	trace_consume_skb(skb, __builtin_return_address(0)); -	skb_release_data(skb, SKB_CONSUMED, false); +	skb_release_data(skb, SKB_CONSUMED);  	kfree_skbmem(skb);  } @@ -1471,7 +1448,7 @@ static void napi_skb_cache_put(struct sk_buff *skb)  void __napi_kfree_skb(struct sk_buff *skb, enum skb_drop_reason reason)  { -	skb_release_all(skb, reason, true); +	skb_release_all(skb, reason);  	napi_skb_cache_put(skb);  } @@ -1509,7 +1486,7 @@ void napi_consume_skb(struct sk_buff *skb, int budget)  		return;  	} -	skb_release_all(skb, SKB_CONSUMED, !!budget); +	skb_release_all(skb, SKB_CONSUMED);  	napi_skb_cache_put(skb);  }  EXPORT_SYMBOL(napi_consume_skb); @@ -1640,7 +1617,7 @@ EXPORT_SYMBOL_GPL(alloc_skb_for_msg);   */  struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src)  { -	skb_release_all(dst, SKB_CONSUMED, false); +	skb_release_all(dst, SKB_CONSUMED);  	return __skb_clone(dst, src);  }  EXPORT_SYMBOL_GPL(skb_morph); @@ -1708,7 +1685,7 @@ static struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size)  		return NULL;  	} -	uarg->ubuf.callback = msg_zerocopy_callback; +	uarg->ubuf.ops = &msg_zerocopy_ubuf_ops;  	uarg->id = ((u32)atomic_inc_return(&sk->sk_zckey)) - 1;  	uarg->len = 1;  	uarg->bytelen = size; @@ -1734,7 +1711,7 @@ struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size,  		u32 bytelen, next;  		/* there might be non MSG_ZEROCOPY users */ -		if (uarg->callback != msg_zerocopy_callback) +		if (uarg->ops != &msg_zerocopy_ubuf_ops)  			return NULL;  		/* realloc only when socket is locked (TCP, UDP cork), @@ -1845,8 +1822,8 @@ release:  	sock_put(sk);  } -void msg_zerocopy_callback(struct sk_buff *skb, struct ubuf_info *uarg, -			   bool success) +static void msg_zerocopy_complete(struct sk_buff *skb, struct ubuf_info *uarg, +				  bool success)  {  	struct ubuf_info_msgzc *uarg_zc = uarg_to_msgzc(uarg); @@ -1855,7 +1832,6 @@ void msg_zerocopy_callback(struct sk_buff *skb, struct ubuf_info *uarg,  	if (refcount_dec_and_test(&uarg->refcnt))  		__msg_zerocopy_callback(uarg_zc);  } -EXPORT_SYMBOL_GPL(msg_zerocopy_callback);  void msg_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref)  { @@ -1865,10 +1841,15 @@ void msg_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref)  	uarg_to_msgzc(uarg)->len--;  	if (have_uref) -		msg_zerocopy_callback(NULL, uarg, true); +		msg_zerocopy_complete(NULL, uarg, true);  }  EXPORT_SYMBOL_GPL(msg_zerocopy_put_abort); +const struct ubuf_info_ops msg_zerocopy_ubuf_ops = { +	.complete = msg_zerocopy_complete, +}; +EXPORT_SYMBOL_GPL(msg_zerocopy_ubuf_ops); +  int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,  			     struct msghdr *msg, int len,  			     struct ubuf_info *uarg) @@ -1876,11 +1857,18 @@ int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,  	struct ubuf_info *orig_uarg = skb_zcopy(skb);  	int err, orig_len = skb->len; -	/* An skb can only point to one uarg. This edge case happens when -	 * TCP appends to an skb, but zerocopy_realloc triggered a new alloc. -	 */ -	if (orig_uarg && uarg != orig_uarg) -		return -EEXIST; +	if (uarg->ops->link_skb) { +		err = uarg->ops->link_skb(skb, uarg); +		if (err) +			return err; +	} else { +		/* An skb can only point to one uarg. This edge case happens +		 * when TCP appends to an skb, but zerocopy_realloc triggered +		 * a new alloc. +		 */ +		if (orig_uarg && uarg != orig_uarg) +			return -EEXIST; +	}  	err = __zerocopy_sg_from_iter(msg, sk, skb, &msg->msg_iter, len);  	if (err == -EFAULT || (err == -EMSGSIZE && skb->len == orig_len)) { @@ -1894,7 +1882,8 @@ int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,  		return err;  	} -	skb_zcopy_set(skb, uarg, NULL); +	if (!uarg->ops->link_skb) +		skb_zcopy_set(skb, uarg, NULL);  	return skb->len - orig_len;  }  EXPORT_SYMBOL_GPL(skb_zerocopy_iter_stream); @@ -2278,9 +2267,9 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,  		if (skb_has_frag_list(skb))  			skb_clone_fraglist(skb); -		skb_release_data(skb, SKB_CONSUMED, false); +		skb_release_data(skb, SKB_CONSUMED);  	} else { -		skb_free_head(skb, false); +		skb_free_head(skb);  	}  	off = (data + nhead) - skb->head; @@ -6586,12 +6575,12 @@ static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off,  			skb_frag_ref(skb, i);  		if (skb_has_frag_list(skb))  			skb_clone_fraglist(skb); -		skb_release_data(skb, SKB_CONSUMED, false); +		skb_release_data(skb, SKB_CONSUMED);  	} else {  		/* we can reuse existing recount- all we did was  		 * relocate values  		 */ -		skb_free_head(skb, false); +		skb_free_head(skb);  	}  	skb->head = data; @@ -6726,7 +6715,7 @@ static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off,  		skb_kfree_head(data, size);  		return -ENOMEM;  	} -	skb_release_data(skb, SKB_CONSUMED, false); +	skb_release_data(skb, SKB_CONSUMED);  	skb->head = data;  	skb->head_frag = 0; @@ -7006,6 +6995,19 @@ free_now:  EXPORT_SYMBOL(__skb_ext_put);  #endif /* CONFIG_SKB_EXTENSIONS */ +static void kfree_skb_napi_cache(struct sk_buff *skb) +{ +	/* if SKB is a clone, don't handle this case */ +	if (skb->fclone != SKB_FCLONE_UNAVAILABLE) { +		__kfree_skb(skb); +		return; +	} + +	local_bh_disable(); +	__napi_kfree_skb(skb, SKB_CONSUMED); +	local_bh_enable(); +} +  /**   * skb_attempt_defer_free - queue skb for remote freeing   * @skb: buffer @@ -7021,10 +7023,10 @@ void skb_attempt_defer_free(struct sk_buff *skb)  	unsigned int defer_max;  	bool kick; -	if (WARN_ON_ONCE(cpu >= nr_cpu_ids) || -	    !cpu_online(cpu) || -	    cpu == raw_smp_processor_id()) { -nodefer:	__kfree_skb(skb); +	if (cpu == raw_smp_processor_id() || +	    WARN_ON_ONCE(cpu >= nr_cpu_ids) || +	    !cpu_online(cpu)) { +nodefer:	kfree_skb_napi_cache(skb);  		return;  	} @@ -7032,7 +7034,7 @@ nodefer:	__kfree_skb(skb);  	DEBUG_NET_WARN_ON_ONCE(skb->destructor);  	sd = &per_cpu(softnet_data, cpu); -	defer_max = READ_ONCE(sysctl_skb_defer_max); +	defer_max = READ_ONCE(net_hotdata.sysctl_skb_defer_max);  	if (READ_ONCE(sd->defer_count) >= defer_max)  		goto nodefer; @@ -7050,8 +7052,8 @@ nodefer:	__kfree_skb(skb);  	/* Make sure to trigger NET_RX_SOFTIRQ on the remote CPU  	 * if we are unlucky enough (this seems very unlikely).  	 */ -	if (unlikely(kick) && !cmpxchg(&sd->defer_ipi_scheduled, 0, 1)) -		smp_call_function_single_async(cpu, &sd->defer_csd); +	if (unlikely(kick)) +		kick_defer_list_purge(sd, cpu);  }  static void skb_splice_csum_page(struct sk_buff *skb, struct page *page, @@ -7084,7 +7086,7 @@ static void skb_splice_csum_page(struct sk_buff *skb, struct page *page,  ssize_t skb_splice_from_iter(struct sk_buff *skb, struct iov_iter *iter,  			     ssize_t maxsize, gfp_t gfp)  { -	size_t frag_limit = READ_ONCE(sysctl_max_skb_frags); +	size_t frag_limit = READ_ONCE(net_hotdata.sysctl_max_skb_frags);  	struct page *pages[8], **ppages = pages;  	ssize_t spliced = 0, ret = 0;  	unsigned int i; diff --git a/net/core/sock.c b/net/core/sock.c index 0963689a5950..8629f9aecf91 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -127,6 +127,7 @@  #include <net/net_namespace.h>  #include <net/request_sock.h>  #include <net/sock.h> +#include <net/proto_memory.h>  #include <linux/net_tstamp.h>  #include <net/xfrm.h>  #include <linux/ipsec.h> @@ -283,7 +284,6 @@ __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;  EXPORT_SYMBOL(sysctl_rmem_max);  __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;  __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX; -int sysctl_mem_pcpu_rsv __read_mostly = SK_MEMORY_PCPU_RESERVE;  int sysctl_tstamp_allow_data __read_mostly = 1; @@ -2526,13 +2526,12 @@ EXPORT_SYMBOL(skb_set_owner_w);  static bool can_skb_orphan_partial(const struct sk_buff *skb)  { -#ifdef CONFIG_TLS_DEVICE  	/* Drivers depend on in-order delivery for crypto offload,  	 * partial orphan breaks out-of-order-OK logic.  	 */ -	if (skb->decrypted) +	if (skb_is_decrypted(skb))  		return false; -#endif +  	return (skb->destructor == sock_wfree ||  		(IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree));  } @@ -3242,8 +3241,8 @@ int sock_no_socketpair(struct socket *sock1, struct socket *sock2)  }  EXPORT_SYMBOL(sock_no_socketpair); -int sock_no_accept(struct socket *sock, struct socket *newsock, int flags, -		   bool kern) +int sock_no_accept(struct socket *sock, struct socket *newsock, +		   struct proto_accept_arg *arg)  {  	return -EOPNOTSUPP;  } @@ -3338,7 +3337,7 @@ static void sock_def_error_report(struct sock *sk)  	wq = rcu_dereference(sk->sk_wq);  	if (skwq_has_sleeper(wq))  		wake_up_interruptible_poll(&wq->wait, EPOLLERR); -	sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR); +	sk_wake_async_rcu(sk, SOCK_WAKE_IO, POLL_ERR);  	rcu_read_unlock();  } @@ -3353,7 +3352,7 @@ void sock_def_readable(struct sock *sk)  	if (skwq_has_sleeper(wq))  		wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |  						EPOLLRDNORM | EPOLLRDBAND); -	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); +	sk_wake_async_rcu(sk, SOCK_WAKE_WAITD, POLL_IN);  	rcu_read_unlock();  } @@ -3373,7 +3372,7 @@ static void sock_def_write_space(struct sock *sk)  						EPOLLWRNORM | EPOLLWRBAND);  		/* Should agree with poll, otherwise some programs break */ -		sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); +		sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);  	}  	rcu_read_unlock(); @@ -3398,7 +3397,7 @@ static void sock_def_write_space_wfree(struct sock *sk)  						EPOLLWRNORM | EPOLLWRBAND);  		/* Should agree with poll, otherwise some programs break */ -		sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); +		sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);  	}  } diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 8598466a3805..9402889840bf 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -24,8 +24,16 @@ struct bpf_stab {  #define SOCK_CREATE_FLAG_MASK				\  	(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) +/* This mutex is used to + *  - protect race between prog/link attach/detach and link prog update, and + *  - protect race between releasing and accessing map in bpf_link. + * A single global mutex lock is used since it is expected contention is low. + */ +static DEFINE_MUTEX(sockmap_mutex); +  static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, -				struct bpf_prog *old, u32 which); +				struct bpf_prog *old, struct bpf_link *link, +				u32 which);  static struct sk_psock_progs *sock_map_progs(struct bpf_map *map);  static struct bpf_map *sock_map_alloc(union bpf_attr *attr) @@ -71,7 +79,9 @@ int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog)  	map = __bpf_map_get(f);  	if (IS_ERR(map))  		return PTR_ERR(map); -	ret = sock_map_prog_update(map, prog, NULL, attr->attach_type); +	mutex_lock(&sockmap_mutex); +	ret = sock_map_prog_update(map, prog, NULL, NULL, attr->attach_type); +	mutex_unlock(&sockmap_mutex);  	fdput(f);  	return ret;  } @@ -103,7 +113,9 @@ int sock_map_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype)  		goto put_prog;  	} -	ret = sock_map_prog_update(map, NULL, prog, attr->attach_type); +	mutex_lock(&sockmap_mutex); +	ret = sock_map_prog_update(map, NULL, prog, NULL, attr->attach_type); +	mutex_unlock(&sockmap_mutex);  put_prog:  	bpf_prog_put(prog);  put_map: @@ -1460,55 +1472,84 @@ static struct sk_psock_progs *sock_map_progs(struct bpf_map *map)  	return NULL;  } -static int sock_map_prog_lookup(struct bpf_map *map, struct bpf_prog ***pprog, -				u32 which) +static int sock_map_prog_link_lookup(struct bpf_map *map, struct bpf_prog ***pprog, +				     struct bpf_link ***plink, u32 which)  {  	struct sk_psock_progs *progs = sock_map_progs(map); +	struct bpf_prog **cur_pprog; +	struct bpf_link **cur_plink;  	if (!progs)  		return -EOPNOTSUPP;  	switch (which) {  	case BPF_SK_MSG_VERDICT: -		*pprog = &progs->msg_parser; +		cur_pprog = &progs->msg_parser; +		cur_plink = &progs->msg_parser_link;  		break;  #if IS_ENABLED(CONFIG_BPF_STREAM_PARSER)  	case BPF_SK_SKB_STREAM_PARSER: -		*pprog = &progs->stream_parser; +		cur_pprog = &progs->stream_parser; +		cur_plink = &progs->stream_parser_link;  		break;  #endif  	case BPF_SK_SKB_STREAM_VERDICT:  		if (progs->skb_verdict)  			return -EBUSY; -		*pprog = &progs->stream_verdict; +		cur_pprog = &progs->stream_verdict; +		cur_plink = &progs->stream_verdict_link;  		break;  	case BPF_SK_SKB_VERDICT:  		if (progs->stream_verdict)  			return -EBUSY; -		*pprog = &progs->skb_verdict; +		cur_pprog = &progs->skb_verdict; +		cur_plink = &progs->skb_verdict_link;  		break;  	default:  		return -EOPNOTSUPP;  	} +	*pprog = cur_pprog; +	if (plink) +		*plink = cur_plink;  	return 0;  } +/* Handle the following four cases: + * prog_attach: prog != NULL, old == NULL, link == NULL + * prog_detach: prog == NULL, old != NULL, link == NULL + * link_attach: prog != NULL, old == NULL, link != NULL + * link_detach: prog == NULL, old != NULL, link != NULL + */  static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, -				struct bpf_prog *old, u32 which) +				struct bpf_prog *old, struct bpf_link *link, +				u32 which)  {  	struct bpf_prog **pprog; +	struct bpf_link **plink;  	int ret; -	ret = sock_map_prog_lookup(map, &pprog, which); +	ret = sock_map_prog_link_lookup(map, &pprog, &plink, which);  	if (ret)  		return ret; -	if (old) -		return psock_replace_prog(pprog, prog, old); +	/* for prog_attach/prog_detach/link_attach, return error if a bpf_link +	 * exists for that prog. +	 */ +	if ((!link || prog) && *plink) +		return -EBUSY; -	psock_set_prog(pprog, prog); -	return 0; +	if (old) { +		ret = psock_replace_prog(pprog, prog, old); +		if (!ret) +			*plink = NULL; +	} else { +		psock_set_prog(pprog, prog); +		if (link) +			*plink = link; +	} + +	return ret;  }  int sock_map_bpf_prog_query(const union bpf_attr *attr, @@ -1533,7 +1574,7 @@ int sock_map_bpf_prog_query(const union bpf_attr *attr,  	rcu_read_lock(); -	ret = sock_map_prog_lookup(map, &pprog, attr->query.attach_type); +	ret = sock_map_prog_link_lookup(map, &pprog, NULL, attr->query.attach_type);  	if (ret)  		goto end; @@ -1663,6 +1704,196 @@ void sock_map_close(struct sock *sk, long timeout)  }  EXPORT_SYMBOL_GPL(sock_map_close); +struct sockmap_link { +	struct bpf_link link; +	struct bpf_map *map; +	enum bpf_attach_type attach_type; +}; + +static void sock_map_link_release(struct bpf_link *link) +{ +	struct sockmap_link *sockmap_link = container_of(link, struct sockmap_link, link); + +	mutex_lock(&sockmap_mutex); +	if (!sockmap_link->map) +		goto out; + +	WARN_ON_ONCE(sock_map_prog_update(sockmap_link->map, NULL, link->prog, link, +					  sockmap_link->attach_type)); + +	bpf_map_put_with_uref(sockmap_link->map); +	sockmap_link->map = NULL; +out: +	mutex_unlock(&sockmap_mutex); +} + +static int sock_map_link_detach(struct bpf_link *link) +{ +	sock_map_link_release(link); +	return 0; +} + +static void sock_map_link_dealloc(struct bpf_link *link) +{ +	kfree(link); +} + +/* Handle the following two cases: + * case 1: link != NULL, prog != NULL, old != NULL + * case 2: link != NULL, prog != NULL, old == NULL + */ +static int sock_map_link_update_prog(struct bpf_link *link, +				     struct bpf_prog *prog, +				     struct bpf_prog *old) +{ +	const struct sockmap_link *sockmap_link = container_of(link, struct sockmap_link, link); +	struct bpf_prog **pprog, *old_link_prog; +	struct bpf_link **plink; +	int ret = 0; + +	mutex_lock(&sockmap_mutex); + +	/* If old prog is not NULL, ensure old prog is the same as link->prog. */ +	if (old && link->prog != old) { +		ret = -EPERM; +		goto out; +	} +	/* Ensure link->prog has the same type/attach_type as the new prog. */ +	if (link->prog->type != prog->type || +	    link->prog->expected_attach_type != prog->expected_attach_type) { +		ret = -EINVAL; +		goto out; +	} + +	ret = sock_map_prog_link_lookup(sockmap_link->map, &pprog, &plink, +					sockmap_link->attach_type); +	if (ret) +		goto out; + +	/* return error if the stored bpf_link does not match the incoming bpf_link. */ +	if (link != *plink) { +		ret = -EBUSY; +		goto out; +	} + +	if (old) { +		ret = psock_replace_prog(pprog, prog, old); +		if (ret) +			goto out; +	} else { +		psock_set_prog(pprog, prog); +	} + +	bpf_prog_inc(prog); +	old_link_prog = xchg(&link->prog, prog); +	bpf_prog_put(old_link_prog); + +out: +	mutex_unlock(&sockmap_mutex); +	return ret; +} + +static u32 sock_map_link_get_map_id(const struct sockmap_link *sockmap_link) +{ +	u32 map_id = 0; + +	mutex_lock(&sockmap_mutex); +	if (sockmap_link->map) +		map_id = sockmap_link->map->id; +	mutex_unlock(&sockmap_mutex); +	return map_id; +} + +static int sock_map_link_fill_info(const struct bpf_link *link, +				   struct bpf_link_info *info) +{ +	const struct sockmap_link *sockmap_link = container_of(link, struct sockmap_link, link); +	u32 map_id = sock_map_link_get_map_id(sockmap_link); + +	info->sockmap.map_id = map_id; +	info->sockmap.attach_type = sockmap_link->attach_type; +	return 0; +} + +static void sock_map_link_show_fdinfo(const struct bpf_link *link, +				      struct seq_file *seq) +{ +	const struct sockmap_link *sockmap_link = container_of(link, struct sockmap_link, link); +	u32 map_id = sock_map_link_get_map_id(sockmap_link); + +	seq_printf(seq, "map_id:\t%u\n", map_id); +	seq_printf(seq, "attach_type:\t%u\n", sockmap_link->attach_type); +} + +static const struct bpf_link_ops sock_map_link_ops = { +	.release = sock_map_link_release, +	.dealloc = sock_map_link_dealloc, +	.detach = sock_map_link_detach, +	.update_prog = sock_map_link_update_prog, +	.fill_link_info = sock_map_link_fill_info, +	.show_fdinfo = sock_map_link_show_fdinfo, +}; + +int sock_map_link_create(const union bpf_attr *attr, struct bpf_prog *prog) +{ +	struct bpf_link_primer link_primer; +	struct sockmap_link *sockmap_link; +	enum bpf_attach_type attach_type; +	struct bpf_map *map; +	int ret; + +	if (attr->link_create.flags) +		return -EINVAL; + +	map = bpf_map_get_with_uref(attr->link_create.target_fd); +	if (IS_ERR(map)) +		return PTR_ERR(map); +	if (map->map_type != BPF_MAP_TYPE_SOCKMAP && map->map_type != BPF_MAP_TYPE_SOCKHASH) { +		ret = -EINVAL; +		goto out; +	} + +	sockmap_link = kzalloc(sizeof(*sockmap_link), GFP_USER); +	if (!sockmap_link) { +		ret = -ENOMEM; +		goto out; +	} + +	attach_type = attr->link_create.attach_type; +	bpf_link_init(&sockmap_link->link, BPF_LINK_TYPE_SOCKMAP, &sock_map_link_ops, prog); +	sockmap_link->map = map; +	sockmap_link->attach_type = attach_type; + +	ret = bpf_link_prime(&sockmap_link->link, &link_primer); +	if (ret) { +		kfree(sockmap_link); +		goto out; +	} + +	mutex_lock(&sockmap_mutex); +	ret = sock_map_prog_update(map, prog, NULL, &sockmap_link->link, attach_type); +	mutex_unlock(&sockmap_mutex); +	if (ret) { +		bpf_link_cleanup(&link_primer); +		goto out; +	} + +	/* Increase refcnt for the prog since when old prog is replaced with +	 * psock_replace_prog() and psock_set_prog() its refcnt will be decreased. +	 * +	 * Actually, we do not need to increase refcnt for the prog since bpf_link +	 * will hold a reference. But in order to have less complexity w.r.t. +	 * replacing/setting prog, let us increase the refcnt to make things simpler. +	 */ +	bpf_prog_inc(prog); + +	return bpf_link_settle(&link_primer); + +out: +	bpf_map_put_with_uref(map); +	return ret; +} +  static int sock_map_iter_attach_target(struct bpf_prog *prog,  				       union bpf_iter_link_info *linfo,  				       struct bpf_iter_aux_info *aux) diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 6973dda3abda..c9fb9ad87485 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -24,6 +24,7 @@  #include <net/busy_poll.h>  #include <net/pkt_sched.h>  #include <net/hotdata.h> +#include <net/proto_memory.h>  #include <net/rps.h>  #include "dev.h" @@ -415,7 +416,7 @@ static struct ctl_table net_core_table[] = {  	},  	{  		.procname	= "mem_pcpu_rsv", -		.data		= &sysctl_mem_pcpu_rsv, +		.data		= &net_hotdata.sysctl_mem_pcpu_rsv,  		.maxlen		= sizeof(int),  		.mode		= 0644,  		.proc_handler	= proc_dointvec_minmax, @@ -595,7 +596,7 @@ static struct ctl_table net_core_table[] = {  	},  	{  		.procname	= "max_skb_frags", -		.data		= &sysctl_max_skb_frags, +		.data		= &net_hotdata.sysctl_max_skb_frags,  		.maxlen		= sizeof(int),  		.mode		= 0644,  		.proc_handler	= proc_dointvec_minmax, @@ -654,13 +655,12 @@ static struct ctl_table net_core_table[] = {  	},  	{  		.procname	= "skb_defer_max", -		.data		= &sysctl_skb_defer_max, +		.data		= &net_hotdata.sysctl_skb_defer_max,  		.maxlen		= sizeof(unsigned int),  		.mode		= 0644,  		.proc_handler	= proc_dointvec_minmax,  		.extra1		= SYSCTL_ZERO,  	}, -	{ }  };  static struct ctl_table netns_core_table[] = { @@ -697,7 +697,6 @@ static struct ctl_table netns_core_table[] = {  		.extra2		= SYSCTL_ONE,  		.proc_handler	= proc_dou8vec_minmax,  	}, -	{ }  };  static int __init fb_tunnels_only_for_init_net_sysctl_setup(char *str) @@ -715,20 +714,21 @@ __setup("fb_tunnels=", fb_tunnels_only_for_init_net_sysctl_setup);  static __net_init int sysctl_core_net_init(struct net *net)  { -	struct ctl_table *tbl, *tmp; +	size_t table_size = ARRAY_SIZE(netns_core_table); +	struct ctl_table *tbl;  	tbl = netns_core_table;  	if (!net_eq(net, &init_net)) { +		int i;  		tbl = kmemdup(tbl, sizeof(netns_core_table), GFP_KERNEL);  		if (tbl == NULL)  			goto err_dup; -		for (tmp = tbl; tmp->procname; tmp++) -			tmp->data += (char *)net - (char *)&init_net; +		for (i = 0; i < table_size; ++i) +			tbl[i].data += (char *)net - (char *)&init_net;  	} -	net->core.sysctl_hdr = register_net_sysctl_sz(net, "net/core", tbl, -						      ARRAY_SIZE(netns_core_table)); +	net->core.sysctl_hdr = register_net_sysctl_sz(net, "net/core", tbl, table_size);  	if (net->core.sysctl_hdr == NULL)  		goto err_reg; @@ -743,7 +743,7 @@ err_dup:  static __net_exit void sysctl_core_net_exit(struct net *net)  { -	struct ctl_table *tbl; +	const struct ctl_table *tbl;  	tbl = net->core.sysctl_hdr->ctl_table_arg;  	unregister_net_sysctl_table(net->core.sysctl_hdr);  |