diff options
Diffstat (limited to 'net/core')
| -rw-r--r-- | net/core/datagram.c | 104 | ||||
| -rw-r--r-- | net/core/dev.c | 285 | ||||
| -rw-r--r-- | net/core/dev_ioctl.c | 20 | ||||
| -rw-r--r-- | net/core/devlink.c | 8 | ||||
| -rw-r--r-- | net/core/dst.c | 300 | ||||
| -rw-r--r-- | net/core/ethtool.c | 20 | ||||
| -rw-r--r-- | net/core/fib_rules.c | 27 | ||||
| -rw-r--r-- | net/core/filter.c | 685 | ||||
| -rw-r--r-- | net/core/flow_dissector.c | 69 | ||||
| -rw-r--r-- | net/core/lwt_bpf.c | 5 | ||||
| -rw-r--r-- | net/core/lwtunnel.c | 38 | ||||
| -rw-r--r-- | net/core/neighbour.c | 94 | ||||
| -rw-r--r-- | net/core/net-procfs.c | 13 | ||||
| -rw-r--r-- | net/core/net-sysfs.c | 16 | ||||
| -rw-r--r-- | net/core/net_namespace.c | 83 | ||||
| -rw-r--r-- | net/core/netpoll.c | 14 | ||||
| -rw-r--r-- | net/core/pktgen.c | 58 | ||||
| -rw-r--r-- | net/core/rtnetlink.c | 199 | ||||
| -rw-r--r-- | net/core/secure_seq.c | 36 | ||||
| -rw-r--r-- | net/core/skbuff.c | 193 | ||||
| -rw-r--r-- | net/core/sock.c | 122 | ||||
| -rw-r--r-- | net/core/sysctl_net_core.c | 2 | 
22 files changed, 1620 insertions, 771 deletions
| diff --git a/net/core/datagram.c b/net/core/datagram.c index db1866f2ffcf..6877c43cc92d 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -68,7 +68,7 @@ static inline int connection_based(struct sock *sk)  	return sk->sk_type == SOCK_SEQPACKET || sk->sk_type == SOCK_STREAM;  } -static int receiver_wake_function(wait_queue_t *wait, unsigned int mode, int sync, +static int receiver_wake_function(wait_queue_entry_t *wait, unsigned int mode, int sync,  				  void *key)  {  	unsigned long bits = (unsigned long)key; @@ -161,6 +161,45 @@ done:  	return skb;  } +struct sk_buff *__skb_try_recv_from_queue(struct sock *sk, +					  struct sk_buff_head *queue, +					  unsigned int flags, +					  void (*destructor)(struct sock *sk, +							   struct sk_buff *skb), +					  int *peeked, int *off, int *err, +					  struct sk_buff **last) +{ +	struct sk_buff *skb; +	int _off = *off; + +	*last = queue->prev; +	skb_queue_walk(queue, skb) { +		if (flags & MSG_PEEK) { +			if (_off >= skb->len && (skb->len || _off || +						 skb->peeked)) { +				_off -= skb->len; +				continue; +			} +			if (!skb->len) { +				skb = skb_set_peeked(skb); +				if (unlikely(IS_ERR(skb))) { +					*err = PTR_ERR(skb); +					return NULL; +				} +			} +			*peeked = 1; +			refcount_inc(&skb->users); +		} else { +			__skb_unlink(skb, queue); +			if (destructor) +				destructor(sk, skb); +		} +		*off = _off; +		return skb; +	} +	return NULL; +} +  /**   *	__skb_try_recv_datagram - Receive a datagram skbuff   *	@sk: socket @@ -181,7 +220,7 @@ done:   *   *	This function will lock the socket if a skb is returned, so   *	the caller needs to unlock the socket in that case (usually by - *	calling skb_free_datagram). Returns NULL with *err set to + *	calling skb_free_datagram). Returns NULL with @err set to   *	-EAGAIN if no data was available or to some other value if an   *	error was detected.   * @@ -222,40 +261,14 @@ struct sk_buff *__skb_try_recv_datagram(struct sock *sk, unsigned int flags,  		 * Look at current nfs client by the way...  		 * However, this function was correct in any case. 8)  		 */ -		int _off = *off; - -		*last = (struct sk_buff *)queue;  		spin_lock_irqsave(&queue->lock, cpu_flags); -		skb_queue_walk(queue, skb) { -			*last = skb; -			if (flags & MSG_PEEK) { -				if (_off >= skb->len && (skb->len || _off || -							 skb->peeked)) { -					_off -= skb->len; -					continue; -				} -				if (!skb->len) { -					skb = skb_set_peeked(skb); -					if (IS_ERR(skb)) { -						error = PTR_ERR(skb); -						spin_unlock_irqrestore(&queue->lock, -								       cpu_flags); -						goto no_packet; -					} -				} -				*peeked = 1; -				atomic_inc(&skb->users); -			} else { -				__skb_unlink(skb, queue); -				if (destructor) -					destructor(sk, skb); -			} -			spin_unlock_irqrestore(&queue->lock, cpu_flags); -			*off = _off; -			return skb; -		} - +		skb = __skb_try_recv_from_queue(sk, queue, flags, destructor, +						peeked, off, &error, last);  		spin_unlock_irqrestore(&queue->lock, cpu_flags); +		if (error) +			goto no_packet; +		if (skb) +			return skb;  		if (!sk_can_busy_loop(sk))  			break; @@ -317,9 +330,7 @@ void __skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb, int len)  {  	bool slow; -	if (likely(atomic_read(&skb->users) == 1)) -		smp_rmb(); -	else if (likely(!atomic_dec_and_test(&skb->users))) { +	if (!skb_unref(skb)) {  		sk_peek_offset_bwd(sk, len);  		return;  	} @@ -335,8 +346,8 @@ void __skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb, int len)  }  EXPORT_SYMBOL(__skb_free_datagram_locked); -int __sk_queue_drop_skb(struct sock *sk, struct sk_buff *skb, -			unsigned int flags, +int __sk_queue_drop_skb(struct sock *sk, struct sk_buff_head *sk_queue, +			struct sk_buff *skb, unsigned int flags,  			void (*destructor)(struct sock *sk,  					   struct sk_buff *skb))  { @@ -344,15 +355,15 @@ int __sk_queue_drop_skb(struct sock *sk, struct sk_buff *skb,  	if (flags & MSG_PEEK) {  		err = -ENOENT; -		spin_lock_bh(&sk->sk_receive_queue.lock); -		if (skb == skb_peek(&sk->sk_receive_queue)) { -			__skb_unlink(skb, &sk->sk_receive_queue); -			atomic_dec(&skb->users); +		spin_lock_bh(&sk_queue->lock); +		if (skb == skb_peek(sk_queue)) { +			__skb_unlink(skb, sk_queue); +			refcount_dec(&skb->users);  			if (destructor)  				destructor(sk, skb);  			err = 0;  		} -		spin_unlock_bh(&sk->sk_receive_queue.lock); +		spin_unlock_bh(&sk_queue->lock);  	}  	atomic_inc(&sk->sk_drops); @@ -383,7 +394,8 @@ EXPORT_SYMBOL(__sk_queue_drop_skb);  int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags)  { -	int err = __sk_queue_drop_skb(sk, skb, flags, NULL); +	int err = __sk_queue_drop_skb(sk, &sk->sk_receive_queue, skb, flags, +				      NULL);  	kfree_skb(skb);  	sk_mem_reclaim_partial(sk); @@ -602,7 +614,7 @@ int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *from)  		skb->data_len += copied;  		skb->len += copied;  		skb->truesize += truesize; -		atomic_add(truesize, &skb->sk->sk_wmem_alloc); +		refcount_add(truesize, &skb->sk->sk_wmem_alloc);  		while (copied) {  			int size = min_t(int, copied, PAGE_SIZE - start);  			skb_fill_page_desc(skb, frag++, pages[n], start, size); diff --git a/net/core/dev.c b/net/core/dev.c index d07aa5ffb511..7098fba52be1 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -81,6 +81,7 @@  #include <linux/hash.h>  #include <linux/slab.h>  #include <linux/sched.h> +#include <linux/sched/mm.h>  #include <linux/mutex.h>  #include <linux/string.h>  #include <linux/mm.h> @@ -104,6 +105,7 @@  #include <net/dst.h>  #include <net/dst_metadata.h>  #include <net/pkt_sched.h> +#include <net/pkt_cls.h>  #include <net/checksum.h>  #include <net/xfrm.h>  #include <linux/highmem.h> @@ -141,6 +143,7 @@  #include <linux/hrtimer.h>  #include <linux/netfilter_ingress.h>  #include <linux/crash_dump.h> +#include <linux/sctp.h>  #include "net-sysfs.h" @@ -160,6 +163,7 @@ static int netif_rx_internal(struct sk_buff *skb);  static int call_netdevice_notifiers_info(unsigned long val,  					 struct net_device *dev,  					 struct netdev_notifier_info *info); +static struct napi_struct *napi_by_id(unsigned int napi_id);  /*   * The @dev_base_head list is protected by @dev_base_lock and the rtnl @@ -864,6 +868,31 @@ struct net_device *dev_get_by_index(struct net *net, int ifindex)  EXPORT_SYMBOL(dev_get_by_index);  /** + *	dev_get_by_napi_id - find a device by napi_id + *	@napi_id: ID of the NAPI struct + * + *	Search for an interface by NAPI ID. Returns %NULL if the device + *	is not found or a pointer to the device. The device has not had + *	its reference counter increased so the caller must be careful + *	about locking. The caller must hold RCU lock. + */ + +struct net_device *dev_get_by_napi_id(unsigned int napi_id) +{ +	struct napi_struct *napi; + +	WARN_ON_ONCE(!rcu_read_lock_held()); + +	if (napi_id < MIN_NAPI_ID) +		return NULL; + +	napi = napi_by_id(napi_id); + +	return napi ? napi->dev : NULL; +} +EXPORT_SYMBOL(dev_get_by_napi_id); + +/**   *	netdev_get_name - get a netdevice name, knowing its ifindex.   *	@net: network namespace   *	@name: a pointer to the buffer where the name will be stored. @@ -1252,8 +1281,9 @@ int dev_set_alias(struct net_device *dev, const char *alias, size_t len)  	if (!new_ifalias)  		return -ENOMEM;  	dev->ifalias = new_ifalias; +	memcpy(dev->ifalias, alias, len); +	dev->ifalias[len] = 0; -	strlcpy(dev->ifalias, alias, len+1);  	return len;  } @@ -1832,7 +1862,7 @@ static inline int deliver_skb(struct sk_buff *skb,  {  	if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))  		return -ENOMEM; -	atomic_inc(&skb->users); +	refcount_inc(&skb->users);  	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);  } @@ -2454,10 +2484,10 @@ void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)  	if (unlikely(!skb))  		return; -	if (likely(atomic_read(&skb->users) == 1)) { +	if (likely(refcount_read(&skb->users) == 1)) {  		smp_rmb(); -		atomic_set(&skb->users, 0); -	} else if (likely(!atomic_dec_and_test(&skb->users))) { +		refcount_set(&skb->users, 0); +	} else if (likely(!refcount_dec_and_test(&skb->users))) {  		return;  	}  	get_kfree_skb_cb(skb)->reason = reason; @@ -2610,6 +2640,47 @@ out:  }  EXPORT_SYMBOL(skb_checksum_help); +int skb_crc32c_csum_help(struct sk_buff *skb) +{ +	__le32 crc32c_csum; +	int ret = 0, offset, start; + +	if (skb->ip_summed != CHECKSUM_PARTIAL) +		goto out; + +	if (unlikely(skb_is_gso(skb))) +		goto out; + +	/* Before computing a checksum, we should make sure no frag could +	 * be modified by an external entity : checksum could be wrong. +	 */ +	if (unlikely(skb_has_shared_frag(skb))) { +		ret = __skb_linearize(skb); +		if (ret) +			goto out; +	} +	start = skb_checksum_start_offset(skb); +	offset = start + offsetof(struct sctphdr, checksum); +	if (WARN_ON_ONCE(offset >= skb_headlen(skb))) { +		ret = -EINVAL; +		goto out; +	} +	if (skb_cloned(skb) && +	    !skb_clone_writable(skb, offset + sizeof(__le32))) { +		ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC); +		if (ret) +			goto out; +	} +	crc32c_csum = cpu_to_le32(~__skb_checksum(skb, start, +						  skb->len - start, ~(__u32)0, +						  crc32c_csum_stub)); +	*(__le32 *)(skb->data + offset) = crc32c_csum; +	skb->ip_summed = CHECKSUM_NONE; +	skb->csum_not_inet = 0; +out: +	return ret; +} +  __be16 skb_network_protocol(struct sk_buff *skb, int *depth)  {  	__be16 type = skb->protocol; @@ -2952,6 +3023,17 @@ static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,  	return skb;  } +int skb_csum_hwoffload_help(struct sk_buff *skb, +			    const netdev_features_t features) +{ +	if (unlikely(skb->csum_not_inet)) +		return !!(features & NETIF_F_SCTP_CRC) ? 0 : +			skb_crc32c_csum_help(skb); + +	return !!(features & NETIF_F_CSUM_MASK) ? 0 : skb_checksum_help(skb); +} +EXPORT_SYMBOL(skb_csum_hwoffload_help); +  static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev)  {  	netdev_features_t features; @@ -2990,8 +3072,7 @@ static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device  			else  				skb_set_transport_header(skb,  							 skb_checksum_start_offset(skb)); -			if (!(features & NETIF_F_CSUM_MASK) && -			    skb_checksum_help(skb)) +			if (skb_csum_hwoffload_help(skb, features))  				goto out_kfree_skb;  		}  	} @@ -3177,7 +3258,7 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)  	/* qdisc_skb_cb(skb)->pkt_len was already set by the caller. */  	qdisc_bstats_cpu_update(cl->q, skb); -	switch (tc_classify(skb, cl, &cl_res, false)) { +	switch (tcf_classify(skb, cl, &cl_res, false)) {  	case TC_ACT_OK:  	case TC_ACT_RECLASSIFY:  		skb->tc_index = TC_H_MIN(cl_res.classid); @@ -3189,6 +3270,7 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)  		return NULL;  	case TC_ACT_STOLEN:  	case TC_ACT_QUEUED: +	case TC_ACT_TRAP:  		*ret = NET_XMIT_SUCCESS;  		consume_skb(skb);  		return NULL; @@ -3873,7 +3955,7 @@ static __latent_entropy void net_tx_action(struct softirq_action *h)  			clist = clist->next; -			WARN_ON(atomic_read(&skb->users)); +			WARN_ON(refcount_read(&skb->users));  			if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))  				trace_consume_skb(skb);  			else @@ -3947,7 +4029,7 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,  	skb->tc_at_ingress = 1;  	qdisc_bstats_cpu_update(cl->q, skb); -	switch (tc_classify(skb, cl, &cl_res, false)) { +	switch (tcf_classify(skb, cl, &cl_res, false)) {  	case TC_ACT_OK:  	case TC_ACT_RECLASSIFY:  		skb->tc_index = TC_H_MIN(cl_res.classid); @@ -3958,6 +4040,7 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,  		return NULL;  	case TC_ACT_STOLEN:  	case TC_ACT_QUEUED: +	case TC_ACT_TRAP:  		consume_skb(skb);  		return NULL;  	case TC_ACT_REDIRECT: @@ -4235,7 +4318,7 @@ static int __netif_receive_skb(struct sk_buff *skb)  	int ret;  	if (sk_memalloc_socks() && skb_pfmemalloc(skb)) { -		unsigned long pflags = current->flags; +		unsigned int noreclaim_flag;  		/*  		 * PFMEMALLOC skbs are special, they should @@ -4246,9 +4329,9 @@ static int __netif_receive_skb(struct sk_buff *skb)  		 * Use PF_MEMALLOC as this saves us from propagating the allocation  		 * context down to all allocation sites.  		 */ -		current->flags |= PF_MEMALLOC; +		noreclaim_flag = memalloc_noreclaim_save();  		ret = __netif_receive_skb_core(skb, true); -		current_restore_flags(pflags, PF_MEMALLOC); +		memalloc_noreclaim_restore(noreclaim_flag);  	} else  		ret = __netif_receive_skb_core(skb, false); @@ -4259,13 +4342,12 @@ static struct static_key generic_xdp_needed __read_mostly;  static int generic_xdp_install(struct net_device *dev, struct netdev_xdp *xdp)  { +	struct bpf_prog *old = rtnl_dereference(dev->xdp_prog);  	struct bpf_prog *new = xdp->prog;  	int ret = 0;  	switch (xdp->command) { -	case XDP_SETUP_PROG: { -		struct bpf_prog *old = rtnl_dereference(dev->xdp_prog); - +	case XDP_SETUP_PROG:  		rcu_assign_pointer(dev->xdp_prog, new);  		if (old)  			bpf_prog_put(old); @@ -4277,10 +4359,10 @@ static int generic_xdp_install(struct net_device *dev, struct netdev_xdp *xdp)  			dev_disable_lro(dev);  		}  		break; -	}  	case XDP_QUERY_PROG: -		xdp->prog_attached = !!rcu_access_pointer(dev->xdp_prog); +		xdp->prog_attached = !!old; +		xdp->prog_id = old ? old->aux->id : 0;  		break;  	default: @@ -4635,9 +4717,6 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff  	if (netif_elide_gro(skb->dev))  		goto normal; -	if (skb->csum_bad) -		goto normal; -  	gro_list_prepare(napi, skb);  	rcu_read_lock(); @@ -4765,6 +4844,13 @@ struct packet_offload *gro_find_complete_by_type(__be16 type)  }  EXPORT_SYMBOL(gro_find_complete_by_type); +static void napi_skb_free_stolen_head(struct sk_buff *skb) +{ +	skb_dst_drop(skb); +	secpath_reset(skb); +	kmem_cache_free(skbuff_head_cache, skb); +} +  static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)  {  	switch (ret) { @@ -4778,13 +4864,10 @@ static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)  		break;  	case GRO_MERGED_FREE: -		if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) { -			skb_dst_drop(skb); -			secpath_reset(skb); -			kmem_cache_free(skbuff_head_cache, skb); -		} else { +		if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) +			napi_skb_free_stolen_head(skb); +		else  			__kfree_skb(skb); -		}  		break;  	case GRO_HELD: @@ -4856,10 +4939,16 @@ static gro_result_t napi_frags_finish(struct napi_struct *napi,  		break;  	case GRO_DROP: -	case GRO_MERGED_FREE:  		napi_reuse_skb(napi, skb);  		break; +	case GRO_MERGED_FREE: +		if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) +			napi_skb_free_stolen_head(skb); +		else +			napi_reuse_skb(napi, skb); +		break; +  	case GRO_MERGED:  	case GRO_CONSUMED:  		break; @@ -4947,6 +5036,19 @@ __sum16 __skb_gro_checksum_complete(struct sk_buff *skb)  }  EXPORT_SYMBOL(__skb_gro_checksum_complete); +static void net_rps_send_ipi(struct softnet_data *remsd) +{ +#ifdef CONFIG_RPS +	while (remsd) { +		struct softnet_data *next = remsd->rps_ipi_next; + +		if (cpu_online(remsd->cpu)) +			smp_call_function_single_async(remsd->cpu, &remsd->csd); +		remsd = next; +	} +#endif +} +  /*   * net_rps_action_and_irq_enable sends any pending IPI's for rps.   * Note: called with local irq disabled, but exits with local irq enabled. @@ -4962,14 +5064,7 @@ static void net_rps_action_and_irq_enable(struct softnet_data *sd)  		local_irq_enable();  		/* Send pending IPI's to kick RPS processing on remote cpus. */ -		while (remsd) { -			struct softnet_data *next = remsd->rps_ipi_next; - -			if (cpu_online(remsd->cpu)) -				smp_call_function_single_async(remsd->cpu, -							   &remsd->csd); -			remsd = next; -		} +		net_rps_send_ipi(remsd);  	} else  #endif  		local_irq_enable(); @@ -5198,8 +5293,6 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock)  	if (rc == BUSY_POLL_BUDGET)  		__napi_schedule(napi);  	local_bh_enable(); -	if (local_softirq_pending()) -		do_softirq();  }  void napi_busy_loop(unsigned int napi_id, @@ -6851,6 +6944,39 @@ int dev_change_proto_down(struct net_device *dev, bool proto_down)  }  EXPORT_SYMBOL(dev_change_proto_down); +u8 __dev_xdp_attached(struct net_device *dev, xdp_op_t xdp_op, u32 *prog_id) +{ +	struct netdev_xdp xdp; + +	memset(&xdp, 0, sizeof(xdp)); +	xdp.command = XDP_QUERY_PROG; + +	/* Query must always succeed. */ +	WARN_ON(xdp_op(dev, &xdp) < 0); +	if (prog_id) +		*prog_id = xdp.prog_id; + +	return xdp.prog_attached; +} + +static int dev_xdp_install(struct net_device *dev, xdp_op_t xdp_op, +			   struct netlink_ext_ack *extack, u32 flags, +			   struct bpf_prog *prog) +{ +	struct netdev_xdp xdp; + +	memset(&xdp, 0, sizeof(xdp)); +	if (flags & XDP_FLAGS_HW_MODE) +		xdp.command = XDP_SETUP_PROG_HW; +	else +		xdp.command = XDP_SETUP_PROG; +	xdp.extack = extack; +	xdp.flags = flags; +	xdp.prog = prog; + +	return xdp_op(dev, &xdp); +} +  /**   *	dev_change_xdp_fd - set or clear a bpf program for a device rx path   *	@dev: device @@ -6863,41 +6989,34 @@ EXPORT_SYMBOL(dev_change_proto_down);  int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,  		      int fd, u32 flags)  { -	int (*xdp_op)(struct net_device *dev, struct netdev_xdp *xdp);  	const struct net_device_ops *ops = dev->netdev_ops;  	struct bpf_prog *prog = NULL; -	struct netdev_xdp xdp; +	xdp_op_t xdp_op, xdp_chk;  	int err;  	ASSERT_RTNL(); -	xdp_op = ops->ndo_xdp; +	xdp_op = xdp_chk = ops->ndo_xdp; +	if (!xdp_op && (flags & (XDP_FLAGS_DRV_MODE | XDP_FLAGS_HW_MODE))) +		return -EOPNOTSUPP;  	if (!xdp_op || (flags & XDP_FLAGS_SKB_MODE))  		xdp_op = generic_xdp_install; +	if (xdp_op == xdp_chk) +		xdp_chk = generic_xdp_install;  	if (fd >= 0) { -		if (flags & XDP_FLAGS_UPDATE_IF_NOEXIST) { -			memset(&xdp, 0, sizeof(xdp)); -			xdp.command = XDP_QUERY_PROG; - -			err = xdp_op(dev, &xdp); -			if (err < 0) -				return err; -			if (xdp.prog_attached) -				return -EBUSY; -		} +		if (xdp_chk && __dev_xdp_attached(dev, xdp_chk, NULL)) +			return -EEXIST; +		if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) && +		    __dev_xdp_attached(dev, xdp_op, NULL)) +			return -EBUSY;  		prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_XDP);  		if (IS_ERR(prog))  			return PTR_ERR(prog);  	} -	memset(&xdp, 0, sizeof(xdp)); -	xdp.command = XDP_SETUP_PROG; -	xdp.extack = extack; -	xdp.prog = prog; - -	err = xdp_op(dev, &xdp); +	err = dev_xdp_install(dev, xdp_op, extack, flags, prog);  	if (err < 0 && prog)  		bpf_prog_put(prog); @@ -6988,7 +7107,7 @@ static void rollback_registered_many(struct list_head *head)  		if (!dev->rtnl_link_ops ||  		    dev->rtnl_link_state == RTNL_LINK_INITIALIZED) -			skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, +			skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0,  						     GFP_KERNEL);  		/* @@ -7264,12 +7383,10 @@ static int netif_alloc_rx_queues(struct net_device *dev)  	BUG_ON(count < 1); -	rx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT); -	if (!rx) { -		rx = vzalloc(sz); -		if (!rx) -			return -ENOMEM; -	} +	rx = kvzalloc(sz, GFP_KERNEL | __GFP_REPEAT); +	if (!rx) +		return -ENOMEM; +  	dev->_rx = rx;  	for (i = 0; i < count; i++) @@ -7306,12 +7423,10 @@ static int netif_alloc_netdev_queues(struct net_device *dev)  	if (count < 1 || count > 0xffff)  		return -EINVAL; -	tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT); -	if (!tx) { -		tx = vzalloc(sz); -		if (!tx) -			return -ENOMEM; -	} +	tx = kvzalloc(sz, GFP_KERNEL | __GFP_REPEAT); +	if (!tx) +		return -ENOMEM; +  	dev->_tx = tx;  	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL); @@ -7485,6 +7600,8 @@ out:  err_uninit:  	if (dev->netdev_ops->ndo_uninit)  		dev->netdev_ops->ndo_uninit(dev); +	if (dev->priv_destructor) +		dev->priv_destructor(dev);  	goto out;  }  EXPORT_SYMBOL(register_netdevice); @@ -7692,8 +7809,10 @@ void netdev_run_todo(void)  		WARN_ON(rcu_access_pointer(dev->ip6_ptr));  		WARN_ON(dev->dn_ptr); -		if (dev->destructor) -			dev->destructor(dev); +		if (dev->priv_destructor) +			dev->priv_destructor(dev); +		if (dev->needs_free_netdev) +			free_netdev(dev);  		/* Report a network device has been unregistered */  		rtnl_lock(); @@ -7716,7 +7835,7 @@ void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,  {  #if BITS_PER_LONG == 64  	BUILD_BUG_ON(sizeof(*stats64) < sizeof(*netdev_stats)); -	memcpy(stats64, netdev_stats, sizeof(*stats64)); +	memcpy(stats64, netdev_stats, sizeof(*netdev_stats));  	/* zero out counters that only exist in rtnl_link_stats64 */  	memset((char *)stats64 + sizeof(*netdev_stats), 0,  	       sizeof(*stats64) - sizeof(*netdev_stats)); @@ -7758,9 +7877,9 @@ struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,  	} else {  		netdev_stats_to_stats64(storage, &dev->stats);  	} -	storage->rx_dropped += atomic_long_read(&dev->rx_dropped); -	storage->tx_dropped += atomic_long_read(&dev->tx_dropped); -	storage->rx_nohandler += atomic_long_read(&dev->rx_nohandler); +	storage->rx_dropped += (unsigned long)atomic_long_read(&dev->rx_dropped); +	storage->tx_dropped += (unsigned long)atomic_long_read(&dev->tx_dropped); +	storage->rx_nohandler += (unsigned long)atomic_long_read(&dev->rx_nohandler);  	return storage;  }  EXPORT_SYMBOL(dev_get_stats); @@ -7845,9 +7964,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,  	/* ensure 32-byte alignment of whole construct */  	alloc_size += NETDEV_ALIGN - 1; -	p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT); -	if (!p) -		p = vzalloc(alloc_size); +	p = kvzalloc(alloc_size, GFP_KERNEL | __GFP_REPEAT);  	if (!p)  		return NULL; @@ -8178,7 +8295,7 @@ static int dev_cpu_dead(unsigned int oldcpu)  	struct sk_buff **list_skb;  	struct sk_buff *skb;  	unsigned int cpu; -	struct softnet_data *sd, *oldsd; +	struct softnet_data *sd, *oldsd, *remsd = NULL;  	local_irq_disable();  	cpu = smp_processor_id(); @@ -8219,6 +8336,13 @@ static int dev_cpu_dead(unsigned int oldcpu)  	raise_softirq_irqoff(NET_TX_SOFTIRQ);  	local_irq_enable(); +#ifdef CONFIG_RPS +	remsd = oldsd->rps_ipi_list; +	oldsd->rps_ipi_list = NULL; +#endif +	/* send out pending IPI's on offline CPU */ +	net_rps_send_ipi(remsd); +  	/* Process offline CPU's input_pkt_queue */  	while ((skb = __skb_dequeue(&oldsd->process_queue))) {  		netif_rx_ni(skb); @@ -8568,7 +8692,6 @@ static int __init net_dev_init(void)  	rc = cpuhp_setup_state_nocalls(CPUHP_NET_DEV_DEAD, "net/dev:dead",  				       NULL, dev_cpu_dead);  	WARN_ON(rc < 0); -	dst_subsys_init();  	rc = 0;  out:  	return rc; diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c index b94b1d293506..82fd4c9c4a1b 100644 --- a/net/core/dev_ioctl.c +++ b/net/core/dev_ioctl.c @@ -225,6 +225,7 @@ static int net_hwtstamp_validate(struct ifreq *ifr)  	case HWTSTAMP_FILTER_PTP_V2_EVENT:  	case HWTSTAMP_FILTER_PTP_V2_SYNC:  	case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ: +	case HWTSTAMP_FILTER_NTP_ALL:  		rx_filter_valid = 1;  		break;  	} @@ -410,6 +411,22 @@ int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)  	if (cmd == SIOCGIFNAME)  		return dev_ifname(net, (struct ifreq __user *)arg); +	/* +	 * Take care of Wireless Extensions. Unfortunately struct iwreq +	 * isn't a proper subset of struct ifreq (it's 8 byte shorter) +	 * so we need to treat it specially, otherwise applications may +	 * fault if the struct they're passing happens to land at the +	 * end of a mapped page. +	 */ +	if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) { +		struct iwreq iwr; + +		if (copy_from_user(&iwr, arg, sizeof(iwr))) +			return -EFAULT; + +		return wext_handle_ioctl(net, &iwr, cmd, arg); +	} +  	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))  		return -EFAULT; @@ -559,9 +576,6 @@ int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)  				ret = -EFAULT;  			return ret;  		} -		/* Take care of Wireless Extensions */ -		if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) -			return wext_handle_ioctl(net, &ifr, cmd, arg);  		return -ENOTTY;  	}  } diff --git a/net/core/devlink.c b/net/core/devlink.c index b0b87a292e7c..a0adfc31a3fe 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -1680,8 +1680,10 @@ start_again:  	hdr = genlmsg_put(skb, info->snd_portid, info->snd_seq,  			  &devlink_nl_family, NLM_F_MULTI, cmd); -	if (!hdr) +	if (!hdr) { +		nlmsg_free(skb);  		return -EMSGSIZE; +	}  	if (devlink_nl_put_handle(skb, devlink))  		goto nla_put_failure; @@ -2098,8 +2100,10 @@ start_again:  	hdr = genlmsg_put(skb, info->snd_portid, info->snd_seq,  			  &devlink_nl_family, NLM_F_MULTI, cmd); -	if (!hdr) +	if (!hdr) { +		nlmsg_free(skb);  		return -EMSGSIZE; +	}  	if (devlink_nl_put_handle(skb, devlink))  		goto nla_put_failure; diff --git a/net/core/dst.c b/net/core/dst.c index 960e503b5a52..00aa972ad1a1 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -42,108 +42,6 @@   * to dirty as few cache lines as possible in __dst_free().   * As this is not a very strong hint, we dont force an alignment on SMP.   */ -static struct { -	spinlock_t		lock; -	struct dst_entry	*list; -	unsigned long		timer_inc; -	unsigned long		timer_expires; -} dst_garbage = { -	.lock = __SPIN_LOCK_UNLOCKED(dst_garbage.lock), -	.timer_inc = DST_GC_MAX, -}; -static void dst_gc_task(struct work_struct *work); -static void ___dst_free(struct dst_entry *dst); - -static DECLARE_DELAYED_WORK(dst_gc_work, dst_gc_task); - -static DEFINE_MUTEX(dst_gc_mutex); -/* - * long lived entries are maintained in this list, guarded by dst_gc_mutex - */ -static struct dst_entry         *dst_busy_list; - -static void dst_gc_task(struct work_struct *work) -{ -	int    delayed = 0; -	int    work_performed = 0; -	unsigned long expires = ~0L; -	struct dst_entry *dst, *next, head; -	struct dst_entry *last = &head; - -	mutex_lock(&dst_gc_mutex); -	next = dst_busy_list; - -loop: -	while ((dst = next) != NULL) { -		next = dst->next; -		prefetch(&next->next); -		cond_resched(); -		if (likely(atomic_read(&dst->__refcnt))) { -			last->next = dst; -			last = dst; -			delayed++; -			continue; -		} -		work_performed++; - -		dst = dst_destroy(dst); -		if (dst) { -			/* NOHASH and still referenced. Unless it is already -			 * on gc list, invalidate it and add to gc list. -			 * -			 * Note: this is temporary. Actually, NOHASH dst's -			 * must be obsoleted when parent is obsoleted. -			 * But we do not have state "obsoleted, but -			 * referenced by parent", so it is right. -			 */ -			if (dst->obsolete > 0) -				continue; - -			___dst_free(dst); -			dst->next = next; -			next = dst; -		} -	} - -	spin_lock_bh(&dst_garbage.lock); -	next = dst_garbage.list; -	if (next) { -		dst_garbage.list = NULL; -		spin_unlock_bh(&dst_garbage.lock); -		goto loop; -	} -	last->next = NULL; -	dst_busy_list = head.next; -	if (!dst_busy_list) -		dst_garbage.timer_inc = DST_GC_MAX; -	else { -		/* -		 * if we freed less than 1/10 of delayed entries, -		 * we can sleep longer. -		 */ -		if (work_performed <= delayed/10) { -			dst_garbage.timer_expires += dst_garbage.timer_inc; -			if (dst_garbage.timer_expires > DST_GC_MAX) -				dst_garbage.timer_expires = DST_GC_MAX; -			dst_garbage.timer_inc += DST_GC_INC; -		} else { -			dst_garbage.timer_inc = DST_GC_INC; -			dst_garbage.timer_expires = DST_GC_MIN; -		} -		expires = dst_garbage.timer_expires; -		/* -		 * if the next desired timer is more than 4 seconds in the -		 * future then round the timer to whole seconds -		 */ -		if (expires > 4*HZ) -			expires = round_jiffies_relative(expires); -		schedule_delayed_work(&dst_gc_work, expires); -	} - -	spin_unlock_bh(&dst_garbage.lock); -	mutex_unlock(&dst_gc_mutex); -} -  int dst_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)  {  	kfree_skb(skb); @@ -151,13 +49,13 @@ int dst_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)  }  EXPORT_SYMBOL(dst_discard_out); -const u32 dst_default_metrics[RTAX_MAX + 1] = { +const struct dst_metrics dst_default_metrics = {  	/* This initializer is needed to force linker to place this variable  	 * into const section. Otherwise it might end into bss section.  	 * We really want to avoid false sharing on this variable, and catch  	 * any writes on it.  	 */ -	[RTAX_MAX] = 0xdeadbeef, +	.refcnt = ATOMIC_INIT(1),  };  void dst_init(struct dst_entry *dst, struct dst_ops *ops, @@ -169,7 +67,7 @@ void dst_init(struct dst_entry *dst, struct dst_ops *ops,  	if (dev)  		dev_hold(dev);  	dst->ops = ops; -	dst_init_metrics(dst, dst_default_metrics, true); +	dst_init_metrics(dst, dst_default_metrics.metrics, true);  	dst->expires = 0UL;  	dst->path = dst;  	dst->from = NULL; @@ -216,41 +114,12 @@ void *dst_alloc(struct dst_ops *ops, struct net_device *dev,  }  EXPORT_SYMBOL(dst_alloc); -static void ___dst_free(struct dst_entry *dst) -{ -	/* The first case (dev==NULL) is required, when -	   protocol module is unloaded. -	 */ -	if (dst->dev == NULL || !(dst->dev->flags&IFF_UP)) { -		dst->input = dst_discard; -		dst->output = dst_discard_out; -	} -	dst->obsolete = DST_OBSOLETE_DEAD; -} - -void __dst_free(struct dst_entry *dst) -{ -	spin_lock_bh(&dst_garbage.lock); -	___dst_free(dst); -	dst->next = dst_garbage.list; -	dst_garbage.list = dst; -	if (dst_garbage.timer_inc > DST_GC_INC) { -		dst_garbage.timer_inc = DST_GC_INC; -		dst_garbage.timer_expires = DST_GC_MIN; -		mod_delayed_work(system_wq, &dst_gc_work, -				 dst_garbage.timer_expires); -	} -	spin_unlock_bh(&dst_garbage.lock); -} -EXPORT_SYMBOL(__dst_free); -  struct dst_entry *dst_destroy(struct dst_entry * dst)  {  	struct dst_entry *child;  	smp_rmb(); -again:  	child = dst->child;  	if (!(dst->flags & DST_NOCOUNT)) @@ -269,20 +138,8 @@ again:  		kmem_cache_free(dst->ops->kmem_cachep, dst);  	dst = child; -	if (dst) { -		int nohash = dst->flags & DST_NOHASH; - -		if (atomic_dec_and_test(&dst->__refcnt)) { -			/* We were real parent of this dst, so kill child. */ -			if (nohash) -				goto again; -		} else { -			/* Child is still referenced, return it for freeing. */ -			if (nohash) -				return dst; -			/* Child is still in his hash table */ -		} -	} +	if (dst) +		dst_release_immediate(dst);  	return NULL;  }  EXPORT_SYMBOL(dst_destroy); @@ -292,47 +149,88 @@ static void dst_destroy_rcu(struct rcu_head *head)  	struct dst_entry *dst = container_of(head, struct dst_entry, rcu_head);  	dst = dst_destroy(dst); -	if (dst) -		__dst_free(dst);  } +/* Operations to mark dst as DEAD and clean up the net device referenced + * by dst: + * 1. put the dst under loopback interface and discard all tx/rx packets + *    on this route. + * 2. release the net_device + * This function should be called when removing routes from the fib tree + * in preparation for a NETDEV_DOWN/NETDEV_UNREGISTER event and also to + * make the next dst_ops->check() fail. + */ +void dst_dev_put(struct dst_entry *dst) +{ +	struct net_device *dev = dst->dev; + +	dst->obsolete = DST_OBSOLETE_DEAD; +	if (dst->ops->ifdown) +		dst->ops->ifdown(dst, dev, true); +	dst->input = dst_discard; +	dst->output = dst_discard_out; +	dst->dev = dev_net(dst->dev)->loopback_dev; +	dev_hold(dst->dev); +	dev_put(dev); +} +EXPORT_SYMBOL(dst_dev_put); +  void dst_release(struct dst_entry *dst)  {  	if (dst) {  		int newrefcnt; -		unsigned short nocache = dst->flags & DST_NOCACHE;  		newrefcnt = atomic_dec_return(&dst->__refcnt);  		if (unlikely(newrefcnt < 0))  			net_warn_ratelimited("%s: dst:%p refcnt:%d\n",  					     __func__, dst, newrefcnt); -		if (!newrefcnt && unlikely(nocache)) +		if (!newrefcnt)  			call_rcu(&dst->rcu_head, dst_destroy_rcu);  	}  }  EXPORT_SYMBOL(dst_release); +void dst_release_immediate(struct dst_entry *dst) +{ +	if (dst) { +		int newrefcnt; + +		newrefcnt = atomic_dec_return(&dst->__refcnt); +		if (unlikely(newrefcnt < 0)) +			net_warn_ratelimited("%s: dst:%p refcnt:%d\n", +					     __func__, dst, newrefcnt); +		if (!newrefcnt) +			dst_destroy(dst); +	} +} +EXPORT_SYMBOL(dst_release_immediate); +  u32 *dst_cow_metrics_generic(struct dst_entry *dst, unsigned long old)  { -	u32 *p = kmalloc(sizeof(u32) * RTAX_MAX, GFP_ATOMIC); +	struct dst_metrics *p = kmalloc(sizeof(*p), GFP_ATOMIC);  	if (p) { -		u32 *old_p = __DST_METRICS_PTR(old); +		struct dst_metrics *old_p = (struct dst_metrics *)__DST_METRICS_PTR(old);  		unsigned long prev, new; -		memcpy(p, old_p, sizeof(u32) * RTAX_MAX); +		atomic_set(&p->refcnt, 1); +		memcpy(p->metrics, old_p->metrics, sizeof(p->metrics));  		new = (unsigned long) p;  		prev = cmpxchg(&dst->_metrics, old, new);  		if (prev != old) {  			kfree(p); -			p = __DST_METRICS_PTR(prev); +			p = (struct dst_metrics *)__DST_METRICS_PTR(prev);  			if (prev & DST_METRICS_READ_ONLY)  				p = NULL; +		} else if (prev & DST_METRICS_REFCOUNTED) { +			if (atomic_dec_and_test(&old_p->refcnt)) +				kfree(old_p);  		}  	} -	return p; +	BUILD_BUG_ON(offsetof(struct dst_metrics, metrics) != 0); +	return (u32 *)p;  }  EXPORT_SYMBOL(dst_cow_metrics_generic); @@ -341,7 +239,7 @@ void __dst_destroy_metrics_generic(struct dst_entry *dst, unsigned long old)  {  	unsigned long prev, new; -	new = ((unsigned long) dst_default_metrics) | DST_METRICS_READ_ONLY; +	new = ((unsigned long) &dst_default_metrics) | DST_METRICS_READ_ONLY;  	prev = cmpxchg(&dst->_metrics, old, new);  	if (prev == old)  		kfree(__DST_METRICS_PTR(old)); @@ -366,21 +264,25 @@ static int dst_md_discard(struct sk_buff *skb)  	return 0;  } -static void __metadata_dst_init(struct metadata_dst *md_dst, u8 optslen) +static void __metadata_dst_init(struct metadata_dst *md_dst, +				enum metadata_type type, u8 optslen) +  {  	struct dst_entry *dst;  	dst = &md_dst->dst;  	dst_init(dst, &md_dst_ops, NULL, 1, DST_OBSOLETE_NONE, -		 DST_METADATA | DST_NOCACHE | DST_NOCOUNT); +		 DST_METADATA | DST_NOCOUNT);  	dst->input = dst_md_discard;  	dst->output = dst_md_discard_out;  	memset(dst + 1, 0, sizeof(*md_dst) + optslen - sizeof(*dst)); +	md_dst->type = type;  } -struct metadata_dst *metadata_dst_alloc(u8 optslen, gfp_t flags) +struct metadata_dst *metadata_dst_alloc(u8 optslen, enum metadata_type type, +					gfp_t flags)  {  	struct metadata_dst *md_dst; @@ -388,7 +290,7 @@ struct metadata_dst *metadata_dst_alloc(u8 optslen, gfp_t flags)  	if (!md_dst)  		return NULL; -	__metadata_dst_init(md_dst, optslen); +	__metadata_dst_init(md_dst, type, optslen);  	return md_dst;  } @@ -402,7 +304,8 @@ void metadata_dst_free(struct metadata_dst *md_dst)  	kfree(md_dst);  } -struct metadata_dst __percpu *metadata_dst_alloc_percpu(u8 optslen, gfp_t flags) +struct metadata_dst __percpu * +metadata_dst_alloc_percpu(u8 optslen, enum metadata_type type, gfp_t flags)  {  	int cpu;  	struct metadata_dst __percpu *md_dst; @@ -413,77 +316,8 @@ struct metadata_dst __percpu *metadata_dst_alloc_percpu(u8 optslen, gfp_t flags)  		return NULL;  	for_each_possible_cpu(cpu) -		__metadata_dst_init(per_cpu_ptr(md_dst, cpu), optslen); +		__metadata_dst_init(per_cpu_ptr(md_dst, cpu), type, optslen);  	return md_dst;  }  EXPORT_SYMBOL_GPL(metadata_dst_alloc_percpu); - -/* Dirty hack. We did it in 2.2 (in __dst_free), - * we have _very_ good reasons not to repeat - * this mistake in 2.3, but we have no choice - * now. _It_ _is_ _explicit_ _deliberate_ - * _race_ _condition_. - * - * Commented and originally written by Alexey. - */ -static void dst_ifdown(struct dst_entry *dst, struct net_device *dev, -		       int unregister) -{ -	if (dst->ops->ifdown) -		dst->ops->ifdown(dst, dev, unregister); - -	if (dev != dst->dev) -		return; - -	if (!unregister) { -		dst->input = dst_discard; -		dst->output = dst_discard_out; -	} else { -		dst->dev = dev_net(dst->dev)->loopback_dev; -		dev_hold(dst->dev); -		dev_put(dev); -	} -} - -static int dst_dev_event(struct notifier_block *this, unsigned long event, -			 void *ptr) -{ -	struct net_device *dev = netdev_notifier_info_to_dev(ptr); -	struct dst_entry *dst, *last = NULL; - -	switch (event) { -	case NETDEV_UNREGISTER_FINAL: -	case NETDEV_DOWN: -		mutex_lock(&dst_gc_mutex); -		for (dst = dst_busy_list; dst; dst = dst->next) { -			last = dst; -			dst_ifdown(dst, dev, event != NETDEV_DOWN); -		} - -		spin_lock_bh(&dst_garbage.lock); -		dst = dst_garbage.list; -		dst_garbage.list = NULL; -		spin_unlock_bh(&dst_garbage.lock); - -		if (last) -			last->next = dst; -		else -			dst_busy_list = dst; -		for (; dst; dst = dst->next) -			dst_ifdown(dst, dev, event != NETDEV_DOWN); -		mutex_unlock(&dst_gc_mutex); -		break; -	} -	return NOTIFY_DONE; -} - -static struct notifier_block dst_dev_notifier = { -	.notifier_call	= dst_dev_event, -	.priority = -10, /* must be called after other network notifiers */ -}; - -void __init dst_subsys_init(void) -{ -	register_netdevice_notifier(&dst_dev_notifier); -} diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 03111a2d6653..674b6c9cec18 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -2322,16 +2322,12 @@ static int ethtool_set_tunable(struct net_device *dev, void __user *useraddr)  	ret = ethtool_tunable_valid(&tuna);  	if (ret)  		return ret; -	data = kmalloc(tuna.len, GFP_USER); -	if (!data) -		return -ENOMEM;  	useraddr += sizeof(tuna); -	ret = -EFAULT; -	if (copy_from_user(data, useraddr, tuna.len)) -		goto out; +	data = memdup_user(useraddr, tuna.len); +	if (IS_ERR(data)) +		return PTR_ERR(data);  	ret = ops->set_tunable(dev, &tuna, data); -out:  	kfree(data);  	return ret;  } @@ -2507,18 +2503,14 @@ static int set_phy_tunable(struct net_device *dev, void __user *useraddr)  	ret = ethtool_phy_tunable_valid(&tuna);  	if (ret)  		return ret; -	data = kmalloc(tuna.len, GFP_USER); -	if (!data) -		return -ENOMEM;  	useraddr += sizeof(tuna); -	ret = -EFAULT; -	if (copy_from_user(data, useraddr, tuna.len)) -		goto out; +	data = memdup_user(useraddr, tuna.len); +	if (IS_ERR(data)) +		return PTR_ERR(data);  	mutex_lock(&phydev->lock);  	ret = phydev->drv->set_tunable(phydev, &tuna, data);  	mutex_unlock(&phydev->lock); -out:  	kfree(data);  	return ret;  } diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index f21c4d3aeae0..a0093e1b0235 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -46,7 +46,7 @@ int fib_default_rule_add(struct fib_rules_ops *ops,  	if (r == NULL)  		return -ENOMEM; -	atomic_set(&r->refcnt, 1); +	refcount_set(&r->refcnt, 1);  	r->action = FR_ACT_TO_TBL;  	r->pref = pref;  	r->table = table; @@ -283,7 +283,7 @@ jumped:  		if (err != -EAGAIN) {  			if ((arg->flags & FIB_LOOKUP_NOREF) || -			    likely(atomic_inc_not_zero(&rule->refcnt))) { +			    likely(refcount_inc_not_zero(&rule->refcnt))) {  				arg->rule = rule;  				goto out;  			} @@ -517,7 +517,7 @@ int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh,  		last = r;  	} -	fib_rule_get(rule); +	refcount_set(&rule->refcnt, 1);  	if (last)  		list_add_rcu(&rule->list, &last->list); @@ -568,7 +568,7 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh,  	struct net *net = sock_net(skb->sk);  	struct fib_rule_hdr *frh = nlmsg_data(nlh);  	struct fib_rules_ops *ops = NULL; -	struct fib_rule *rule, *tmp; +	struct fib_rule *rule, *r;  	struct nlattr *tb[FRA_MAX+1];  	struct fib_kuid_range range;  	int err = -EINVAL; @@ -668,16 +668,23 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh,  		/*  		 * Check if this rule is a target to any of them. If so, +		 * adjust to the next one with the same preference or  		 * disable them. As this operation is eventually very -		 * expensive, it is only performed if goto rules have -		 * actually been added. +		 * expensive, it is only performed if goto rules, except +		 * current if it is goto rule, have actually been added.  		 */  		if (ops->nr_goto_rules > 0) { -			list_for_each_entry(tmp, &ops->rules_list, list) { -				if (rtnl_dereference(tmp->ctarget) == rule) { -					RCU_INIT_POINTER(tmp->ctarget, NULL); +			struct fib_rule *n; + +			n = list_next_entry(rule, list); +			if (&n->list == &ops->rules_list || n->pref != rule->pref) +				n = NULL; +			list_for_each_entry(r, &ops->rules_list, list) { +				if (rtnl_dereference(r->ctarget) != rule) +					continue; +				rcu_assign_pointer(r->ctarget, n); +				if (!n)  					ops->unresolved_rules++; -				}  			}  		} diff --git a/net/core/filter.c b/net/core/filter.c index a253a6197e6b..c7f737058d89 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -54,6 +54,7 @@  #include <net/dst.h>  #include <net/sock_reuseport.h>  #include <net/busy_poll.h> +#include <net/tcp.h>  /**   *	sk_filter_trim_cap - run a packet through a socket filter @@ -352,7 +353,7 @@ static bool convert_bpf_extensions(struct sock_filter *fp,   *	bpf_convert_filter - convert filter program   *	@prog: the user passed filter program   *	@len: the length of the user passed filter program - *	@new_prog: buffer where converted program will be stored + *	@new_prog: allocated 'struct bpf_prog' or NULL   *	@new_len: pointer to store length of converted program   *   * Remap 'sock_filter' style classic BPF (cBPF) instruction set to 'bpf_insn' @@ -364,14 +365,13 @@ static bool convert_bpf_extensions(struct sock_filter *fp,   *   * 2) 2nd pass to remap in two passes: 1st pass finds new   *    jump offsets, 2nd pass remapping: - *   new_prog = kmalloc(sizeof(struct bpf_insn) * new_len);   *   bpf_convert_filter(old_prog, old_len, new_prog, &new_len);   */  static int bpf_convert_filter(struct sock_filter *prog, int len, -			      struct bpf_insn *new_prog, int *new_len) +			      struct bpf_prog *new_prog, int *new_len)  { -	int new_flen = 0, pass = 0, target, i; -	struct bpf_insn *new_insn; +	int new_flen = 0, pass = 0, target, i, stack_off; +	struct bpf_insn *new_insn, *first_insn = NULL;  	struct sock_filter *fp;  	int *addrs = NULL;  	u8 bpf_src; @@ -383,6 +383,7 @@ static int bpf_convert_filter(struct sock_filter *prog, int len,  		return -EINVAL;  	if (new_prog) { +		first_insn = new_prog->insnsi;  		addrs = kcalloc(len, sizeof(*addrs),  				GFP_KERNEL | __GFP_NOWARN);  		if (!addrs) @@ -390,11 +391,11 @@ static int bpf_convert_filter(struct sock_filter *prog, int len,  	}  do_pass: -	new_insn = new_prog; +	new_insn = first_insn;  	fp = prog;  	/* Classic BPF related prologue emission. */ -	if (new_insn) { +	if (new_prog) {  		/* Classic BPF expects A and X to be reset first. These need  		 * to be guaranteed to be the first two instructions.  		 */ @@ -415,7 +416,7 @@ do_pass:  		struct bpf_insn *insn = tmp_insns;  		if (addrs) -			addrs[i] = new_insn - new_prog; +			addrs[i] = new_insn - first_insn;  		switch (fp->code) {  		/* All arithmetic insns and skb loads map as-is. */ @@ -561,17 +562,25 @@ do_pass:  		/* Store to stack. */  		case BPF_ST:  		case BPF_STX: +			stack_off = fp->k * 4  + 4;  			*insn = BPF_STX_MEM(BPF_W, BPF_REG_FP, BPF_CLASS(fp->code) ==  					    BPF_ST ? BPF_REG_A : BPF_REG_X, -					    -(BPF_MEMWORDS - fp->k) * 4); +					    -stack_off); +			/* check_load_and_stores() verifies that classic BPF can +			 * load from stack only after write, so tracking +			 * stack_depth for ST|STX insns is enough +			 */ +			if (new_prog && new_prog->aux->stack_depth < stack_off) +				new_prog->aux->stack_depth = stack_off;  			break;  		/* Load from stack. */  		case BPF_LD | BPF_MEM:  		case BPF_LDX | BPF_MEM: +			stack_off = fp->k * 4  + 4;  			*insn = BPF_LDX_MEM(BPF_W, BPF_CLASS(fp->code) == BPF_LD  ?  					    BPF_REG_A : BPF_REG_X, BPF_REG_FP, -					    -(BPF_MEMWORDS - fp->k) * 4); +					    -stack_off);  			break;  		/* A = K or X = K */ @@ -619,13 +628,13 @@ do_pass:  	if (!new_prog) {  		/* Only calculating new length. */ -		*new_len = new_insn - new_prog; +		*new_len = new_insn - first_insn;  		return 0;  	}  	pass++; -	if (new_flen != new_insn - new_prog) { -		new_flen = new_insn - new_prog; +	if (new_flen != new_insn - first_insn) { +		new_flen = new_insn - first_insn;  		if (pass > 2)  			goto err;  		goto do_pass; @@ -1017,7 +1026,7 @@ static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp)  	fp->len = new_len;  	/* 2nd pass: remap sock_filter insns into bpf_insn insns. */ -	err = bpf_convert_filter(old_prog, old_len, fp->insnsi, &new_len); +	err = bpf_convert_filter(old_prog, old_len, fp, &new_len);  	if (err)  		/* 2nd bpf_convert_filter() can fail only if it fails  		 * to allocate memory, remapping must succeed. Note, @@ -1866,6 +1875,24 @@ static const struct bpf_func_proto bpf_set_hash_invalid_proto = {  	.arg1_type	= ARG_PTR_TO_CTX,  }; +BPF_CALL_2(bpf_set_hash, struct sk_buff *, skb, u32, hash) +{ +	/* Set user specified hash as L4(+), so that it gets returned +	 * on skb_get_hash() call unless BPF prog later on triggers a +	 * skb_clear_hash(). +	 */ +	__skb_set_sw_hash(skb, hash, true); +	return 0; +} + +static const struct bpf_func_proto bpf_set_hash_proto = { +	.func		= bpf_set_hash, +	.gpl_only	= false, +	.ret_type	= RET_INTEGER, +	.arg1_type	= ARG_PTR_TO_CTX, +	.arg2_type	= ARG_ANYTHING, +}; +  BPF_CALL_3(bpf_skb_vlan_push, struct sk_buff *, skb, __be16, vlan_proto,  	   u16, vlan_tci)  { @@ -1985,7 +2012,7 @@ static int bpf_skb_net_hdr_pop(struct sk_buff *skb, u32 off, u32 len)  static int bpf_skb_proto_4_to_6(struct sk_buff *skb)  {  	const u32 len_diff = sizeof(struct ipv6hdr) - sizeof(struct iphdr); -	u32 off = skb->network_header - skb->mac_header; +	u32 off = skb_mac_header_len(skb);  	int ret;  	ret = skb_cow(skb, len_diff); @@ -2021,7 +2048,7 @@ static int bpf_skb_proto_4_to_6(struct sk_buff *skb)  static int bpf_skb_proto_6_to_4(struct sk_buff *skb)  {  	const u32 len_diff = sizeof(struct ipv6hdr) - sizeof(struct iphdr); -	u32 off = skb->network_header - skb->mac_header; +	u32 off = skb_mac_header_len(skb);  	int ret;  	ret = skb_unclone(skb, GFP_ATOMIC); @@ -2127,6 +2154,124 @@ static const struct bpf_func_proto bpf_skb_change_type_proto = {  	.arg2_type	= ARG_ANYTHING,  }; +static u32 bpf_skb_net_base_len(const struct sk_buff *skb) +{ +	switch (skb->protocol) { +	case htons(ETH_P_IP): +		return sizeof(struct iphdr); +	case htons(ETH_P_IPV6): +		return sizeof(struct ipv6hdr); +	default: +		return ~0U; +	} +} + +static int bpf_skb_net_grow(struct sk_buff *skb, u32 len_diff) +{ +	u32 off = skb_mac_header_len(skb) + bpf_skb_net_base_len(skb); +	int ret; + +	ret = skb_cow(skb, len_diff); +	if (unlikely(ret < 0)) +		return ret; + +	ret = bpf_skb_net_hdr_push(skb, off, len_diff); +	if (unlikely(ret < 0)) +		return ret; + +	if (skb_is_gso(skb)) { +		/* Due to header grow, MSS needs to be downgraded. */ +		skb_shinfo(skb)->gso_size -= len_diff; +		/* Header must be checked, and gso_segs recomputed. */ +		skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY; +		skb_shinfo(skb)->gso_segs = 0; +	} + +	return 0; +} + +static int bpf_skb_net_shrink(struct sk_buff *skb, u32 len_diff) +{ +	u32 off = skb_mac_header_len(skb) + bpf_skb_net_base_len(skb); +	int ret; + +	ret = skb_unclone(skb, GFP_ATOMIC); +	if (unlikely(ret < 0)) +		return ret; + +	ret = bpf_skb_net_hdr_pop(skb, off, len_diff); +	if (unlikely(ret < 0)) +		return ret; + +	if (skb_is_gso(skb)) { +		/* Due to header shrink, MSS can be upgraded. */ +		skb_shinfo(skb)->gso_size += len_diff; +		/* Header must be checked, and gso_segs recomputed. */ +		skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY; +		skb_shinfo(skb)->gso_segs = 0; +	} + +	return 0; +} + +static u32 __bpf_skb_max_len(const struct sk_buff *skb) +{ +	return skb->dev->mtu + skb->dev->hard_header_len; +} + +static int bpf_skb_adjust_net(struct sk_buff *skb, s32 len_diff) +{ +	bool trans_same = skb->transport_header == skb->network_header; +	u32 len_cur, len_diff_abs = abs(len_diff); +	u32 len_min = bpf_skb_net_base_len(skb); +	u32 len_max = __bpf_skb_max_len(skb); +	__be16 proto = skb->protocol; +	bool shrink = len_diff < 0; +	int ret; + +	if (unlikely(len_diff_abs > 0xfffU)) +		return -EFAULT; +	if (unlikely(proto != htons(ETH_P_IP) && +		     proto != htons(ETH_P_IPV6))) +		return -ENOTSUPP; + +	len_cur = skb->len - skb_network_offset(skb); +	if (skb_transport_header_was_set(skb) && !trans_same) +		len_cur = skb_network_header_len(skb); +	if ((shrink && (len_diff_abs >= len_cur || +			len_cur - len_diff_abs < len_min)) || +	    (!shrink && (skb->len + len_diff_abs > len_max && +			 !skb_is_gso(skb)))) +		return -ENOTSUPP; + +	ret = shrink ? bpf_skb_net_shrink(skb, len_diff_abs) : +		       bpf_skb_net_grow(skb, len_diff_abs); + +	bpf_compute_data_end(skb); +	return 0; +} + +BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff, +	   u32, mode, u64, flags) +{ +	if (unlikely(flags)) +		return -EINVAL; +	if (likely(mode == BPF_ADJ_ROOM_NET)) +		return bpf_skb_adjust_net(skb, len_diff); + +	return -ENOTSUPP; +} + +static const struct bpf_func_proto bpf_skb_adjust_room_proto = { +	.func		= bpf_skb_adjust_room, +	.gpl_only	= false, +	.ret_type	= RET_INTEGER, +	.arg1_type	= ARG_PTR_TO_CTX, +	.arg2_type	= ARG_ANYTHING, +	.arg3_type	= ARG_ANYTHING, +	.arg4_type	= ARG_ANYTHING, +}; +  static u32 __bpf_skb_min_len(const struct sk_buff *skb)  {  	u32 min_len = skb_network_offset(skb); @@ -2139,11 +2284,6 @@ static u32 __bpf_skb_min_len(const struct sk_buff *skb)  	return min_len;  } -static u32 __bpf_skb_max_len(const struct sk_buff *skb) -{ -	return skb->dev->mtu + skb->dev->hard_header_len; -} -  static int bpf_skb_grow_rcsum(struct sk_buff *skb, unsigned int new_len)  {  	unsigned int old_len = skb->len; @@ -2280,7 +2420,9 @@ bool bpf_helper_changes_pkt_data(void *func)  	    func == bpf_skb_change_proto ||  	    func == bpf_skb_change_head ||  	    func == bpf_skb_change_tail || +	    func == bpf_skb_adjust_room ||  	    func == bpf_skb_pull_data || +	    func == bpf_clone_redirect ||  	    func == bpf_l3_csum_replace ||  	    func == bpf_l4_csum_replace ||  	    func == bpf_xdp_adjust_head) @@ -2538,6 +2680,7 @@ bpf_get_skb_set_tunnel_proto(enum bpf_func_id which)  		 * that is holding verifier mutex.  		 */  		md_dst = metadata_dst_alloc_percpu(IP_TUNNEL_OPTS_MAX, +						   METADATA_IP_TUNNEL,  						   GFP_KERNEL);  		if (!md_dst)  			return NULL; @@ -2644,6 +2787,110 @@ static const struct bpf_func_proto bpf_get_socket_uid_proto = {  	.arg1_type      = ARG_PTR_TO_CTX,  }; +BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, +	   int, level, int, optname, char *, optval, int, optlen) +{ +	struct sock *sk = bpf_sock->sk; +	int ret = 0; +	int val; + +	if (!sk_fullsock(sk)) +		return -EINVAL; + +	if (level == SOL_SOCKET) { +		if (optlen != sizeof(int)) +			return -EINVAL; +		val = *((int *)optval); + +		/* Only some socketops are supported */ +		switch (optname) { +		case SO_RCVBUF: +			sk->sk_userlocks |= SOCK_RCVBUF_LOCK; +			sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF); +			break; +		case SO_SNDBUF: +			sk->sk_userlocks |= SOCK_SNDBUF_LOCK; +			sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF); +			break; +		case SO_MAX_PACING_RATE: +			sk->sk_max_pacing_rate = val; +			sk->sk_pacing_rate = min(sk->sk_pacing_rate, +						 sk->sk_max_pacing_rate); +			break; +		case SO_PRIORITY: +			sk->sk_priority = val; +			break; +		case SO_RCVLOWAT: +			if (val < 0) +				val = INT_MAX; +			sk->sk_rcvlowat = val ? : 1; +			break; +		case SO_MARK: +			sk->sk_mark = val; +			break; +		default: +			ret = -EINVAL; +		} +#ifdef CONFIG_INET +	} else if (level == SOL_TCP && +		   sk->sk_prot->setsockopt == tcp_setsockopt) { +		if (optname == TCP_CONGESTION) { +			char name[TCP_CA_NAME_MAX]; + +			strncpy(name, optval, min_t(long, optlen, +						    TCP_CA_NAME_MAX-1)); +			name[TCP_CA_NAME_MAX-1] = 0; +			ret = tcp_set_congestion_control(sk, name, false); +			if (!ret && bpf_sock->op > BPF_SOCK_OPS_NEEDS_ECN) +				/* replacing an existing ca */ +				tcp_reinit_congestion_control(sk, +					inet_csk(sk)->icsk_ca_ops); +		} else { +			struct tcp_sock *tp = tcp_sk(sk); + +			if (optlen != sizeof(int)) +				return -EINVAL; + +			val = *((int *)optval); +			/* Only some options are supported */ +			switch (optname) { +			case TCP_BPF_IW: +				if (val <= 0 || tp->data_segs_out > 0) +					ret = -EINVAL; +				else +					tp->snd_cwnd = val; +				break; +			case TCP_BPF_SNDCWND_CLAMP: +				if (val <= 0) { +					ret = -EINVAL; +				} else { +					tp->snd_cwnd_clamp = val; +					tp->snd_ssthresh = val; +				} +				break; +			default: +				ret = -EINVAL; +			} +		} +		ret = -EINVAL; +#endif +	} else { +		ret = -EINVAL; +	} +	return ret; +} + +static const struct bpf_func_proto bpf_setsockopt_proto = { +	.func		= bpf_setsockopt, +	.gpl_only	= true, +	.ret_type	= RET_INTEGER, +	.arg1_type	= ARG_PTR_TO_CTX, +	.arg2_type	= ARG_ANYTHING, +	.arg3_type	= ARG_ANYTHING, +	.arg4_type	= ARG_PTR_TO_MEM, +	.arg5_type	= ARG_CONST_SIZE, +}; +  static const struct bpf_func_proto *  bpf_base_func_proto(enum bpf_func_id func_id)  { @@ -2717,6 +2964,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id)  		return &bpf_skb_change_proto_proto;  	case BPF_FUNC_skb_change_type:  		return &bpf_skb_change_type_proto; +	case BPF_FUNC_skb_adjust_room: +		return &bpf_skb_adjust_room_proto;  	case BPF_FUNC_skb_change_tail:  		return &bpf_skb_change_tail_proto;  	case BPF_FUNC_skb_get_tunnel_key: @@ -2735,6 +2984,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id)  		return &bpf_get_hash_recalc_proto;  	case BPF_FUNC_set_hash_invalid:  		return &bpf_set_hash_invalid_proto; +	case BPF_FUNC_set_hash: +		return &bpf_set_hash_proto;  	case BPF_FUNC_perf_event_output:  		return &bpf_skb_event_output_proto;  	case BPF_FUNC_get_smp_processor_id: @@ -2766,12 +3017,6 @@ xdp_func_proto(enum bpf_func_id func_id)  }  static const struct bpf_func_proto * -cg_skb_func_proto(enum bpf_func_id func_id) -{ -	return sk_filter_func_proto(func_id); -} - -static const struct bpf_func_proto *  lwt_inout_func_proto(enum bpf_func_id func_id)  {  	switch (func_id) { @@ -2799,6 +3044,17 @@ lwt_inout_func_proto(enum bpf_func_id func_id)  }  static const struct bpf_func_proto * +	sock_ops_func_proto(enum bpf_func_id func_id) +{ +	switch (func_id) { +	case BPF_FUNC_setsockopt: +		return &bpf_setsockopt_proto; +	default: +		return bpf_base_func_proto(func_id); +	} +} + +static const struct bpf_func_proto *  lwt_xmit_func_proto(enum bpf_func_id func_id)  {  	switch (func_id) { @@ -2833,8 +3089,11 @@ lwt_xmit_func_proto(enum bpf_func_id func_id)  	}  } -static bool __is_valid_access(int off, int size) +static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type, +				    struct bpf_insn_access_aux *info)  { +	const int size_default = sizeof(__u32); +  	if (off < 0 || off >= sizeof(struct __sk_buff))  		return false; @@ -2843,15 +3102,25 @@ static bool __is_valid_access(int off, int size)  		return false;  	switch (off) { -	case offsetof(struct __sk_buff, cb[0]) ... -	     offsetof(struct __sk_buff, cb[4]) + sizeof(__u32) - 1: -		if (off + size > -		    offsetof(struct __sk_buff, cb[4]) + sizeof(__u32)) +	case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]): +		if (off + size > offsetofend(struct __sk_buff, cb[4]))  			return false;  		break; -	default: -		if (size != sizeof(__u32)) +	case bpf_ctx_range(struct __sk_buff, data): +	case bpf_ctx_range(struct __sk_buff, data_end): +		if (size != size_default)  			return false; +		break; +	default: +		/* Only narrow read access allowed for now. */ +		if (type == BPF_WRITE) { +			if (size != size_default) +				return false; +		} else { +			bpf_ctx_record_field_size(info, size_default); +			if (!bpf_ctx_narrow_access_ok(off, size, size_default)) +				return false; +		}  	}  	return true; @@ -2859,43 +3128,41 @@ static bool __is_valid_access(int off, int size)  static bool sk_filter_is_valid_access(int off, int size,  				      enum bpf_access_type type, -				      enum bpf_reg_type *reg_type) +				      struct bpf_insn_access_aux *info)  {  	switch (off) { -	case offsetof(struct __sk_buff, tc_classid): -	case offsetof(struct __sk_buff, data): -	case offsetof(struct __sk_buff, data_end): +	case bpf_ctx_range(struct __sk_buff, tc_classid): +	case bpf_ctx_range(struct __sk_buff, data): +	case bpf_ctx_range(struct __sk_buff, data_end):  		return false;  	}  	if (type == BPF_WRITE) {  		switch (off) { -		case offsetof(struct __sk_buff, cb[0]) ... -		     offsetof(struct __sk_buff, cb[4]) + sizeof(__u32) - 1: +		case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]):  			break;  		default:  			return false;  		}  	} -	return __is_valid_access(off, size); +	return bpf_skb_is_valid_access(off, size, type, info);  }  static bool lwt_is_valid_access(int off, int size,  				enum bpf_access_type type, -				enum bpf_reg_type *reg_type) +				struct bpf_insn_access_aux *info)  {  	switch (off) { -	case offsetof(struct __sk_buff, tc_classid): +	case bpf_ctx_range(struct __sk_buff, tc_classid):  		return false;  	}  	if (type == BPF_WRITE) {  		switch (off) { -		case offsetof(struct __sk_buff, mark): -		case offsetof(struct __sk_buff, priority): -		case offsetof(struct __sk_buff, cb[0]) ... -		     offsetof(struct __sk_buff, cb[4]) + sizeof(__u32) - 1: +		case bpf_ctx_range(struct __sk_buff, mark): +		case bpf_ctx_range(struct __sk_buff, priority): +		case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]):  			break;  		default:  			return false; @@ -2903,20 +3170,20 @@ static bool lwt_is_valid_access(int off, int size,  	}  	switch (off) { -	case offsetof(struct __sk_buff, data): -		*reg_type = PTR_TO_PACKET; +	case bpf_ctx_range(struct __sk_buff, data): +		info->reg_type = PTR_TO_PACKET;  		break; -	case offsetof(struct __sk_buff, data_end): -		*reg_type = PTR_TO_PACKET_END; +	case bpf_ctx_range(struct __sk_buff, data_end): +		info->reg_type = PTR_TO_PACKET_END;  		break;  	} -	return __is_valid_access(off, size); +	return bpf_skb_is_valid_access(off, size, type, info);  }  static bool sock_filter_is_valid_access(int off, int size,  					enum bpf_access_type type, -					enum bpf_reg_type *reg_type) +					struct bpf_insn_access_aux *info)  {  	if (type == BPF_WRITE) {  		switch (off) { @@ -2979,16 +3246,15 @@ static int tc_cls_act_prologue(struct bpf_insn *insn_buf, bool direct_write,  static bool tc_cls_act_is_valid_access(int off, int size,  				       enum bpf_access_type type, -				       enum bpf_reg_type *reg_type) +				       struct bpf_insn_access_aux *info)  {  	if (type == BPF_WRITE) {  		switch (off) { -		case offsetof(struct __sk_buff, mark): -		case offsetof(struct __sk_buff, tc_index): -		case offsetof(struct __sk_buff, priority): -		case offsetof(struct __sk_buff, cb[0]) ... -		     offsetof(struct __sk_buff, cb[4]) + sizeof(__u32) - 1: -		case offsetof(struct __sk_buff, tc_classid): +		case bpf_ctx_range(struct __sk_buff, mark): +		case bpf_ctx_range(struct __sk_buff, tc_index): +		case bpf_ctx_range(struct __sk_buff, priority): +		case bpf_ctx_range(struct __sk_buff, tc_classid): +		case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]):  			break;  		default:  			return false; @@ -2996,15 +3262,15 @@ static bool tc_cls_act_is_valid_access(int off, int size,  	}  	switch (off) { -	case offsetof(struct __sk_buff, data): -		*reg_type = PTR_TO_PACKET; +	case bpf_ctx_range(struct __sk_buff, data): +		info->reg_type = PTR_TO_PACKET;  		break; -	case offsetof(struct __sk_buff, data_end): -		*reg_type = PTR_TO_PACKET_END; +	case bpf_ctx_range(struct __sk_buff, data_end): +		info->reg_type = PTR_TO_PACKET_END;  		break;  	} -	return __is_valid_access(off, size); +	return bpf_skb_is_valid_access(off, size, type, info);  }  static bool __is_valid_xdp_access(int off, int size) @@ -3021,17 +3287,17 @@ static bool __is_valid_xdp_access(int off, int size)  static bool xdp_is_valid_access(int off, int size,  				enum bpf_access_type type, -				enum bpf_reg_type *reg_type) +				struct bpf_insn_access_aux *info)  {  	if (type == BPF_WRITE)  		return false;  	switch (off) {  	case offsetof(struct xdp_md, data): -		*reg_type = PTR_TO_PACKET; +		info->reg_type = PTR_TO_PACKET;  		break;  	case offsetof(struct xdp_md, data_end): -		*reg_type = PTR_TO_PACKET_END; +		info->reg_type = PTR_TO_PACKET_END;  		break;  	} @@ -3044,101 +3310,141 @@ void bpf_warn_invalid_xdp_action(u32 act)  }  EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action); +static bool __is_valid_sock_ops_access(int off, int size) +{ +	if (off < 0 || off >= sizeof(struct bpf_sock_ops)) +		return false; +	/* The verifier guarantees that size > 0. */ +	if (off % size != 0) +		return false; +	if (size != sizeof(__u32)) +		return false; + +	return true; +} + +static bool sock_ops_is_valid_access(int off, int size, +				     enum bpf_access_type type, +				     struct bpf_insn_access_aux *info) +{ +	if (type == BPF_WRITE) { +		switch (off) { +		case offsetof(struct bpf_sock_ops, op) ... +		     offsetof(struct bpf_sock_ops, replylong[3]): +			break; +		default: +			return false; +		} +	} + +	return __is_valid_sock_ops_access(off, size); +} +  static u32 bpf_convert_ctx_access(enum bpf_access_type type,  				  const struct bpf_insn *si,  				  struct bpf_insn *insn_buf, -				  struct bpf_prog *prog) +				  struct bpf_prog *prog, u32 *target_size)  {  	struct bpf_insn *insn = insn_buf;  	int off;  	switch (si->off) {  	case offsetof(struct __sk_buff, len): -		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, len) != 4); -  		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, -				      offsetof(struct sk_buff, len)); +				      bpf_target_off(struct sk_buff, len, 4, +						     target_size));  		break;  	case offsetof(struct __sk_buff, protocol): -		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, protocol) != 2); -  		*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, -				      offsetof(struct sk_buff, protocol)); +				      bpf_target_off(struct sk_buff, protocol, 2, +						     target_size));  		break;  	case offsetof(struct __sk_buff, vlan_proto): -		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_proto) != 2); -  		*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, -				      offsetof(struct sk_buff, vlan_proto)); +				      bpf_target_off(struct sk_buff, vlan_proto, 2, +						     target_size));  		break;  	case offsetof(struct __sk_buff, priority): -		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, priority) != 4); -  		if (type == BPF_WRITE)  			*insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg, -					      offsetof(struct sk_buff, priority)); +					      bpf_target_off(struct sk_buff, priority, 4, +							     target_size));  		else  			*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, -					      offsetof(struct sk_buff, priority)); +					      bpf_target_off(struct sk_buff, priority, 4, +							     target_size));  		break;  	case offsetof(struct __sk_buff, ingress_ifindex): -		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, skb_iif) != 4); -  		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, -				      offsetof(struct sk_buff, skb_iif)); +				      bpf_target_off(struct sk_buff, skb_iif, 4, +						     target_size));  		break;  	case offsetof(struct __sk_buff, ifindex): -		BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4); -  		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev),  				      si->dst_reg, si->src_reg,  				      offsetof(struct sk_buff, dev));  		*insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1);  		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, -				      offsetof(struct net_device, ifindex)); +				      bpf_target_off(struct net_device, ifindex, 4, +						     target_size));  		break;  	case offsetof(struct __sk_buff, hash): -		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, hash) != 4); -  		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, -				      offsetof(struct sk_buff, hash)); +				      bpf_target_off(struct sk_buff, hash, 4, +						     target_size));  		break;  	case offsetof(struct __sk_buff, mark): -		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, mark) != 4); -  		if (type == BPF_WRITE)  			*insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg, -					      offsetof(struct sk_buff, mark)); +					      bpf_target_off(struct sk_buff, mark, 4, +							     target_size));  		else  			*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, -					      offsetof(struct sk_buff, mark)); +					      bpf_target_off(struct sk_buff, mark, 4, +							     target_size));  		break;  	case offsetof(struct __sk_buff, pkt_type): -		return convert_skb_access(SKF_AD_PKTTYPE, si->dst_reg, -					  si->src_reg, insn); +		*target_size = 1; +		*insn++ = BPF_LDX_MEM(BPF_B, si->dst_reg, si->src_reg, +				      PKT_TYPE_OFFSET()); +		*insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, PKT_TYPE_MAX); +#ifdef __BIG_ENDIAN_BITFIELD +		*insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, 5); +#endif +		break;  	case offsetof(struct __sk_buff, queue_mapping): -		return convert_skb_access(SKF_AD_QUEUE, si->dst_reg, -					  si->src_reg, insn); +		*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, +				      bpf_target_off(struct sk_buff, queue_mapping, 2, +						     target_size)); +		break;  	case offsetof(struct __sk_buff, vlan_present): -		return convert_skb_access(SKF_AD_VLAN_TAG_PRESENT, -					  si->dst_reg, si->src_reg, insn); -  	case offsetof(struct __sk_buff, vlan_tci): -		return convert_skb_access(SKF_AD_VLAN_TAG, -					  si->dst_reg, si->src_reg, insn); +		BUILD_BUG_ON(VLAN_TAG_PRESENT != 0x1000); + +		*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, +				      bpf_target_off(struct sk_buff, vlan_tci, 2, +						     target_size)); +		if (si->off == offsetof(struct __sk_buff, vlan_tci)) { +			*insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, +						~VLAN_TAG_PRESENT); +		} else { +			*insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, 12); +			*insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, 1); +		} +		break;  	case offsetof(struct __sk_buff, cb[0]) ... -	     offsetof(struct __sk_buff, cb[4]) + sizeof(__u32) - 1: +	     offsetofend(struct __sk_buff, cb[4]) - 1:  		BUILD_BUG_ON(FIELD_SIZEOF(struct qdisc_skb_cb, data) < 20);  		BUILD_BUG_ON((offsetof(struct sk_buff, cb) +  			      offsetof(struct qdisc_skb_cb, data)) % @@ -3164,6 +3470,7 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type,  		off -= offsetof(struct __sk_buff, tc_classid);  		off += offsetof(struct sk_buff, cb);  		off += offsetof(struct qdisc_skb_cb, tc_classid); +		*target_size = 2;  		if (type == BPF_WRITE)  			*insn++ = BPF_STX_MEM(BPF_H, si->dst_reg,  					      si->src_reg, off); @@ -3189,14 +3496,14 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type,  	case offsetof(struct __sk_buff, tc_index):  #ifdef CONFIG_NET_SCHED -		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, tc_index) != 2); -  		if (type == BPF_WRITE)  			*insn++ = BPF_STX_MEM(BPF_H, si->dst_reg, si->src_reg, -					      offsetof(struct sk_buff, tc_index)); +					      bpf_target_off(struct sk_buff, tc_index, 2, +							     target_size));  		else  			*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, -					      offsetof(struct sk_buff, tc_index)); +					      bpf_target_off(struct sk_buff, tc_index, 2, +							     target_size));  #else  		if (type == BPF_WRITE)  			*insn++ = BPF_MOV64_REG(si->dst_reg, si->dst_reg); @@ -3207,10 +3514,9 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type,  	case offsetof(struct __sk_buff, napi_id):  #if defined(CONFIG_NET_RX_BUSY_POLL) -		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, napi_id) != 4); -  		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, -				      offsetof(struct sk_buff, napi_id)); +				      bpf_target_off(struct sk_buff, napi_id, 4, +						     target_size));  		*insn++ = BPF_JMP_IMM(BPF_JGE, si->dst_reg, MIN_NAPI_ID, 1);  		*insn++ = BPF_MOV64_IMM(si->dst_reg, 0);  #else @@ -3225,7 +3531,7 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type,  static u32 sock_filter_convert_ctx_access(enum bpf_access_type type,  					  const struct bpf_insn *si,  					  struct bpf_insn *insn_buf, -					  struct bpf_prog *prog) +					  struct bpf_prog *prog, u32 *target_size)  {  	struct bpf_insn *insn = insn_buf; @@ -3269,22 +3575,22 @@ static u32 sock_filter_convert_ctx_access(enum bpf_access_type type,  static u32 tc_cls_act_convert_ctx_access(enum bpf_access_type type,  					 const struct bpf_insn *si,  					 struct bpf_insn *insn_buf, -					 struct bpf_prog *prog) +					 struct bpf_prog *prog, u32 *target_size)  {  	struct bpf_insn *insn = insn_buf;  	switch (si->off) {  	case offsetof(struct __sk_buff, ifindex): -		BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4); -  		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev),  				      si->dst_reg, si->src_reg,  				      offsetof(struct sk_buff, dev));  		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, -				      offsetof(struct net_device, ifindex)); +				      bpf_target_off(struct net_device, ifindex, 4, +						     target_size));  		break;  	default: -		return bpf_convert_ctx_access(type, si, insn_buf, prog); +		return bpf_convert_ctx_access(type, si, insn_buf, prog, +					      target_size);  	}  	return insn - insn_buf; @@ -3293,7 +3599,7 @@ static u32 tc_cls_act_convert_ctx_access(enum bpf_access_type type,  static u32 xdp_convert_ctx_access(enum bpf_access_type type,  				  const struct bpf_insn *si,  				  struct bpf_insn *insn_buf, -				  struct bpf_prog *prog) +				  struct bpf_prog *prog, u32 *target_size)  {  	struct bpf_insn *insn = insn_buf; @@ -3313,6 +3619,139 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type,  	return insn - insn_buf;  } +static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, +				       const struct bpf_insn *si, +				       struct bpf_insn *insn_buf, +				       struct bpf_prog *prog, +				       u32 *target_size) +{ +	struct bpf_insn *insn = insn_buf; +	int off; + +	switch (si->off) { +	case offsetof(struct bpf_sock_ops, op) ... +	     offsetof(struct bpf_sock_ops, replylong[3]): +		BUILD_BUG_ON(FIELD_SIZEOF(struct bpf_sock_ops, op) != +			     FIELD_SIZEOF(struct bpf_sock_ops_kern, op)); +		BUILD_BUG_ON(FIELD_SIZEOF(struct bpf_sock_ops, reply) != +			     FIELD_SIZEOF(struct bpf_sock_ops_kern, reply)); +		BUILD_BUG_ON(FIELD_SIZEOF(struct bpf_sock_ops, replylong) != +			     FIELD_SIZEOF(struct bpf_sock_ops_kern, replylong)); +		off = si->off; +		off -= offsetof(struct bpf_sock_ops, op); +		off += offsetof(struct bpf_sock_ops_kern, op); +		if (type == BPF_WRITE) +			*insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg, +					      off); +		else +			*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, +					      off); +		break; + +	case offsetof(struct bpf_sock_ops, family): +		BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_family) != 2); + +		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( +					      struct bpf_sock_ops_kern, sk), +				      si->dst_reg, si->src_reg, +				      offsetof(struct bpf_sock_ops_kern, sk)); +		*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, +				      offsetof(struct sock_common, skc_family)); +		break; + +	case offsetof(struct bpf_sock_ops, remote_ip4): +		BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_daddr) != 4); + +		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( +						struct bpf_sock_ops_kern, sk), +				      si->dst_reg, si->src_reg, +				      offsetof(struct bpf_sock_ops_kern, sk)); +		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, +				      offsetof(struct sock_common, skc_daddr)); +		break; + +	case offsetof(struct bpf_sock_ops, local_ip4): +		BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_rcv_saddr) != 4); + +		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( +					      struct bpf_sock_ops_kern, sk), +				      si->dst_reg, si->src_reg, +				      offsetof(struct bpf_sock_ops_kern, sk)); +		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, +				      offsetof(struct sock_common, +					       skc_rcv_saddr)); +		break; + +	case offsetof(struct bpf_sock_ops, remote_ip6[0]) ... +	     offsetof(struct bpf_sock_ops, remote_ip6[3]): +#if IS_ENABLED(CONFIG_IPV6) +		BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, +					  skc_v6_daddr.s6_addr32[0]) != 4); + +		off = si->off; +		off -= offsetof(struct bpf_sock_ops, remote_ip6[0]); +		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( +						struct bpf_sock_ops_kern, sk), +				      si->dst_reg, si->src_reg, +				      offsetof(struct bpf_sock_ops_kern, sk)); +		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, +				      offsetof(struct sock_common, +					       skc_v6_daddr.s6_addr32[0]) + +				      off); +#else +		*insn++ = BPF_MOV32_IMM(si->dst_reg, 0); +#endif +		break; + +	case offsetof(struct bpf_sock_ops, local_ip6[0]) ... +	     offsetof(struct bpf_sock_ops, local_ip6[3]): +#if IS_ENABLED(CONFIG_IPV6) +		BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, +					  skc_v6_rcv_saddr.s6_addr32[0]) != 4); + +		off = si->off; +		off -= offsetof(struct bpf_sock_ops, local_ip6[0]); +		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( +						struct bpf_sock_ops_kern, sk), +				      si->dst_reg, si->src_reg, +				      offsetof(struct bpf_sock_ops_kern, sk)); +		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, +				      offsetof(struct sock_common, +					       skc_v6_rcv_saddr.s6_addr32[0]) + +				      off); +#else +		*insn++ = BPF_MOV32_IMM(si->dst_reg, 0); +#endif +		break; + +	case offsetof(struct bpf_sock_ops, remote_port): +		BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_dport) != 2); + +		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( +						struct bpf_sock_ops_kern, sk), +				      si->dst_reg, si->src_reg, +				      offsetof(struct bpf_sock_ops_kern, sk)); +		*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, +				      offsetof(struct sock_common, skc_dport)); +#ifndef __BIG_ENDIAN_BITFIELD +		*insn++ = BPF_ALU32_IMM(BPF_LSH, si->dst_reg, 16); +#endif +		break; + +	case offsetof(struct bpf_sock_ops, local_port): +		BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_num) != 2); + +		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( +						struct bpf_sock_ops_kern, sk), +				      si->dst_reg, si->src_reg, +				      offsetof(struct bpf_sock_ops_kern, sk)); +		*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, +				      offsetof(struct sock_common, skc_num)); +		break; +	} +	return insn - insn_buf; +} +  const struct bpf_verifier_ops sk_filter_prog_ops = {  	.get_func_proto		= sk_filter_func_proto,  	.is_valid_access	= sk_filter_is_valid_access, @@ -3335,7 +3774,7 @@ const struct bpf_verifier_ops xdp_prog_ops = {  };  const struct bpf_verifier_ops cg_skb_prog_ops = { -	.get_func_proto		= cg_skb_func_proto, +	.get_func_proto		= sk_filter_func_proto,  	.is_valid_access	= sk_filter_is_valid_access,  	.convert_ctx_access	= bpf_convert_ctx_access,  	.test_run		= bpf_prog_test_run_skb, @@ -3362,6 +3801,12 @@ const struct bpf_verifier_ops cg_sock_prog_ops = {  	.convert_ctx_access	= sock_filter_convert_ctx_access,  }; +const struct bpf_verifier_ops sock_ops_prog_ops = { +	.get_func_proto		= sock_ops_func_proto, +	.is_valid_access	= sock_ops_is_valid_access, +	.convert_ctx_access	= sock_ops_convert_ctx_access, +}; +  int sk_detach_filter(struct sock *sk)  {  	int ret = -ENOENT; diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 28d94bce4df8..fc5fc4594c90 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -18,6 +18,7 @@  #include <linux/stddef.h>  #include <linux/if_ether.h>  #include <linux/mpls.h> +#include <linux/tcp.h>  #include <net/flow_dissector.h>  #include <scsi/fc/fc_fcoe.h> @@ -342,6 +343,64 @@ __skb_flow_dissect_gre(const struct sk_buff *skb,  	return FLOW_DISSECT_RET_OUT_PROTO_AGAIN;  } +static void +__skb_flow_dissect_tcp(const struct sk_buff *skb, +		       struct flow_dissector *flow_dissector, +		       void *target_container, void *data, int thoff, int hlen) +{ +	struct flow_dissector_key_tcp *key_tcp; +	struct tcphdr *th, _th; + +	if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_TCP)) +		return; + +	th = __skb_header_pointer(skb, thoff, sizeof(_th), data, hlen, &_th); +	if (!th) +		return; + +	if (unlikely(__tcp_hdrlen(th) < sizeof(_th))) +		return; + +	key_tcp = skb_flow_dissector_target(flow_dissector, +					    FLOW_DISSECTOR_KEY_TCP, +					    target_container); +	key_tcp->flags = (*(__be16 *) &tcp_flag_word(th) & htons(0x0FFF)); +} + +static void +__skb_flow_dissect_ipv4(const struct sk_buff *skb, +			struct flow_dissector *flow_dissector, +			void *target_container, void *data, const struct iphdr *iph) +{ +	struct flow_dissector_key_ip *key_ip; + +	if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IP)) +		return; + +	key_ip = skb_flow_dissector_target(flow_dissector, +					   FLOW_DISSECTOR_KEY_IP, +					   target_container); +	key_ip->tos = iph->tos; +	key_ip->ttl = iph->ttl; +} + +static void +__skb_flow_dissect_ipv6(const struct sk_buff *skb, +			struct flow_dissector *flow_dissector, +			void *target_container, void *data, const struct ipv6hdr *iph) +{ +	struct flow_dissector_key_ip *key_ip; + +	if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IP)) +		return; + +	key_ip = skb_flow_dissector_target(flow_dissector, +					   FLOW_DISSECTOR_KEY_IP, +					   target_container); +	key_ip->tos = ipv6_get_dsfield(iph); +	key_ip->ttl = iph->hop_limit; +} +  /**   * __skb_flow_dissect - extract the flow_keys struct and return it   * @skb: sk_buff to extract the flow from, can be NULL if the rest are specified @@ -444,6 +503,9 @@ ip:  			}  		} +		__skb_flow_dissect_ipv4(skb, flow_dissector, +					target_container, data, iph); +  		if (flags & FLOW_DISSECTOR_F_STOP_AT_L3)  			goto out_good; @@ -489,6 +551,9 @@ ipv6:  				goto out_good;  		} +		__skb_flow_dissect_ipv6(skb, flow_dissector, +					target_container, data, iph); +  		if (flags & FLOW_DISSECTOR_F_STOP_AT_L3)  			goto out_good; @@ -683,6 +748,10 @@ ip_proto_again:  	case IPPROTO_MPLS:  		proto = htons(ETH_P_MPLS_UC);  		goto mpls; +	case IPPROTO_TCP: +		__skb_flow_dissect_tcp(skb, flow_dissector, target_container, +				       data, nhoff, hlen); +		break;  	default:  		break;  	} diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c index b3bc0a31af9f..1307731ddfe4 100644 --- a/net/core/lwt_bpf.c +++ b/net/core/lwt_bpf.c @@ -240,7 +240,8 @@ static const struct nla_policy bpf_nl_policy[LWT_BPF_MAX + 1] = {  static int bpf_build_state(struct nlattr *nla,  			   unsigned int family, const void *cfg, -			   struct lwtunnel_state **ts) +			   struct lwtunnel_state **ts, +			   struct netlink_ext_ack *extack)  {  	struct nlattr *tb[LWT_BPF_MAX + 1];  	struct lwtunnel_state *newts; @@ -250,7 +251,7 @@ static int bpf_build_state(struct nlattr *nla,  	if (family != AF_INET && family != AF_INET6)  		return -EAFNOSUPPORT; -	ret = nla_parse_nested(tb, LWT_BPF_MAX, nla, bpf_nl_policy, NULL); +	ret = nla_parse_nested(tb, LWT_BPF_MAX, nla, bpf_nl_policy, extack);  	if (ret < 0)  		return ret; diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c index cfae3d5fe11f..d9cb3532f1dd 100644 --- a/net/core/lwtunnel.c +++ b/net/core/lwtunnel.c @@ -103,37 +103,53 @@ EXPORT_SYMBOL(lwtunnel_encap_del_ops);  int lwtunnel_build_state(u16 encap_type,  			 struct nlattr *encap, unsigned int family, -			 const void *cfg, struct lwtunnel_state **lws) +			 const void *cfg, struct lwtunnel_state **lws, +			 struct netlink_ext_ack *extack)  {  	const struct lwtunnel_encap_ops *ops; +	bool found = false;  	int ret = -EINVAL;  	if (encap_type == LWTUNNEL_ENCAP_NONE || -	    encap_type > LWTUNNEL_ENCAP_MAX) +	    encap_type > LWTUNNEL_ENCAP_MAX) { +		NL_SET_ERR_MSG_ATTR(extack, encap, +				    "Unknown LWT encapsulation type");  		return ret; +	}  	ret = -EOPNOTSUPP;  	rcu_read_lock();  	ops = rcu_dereference(lwtun_encaps[encap_type]);  	if (likely(ops && ops->build_state && try_module_get(ops->owner))) { -		ret = ops->build_state(encap, family, cfg, lws); +		found = true; +		ret = ops->build_state(encap, family, cfg, lws, extack);  		if (ret)  			module_put(ops->owner);  	}  	rcu_read_unlock(); +	/* don't rely on -EOPNOTSUPP to detect match as build_state +	 * handlers could return it +	 */ +	if (!found) { +		NL_SET_ERR_MSG_ATTR(extack, encap, +				    "LWT encapsulation type not supported"); +	} +  	return ret;  }  EXPORT_SYMBOL(lwtunnel_build_state); -int lwtunnel_valid_encap_type(u16 encap_type) +int lwtunnel_valid_encap_type(u16 encap_type, struct netlink_ext_ack *extack)  {  	const struct lwtunnel_encap_ops *ops;  	int ret = -EINVAL;  	if (encap_type == LWTUNNEL_ENCAP_NONE || -	    encap_type > LWTUNNEL_ENCAP_MAX) +	    encap_type > LWTUNNEL_ENCAP_MAX) { +		NL_SET_ERR_MSG(extack, "Unknown lwt encapsulation type");  		return ret; +	}  	rcu_read_lock();  	ops = rcu_dereference(lwtun_encaps[encap_type]); @@ -153,11 +169,16 @@ int lwtunnel_valid_encap_type(u16 encap_type)  		}  	}  #endif -	return ops ? 0 : -EOPNOTSUPP; +	ret = ops ? 0 : -EOPNOTSUPP; +	if (ret < 0) +		NL_SET_ERR_MSG(extack, "lwt encapsulation type not supported"); + +	return ret;  }  EXPORT_SYMBOL(lwtunnel_valid_encap_type); -int lwtunnel_valid_encap_type_attr(struct nlattr *attr, int remaining) +int lwtunnel_valid_encap_type_attr(struct nlattr *attr, int remaining, +				   struct netlink_ext_ack *extack)  {  	struct rtnexthop *rtnh = (struct rtnexthop *)attr;  	struct nlattr *nla_entype; @@ -174,7 +195,8 @@ int lwtunnel_valid_encap_type_attr(struct nlattr *attr, int remaining)  			if (nla_entype) {  				encap_type = nla_get_u16(nla_entype); -				if (lwtunnel_valid_encap_type(encap_type) != 0) +				if (lwtunnel_valid_encap_type(encap_type, +							      extack) != 0)  					return -EOPNOTSUPP;  			}  		} diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 58b0bcc125b5..e31fc11a8000 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -118,6 +118,50 @@ unsigned long neigh_rand_reach_time(unsigned long base)  EXPORT_SYMBOL(neigh_rand_reach_time); +static bool neigh_del(struct neighbour *n, __u8 state, +		      struct neighbour __rcu **np, struct neigh_table *tbl) +{ +	bool retval = false; + +	write_lock(&n->lock); +	if (refcount_read(&n->refcnt) == 1 && !(n->nud_state & state)) { +		struct neighbour *neigh; + +		neigh = rcu_dereference_protected(n->next, +						  lockdep_is_held(&tbl->lock)); +		rcu_assign_pointer(*np, neigh); +		n->dead = 1; +		retval = true; +	} +	write_unlock(&n->lock); +	if (retval) +		neigh_cleanup_and_release(n); +	return retval; +} + +bool neigh_remove_one(struct neighbour *ndel, struct neigh_table *tbl) +{ +	struct neigh_hash_table *nht; +	void *pkey = ndel->primary_key; +	u32 hash_val; +	struct neighbour *n; +	struct neighbour __rcu **np; + +	nht = rcu_dereference_protected(tbl->nht, +					lockdep_is_held(&tbl->lock)); +	hash_val = tbl->hash(pkey, ndel->dev, nht->hash_rnd); +	hash_val = hash_val >> (32 - nht->hash_shift); + +	np = &nht->hash_buckets[hash_val]; +	while ((n = rcu_dereference_protected(*np, +					      lockdep_is_held(&tbl->lock)))) { +		if (n == ndel) +			return neigh_del(n, 0, np, tbl); +		np = &n->next; +	} +	return false; +} +  static int neigh_forced_gc(struct neigh_table *tbl)  {  	int shrunk = 0; @@ -140,19 +184,10 @@ static int neigh_forced_gc(struct neigh_table *tbl)  			 * - nobody refers to it.  			 * - it is not permanent  			 */ -			write_lock(&n->lock); -			if (atomic_read(&n->refcnt) == 1 && -			    !(n->nud_state & NUD_PERMANENT)) { -				rcu_assign_pointer(*np, -					rcu_dereference_protected(n->next, -						  lockdep_is_held(&tbl->lock))); -				n->dead = 1; -				shrunk	= 1; -				write_unlock(&n->lock); -				neigh_cleanup_and_release(n); +			if (neigh_del(n, NUD_PERMANENT, np, tbl)) { +				shrunk = 1;  				continue;  			} -			write_unlock(&n->lock);  			np = &n->next;  		}  	} @@ -219,7 +254,7 @@ static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev)  			neigh_del_timer(n);  			n->dead = 1; -			if (atomic_read(&n->refcnt) != 1) { +			if (refcount_read(&n->refcnt) != 1) {  				/* The most unpleasant situation.  				   We must destroy neighbour entry,  				   but someone still uses it. @@ -300,7 +335,7 @@ static struct neighbour *neigh_alloc(struct neigh_table *tbl, struct net_device  	NEIGH_CACHE_STAT_INC(tbl, allocs);  	n->tbl		  = tbl; -	atomic_set(&n->refcnt, 1); +	refcount_set(&n->refcnt, 1);  	n->dead		  = 1;  out:  	return n; @@ -409,7 +444,7 @@ struct neighbour *neigh_lookup(struct neigh_table *tbl, const void *pkey,  	rcu_read_lock_bh();  	n = __neigh_lookup_noref(tbl, pkey, dev);  	if (n) { -		if (!atomic_inc_not_zero(&n->refcnt)) +		if (!refcount_inc_not_zero(&n->refcnt))  			n = NULL;  		NEIGH_CACHE_STAT_INC(tbl, hits);  	} @@ -438,7 +473,7 @@ struct neighbour *neigh_lookup_nodev(struct neigh_table *tbl, struct net *net,  	     n = rcu_dereference_bh(n->next)) {  		if (!memcmp(n->primary_key, pkey, key_len) &&  		    net_eq(dev_net(n->dev), net)) { -			if (!atomic_inc_not_zero(&n->refcnt)) +			if (!refcount_inc_not_zero(&n->refcnt))  				n = NULL;  			NEIGH_CACHE_STAT_INC(tbl, hits);  			break; @@ -674,7 +709,7 @@ static void neigh_parms_destroy(struct neigh_parms *parms);  static inline void neigh_parms_put(struct neigh_parms *parms)  { -	if (atomic_dec_and_test(&parms->refcnt)) +	if (refcount_dec_and_test(&parms->refcnt))  		neigh_parms_destroy(parms);  } @@ -786,7 +821,7 @@ static void neigh_periodic_work(struct work_struct *work)  			if (time_before(n->used, n->confirmed))  				n->used = n->confirmed; -			if (atomic_read(&n->refcnt) == 1 && +			if (refcount_read(&n->refcnt) == 1 &&  			    (state == NUD_FAILED ||  			     time_after(jiffies, n->used + NEIGH_VAR(n->parms, GC_STALETIME)))) {  				*np = n->next; @@ -1132,10 +1167,6 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new,  		lladdr = neigh->ha;  	} -	if (new & NUD_CONNECTED) -		neigh->confirmed = jiffies; -	neigh->updated = jiffies; -  	/* If entry was valid and address is not changed,  	   do not change entry state, if new one is STALE.  	 */ @@ -1157,6 +1188,16 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new,  		}  	} +	/* Update timestamps only once we know we will make a change to the +	 * neighbour entry. Otherwise we risk to move the locktime window with +	 * noop updates and ignore relevant ARP updates. +	 */ +	if (new != old || lladdr != neigh->ha) { +		if (new & NUD_CONNECTED) +			neigh->confirmed = jiffies; +		neigh->updated = jiffies; +	} +  	if (new != old) {  		neigh_del_timer(neigh);  		if (new & NUD_PROBE) @@ -1438,7 +1479,7 @@ struct neigh_parms *neigh_parms_alloc(struct net_device *dev,  	p = kmemdup(&tbl->parms, sizeof(*p), GFP_KERNEL);  	if (p) {  		p->tbl		  = tbl; -		atomic_set(&p->refcnt, 1); +		refcount_set(&p->refcnt, 1);  		p->reachable_time =  				neigh_rand_reach_time(NEIGH_VAR(p, BASE_REACHABLE_TIME));  		dev_hold(dev); @@ -1501,7 +1542,7 @@ void neigh_table_init(int index, struct neigh_table *tbl)  	INIT_LIST_HEAD(&tbl->parms_list);  	list_add(&tbl->parms.list, &tbl->parms_list);  	write_pnet(&tbl->parms.net, &init_net); -	atomic_set(&tbl->parms.refcnt, 1); +	refcount_set(&tbl->parms.refcnt, 1);  	tbl->parms.reachable_time =  			  neigh_rand_reach_time(NEIGH_VAR(&tbl->parms, BASE_REACHABLE_TIME)); @@ -1643,7 +1684,10 @@ static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh,  			   NEIGH_UPDATE_F_OVERRIDE |  			   NEIGH_UPDATE_F_ADMIN,  			   NETLINK_CB(skb).portid); +	write_lock_bh(&tbl->lock);  	neigh_release(neigh); +	neigh_remove_one(neigh, tbl); +	write_unlock_bh(&tbl->lock);  out:  	return err; @@ -1752,7 +1796,7 @@ static int neightbl_fill_parms(struct sk_buff *skb, struct neigh_parms *parms)  	if ((parms->dev &&  	     nla_put_u32(skb, NDTPA_IFINDEX, parms->dev->ifindex)) || -	    nla_put_u32(skb, NDTPA_REFCNT, atomic_read(&parms->refcnt)) || +	    nla_put_u32(skb, NDTPA_REFCNT, refcount_read(&parms->refcnt)) ||  	    nla_put_u32(skb, NDTPA_QUEUE_LENBYTES,  			NEIGH_VAR(parms, QUEUE_LEN_BYTES)) ||  	    /* approximative value for deprecated QUEUE_LEN (in packets) */ @@ -2190,7 +2234,7 @@ static int neigh_fill_info(struct sk_buff *skb, struct neighbour *neigh,  	ci.ndm_used	 = jiffies_to_clock_t(now - neigh->used);  	ci.ndm_confirmed = jiffies_to_clock_t(now - neigh->confirmed);  	ci.ndm_updated	 = jiffies_to_clock_t(now - neigh->updated); -	ci.ndm_refcnt	 = atomic_read(&neigh->refcnt) - 1; +	ci.ndm_refcnt	 = refcount_read(&neigh->refcnt) - 1;  	read_unlock_bh(&neigh->lock);  	if (nla_put_u32(skb, NDA_PROBES, atomic_read(&neigh->probes)) || diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c index 14d09345f00d..4847964931df 100644 --- a/net/core/net-procfs.c +++ b/net/core/net-procfs.c @@ -363,15 +363,10 @@ static int dev_mc_seq_show(struct seq_file *seq, void *v)  	netif_addr_lock_bh(dev);  	netdev_for_each_mc_addr(ha, dev) { -		int i; - -		seq_printf(seq, "%-4d %-15s %-5d %-5d ", dev->ifindex, -			   dev->name, ha->refcount, ha->global_use); - -		for (i = 0; i < dev->addr_len; i++) -			seq_printf(seq, "%02x", ha->addr[i]); - -		seq_putc(seq, '\n'); +		seq_printf(seq, "%-4d %-15s %-5d %-5d %*phN\n", +			   dev->ifindex, dev->name, +			   ha->refcount, ha->global_use, +			   (int)dev->addr_len, ha->addr);  	}  	netif_addr_unlock_bh(dev);  	return 0; diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 65ea0ff4017c..b4f9922b6f23 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -323,7 +323,11 @@ NETDEVICE_SHOW_RW(flags, fmt_hex);  static int change_tx_queue_len(struct net_device *dev, unsigned long new_len)  { -	int res, orig_len = dev->tx_queue_len; +	unsigned int orig_len = dev->tx_queue_len; +	int res; + +	if (new_len != (unsigned int)new_len) +		return -ERANGE;  	if (new_len != orig_len) {  		dev->tx_queue_len = new_len; @@ -349,7 +353,7 @@ static ssize_t tx_queue_len_store(struct device *dev,  	return netdev_store(dev, attr, buf, len, change_tx_queue_len);  } -NETDEVICE_SHOW_RW(tx_queue_len, fmt_ulong); +NETDEVICE_SHOW_RW(tx_queue_len, fmt_dec);  static int change_gro_flush_timeout(struct net_device *dev, unsigned long val)  { @@ -622,7 +626,7 @@ static struct attribute *netstat_attrs[] = {  }; -static struct attribute_group netstat_group = { +static const struct attribute_group netstat_group = {  	.name  = "statistics",  	.attrs  = netstat_attrs,  }; @@ -632,7 +636,7 @@ static struct attribute *wireless_attrs[] = {  	NULL  }; -static struct attribute_group wireless_group = { +static const struct attribute_group wireless_group = {  	.name = "wireless",  	.attrs = wireless_attrs,  }; @@ -1200,7 +1204,7 @@ static struct attribute *dql_attrs[] = {  	NULL  }; -static struct attribute_group dql_group = { +static const struct attribute_group dql_group = {  	.name  = "byte_queue_limits",  	.attrs  = dql_attrs,  }; @@ -1444,7 +1448,7 @@ static void *net_grab_current_ns(void)  	struct net *ns = current->nsproxy->net_ns;  #ifdef CONFIG_NET_NS  	if (ns) -		atomic_inc(&ns->passive); +		refcount_inc(&ns->passive);  #endif  	return ns;  } diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 1934efd4a9d4..8726d051f31d 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -284,7 +284,7 @@ static __net_init int setup_net(struct net *net, struct user_namespace *user_ns)  	LIST_HEAD(net_exit_list);  	atomic_set(&net->count, 1); -	atomic_set(&net->passive, 1); +	refcount_set(&net->passive, 1);  	net->dev_base_seq = 1;  	net->user_ns = user_ns;  	idr_init(&net->netns_ids); @@ -315,6 +315,25 @@ out_undo:  	goto out;  } +static int __net_init net_defaults_init_net(struct net *net) +{ +	net->core.sysctl_somaxconn = SOMAXCONN; +	return 0; +} + +static struct pernet_operations net_defaults_ops = { +	.init = net_defaults_init_net, +}; + +static __init int net_defaults_init(void) +{ +	if (register_pernet_subsys(&net_defaults_ops)) +		panic("Cannot initialize net default settings"); + +	return 0; +} + +core_initcall(net_defaults_init);  #ifdef CONFIG_NET_NS  static struct ucounts *inc_net_namespaces(struct user_namespace *ns) @@ -361,7 +380,7 @@ static void net_free(struct net *net)  void net_drop_ns(void *p)  {  	struct net *ns = p; -	if (ns && atomic_dec_and_test(&ns->passive)) +	if (ns && refcount_dec_and_test(&ns->passive))  		net_free(ns);  } @@ -482,6 +501,23 @@ static void cleanup_net(struct work_struct *work)  		net_drop_ns(net);  	}  } + +/** + * net_ns_barrier - wait until concurrent net_cleanup_work is done + * + * cleanup_net runs from work queue and will first remove namespaces + * from the global list, then run net exit functions. + * + * Call this in module exit path to make sure that all netns + * ->exit ops have been invoked before the function is removed. + */ +void net_ns_barrier(void) +{ +	mutex_lock(&net_mutex); +	mutex_unlock(&net_mutex); +} +EXPORT_SYMBOL(net_ns_barrier); +  static DECLARE_WORK(net_cleanup_work, cleanup_net);  void __put_net(struct net *net) @@ -577,6 +613,7 @@ static int rtnl_net_newid(struct sk_buff *skb, struct nlmsghdr *nlh,  {  	struct net *net = sock_net(skb->sk);  	struct nlattr *tb[NETNSA_MAX + 1]; +	struct nlattr *nla;  	struct net *peer;  	int nsid, err; @@ -584,23 +621,35 @@ static int rtnl_net_newid(struct sk_buff *skb, struct nlmsghdr *nlh,  			  rtnl_net_policy, extack);  	if (err < 0)  		return err; -	if (!tb[NETNSA_NSID]) +	if (!tb[NETNSA_NSID]) { +		NL_SET_ERR_MSG(extack, "nsid is missing");  		return -EINVAL; +	}  	nsid = nla_get_s32(tb[NETNSA_NSID]); -	if (tb[NETNSA_PID]) +	if (tb[NETNSA_PID]) {  		peer = get_net_ns_by_pid(nla_get_u32(tb[NETNSA_PID])); -	else if (tb[NETNSA_FD]) +		nla = tb[NETNSA_PID]; +	} else if (tb[NETNSA_FD]) {  		peer = get_net_ns_by_fd(nla_get_u32(tb[NETNSA_FD])); -	else +		nla = tb[NETNSA_FD]; +	} else { +		NL_SET_ERR_MSG(extack, "Peer netns reference is missing");  		return -EINVAL; -	if (IS_ERR(peer)) +	} +	if (IS_ERR(peer)) { +		NL_SET_BAD_ATTR(extack, nla); +		NL_SET_ERR_MSG(extack, "Peer netns reference is invalid");  		return PTR_ERR(peer); +	}  	spin_lock_bh(&net->nsid_lock);  	if (__peernet2id(net, peer) >= 0) {  		spin_unlock_bh(&net->nsid_lock);  		err = -EEXIST; +		NL_SET_BAD_ATTR(extack, nla); +		NL_SET_ERR_MSG(extack, +			       "Peer netns already has a nsid assigned");  		goto out;  	} @@ -609,6 +658,10 @@ static int rtnl_net_newid(struct sk_buff *skb, struct nlmsghdr *nlh,  	if (err >= 0) {  		rtnl_net_notifyid(net, RTM_NEWNSID, err);  		err = 0; +	} else if (err == -ENOSPC && nsid >= 0) { +		err = -EEXIST; +		NL_SET_BAD_ATTR(extack, tb[NETNSA_NSID]); +		NL_SET_ERR_MSG(extack, "The specified nsid is already used");  	}  out:  	put_net(peer); @@ -651,6 +704,7 @@ static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh,  {  	struct net *net = sock_net(skb->sk);  	struct nlattr *tb[NETNSA_MAX + 1]; +	struct nlattr *nla;  	struct sk_buff *msg;  	struct net *peer;  	int err, id; @@ -659,15 +713,22 @@ static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh,  			  rtnl_net_policy, extack);  	if (err < 0)  		return err; -	if (tb[NETNSA_PID]) +	if (tb[NETNSA_PID]) {  		peer = get_net_ns_by_pid(nla_get_u32(tb[NETNSA_PID])); -	else if (tb[NETNSA_FD]) +		nla = tb[NETNSA_PID]; +	} else if (tb[NETNSA_FD]) {  		peer = get_net_ns_by_fd(nla_get_u32(tb[NETNSA_FD])); -	else +		nla = tb[NETNSA_FD]; +	} else { +		NL_SET_ERR_MSG(extack, "Peer netns reference is missing");  		return -EINVAL; +	} -	if (IS_ERR(peer)) +	if (IS_ERR(peer)) { +		NL_SET_BAD_ATTR(extack, nla); +		NL_SET_ERR_MSG(extack, "Peer netns reference is invalid");  		return PTR_ERR(peer); +	}  	msg = nlmsg_new(rtnl_net_get_size(), GFP_KERNEL);  	if (!msg) { diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 29be2466970c..d3408a693166 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -277,7 +277,7 @@ static void zap_completion_queue(void)  			struct sk_buff *skb = clist;  			clist = clist->next;  			if (!skb_irq_freeable(skb)) { -				atomic_inc(&skb->users); +				refcount_inc(&skb->users);  				dev_kfree_skb_any(skb); /* put this one back */  			} else {  				__kfree_skb(skb); @@ -309,7 +309,7 @@ repeat:  		return NULL;  	} -	atomic_set(&skb->users, 1); +	refcount_set(&skb->users, 1);  	skb_reserve(skb, reserve);  	return skb;  } @@ -441,7 +441,7 @@ void netpoll_send_udp(struct netpoll *np, const char *msg, int len)  		ip6h->saddr = np->local_ip.in6;  		ip6h->daddr = np->remote_ip.in6; -		eth = (struct ethhdr *) skb_push(skb, ETH_HLEN); +		eth = skb_push(skb, ETH_HLEN);  		skb_reset_mac_header(skb);  		skb->protocol = eth->h_proto = htons(ETH_P_IPV6);  	} else { @@ -470,7 +470,7 @@ void netpoll_send_udp(struct netpoll *np, const char *msg, int len)  		put_unaligned(np->remote_ip.ip, &(iph->daddr));  		iph->check    = ip_fast_csum((unsigned char *)iph, iph->ihl); -		eth = (struct ethhdr *) skb_push(skb, ETH_HLEN); +		eth = skb_push(skb, ETH_HLEN);  		skb_reset_mac_header(skb);  		skb->protocol = eth->h_proto = htons(ETH_P_IP);  	} @@ -632,7 +632,7 @@ int __netpoll_setup(struct netpoll *np, struct net_device *ndev)  		skb_queue_head_init(&npinfo->txq);  		INIT_DELAYED_WORK(&npinfo->tx_work, queue_process); -		atomic_set(&npinfo->refcnt, 1); +		refcount_set(&npinfo->refcnt, 1);  		ops = np->dev->netdev_ops;  		if (ops->ndo_netpoll_setup) { @@ -642,7 +642,7 @@ int __netpoll_setup(struct netpoll *np, struct net_device *ndev)  		}  	} else {  		npinfo = rtnl_dereference(ndev->npinfo); -		atomic_inc(&npinfo->refcnt); +		refcount_inc(&npinfo->refcnt);  	}  	npinfo->netpoll = np; @@ -821,7 +821,7 @@ void __netpoll_cleanup(struct netpoll *np)  	synchronize_srcu(&netpoll_srcu); -	if (atomic_dec_and_test(&npinfo->refcnt)) { +	if (refcount_dec_and_test(&npinfo->refcnt)) {  		const struct net_device_ops *ops;  		ops = np->dev->netdev_ops; diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 96947f5d41e4..6e1e10ff433a 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -2675,7 +2675,7 @@ static int process_ipsec(struct pktgen_dev *pkt_dev,  				goto err;  			}  			/* restore ll */ -			eth = (struct ethhdr *)skb_push(skb, ETH_HLEN); +			eth = skb_push(skb, ETH_HLEN);  			memcpy(eth, pkt_dev->hh, 2 * ETH_ALEN);  			eth->h_proto = protocol; @@ -2714,11 +2714,11 @@ static void pktgen_finalize_skb(struct pktgen_dev *pkt_dev, struct sk_buff *skb,  	struct timeval timestamp;  	struct pktgen_hdr *pgh; -	pgh = (struct pktgen_hdr *)skb_put(skb, sizeof(*pgh)); +	pgh = skb_put(skb, sizeof(*pgh));  	datalen -= sizeof(*pgh);  	if (pkt_dev->nfrags <= 0) { -		memset(skb_put(skb, datalen), 0, datalen); +		skb_put_zero(skb, datalen);  	} else {  		int frags = pkt_dev->nfrags;  		int i, len; @@ -2729,7 +2729,7 @@ static void pktgen_finalize_skb(struct pktgen_dev *pkt_dev, struct sk_buff *skb,  			frags = MAX_SKB_FRAGS;  		len = datalen - frags * PAGE_SIZE;  		if (len > 0) { -			memset(skb_put(skb, len), 0, len); +			skb_put_zero(skb, len);  			datalen = frags * PAGE_SIZE;  		} @@ -2844,34 +2844,35 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev,  	skb_reserve(skb, 16);  	/*  Reserve for ethernet and IP header  */ -	eth = (__u8 *) skb_push(skb, 14); -	mpls = (__be32 *)skb_put(skb, pkt_dev->nr_labels*sizeof(__u32)); +	eth = skb_push(skb, 14); +	mpls = skb_put(skb, pkt_dev->nr_labels * sizeof(__u32));  	if (pkt_dev->nr_labels)  		mpls_push(mpls, pkt_dev);  	if (pkt_dev->vlan_id != 0xffff) {  		if (pkt_dev->svlan_id != 0xffff) { -			svlan_tci = (__be16 *)skb_put(skb, sizeof(__be16)); +			svlan_tci = skb_put(skb, sizeof(__be16));  			*svlan_tci = build_tci(pkt_dev->svlan_id,  					       pkt_dev->svlan_cfi,  					       pkt_dev->svlan_p); -			svlan_encapsulated_proto = (__be16 *)skb_put(skb, sizeof(__be16)); +			svlan_encapsulated_proto = skb_put(skb, +							   sizeof(__be16));  			*svlan_encapsulated_proto = htons(ETH_P_8021Q);  		} -		vlan_tci = (__be16 *)skb_put(skb, sizeof(__be16)); +		vlan_tci = skb_put(skb, sizeof(__be16));  		*vlan_tci = build_tci(pkt_dev->vlan_id,  				      pkt_dev->vlan_cfi,  				      pkt_dev->vlan_p); -		vlan_encapsulated_proto = (__be16 *)skb_put(skb, sizeof(__be16)); +		vlan_encapsulated_proto = skb_put(skb, sizeof(__be16));  		*vlan_encapsulated_proto = htons(ETH_P_IP);  	}  	skb_reset_mac_header(skb);  	skb_set_network_header(skb, skb->len); -	iph = (struct iphdr *) skb_put(skb, sizeof(struct iphdr)); +	iph = skb_put(skb, sizeof(struct iphdr));  	skb_set_transport_header(skb, skb->len); -	udph = (struct udphdr *) skb_put(skb, sizeof(struct udphdr)); +	udph = skb_put(skb, sizeof(struct udphdr));  	skb_set_queue_mapping(skb, queue_map);  	skb->priority = pkt_dev->skb_priority; @@ -2971,34 +2972,35 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev,  	skb_reserve(skb, 16);  	/*  Reserve for ethernet and IP header  */ -	eth = (__u8 *) skb_push(skb, 14); -	mpls = (__be32 *)skb_put(skb, pkt_dev->nr_labels*sizeof(__u32)); +	eth = skb_push(skb, 14); +	mpls = skb_put(skb, pkt_dev->nr_labels * sizeof(__u32));  	if (pkt_dev->nr_labels)  		mpls_push(mpls, pkt_dev);  	if (pkt_dev->vlan_id != 0xffff) {  		if (pkt_dev->svlan_id != 0xffff) { -			svlan_tci = (__be16 *)skb_put(skb, sizeof(__be16)); +			svlan_tci = skb_put(skb, sizeof(__be16));  			*svlan_tci = build_tci(pkt_dev->svlan_id,  					       pkt_dev->svlan_cfi,  					       pkt_dev->svlan_p); -			svlan_encapsulated_proto = (__be16 *)skb_put(skb, sizeof(__be16)); +			svlan_encapsulated_proto = skb_put(skb, +							   sizeof(__be16));  			*svlan_encapsulated_proto = htons(ETH_P_8021Q);  		} -		vlan_tci = (__be16 *)skb_put(skb, sizeof(__be16)); +		vlan_tci = skb_put(skb, sizeof(__be16));  		*vlan_tci = build_tci(pkt_dev->vlan_id,  				      pkt_dev->vlan_cfi,  				      pkt_dev->vlan_p); -		vlan_encapsulated_proto = (__be16 *)skb_put(skb, sizeof(__be16)); +		vlan_encapsulated_proto = skb_put(skb, sizeof(__be16));  		*vlan_encapsulated_proto = htons(ETH_P_IPV6);  	}  	skb_reset_mac_header(skb);  	skb_set_network_header(skb, skb->len); -	iph = (struct ipv6hdr *) skb_put(skb, sizeof(struct ipv6hdr)); +	iph = skb_put(skb, sizeof(struct ipv6hdr));  	skb_set_transport_header(skb, skb->len); -	udph = (struct udphdr *) skb_put(skb, sizeof(struct udphdr)); +	udph = skb_put(skb, sizeof(struct udphdr));  	skb_set_queue_mapping(skb, queue_map);  	skb->priority = pkt_dev->skb_priority; @@ -3361,7 +3363,7 @@ static void pktgen_wait_for_skb(struct pktgen_dev *pkt_dev)  {  	ktime_t idle_start = ktime_get(); -	while (atomic_read(&(pkt_dev->skb->users)) != 1) { +	while (refcount_read(&(pkt_dev->skb->users)) != 1) {  		if (signal_pending(current))  			break; @@ -3418,7 +3420,7 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev)  	if (pkt_dev->xmit_mode == M_NETIF_RECEIVE) {  		skb = pkt_dev->skb;  		skb->protocol = eth_type_trans(skb, skb->dev); -		atomic_add(burst, &skb->users); +		refcount_add(burst, &skb->users);  		local_bh_disable();  		do {  			ret = netif_receive_skb(skb); @@ -3426,11 +3428,11 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev)  				pkt_dev->errors++;  			pkt_dev->sofar++;  			pkt_dev->seq_num++; -			if (atomic_read(&skb->users) != burst) { +			if (refcount_read(&skb->users) != burst) {  				/* skb was queued by rps/rfs or taps,  				 * so cannot reuse this skb  				 */ -				atomic_sub(burst - 1, &skb->users); +				WARN_ON(refcount_sub_and_test(burst - 1, &skb->users));  				/* get out of the loop and wait  				 * until skb is consumed  				 */ @@ -3444,7 +3446,7 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev)  		goto out; /* Skips xmit_mode M_START_XMIT */  	} else if (pkt_dev->xmit_mode == M_QUEUE_XMIT) {  		local_bh_disable(); -		atomic_inc(&pkt_dev->skb->users); +		refcount_inc(&pkt_dev->skb->users);  		ret = dev_queue_xmit(pkt_dev->skb);  		switch (ret) { @@ -3485,7 +3487,7 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev)  		pkt_dev->last_ok = 0;  		goto unlock;  	} -	atomic_add(burst, &pkt_dev->skb->users); +	refcount_add(burst, &pkt_dev->skb->users);  xmit_more:  	ret = netdev_start_xmit(pkt_dev->skb, odev, txq, --burst > 0); @@ -3511,11 +3513,11 @@ xmit_more:  		/* fallthru */  	case NETDEV_TX_BUSY:  		/* Retry it next time */ -		atomic_dec(&(pkt_dev->skb->users)); +		refcount_dec(&(pkt_dev->skb->users));  		pkt_dev->last_ok = 0;  	}  	if (unlikely(burst)) -		atomic_sub(burst, &pkt_dev->skb->users); +		WARN_ON(refcount_sub_and_test(burst, &pkt_dev->skb->users));  unlock:  	HARD_TX_UNLOCK(odev, txq); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index bcb0f610ee42..d1ba90980be1 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -16,6 +16,7 @@   *	Vitaly E. Lavrov		RTA_OK arithmetics was wrong.   */ +#include <linux/bitops.h>  #include <linux/errno.h>  #include <linux/module.h>  #include <linux/types.h> @@ -39,6 +40,7 @@  #include <linux/if_vlan.h>  #include <linux/pci.h>  #include <linux/etherdevice.h> +#include <linux/bpf.h>  #include <linux/uaccess.h> @@ -647,7 +649,7 @@ int rtnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, unsigned int g  	NETLINK_CB(skb).dst_group = group;  	if (echo) -		atomic_inc(&skb->users); +		refcount_inc(&skb->users);  	netlink_broadcast(rtnl, skb, pid, group, GFP_KERNEL);  	if (echo)  		err = netlink_unicast(rtnl, skb, pid, MSG_DONTWAIT); @@ -900,7 +902,7 @@ static size_t rtnl_xdp_size(void)  {  	size_t xdp_size = nla_total_size(0) +	/* nest IFLA_XDP */  			  nla_total_size(1) +	/* XDP_ATTACHED */ -			  nla_total_size(4);	/* XDP_FLAGS */ +			  nla_total_size(4);	/* XDP_PROG_ID */  	return xdp_size;  } @@ -932,6 +934,7 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev,  	       + nla_total_size(1) /* IFLA_LINKMODE */  	       + nla_total_size(4) /* IFLA_CARRIER_CHANGES */  	       + nla_total_size(4) /* IFLA_LINK_NETNSID */ +	       + nla_total_size(4) /* IFLA_GROUP */  	       + nla_total_size(ext_filter_mask  			        & RTEXT_FILTER_VF ? 4 : 0) /* IFLA_NUM_VF */  	       + rtnl_vfinfo_size(dev, ext_filter_mask) /* IFLA_VFINFO_LIST */ @@ -942,6 +945,7 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev,  	       + nla_total_size(MAX_PHYS_ITEM_ID_LEN) /* IFLA_PHYS_SWITCH_ID */  	       + nla_total_size(IFNAMSIZ) /* IFLA_PHYS_PORT_NAME */  	       + rtnl_xdp_size() /* IFLA_XDP */ +	       + nla_total_size(4)  /* IFLA_EVENT */  	       + nla_total_size(1); /* IFLA_PROTO_DOWN */  } @@ -1125,6 +1129,8 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb,  	struct ifla_vf_mac vf_mac;  	struct ifla_vf_info ivi; +	memset(&ivi, 0, sizeof(ivi)); +  	/* Not all SR-IOV capable drivers support the  	 * spoofcheck and "RSS query enable" query.  Preset to  	 * -1 so the user space tool can detect that the driver @@ -1133,7 +1139,6 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb,  	ivi.spoofchk = -1;  	ivi.rss_query_en = -1;  	ivi.trusted = -1; -	memset(ivi.mac, 0, sizeof(ivi.mac));  	/* The default value for VF link state is "auto"  	 * IFLA_VF_LINK_STATE_AUTO which equals zero  	 */ @@ -1247,37 +1252,46 @@ static int rtnl_fill_link_ifmap(struct sk_buff *skb, struct net_device *dev)  	return 0;  } +static u8 rtnl_xdp_attached_mode(struct net_device *dev, u32 *prog_id) +{ +	const struct net_device_ops *ops = dev->netdev_ops; +	const struct bpf_prog *generic_xdp_prog; + +	ASSERT_RTNL(); + +	*prog_id = 0; +	generic_xdp_prog = rtnl_dereference(dev->xdp_prog); +	if (generic_xdp_prog) { +		*prog_id = generic_xdp_prog->aux->id; +		return XDP_ATTACHED_SKB; +	} +	if (!ops->ndo_xdp) +		return XDP_ATTACHED_NONE; + +	return __dev_xdp_attached(dev, ops->ndo_xdp, prog_id); +} +  static int rtnl_xdp_fill(struct sk_buff *skb, struct net_device *dev)  {  	struct nlattr *xdp; -	u32 xdp_flags = 0; -	u8 val = 0; +	u32 prog_id;  	int err;  	xdp = nla_nest_start(skb, IFLA_XDP);  	if (!xdp)  		return -EMSGSIZE; -	if (rcu_access_pointer(dev->xdp_prog)) { -		xdp_flags = XDP_FLAGS_SKB_MODE; -		val = 1; -	} else if (dev->netdev_ops->ndo_xdp) { -		struct netdev_xdp xdp_op = {}; - -		xdp_op.command = XDP_QUERY_PROG; -		err = dev->netdev_ops->ndo_xdp(dev, &xdp_op); -		if (err) -			goto err_cancel; -		val = xdp_op.prog_attached; -	} -	err = nla_put_u8(skb, IFLA_XDP_ATTACHED, val); + +	err = nla_put_u8(skb, IFLA_XDP_ATTACHED, +			 rtnl_xdp_attached_mode(dev, &prog_id));  	if (err)  		goto err_cancel; -	if (xdp_flags) { -		err = nla_put_u32(skb, IFLA_XDP_FLAGS, xdp_flags); +	if (prog_id) { +		err = nla_put_u32(skb, IFLA_XDP_PROG_ID, prog_id);  		if (err)  			goto err_cancel;  	} +  	nla_nest_end(skb, xdp);  	return 0; @@ -1286,9 +1300,40 @@ err_cancel:  	return err;  } +static u32 rtnl_get_event(unsigned long event) +{ +	u32 rtnl_event_type = IFLA_EVENT_NONE; + +	switch (event) { +	case NETDEV_REBOOT: +		rtnl_event_type = IFLA_EVENT_REBOOT; +		break; +	case NETDEV_FEAT_CHANGE: +		rtnl_event_type = IFLA_EVENT_FEATURES; +		break; +	case NETDEV_BONDING_FAILOVER: +		rtnl_event_type = IFLA_EVENT_BONDING_FAILOVER; +		break; +	case NETDEV_NOTIFY_PEERS: +		rtnl_event_type = IFLA_EVENT_NOTIFY_PEERS; +		break; +	case NETDEV_RESEND_IGMP: +		rtnl_event_type = IFLA_EVENT_IGMP_RESEND; +		break; +	case NETDEV_CHANGEINFODATA: +		rtnl_event_type = IFLA_EVENT_BONDING_OPTIONS; +		break; +	default: +		break; +	} + +	return rtnl_event_type; +} +  static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,  			    int type, u32 pid, u32 seq, u32 change, -			    unsigned int flags, u32 ext_filter_mask) +			    unsigned int flags, u32 ext_filter_mask, +			    u32 event)  {  	struct ifinfomsg *ifm;  	struct nlmsghdr *nlh; @@ -1337,6 +1382,11 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,  	    nla_put_u8(skb, IFLA_PROTO_DOWN, dev->proto_down))  		goto nla_put_failure; +	if (event != IFLA_EVENT_NONE) { +		if (nla_put_u32(skb, IFLA_EVENT, event)) +			goto nla_put_failure; +	} +  	if (rtnl_fill_link_ifmap(skb, dev))  		goto nla_put_failure; @@ -1471,6 +1521,8 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = {  	[IFLA_LINK_NETNSID]	= { .type = NLA_S32 },  	[IFLA_PROTO_DOWN]	= { .type = NLA_U8 },  	[IFLA_XDP]		= { .type = NLA_NESTED }, +	[IFLA_EVENT]		= { .type = NLA_U32 }, +	[IFLA_GROUP]		= { .type = NLA_U32 },  };  static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = { @@ -1518,6 +1570,7 @@ static const struct nla_policy ifla_xdp_policy[IFLA_XDP_MAX + 1] = {  	[IFLA_XDP_FD]		= { .type = NLA_S32 },  	[IFLA_XDP_ATTACHED]	= { .type = NLA_U8 },  	[IFLA_XDP_FLAGS]	= { .type = NLA_U32 }, +	[IFLA_XDP_PROG_ID]	= { .type = NLA_U32 },  };  static const struct rtnl_link_ops *linkinfo_to_kind_ops(const struct nlattr *nla) @@ -1630,14 +1683,14 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)  					       NETLINK_CB(cb->skb).portid,  					       cb->nlh->nlmsg_seq, 0,  					       flags, -					       ext_filter_mask); -			/* If we ran out of room on the first message, -			 * we're in trouble -			 */ -			WARN_ON((err == -EMSGSIZE) && (skb->len == 0)); +					       ext_filter_mask, 0); -			if (err < 0) -				goto out; +			if (err < 0) { +				if (likely(skb->len)) +					goto out; + +				goto out_err; +			}  			nl_dump_check_consistent(cb, nlmsg_hdr(skb));  cont: @@ -1645,10 +1698,12 @@ cont:  		}  	}  out: +	err = skb->len; +out_err:  	cb->args[1] = idx;  	cb->args[0] = h; -	return skb->len; +	return err;  }  int rtnl_nla_parse_ifla(struct nlattr **tb, const struct nlattr *head, int len, @@ -2050,8 +2105,8 @@ static int do_setlink(const struct sk_buff *skb,  	}  	if (tb[IFLA_TXQLEN]) { -		unsigned long value = nla_get_u32(tb[IFLA_TXQLEN]); -		unsigned long orig_len = dev->tx_queue_len; +		unsigned int value = nla_get_u32(tb[IFLA_TXQLEN]); +		unsigned int orig_len = dev->tx_queue_len;  		if (dev->tx_queue_len ^ value) {  			dev->tx_queue_len = value; @@ -2188,7 +2243,7 @@ static int do_setlink(const struct sk_buff *skb,  		if (err < 0)  			goto errout; -		if (xdp[IFLA_XDP_ATTACHED]) { +		if (xdp[IFLA_XDP_ATTACHED] || xdp[IFLA_XDP_PROG_ID]) {  			err = -EINVAL;  			goto errout;  		} @@ -2199,6 +2254,10 @@ static int do_setlink(const struct sk_buff *skb,  				err = -EINVAL;  				goto errout;  			} +			if (hweight32(xdp_flags & XDP_FLAGS_MODES) > 1) { +				err = -EINVAL; +				goto errout; +			}  		}  		if (xdp[IFLA_XDP_FD]) { @@ -2523,7 +2582,7 @@ replay:  				data = attr;  			}  			if (ops->validate) { -				err = ops->validate(tb, data); +				err = ops->validate(tb, data, extack);  				if (err < 0)  					return err;  			} @@ -2542,7 +2601,8 @@ replay:  				slave_data = slave_attr;  			}  			if (m_ops->slave_validate) { -				err = m_ops->slave_validate(tb, slave_data); +				err = m_ops->slave_validate(tb, slave_data, +							    extack);  				if (err < 0)  					return err;  			} @@ -2561,7 +2621,7 @@ replay:  				    !ops->changelink)  					return -EOPNOTSUPP; -				err = ops->changelink(dev, tb, data); +				err = ops->changelink(dev, tb, data, extack);  				if (err < 0)  					return err;  				status |= DO_SETLINK_NOTIFY; @@ -2572,7 +2632,8 @@ replay:  					return -EOPNOTSUPP;  				err = m_ops->slave_changelink(master_dev, dev, -							      tb, slave_data); +							      tb, slave_data, +							      extack);  				if (err < 0)  					return err;  				status |= DO_SETLINK_NOTIFY; @@ -2646,7 +2707,8 @@ replay:  		dev->ifindex = ifm->ifi_index;  		if (ops->newlink) { -			err = ops->newlink(link_net ? : net, dev, tb, data); +			err = ops->newlink(link_net ? : net, dev, tb, data, +					   extack);  			/* Drivers should call free_netdev() in ->destructor  			 * and unregister it on failure after registration  			 * so that device could be finally freed in rtnl_unlock. @@ -2733,7 +2795,7 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr *nlh,  		return -ENOBUFS;  	err = rtnl_fill_ifinfo(nskb, dev, RTM_NEWLINK, NETLINK_CB(skb).portid, -			       nlh->nlmsg_seq, 0, 0, ext_filter_mask); +			       nlh->nlmsg_seq, 0, 0, ext_filter_mask, 0);  	if (err < 0) {  		/* -EMSGSIZE implies BUG in if_nlmsg_size */  		WARN_ON(err == -EMSGSIZE); @@ -2805,7 +2867,8 @@ static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb)  }  struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev, -				       unsigned int change, gfp_t flags) +				       unsigned int change, +				       u32 event, gfp_t flags)  {  	struct net *net = dev_net(dev);  	struct sk_buff *skb; @@ -2816,7 +2879,7 @@ struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev,  	if (skb == NULL)  		goto errout; -	err = rtnl_fill_ifinfo(skb, dev, type, 0, 0, change, 0, 0); +	err = rtnl_fill_ifinfo(skb, dev, type, 0, 0, change, 0, 0, event);  	if (err < 0) {  		/* -EMSGSIZE implies BUG in if_nlmsg_size() */  		WARN_ON(err == -EMSGSIZE); @@ -2837,18 +2900,25 @@ void rtmsg_ifinfo_send(struct sk_buff *skb, struct net_device *dev, gfp_t flags)  	rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, flags);  } -void rtmsg_ifinfo(int type, struct net_device *dev, unsigned int change, -		  gfp_t flags) +static void rtmsg_ifinfo_event(int type, struct net_device *dev, +			       unsigned int change, u32 event, +			       gfp_t flags)  {  	struct sk_buff *skb;  	if (dev->reg_state != NETREG_REGISTERED)  		return; -	skb = rtmsg_ifinfo_build_skb(type, dev, change, flags); +	skb = rtmsg_ifinfo_build_skb(type, dev, change, event, flags);  	if (skb)  		rtmsg_ifinfo_send(skb, dev, flags);  } + +void rtmsg_ifinfo(int type, struct net_device *dev, unsigned int change, +		  gfp_t flags) +{ +	rtmsg_ifinfo_event(type, dev, change, rtnl_get_event(0), flags); +}  EXPORT_SYMBOL(rtmsg_ifinfo);  static int nlmsg_populate_fdb_fill(struct sk_buff *skb, @@ -3228,8 +3298,11 @@ static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb)  	int err = 0;  	int fidx = 0; -	if (nlmsg_parse(cb->nlh, sizeof(struct ifinfomsg), tb, -			IFLA_MAX, ifla_policy, NULL) == 0) { +	err = nlmsg_parse(cb->nlh, sizeof(struct ifinfomsg), tb, +			  IFLA_MAX, ifla_policy, NULL); +	if (err < 0) { +		return -EINVAL; +	} else if (err == 0) {  		if (tb[IFLA_MASTER])  			br_idx = nla_get_u32(tb[IFLA_MASTER]);  	} @@ -3452,8 +3525,12 @@ static int rtnl_bridge_getlink(struct sk_buff *skb, struct netlink_callback *cb)  				err = br_dev->netdev_ops->ndo_bridge_getlink(  						skb, portid, seq, dev,  						filter_mask, NLM_F_MULTI); -				if (err < 0 && err != -EOPNOTSUPP) -					break; +				if (err < 0 && err != -EOPNOTSUPP) { +					if (likely(skb->len)) +						break; + +					goto out_err; +				}  			}  			idx++;  		} @@ -3464,16 +3541,22 @@ static int rtnl_bridge_getlink(struct sk_buff *skb, struct netlink_callback *cb)  							      seq, dev,  							      filter_mask,  							      NLM_F_MULTI); -				if (err < 0 && err != -EOPNOTSUPP) -					break; +				if (err < 0 && err != -EOPNOTSUPP) { +					if (likely(skb->len)) +						break; + +					goto out_err; +				}  			}  			idx++;  		}  	} +	err = skb->len; +out_err:  	rcu_read_unlock();  	cb->args[0] = idx; -	return skb->len; +	return err;  }  static inline size_t bridge_nlmsg_size(void) @@ -4140,6 +4223,18 @@ static void rtnetlink_rcv(struct sk_buff *skb)  	rtnl_unlock();  } +static int rtnetlink_bind(struct net *net, int group) +{ +	switch (group) { +	case RTNLGRP_IPV4_MROUTE_R: +	case RTNLGRP_IPV6_MROUTE_R: +		if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) +			return -EPERM; +		break; +	} +	return 0; +} +  static int rtnetlink_event(struct notifier_block *this, unsigned long event, void *ptr)  {  	struct net_device *dev = netdev_notifier_info_to_dev(ptr); @@ -4152,7 +4247,8 @@ static int rtnetlink_event(struct notifier_block *this, unsigned long event, voi  	case NETDEV_NOTIFY_PEERS:  	case NETDEV_RESEND_IGMP:  	case NETDEV_CHANGEINFODATA: -		rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL); +		rtmsg_ifinfo_event(RTM_NEWLINK, dev, 0, rtnl_get_event(event), +				   GFP_KERNEL);  		break;  	default:  		break; @@ -4173,6 +4269,7 @@ static int __net_init rtnetlink_net_init(struct net *net)  		.input		= rtnetlink_rcv,  		.cb_mutex	= &rtnl_mutex,  		.flags		= NL_CFG_F_NONROOT_RECV, +		.bind		= rtnetlink_bind,  	};  	sk = netlink_kernel_create(net, NETLINK_ROUTE, &cfg); diff --git a/net/core/secure_seq.c b/net/core/secure_seq.c index 6bd2f8fb0476..7232274de334 100644 --- a/net/core/secure_seq.c +++ b/net/core/secure_seq.c @@ -24,9 +24,13 @@ static siphash_key_t ts_secret __read_mostly;  static __always_inline void net_secret_init(void)  { -	net_get_random_once(&ts_secret, sizeof(ts_secret));  	net_get_random_once(&net_secret, sizeof(net_secret));  } + +static __always_inline void ts_secret_init(void) +{ +	net_get_random_once(&ts_secret, sizeof(ts_secret)); +}  #endif  #ifdef CONFIG_INET @@ -47,7 +51,8 @@ static u32 seq_scale(u32 seq)  #endif  #if IS_ENABLED(CONFIG_IPV6) -static u32 secure_tcpv6_ts_off(const __be32 *saddr, const __be32 *daddr) +u32 secure_tcpv6_ts_off(const struct net *net, +			const __be32 *saddr, const __be32 *daddr)  {  	const struct {  		struct in6_addr saddr; @@ -57,15 +62,17 @@ static u32 secure_tcpv6_ts_off(const __be32 *saddr, const __be32 *daddr)  		.daddr = *(struct in6_addr *)daddr,  	}; -	if (sysctl_tcp_timestamps != 1) +	if (net->ipv4.sysctl_tcp_timestamps != 1)  		return 0; +	ts_secret_init();  	return siphash(&combined, offsetofend(typeof(combined), daddr),  		       &ts_secret);  } +EXPORT_SYMBOL(secure_tcpv6_ts_off); -u32 secure_tcpv6_seq_and_tsoff(const __be32 *saddr, const __be32 *daddr, -			       __be16 sport, __be16 dport, u32 *tsoff) +u32 secure_tcpv6_seq(const __be32 *saddr, const __be32 *daddr, +		     __be16 sport, __be16 dport)  {  	const struct {  		struct in6_addr saddr; @@ -78,14 +85,14 @@ u32 secure_tcpv6_seq_and_tsoff(const __be32 *saddr, const __be32 *daddr,  		.sport = sport,  		.dport = dport  	}; -	u64 hash; +	u32 hash; +  	net_secret_init();  	hash = siphash(&combined, offsetofend(typeof(combined), dport),  		       &net_secret); -	*tsoff = secure_tcpv6_ts_off(saddr, daddr);  	return seq_scale(hash);  } -EXPORT_SYMBOL(secure_tcpv6_seq_and_tsoff); +EXPORT_SYMBOL(secure_tcpv6_seq);  u32 secure_ipv6_port_ephemeral(const __be32 *saddr, const __be32 *daddr,  			       __be16 dport) @@ -107,11 +114,12 @@ EXPORT_SYMBOL(secure_ipv6_port_ephemeral);  #endif  #ifdef CONFIG_INET -static u32 secure_tcp_ts_off(__be32 saddr, __be32 daddr) +u32 secure_tcp_ts_off(const struct net *net, __be32 saddr, __be32 daddr)  { -	if (sysctl_tcp_timestamps != 1) +	if (net->ipv4.sysctl_tcp_timestamps != 1)  		return 0; +	ts_secret_init();  	return siphash_2u32((__force u32)saddr, (__force u32)daddr,  			    &ts_secret);  } @@ -121,15 +129,15 @@ static u32 secure_tcp_ts_off(__be32 saddr, __be32 daddr)   * it would be easy enough to have the former function use siphash_4u32, passing   * the arguments as separate u32.   */ -u32 secure_tcp_seq_and_tsoff(__be32 saddr, __be32 daddr, -			     __be16 sport, __be16 dport, u32 *tsoff) +u32 secure_tcp_seq(__be32 saddr, __be32 daddr, +		   __be16 sport, __be16 dport)  { -	u64 hash; +	u32 hash; +  	net_secret_init();  	hash = siphash_3u32((__force u32)saddr, (__force u32)daddr,  			    (__force u32)sport << 16 | (__force u32)dport,  			    &net_secret); -	*tsoff = secure_tcp_ts_off(saddr, daddr);  	return seq_scale(hash);  } diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 346d3e85dfbc..8b11341ed69a 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -176,7 +176,7 @@ struct sk_buff *__alloc_skb_head(gfp_t gfp_mask, int node)  	memset(skb, 0, offsetof(struct sk_buff, tail));  	skb->head = NULL;  	skb->truesize = sizeof(struct sk_buff); -	atomic_set(&skb->users, 1); +	refcount_set(&skb->users, 1);  	skb->mac_header = (typeof(skb->mac_header))~0U;  out: @@ -247,7 +247,7 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,  	/* Account for allocated memory : skb + skb->head */  	skb->truesize = SKB_TRUESIZE(size);  	skb->pfmemalloc = pfmemalloc; -	atomic_set(&skb->users, 1); +	refcount_set(&skb->users, 1);  	skb->head = data;  	skb->data = data;  	skb_reset_tail_pointer(skb); @@ -268,7 +268,7 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,  		kmemcheck_annotate_bitfield(&fclones->skb2, flags1);  		skb->fclone = SKB_FCLONE_ORIG; -		atomic_set(&fclones->fclone_ref, 1); +		refcount_set(&fclones->fclone_ref, 1);  		fclones->skb2.fclone = SKB_FCLONE_CLONE;  	} @@ -314,7 +314,7 @@ struct sk_buff *__build_skb(void *data, unsigned int frag_size)  	memset(skb, 0, offsetof(struct sk_buff, tail));  	skb->truesize = SKB_TRUESIZE(size); -	atomic_set(&skb->users, 1); +	refcount_set(&skb->users, 1);  	skb->head = data;  	skb->data = data;  	skb_reset_tail_pointer(skb); @@ -629,7 +629,7 @@ static void kfree_skbmem(struct sk_buff *skb)  		 * This test would have no chance to be true for the clone,  		 * while here, branch prediction will be good.  		 */ -		if (atomic_read(&fclones->fclone_ref) == 1) +		if (refcount_read(&fclones->fclone_ref) == 1)  			goto fastpath;  		break; @@ -637,18 +637,16 @@ static void kfree_skbmem(struct sk_buff *skb)  		fclones = container_of(skb, struct sk_buff_fclones, skb2);  		break;  	} -	if (!atomic_dec_and_test(&fclones->fclone_ref)) +	if (!refcount_dec_and_test(&fclones->fclone_ref))  		return;  fastpath:  	kmem_cache_free(skbuff_fclone_cache, fclones);  } -static void skb_release_head_state(struct sk_buff *skb) +void skb_release_head_state(struct sk_buff *skb)  {  	skb_dst_drop(skb); -#ifdef CONFIG_XFRM -	secpath_put(skb->sp); -#endif +	secpath_reset(skb);  	if (skb->destructor) {  		WARN_ON(in_irq());  		skb->destructor(skb); @@ -694,12 +692,9 @@ EXPORT_SYMBOL(__kfree_skb);   */  void kfree_skb(struct sk_buff *skb)  { -	if (unlikely(!skb)) -		return; -	if (likely(atomic_read(&skb->users) == 1)) -		smp_rmb(); -	else if (likely(!atomic_dec_and_test(&skb->users))) +	if (!skb_unref(skb))  		return; +  	trace_kfree_skb(skb, __builtin_return_address(0));  	__kfree_skb(skb);  } @@ -746,17 +741,32 @@ EXPORT_SYMBOL(skb_tx_error);   */  void consume_skb(struct sk_buff *skb)  { -	if (unlikely(!skb)) -		return; -	if (likely(atomic_read(&skb->users) == 1)) -		smp_rmb(); -	else if (likely(!atomic_dec_and_test(&skb->users))) +	if (!skb_unref(skb))  		return; +  	trace_consume_skb(skb);  	__kfree_skb(skb);  }  EXPORT_SYMBOL(consume_skb); +/** + *	consume_stateless_skb - free an skbuff, assuming it is stateless + *	@skb: buffer to free + * + *	Works like consume_skb(), but this variant assumes that all the head + *	states have been already dropped. + */ +void consume_stateless_skb(struct sk_buff *skb) +{ +	if (!skb_unref(skb)) +		return; + +	trace_consume_skb(skb); +	if (likely(skb->head)) +		skb_release_data(skb); +	kfree_skbmem(skb); +} +  void __kfree_skb_flush(void)  {  	struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); @@ -807,10 +817,9 @@ void napi_consume_skb(struct sk_buff *skb, int budget)  		return;  	} -	if (likely(atomic_read(&skb->users) == 1)) -		smp_rmb(); -	else if (likely(!atomic_dec_and_test(&skb->users))) +	if (!skb_unref(skb))  		return; +  	/* if reaching here SKB is ready to free */  	trace_consume_skb(skb); @@ -906,7 +915,7 @@ static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb)  	C(head_frag);  	C(data);  	C(truesize); -	atomic_set(&n->users, 1); +	refcount_set(&n->users, 1);  	atomic_inc(&(skb_shinfo(skb)->dataref));  	skb->cloned = 1; @@ -1018,9 +1027,9 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)  		return NULL;  	if (skb->fclone == SKB_FCLONE_ORIG && -	    atomic_read(&fclones->fclone_ref) == 1) { +	    refcount_read(&fclones->fclone_ref) == 1) {  		n = &fclones->skb2; -		atomic_set(&fclones->fclone_ref, 2); +		refcount_set(&fclones->fclone_ref, 2);  	} else {  		if (skb_pfmemalloc(skb))  			gfp_mask |= __GFP_MEMALLOC; @@ -1412,7 +1421,7 @@ EXPORT_SYMBOL(skb_pad);   *	returned.   */ -unsigned char *pskb_put(struct sk_buff *skb, struct sk_buff *tail, int len) +void *pskb_put(struct sk_buff *skb, struct sk_buff *tail, int len)  {  	if (tail != skb) {  		skb->data_len += len; @@ -1431,9 +1440,9 @@ EXPORT_SYMBOL_GPL(pskb_put);   *	exceed the total buffer size the kernel will panic. A pointer to the   *	first byte of the extra data is returned.   */ -unsigned char *skb_put(struct sk_buff *skb, unsigned int len) +void *skb_put(struct sk_buff *skb, unsigned int len)  { -	unsigned char *tmp = skb_tail_pointer(skb); +	void *tmp = skb_tail_pointer(skb);  	SKB_LINEAR_ASSERT(skb);  	skb->tail += len;  	skb->len  += len; @@ -1452,7 +1461,7 @@ EXPORT_SYMBOL(skb_put);   *	start. If this would exceed the total buffer headroom the kernel will   *	panic. A pointer to the first byte of the extra data is returned.   */ -unsigned char *skb_push(struct sk_buff *skb, unsigned int len) +void *skb_push(struct sk_buff *skb, unsigned int len)  {  	skb->data -= len;  	skb->len  += len; @@ -1472,7 +1481,7 @@ EXPORT_SYMBOL(skb_push);   *	is returned. Once the data has been pulled future pushes will overwrite   *	the old data.   */ -unsigned char *skb_pull(struct sk_buff *skb, unsigned int len) +void *skb_pull(struct sk_buff *skb, unsigned int len)  {  	return skb_pull_inline(skb, len);  } @@ -1607,7 +1616,7 @@ EXPORT_SYMBOL(___pskb_trim);   *   * It is pretty complicated. Luckily, it is called only in exceptional cases.   */ -unsigned char *__pskb_pull_tail(struct sk_buff *skb, int delta) +void *__pskb_pull_tail(struct sk_buff *skb, int delta)  {  	/* If skb has not enough free space at tail, get new one  	 * plus 128 bytes for future expansions. If we have enough @@ -2243,6 +2252,32 @@ __wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset,  }  EXPORT_SYMBOL(skb_copy_and_csum_bits); +static __wsum warn_crc32c_csum_update(const void *buff, int len, __wsum sum) +{ +	net_warn_ratelimited( +		"%s: attempt to compute crc32c without libcrc32c.ko\n", +		__func__); +	return 0; +} + +static __wsum warn_crc32c_csum_combine(__wsum csum, __wsum csum2, +				       int offset, int len) +{ +	net_warn_ratelimited( +		"%s: attempt to compute crc32c without libcrc32c.ko\n", +		__func__); +	return 0; +} + +static const struct skb_checksum_ops default_crc32c_ops = { +	.update  = warn_crc32c_csum_update, +	.combine = warn_crc32c_csum_combine, +}; + +const struct skb_checksum_ops *crc32c_csum_stub __read_mostly = +	&default_crc32c_ops; +EXPORT_SYMBOL(crc32c_csum_stub); +   /**   *	skb_zerocopy_headlen - Calculate headroom needed for skb_zerocopy()   *	@from: source buffer @@ -2620,7 +2655,8 @@ void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len)  {  	int pos = skb_headlen(skb); -	skb_shinfo(skb1)->tx_flags = skb_shinfo(skb)->tx_flags & SKBTX_SHARED_FRAG; +	skb_shinfo(skb1)->tx_flags |= skb_shinfo(skb)->tx_flags & +				      SKBTX_SHARED_FRAG;  	if (len < pos)	/* Split line is inside header. */  		skb_split_inside_header(skb, skb1, len, pos);  	else		/* Second chunk has no header, nothing to copy. */ @@ -2988,7 +3024,7 @@ int skb_append_datato_frags(struct sock *sk, struct sk_buff *skb,  		get_page(pfrag->page);  		skb->truesize += copy; -		atomic_add(copy, &sk->sk_wmem_alloc); +		refcount_add(copy, &sk->sk_wmem_alloc);  		skb->len += copy;  		skb->data_len += copy;  		offset += copy; @@ -3029,7 +3065,7 @@ EXPORT_SYMBOL_GPL(skb_append_pagefrags);   *	that the checksum difference is zero (e.g., a valid IP header)   *	or you are setting ip_summed to CHECKSUM_NONE.   */ -unsigned char *skb_pull_rcsum(struct sk_buff *skb, unsigned int len) +void *skb_pull_rcsum(struct sk_buff *skb, unsigned int len)  {  	unsigned char *data = skb->data; @@ -3235,8 +3271,8 @@ normal:  		skb_copy_from_linear_data_offset(head_skb, offset,  						 skb_put(nskb, hsize), hsize); -		skb_shinfo(nskb)->tx_flags = skb_shinfo(head_skb)->tx_flags & -			SKBTX_SHARED_FRAG; +		skb_shinfo(nskb)->tx_flags |= skb_shinfo(head_skb)->tx_flags & +					      SKBTX_SHARED_FRAG;  		while (pos < offset + len) {  			if (i >= nfrags) { @@ -3482,24 +3518,18 @@ void __init skb_init(void)  						NULL);  } -/** - *	skb_to_sgvec - Fill a scatter-gather list from a socket buffer - *	@skb: Socket buffer containing the buffers to be mapped - *	@sg: The scatter-gather list to map into - *	@offset: The offset into the buffer's contents to start mapping - *	@len: Length of buffer space to be mapped - * - *	Fill the specified scatter-gather list with mappings/pointers into a - *	region of the buffer space attached to a socket buffer. - */  static int -__skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len) +__skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len, +	       unsigned int recursion_level)  {  	int start = skb_headlen(skb);  	int i, copy = start - offset;  	struct sk_buff *frag_iter;  	int elt = 0; +	if (unlikely(recursion_level >= 24)) +		return -EMSGSIZE; +  	if (copy > 0) {  		if (copy > len)  			copy = len; @@ -3518,6 +3548,8 @@ __skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len)  		end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]);  		if ((copy = end - offset) > 0) {  			skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; +			if (unlikely(elt && sg_is_last(&sg[elt - 1]))) +				return -EMSGSIZE;  			if (copy > len)  				copy = len; @@ -3532,16 +3564,22 @@ __skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len)  	}  	skb_walk_frags(skb, frag_iter) { -		int end; +		int end, ret;  		WARN_ON(start > offset + len);  		end = start + frag_iter->len;  		if ((copy = end - offset) > 0) { +			if (unlikely(elt && sg_is_last(&sg[elt - 1]))) +				return -EMSGSIZE; +  			if (copy > len)  				copy = len; -			elt += __skb_to_sgvec(frag_iter, sg+elt, offset - start, -					      copy); +			ret = __skb_to_sgvec(frag_iter, sg+elt, offset - start, +					      copy, recursion_level + 1); +			if (unlikely(ret < 0)) +				return ret; +			elt += ret;  			if ((len -= copy) == 0)  				return elt;  			offset += copy; @@ -3552,6 +3590,31 @@ __skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len)  	return elt;  } +/** + *	skb_to_sgvec - Fill a scatter-gather list from a socket buffer + *	@skb: Socket buffer containing the buffers to be mapped + *	@sg: The scatter-gather list to map into + *	@offset: The offset into the buffer's contents to start mapping + *	@len: Length of buffer space to be mapped + * + *	Fill the specified scatter-gather list with mappings/pointers into a + *	region of the buffer space attached to a socket buffer. Returns either + *	the number of scatterlist items used, or -EMSGSIZE if the contents + *	could not fit. + */ +int skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len) +{ +	int nsg = __skb_to_sgvec(skb, sg, offset, len, 0); + +	if (nsg <= 0) +		return nsg; + +	sg_mark_end(&sg[nsg - 1]); + +	return nsg; +} +EXPORT_SYMBOL_GPL(skb_to_sgvec); +  /* As compared with skb_to_sgvec, skb_to_sgvec_nomark only map skb to given   * sglist without mark the sg which contain last skb data as the end.   * So the caller can mannipulate sg list as will when padding new data after @@ -3574,19 +3637,11 @@ __skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len)  int skb_to_sgvec_nomark(struct sk_buff *skb, struct scatterlist *sg,  			int offset, int len)  { -	return __skb_to_sgvec(skb, sg, offset, len); +	return __skb_to_sgvec(skb, sg, offset, len, 0);  }  EXPORT_SYMBOL_GPL(skb_to_sgvec_nomark); -int skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len) -{ -	int nsg = __skb_to_sgvec(skb, sg, offset, len); -	sg_mark_end(&sg[nsg - 1]); - -	return nsg; -} -EXPORT_SYMBOL_GPL(skb_to_sgvec);  /**   *	skb_cow_data - Check that a socket buffer's data buffers are writable @@ -3754,8 +3809,11 @@ struct sk_buff *sock_dequeue_err_skb(struct sock *sk)  	spin_lock_irqsave(&q->lock, flags);  	skb = __skb_dequeue(q); -	if (skb && (skb_next = skb_peek(q))) +	if (skb && (skb_next = skb_peek(q))) {  		icmp_next = is_icmp_err_skb(skb_next); +		if (icmp_next) +			sk->sk_err = SKB_EXT_ERR(skb_next)->ee.ee_origin; +	}  	spin_unlock_irqrestore(&q->lock, flags);  	if (is_icmp_err_skb(skb) && !icmp_next) @@ -3786,7 +3844,7 @@ struct sk_buff *skb_clone_sk(struct sk_buff *skb)  	struct sock *sk = skb->sk;  	struct sk_buff *clone; -	if (!sk || !atomic_inc_not_zero(&sk->sk_refcnt)) +	if (!sk || !refcount_inc_not_zero(&sk->sk_refcnt))  		return NULL;  	clone = skb_clone(skb, GFP_ATOMIC); @@ -3857,7 +3915,7 @@ void skb_complete_tx_timestamp(struct sk_buff *skb,  	/* Take a reference to prevent skb_orphan() from freeing the socket,  	 * but only if the socket refcount is not zero.  	 */ -	if (likely(atomic_inc_not_zero(&sk->sk_refcnt))) { +	if (likely(refcount_inc_not_zero(&sk->sk_refcnt))) {  		*skb_hwtstamps(skb) = *hwtstamps;  		__skb_complete_tx_timestamp(skb, sk, SCM_TSTAMP_SND, false);  		sock_put(sk); @@ -3875,6 +3933,10 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb,  	if (!sk)  		return; +	if (!hwtstamps && !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_TX_SWHW) && +	    skb_shinfo(orig_skb)->tx_flags & SKBTX_IN_PROGRESS) +		return; +  	tsonly = sk->sk_tsflags & SOF_TIMESTAMPING_OPT_TSONLY;  	if (!skb_may_tx_timestamp(sk, tsonly))  		return; @@ -3896,7 +3958,8 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb,  		return;  	if (tsonly) { -		skb_shinfo(skb)->tx_flags = skb_shinfo(orig_skb)->tx_flags; +		skb_shinfo(skb)->tx_flags |= skb_shinfo(orig_skb)->tx_flags & +					     SKBTX_ANY_TSTAMP;  		skb_shinfo(skb)->tskey = skb_shinfo(orig_skb)->tskey;  	} @@ -3934,7 +3997,7 @@ void skb_complete_wifi_ack(struct sk_buff *skb, bool acked)  	/* Take a reference to prevent skb_orphan() from freeing the socket,  	 * but only if the socket refcount is not zero.  	 */ -	if (likely(atomic_inc_not_zero(&sk->sk_refcnt))) { +	if (likely(refcount_inc_not_zero(&sk->sk_refcnt))) {  		err = sock_queue_err_skb(sk, skb);  		sock_put(sk);  	} diff --git a/net/core/sock.c b/net/core/sock.c index b5baeb9cb0fb..ac2a404c73eb 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -102,6 +102,7 @@  #include <linux/proc_fs.h>  #include <linux/seq_file.h>  #include <linux/sched.h> +#include <linux/sched/mm.h>  #include <linux/timer.h>  #include <linux/string.h>  #include <linux/sockios.h> @@ -138,10 +139,7 @@  #include <trace/events/sock.h> -#ifdef CONFIG_INET  #include <net/tcp.h> -#endif -  #include <net/busy_poll.h>  static DEFINE_MUTEX(proto_list_mutex); @@ -372,14 +370,14 @@ EXPORT_SYMBOL_GPL(sk_clear_memalloc);  int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)  {  	int ret; -	unsigned long pflags = current->flags; +	unsigned int noreclaim_flag;  	/* these should have been dropped before queueing */  	BUG_ON(!sock_flag(sk, SOCK_MEMALLOC)); -	current->flags |= PF_MEMALLOC; +	noreclaim_flag = memalloc_noreclaim_save();  	ret = sk->sk_backlog_rcv(sk, skb); -	current_restore_flags(pflags, PF_MEMALLOC); +	memalloc_noreclaim_restore(noreclaim_flag);  	return ret;  } @@ -1040,6 +1038,10 @@ set_rcvbuf:  #endif  	case SO_MAX_PACING_RATE: +		if (val != ~0U) +			cmpxchg(&sk->sk_pacing_status, +				SK_PACING_NONE, +				SK_PACING_NEEDED);  		sk->sk_max_pacing_rate = val;  		sk->sk_pacing_rate = min(sk->sk_pacing_rate,  					 sk->sk_max_pacing_rate); @@ -1076,6 +1078,18 @@ static void cred_to_ucred(struct pid *pid, const struct cred *cred,  	}  } +static int groups_to_user(gid_t __user *dst, const struct group_info *src) +{ +	struct user_namespace *user_ns = current_user_ns(); +	int i; + +	for (i = 0; i < src->ngroups; i++) +		if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i)) +			return -EFAULT; + +	return 0; +} +  int sock_getsockopt(struct socket *sock, int level, int optname,  		    char __user *optval, int __user *optlen)  { @@ -1229,6 +1243,27 @@ int sock_getsockopt(struct socket *sock, int level, int optname,  		goto lenout;  	} +	case SO_PEERGROUPS: +	{ +		int ret, n; + +		if (!sk->sk_peer_cred) +			return -ENODATA; + +		n = sk->sk_peer_cred->group_info->ngroups; +		if (len < n * sizeof(gid_t)) { +			len = n * sizeof(gid_t); +			return put_user(len, optlen) ? -EFAULT : -ERANGE; +		} +		len = n * sizeof(gid_t); + +		ret = groups_to_user((gid_t __user *)optval, +				     sk->sk_peer_cred->group_info); +		if (ret) +			return ret; +		goto lenout; +	} +  	case SO_PEERNAME:  	{  		char address[128]; @@ -1493,7 +1528,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority,  		if (likely(sk->sk_net_refcnt))  			get_net(net);  		sock_net_set(sk, net); -		atomic_set(&sk->sk_wmem_alloc, 1); +		refcount_set(&sk->sk_wmem_alloc, 1);  		mem_cgroup_sk_alloc(sk);  		cgroup_sk_alloc(&sk->sk_cgrp_data); @@ -1517,7 +1552,7 @@ static void __sk_destruct(struct rcu_head *head)  		sk->sk_destruct(sk);  	filter = rcu_dereference_check(sk->sk_filter, -				       atomic_read(&sk->sk_wmem_alloc) == 0); +				       refcount_read(&sk->sk_wmem_alloc) == 0);  	if (filter) {  		sk_filter_uncharge(sk, filter);  		RCU_INIT_POINTER(sk->sk_filter, NULL); @@ -1567,7 +1602,7 @@ void sk_free(struct sock *sk)  	 * some packets are still in some tx queue.  	 * If not null, sock_wfree() will call __sk_free(sk) later  	 */ -	if (atomic_dec_and_test(&sk->sk_wmem_alloc)) +	if (refcount_dec_and_test(&sk->sk_wmem_alloc))  		__sk_free(sk);  }  EXPORT_SYMBOL(sk_free); @@ -1624,7 +1659,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)  		/*  		 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())  		 */ -		atomic_set(&newsk->sk_wmem_alloc, 1); +		refcount_set(&newsk->sk_wmem_alloc, 1);  		atomic_set(&newsk->sk_omem_alloc, 0);  		sk_init_common(newsk); @@ -1673,7 +1708,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)  		 * (Documentation/RCU/rculist_nulls.txt for details)  		 */  		smp_wmb(); -		atomic_set(&newsk->sk_refcnt, 2); +		refcount_set(&newsk->sk_refcnt, 2);  		/*  		 * Increment the counter in the same struct proto as the master @@ -1752,7 +1787,7 @@ void sock_wfree(struct sk_buff *skb)  		 * Keep a reference on sk_wmem_alloc, this will be released  		 * after sk_write_space() call  		 */ -		atomic_sub(len - 1, &sk->sk_wmem_alloc); +		WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));  		sk->sk_write_space(sk);  		len = 1;  	} @@ -1760,7 +1795,7 @@ void sock_wfree(struct sk_buff *skb)  	 * if sk_wmem_alloc reaches 0, we must finish what sk_free()  	 * could not do because of in-flight packets  	 */ -	if (atomic_sub_and_test(len, &sk->sk_wmem_alloc)) +	if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))  		__sk_free(sk);  }  EXPORT_SYMBOL(sock_wfree); @@ -1772,7 +1807,7 @@ void __sock_wfree(struct sk_buff *skb)  {  	struct sock *sk = skb->sk; -	if (atomic_sub_and_test(skb->truesize, &sk->sk_wmem_alloc)) +	if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))  		__sk_free(sk);  } @@ -1794,7 +1829,7 @@ void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)  	 * is enough to guarantee sk_free() wont free this sock until  	 * all in-flight packets are completed  	 */ -	atomic_add(skb->truesize, &sk->sk_wmem_alloc); +	refcount_add(skb->truesize, &sk->sk_wmem_alloc);  }  EXPORT_SYMBOL(skb_set_owner_w); @@ -1802,28 +1837,24 @@ EXPORT_SYMBOL(skb_set_owner_w);   * delay queue. We want to allow the owner socket to send more   * packets, as if they were already TX completed by a typical driver.   * But we also want to keep skb->sk set because some packet schedulers - * rely on it (sch_fq for example). So we set skb->truesize to a small - * amount (1) and decrease sk_wmem_alloc accordingly. + * rely on it (sch_fq for example).   */  void skb_orphan_partial(struct sk_buff *skb)  { -	/* If this skb is a TCP pure ACK or already went here, -	 * we have nothing to do. 2 is already a very small truesize. -	 */ -	if (skb->truesize <= 2) +	if (skb_is_tcp_pure_ack(skb))  		return; -	/* TCP stack sets skb->ooo_okay based on sk_wmem_alloc, -	 * so we do not completely orphan skb, but transfert all -	 * accounted bytes but one, to avoid unexpected reorders. -	 */  	if (skb->destructor == sock_wfree  #ifdef CONFIG_INET  	    || skb->destructor == tcp_wfree  #endif  		) { -		atomic_sub(skb->truesize - 1, &skb->sk->sk_wmem_alloc); -		skb->truesize = 1; +		struct sock *sk = skb->sk; + +		if (refcount_inc_not_zero(&sk->sk_refcnt)) { +			WARN_ON(refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc)); +			skb->destructor = sock_efree; +		}  	} else {  		skb_orphan(skb);  	} @@ -1881,7 +1912,7 @@ EXPORT_SYMBOL(sock_i_ino);  struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,  			     gfp_t priority)  { -	if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) { +	if (force || refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {  		struct sk_buff *skb = alloc_skb(size, priority);  		if (skb) {  			skb_set_owner_w(skb, sk); @@ -1956,7 +1987,7 @@ static long sock_wait_for_wmem(struct sock *sk, long timeo)  			break;  		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);  		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); -		if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) +		if (refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)  			break;  		if (sk->sk_shutdown & SEND_SHUTDOWN)  			break; @@ -2078,6 +2109,26 @@ int sock_cmsg_send(struct sock *sk, struct msghdr *msg,  }  EXPORT_SYMBOL(sock_cmsg_send); +static void sk_enter_memory_pressure(struct sock *sk) +{ +	if (!sk->sk_prot->enter_memory_pressure) +		return; + +	sk->sk_prot->enter_memory_pressure(sk); +} + +static void sk_leave_memory_pressure(struct sock *sk) +{ +	if (sk->sk_prot->leave_memory_pressure) { +		sk->sk_prot->leave_memory_pressure(sk); +	} else { +		unsigned long *memory_pressure = sk->sk_prot->memory_pressure; + +		if (memory_pressure && *memory_pressure) +			*memory_pressure = 0; +	} +} +  /* On 32bit arches, an skb frag is limited to 2^15 */  #define SKB_FRAG_PAGE_ORDER	get_order(32768) @@ -2259,7 +2310,7 @@ int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)  		if (sk->sk_type == SOCK_STREAM) {  			if (sk->sk_wmem_queued < prot->sysctl_wmem[0])  				return 1; -		} else if (atomic_read(&sk->sk_wmem_alloc) < +		} else if (refcount_read(&sk->sk_wmem_alloc) <  			   prot->sysctl_wmem[0])  				return 1;  	} @@ -2526,7 +2577,7 @@ static void sock_def_write_space(struct sock *sk)  	/* Do not wake up a writer until he can make "significant"  	 * progress.  --DaveM  	 */ -	if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) { +	if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {  		wq = rcu_dereference(sk->sk_wq);  		if (skwq_has_sleeper(wq))  			wake_up_interruptible_sync_poll(&wq->wait, POLLOUT | @@ -2636,7 +2687,7 @@ void sock_init_data(struct socket *sock, struct sock *sk)  	 * (Documentation/RCU/rculist_nulls.txt for details)  	 */  	smp_wmb(); -	atomic_set(&sk->sk_refcnt, 1); +	refcount_set(&sk->sk_refcnt, 1);  	atomic_set(&sk->sk_drops, 0);  }  EXPORT_SYMBOL(sock_init_data); @@ -2681,9 +2732,12 @@ EXPORT_SYMBOL(release_sock);   * @sk: socket   *   * This version should be used for very small section, where process wont block - * return false if fast path is taken + * return false if fast path is taken: + *   *   sk_lock.slock locked, owned = 0, BH disabled - * return true if slow path is taken + * + * return true if slow path is taken: + *   *   sk_lock.slock unlocked, owned = 1, BH enabled   */  bool lock_sock_fast(struct sock *sk) diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index ea23254b2457..b7cd9aafe99e 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -479,8 +479,6 @@ static __net_init int sysctl_core_net_init(struct net *net)  {  	struct ctl_table *tbl; -	net->core.sysctl_somaxconn = SOMAXCONN; -  	tbl = netns_core_table;  	if (!net_eq(net, &init_net)) {  		tbl = kmemdup(tbl, sizeof(netns_core_table), GFP_KERNEL); |