diff options
Diffstat (limited to 'net/core')
| -rw-r--r-- | net/core/Makefile | 2 | ||||
| -rw-r--r-- | net/core/dev.c | 158 | ||||
| -rw-r--r-- | net/core/filter.c | 212 | ||||
| -rw-r--r-- | net/core/flow_dissector.c | 40 | ||||
| -rw-r--r-- | net/core/gro.c | 114 | ||||
| -rw-r--r-- | net/core/gso.c | 273 | ||||
| -rw-r--r-- | net/core/net_namespace.c | 4 | ||||
| -rw-r--r-- | net/core/netdev-genl-gen.c | 2 | ||||
| -rw-r--r-- | net/core/netdev-genl-gen.h | 2 | ||||
| -rw-r--r-- | net/core/netpoll.c | 5 | ||||
| -rw-r--r-- | net/core/pktgen.c | 13 | ||||
| -rw-r--r-- | net/core/rtnetlink.c | 187 | ||||
| -rw-r--r-- | net/core/skbuff.c | 308 | ||||
| -rw-r--r-- | net/core/sock.c | 160 | ||||
| -rw-r--r-- | net/core/sock_map.c | 4 | 
15 files changed, 950 insertions, 534 deletions
diff --git a/net/core/Makefile b/net/core/Makefile index 8f367813bc68..731db2eaa610 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -13,7 +13,7 @@ obj-y		     += dev.o dev_addr_lists.o dst.o netevent.o \  			neighbour.o rtnetlink.o utils.o link_watch.o filter.o \  			sock_diag.o dev_ioctl.o tso.o sock_reuseport.o \  			fib_notifier.o xdp.o flow_offload.o gro.o \ -			netdev-genl.o netdev-genl-gen.o +			netdev-genl.o netdev-genl-gen.o gso.o  obj-$(CONFIG_NETDEV_ADDR_LIST_TEST) += dev_addr_lists_test.o diff --git a/net/core/dev.c b/net/core/dev.c index c29f3e1db3ca..69a3e544676c 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -758,29 +758,43 @@ struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)  }  EXPORT_SYMBOL(dev_get_by_name_rcu); +/* Deprecated for new users, call netdev_get_by_name() instead */ +struct net_device *dev_get_by_name(struct net *net, const char *name) +{ +	struct net_device *dev; + +	rcu_read_lock(); +	dev = dev_get_by_name_rcu(net, name); +	dev_hold(dev); +	rcu_read_unlock(); +	return dev; +} +EXPORT_SYMBOL(dev_get_by_name); +  /** - *	dev_get_by_name		- find a device by its name + *	netdev_get_by_name() - find a device by its name   *	@net: the applicable net namespace   *	@name: name to find + *	@tracker: tracking object for the acquired reference + *	@gfp: allocation flags for the tracker   *   *	Find an interface by name. This can be called from any   *	context and does its own locking. The returned handle has - *	the usage count incremented and the caller must use dev_put() to + *	the usage count incremented and the caller must use netdev_put() to   *	release it when it is no longer needed. %NULL is returned if no   *	matching device is found.   */ - -struct net_device *dev_get_by_name(struct net *net, const char *name) +struct net_device *netdev_get_by_name(struct net *net, const char *name, +				      netdevice_tracker *tracker, gfp_t gfp)  {  	struct net_device *dev; -	rcu_read_lock(); -	dev = dev_get_by_name_rcu(net, name); -	dev_hold(dev); -	rcu_read_unlock(); +	dev = dev_get_by_name(net, name); +	if (dev) +		netdev_tracker_alloc(dev, tracker, gfp);  	return dev;  } -EXPORT_SYMBOL(dev_get_by_name); +EXPORT_SYMBOL(netdev_get_by_name);  /**   *	__dev_get_by_index - find a device by its ifindex @@ -831,29 +845,42 @@ struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)  }  EXPORT_SYMBOL(dev_get_by_index_rcu); +/* Deprecated for new users, call netdev_get_by_index() instead */ +struct net_device *dev_get_by_index(struct net *net, int ifindex) +{ +	struct net_device *dev; + +	rcu_read_lock(); +	dev = dev_get_by_index_rcu(net, ifindex); +	dev_hold(dev); +	rcu_read_unlock(); +	return dev; +} +EXPORT_SYMBOL(dev_get_by_index);  /** - *	dev_get_by_index - find a device by its ifindex + *	netdev_get_by_index() - find a device by its ifindex   *	@net: the applicable net namespace   *	@ifindex: index of device + *	@tracker: tracking object for the acquired reference + *	@gfp: allocation flags for the tracker   *   *	Search for an interface by index. Returns NULL if the device   *	is not found or a pointer to the device. The device returned has   *	had a reference added and the pointer is safe until the user calls - *	dev_put to indicate they have finished with it. + *	netdev_put() to indicate they have finished with it.   */ - -struct net_device *dev_get_by_index(struct net *net, int ifindex) +struct net_device *netdev_get_by_index(struct net *net, int ifindex, +				       netdevice_tracker *tracker, gfp_t gfp)  {  	struct net_device *dev; -	rcu_read_lock(); -	dev = dev_get_by_index_rcu(net, ifindex); -	dev_hold(dev); -	rcu_read_unlock(); +	dev = dev_get_by_index(net, ifindex); +	if (dev) +		netdev_tracker_alloc(dev, tracker, gfp);  	return dev;  } -EXPORT_SYMBOL(dev_get_by_index); +EXPORT_SYMBOL(netdev_get_by_index);  /**   *	dev_get_by_napi_id - find a device by napi_id @@ -3209,7 +3236,7 @@ static u16 skb_tx_hash(const struct net_device *dev,  	return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;  } -static void skb_warn_bad_offload(const struct sk_buff *skb) +void skb_warn_bad_offload(const struct sk_buff *skb)  {  	static const netdev_features_t null_features;  	struct net_device *dev = skb->dev; @@ -3338,74 +3365,6 @@ __be16 skb_network_protocol(struct sk_buff *skb, int *depth)  	return vlan_get_protocol_and_depth(skb, type, depth);  } -/* openvswitch calls this on rx path, so we need a different check. - */ -static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path) -{ -	if (tx_path) -		return skb->ip_summed != CHECKSUM_PARTIAL && -		       skb->ip_summed != CHECKSUM_UNNECESSARY; - -	return skb->ip_summed == CHECKSUM_NONE; -} - -/** - *	__skb_gso_segment - Perform segmentation on skb. - *	@skb: buffer to segment - *	@features: features for the output path (see dev->features) - *	@tx_path: whether it is called in TX path - * - *	This function segments the given skb and returns a list of segments. - * - *	It may return NULL if the skb requires no segmentation.  This is - *	only possible when GSO is used for verifying header integrity. - * - *	Segmentation preserves SKB_GSO_CB_OFFSET bytes of previous skb cb. - */ -struct sk_buff *__skb_gso_segment(struct sk_buff *skb, -				  netdev_features_t features, bool tx_path) -{ -	struct sk_buff *segs; - -	if (unlikely(skb_needs_check(skb, tx_path))) { -		int err; - -		/* We're going to init ->check field in TCP or UDP header */ -		err = skb_cow_head(skb, 0); -		if (err < 0) -			return ERR_PTR(err); -	} - -	/* Only report GSO partial support if it will enable us to -	 * support segmentation on this frame without needing additional -	 * work. -	 */ -	if (features & NETIF_F_GSO_PARTIAL) { -		netdev_features_t partial_features = NETIF_F_GSO_ROBUST; -		struct net_device *dev = skb->dev; - -		partial_features |= dev->features & dev->gso_partial_features; -		if (!skb_gso_ok(skb, features | partial_features)) -			features &= ~NETIF_F_GSO_PARTIAL; -	} - -	BUILD_BUG_ON(SKB_GSO_CB_OFFSET + -		     sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb)); - -	SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb); -	SKB_GSO_CB(skb)->encap_level = 0; - -	skb_reset_mac_header(skb); -	skb_reset_mac_len(skb); - -	segs = skb_mac_gso_segment(skb, features); - -	if (segs != skb && unlikely(skb_needs_check(skb, tx_path) && !IS_ERR(segs))) -		skb_warn_bad_offload(skb); - -	return segs; -} -EXPORT_SYMBOL(__skb_gso_segment);  /* Take action when hardware reception checksum errors are detected. */  #ifdef CONFIG_BUG @@ -6199,7 +6158,8 @@ restart:  	if (!napi)  		goto out; -	preempt_disable(); +	if (!IS_ENABLED(CONFIG_PREEMPT_RT)) +		preempt_disable();  	for (;;) {  		int work = 0; @@ -6241,7 +6201,8 @@ count:  		if (unlikely(need_resched())) {  			if (napi_poll)  				busy_poll_stop(napi, have_poll_lock, prefer_busy_poll, budget); -			preempt_enable(); +			if (!IS_ENABLED(CONFIG_PREEMPT_RT)) +				preempt_enable();  			rcu_read_unlock();  			cond_resched();  			if (loop_end(loop_end_arg, start_time)) @@ -6252,7 +6213,8 @@ count:  	}  	if (napi_poll)  		busy_poll_stop(napi, have_poll_lock, prefer_busy_poll, budget); -	preempt_enable(); +	if (!IS_ENABLED(CONFIG_PREEMPT_RT)) +		preempt_enable();  out:  	rcu_read_unlock();  } @@ -8822,9 +8784,11 @@ int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa,  	err = dev_pre_changeaddr_notify(dev, sa->sa_data, extack);  	if (err)  		return err; -	err = ops->ndo_set_mac_address(dev, sa); -	if (err) -		return err; +	if (memcmp(dev->dev_addr, sa->sa_data, dev->addr_len)) { +		err = ops->ndo_set_mac_address(dev, sa); +		if (err) +			return err; +	}  	dev->addr_assign_type = NET_ADDR_SET;  	call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);  	add_device_randomness(dev->dev_addr, dev->addr_len); @@ -10570,8 +10534,10 @@ void netdev_sw_irq_coalesce_default_on(struct net_device *dev)  {  	WARN_ON(dev->reg_state == NETREG_REGISTERED); -	dev->gro_flush_timeout = 20000; -	dev->napi_defer_hard_irqs = 1; +	if (!IS_ENABLED(CONFIG_PREEMPT_RT)) { +		dev->gro_flush_timeout = 20000; +		dev->napi_defer_hard_irqs = 1; +	}  }  EXPORT_SYMBOL_GPL(netdev_sw_irq_coalesce_default_on); @@ -10632,7 +10598,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,  	dev = PTR_ALIGN(p, NETDEV_ALIGN);  	dev->padded = (char *)dev - (char *)p; -	ref_tracker_dir_init(&dev->refcnt_tracker, 128); +	ref_tracker_dir_init(&dev->refcnt_tracker, 128, name);  #ifdef CONFIG_PCPU_DEV_REFCNT  	dev->pcpu_refcnt = alloc_percpu(int);  	if (!dev->pcpu_refcnt) diff --git a/net/core/filter.c b/net/core/filter.c index d9ce04ca22ce..06ba0e56e369 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3948,20 +3948,21 @@ void bpf_xdp_copy_buf(struct xdp_buff *xdp, unsigned long off,  void *bpf_xdp_pointer(struct xdp_buff *xdp, u32 offset, u32 len)  { -	struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp);  	u32 size = xdp->data_end - xdp->data; +	struct skb_shared_info *sinfo;  	void *addr = xdp->data;  	int i;  	if (unlikely(offset > 0xffff || len > 0xffff))  		return ERR_PTR(-EFAULT); -	if (offset + len > xdp_get_buff_len(xdp)) +	if (unlikely(offset + len > xdp_get_buff_len(xdp)))  		return ERR_PTR(-EINVAL); -	if (offset < size) /* linear area */ +	if (likely(offset < size)) /* linear area */  		goto out; +	sinfo = xdp_get_shared_info_from_buff(xdp);  	offset -= size;  	for (i = 0; i < sinfo->nr_frags; i++) { /* paged area */  		u32 frag_size = skb_frag_size(&sinfo->frags[i]); @@ -5803,6 +5804,12 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params,  		u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN;  		struct fib_table *tb; +		if (flags & BPF_FIB_LOOKUP_TBID) { +			tbid = params->tbid; +			/* zero out for vlan output */ +			params->tbid = 0; +		} +  		tb = fib_get_table(net, tbid);  		if (unlikely(!tb))  			return BPF_FIB_LKUP_RET_NOT_FWDED; @@ -5936,6 +5943,12 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,  		u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN;  		struct fib6_table *tb; +		if (flags & BPF_FIB_LOOKUP_TBID) { +			tbid = params->tbid; +			/* zero out for vlan output */ +			params->tbid = 0; +		} +  		tb = ipv6_stub->fib6_get_table(net, tbid);  		if (unlikely(!tb))  			return BPF_FIB_LKUP_RET_NOT_FWDED; @@ -6008,7 +6021,7 @@ set_fwd_params:  #endif  #define BPF_FIB_LOOKUP_MASK (BPF_FIB_LOOKUP_DIRECT | BPF_FIB_LOOKUP_OUTPUT | \ -			     BPF_FIB_LOOKUP_SKIP_NEIGH) +			     BPF_FIB_LOOKUP_SKIP_NEIGH | BPF_FIB_LOOKUP_TBID)  BPF_CALL_4(bpf_xdp_fib_lookup, struct xdp_buff *, ctx,  	   struct bpf_fib_lookup *, params, int, plen, u32, flags) @@ -6555,12 +6568,11 @@ static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple,  static struct sock *  __bpf_skc_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,  		 struct net *caller_net, u32 ifindex, u8 proto, u64 netns_id, -		 u64 flags) +		 u64 flags, int sdif)  {  	struct sock *sk = NULL;  	struct net *net;  	u8 family; -	int sdif;  	if (len == sizeof(tuple->ipv4))  		family = AF_INET; @@ -6572,10 +6584,12 @@ __bpf_skc_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,  	if (unlikely(flags || !((s32)netns_id < 0 || netns_id <= S32_MAX)))  		goto out; -	if (family == AF_INET) -		sdif = inet_sdif(skb); -	else -		sdif = inet6_sdif(skb); +	if (sdif < 0) { +		if (family == AF_INET) +			sdif = inet_sdif(skb); +		else +			sdif = inet6_sdif(skb); +	}  	if ((s32)netns_id < 0) {  		net = caller_net; @@ -6595,10 +6609,11 @@ out:  static struct sock *  __bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,  		struct net *caller_net, u32 ifindex, u8 proto, u64 netns_id, -		u64 flags) +		u64 flags, int sdif)  {  	struct sock *sk = __bpf_skc_lookup(skb, tuple, len, caller_net, -					   ifindex, proto, netns_id, flags); +					   ifindex, proto, netns_id, flags, +					   sdif);  	if (sk) {  		struct sock *sk2 = sk_to_full_sk(sk); @@ -6638,7 +6653,7 @@ bpf_skc_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,  	}  	return __bpf_skc_lookup(skb, tuple, len, caller_net, ifindex, proto, -				netns_id, flags); +				netns_id, flags, -1);  }  static struct sock * @@ -6727,6 +6742,78 @@ static const struct bpf_func_proto bpf_sk_lookup_udp_proto = {  	.arg5_type	= ARG_ANYTHING,  }; +BPF_CALL_5(bpf_tc_skc_lookup_tcp, struct sk_buff *, skb, +	   struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) +{ +	struct net_device *dev = skb->dev; +	int ifindex = dev->ifindex, sdif = dev_sdif(dev); +	struct net *caller_net = dev_net(dev); + +	return (unsigned long)__bpf_skc_lookup(skb, tuple, len, caller_net, +					       ifindex, IPPROTO_TCP, netns_id, +					       flags, sdif); +} + +static const struct bpf_func_proto bpf_tc_skc_lookup_tcp_proto = { +	.func		= bpf_tc_skc_lookup_tcp, +	.gpl_only	= false, +	.pkt_access	= true, +	.ret_type	= RET_PTR_TO_SOCK_COMMON_OR_NULL, +	.arg1_type	= ARG_PTR_TO_CTX, +	.arg2_type	= ARG_PTR_TO_MEM | MEM_RDONLY, +	.arg3_type	= ARG_CONST_SIZE, +	.arg4_type	= ARG_ANYTHING, +	.arg5_type	= ARG_ANYTHING, +}; + +BPF_CALL_5(bpf_tc_sk_lookup_tcp, struct sk_buff *, skb, +	   struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) +{ +	struct net_device *dev = skb->dev; +	int ifindex = dev->ifindex, sdif = dev_sdif(dev); +	struct net *caller_net = dev_net(dev); + +	return (unsigned long)__bpf_sk_lookup(skb, tuple, len, caller_net, +					      ifindex, IPPROTO_TCP, netns_id, +					      flags, sdif); +} + +static const struct bpf_func_proto bpf_tc_sk_lookup_tcp_proto = { +	.func		= bpf_tc_sk_lookup_tcp, +	.gpl_only	= false, +	.pkt_access	= true, +	.ret_type	= RET_PTR_TO_SOCKET_OR_NULL, +	.arg1_type	= ARG_PTR_TO_CTX, +	.arg2_type	= ARG_PTR_TO_MEM | MEM_RDONLY, +	.arg3_type	= ARG_CONST_SIZE, +	.arg4_type	= ARG_ANYTHING, +	.arg5_type	= ARG_ANYTHING, +}; + +BPF_CALL_5(bpf_tc_sk_lookup_udp, struct sk_buff *, skb, +	   struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) +{ +	struct net_device *dev = skb->dev; +	int ifindex = dev->ifindex, sdif = dev_sdif(dev); +	struct net *caller_net = dev_net(dev); + +	return (unsigned long)__bpf_sk_lookup(skb, tuple, len, caller_net, +					      ifindex, IPPROTO_UDP, netns_id, +					      flags, sdif); +} + +static const struct bpf_func_proto bpf_tc_sk_lookup_udp_proto = { +	.func		= bpf_tc_sk_lookup_udp, +	.gpl_only	= false, +	.pkt_access	= true, +	.ret_type	= RET_PTR_TO_SOCKET_OR_NULL, +	.arg1_type	= ARG_PTR_TO_CTX, +	.arg2_type	= ARG_PTR_TO_MEM | MEM_RDONLY, +	.arg3_type	= ARG_CONST_SIZE, +	.arg4_type	= ARG_ANYTHING, +	.arg5_type	= ARG_ANYTHING, +}; +  BPF_CALL_1(bpf_sk_release, struct sock *, sk)  {  	if (sk && sk_is_refcounted(sk)) @@ -6744,12 +6831,13 @@ static const struct bpf_func_proto bpf_sk_release_proto = {  BPF_CALL_5(bpf_xdp_sk_lookup_udp, struct xdp_buff *, ctx,  	   struct bpf_sock_tuple *, tuple, u32, len, u32, netns_id, u64, flags)  { -	struct net *caller_net = dev_net(ctx->rxq->dev); -	int ifindex = ctx->rxq->dev->ifindex; +	struct net_device *dev = ctx->rxq->dev; +	int ifindex = dev->ifindex, sdif = dev_sdif(dev); +	struct net *caller_net = dev_net(dev);  	return (unsigned long)__bpf_sk_lookup(NULL, tuple, len, caller_net,  					      ifindex, IPPROTO_UDP, netns_id, -					      flags); +					      flags, sdif);  }  static const struct bpf_func_proto bpf_xdp_sk_lookup_udp_proto = { @@ -6767,12 +6855,13 @@ static const struct bpf_func_proto bpf_xdp_sk_lookup_udp_proto = {  BPF_CALL_5(bpf_xdp_skc_lookup_tcp, struct xdp_buff *, ctx,  	   struct bpf_sock_tuple *, tuple, u32, len, u32, netns_id, u64, flags)  { -	struct net *caller_net = dev_net(ctx->rxq->dev); -	int ifindex = ctx->rxq->dev->ifindex; +	struct net_device *dev = ctx->rxq->dev; +	int ifindex = dev->ifindex, sdif = dev_sdif(dev); +	struct net *caller_net = dev_net(dev);  	return (unsigned long)__bpf_skc_lookup(NULL, tuple, len, caller_net,  					       ifindex, IPPROTO_TCP, netns_id, -					       flags); +					       flags, sdif);  }  static const struct bpf_func_proto bpf_xdp_skc_lookup_tcp_proto = { @@ -6790,12 +6879,13 @@ static const struct bpf_func_proto bpf_xdp_skc_lookup_tcp_proto = {  BPF_CALL_5(bpf_xdp_sk_lookup_tcp, struct xdp_buff *, ctx,  	   struct bpf_sock_tuple *, tuple, u32, len, u32, netns_id, u64, flags)  { -	struct net *caller_net = dev_net(ctx->rxq->dev); -	int ifindex = ctx->rxq->dev->ifindex; +	struct net_device *dev = ctx->rxq->dev; +	int ifindex = dev->ifindex, sdif = dev_sdif(dev); +	struct net *caller_net = dev_net(dev);  	return (unsigned long)__bpf_sk_lookup(NULL, tuple, len, caller_net,  					      ifindex, IPPROTO_TCP, netns_id, -					      flags); +					      flags, sdif);  }  static const struct bpf_func_proto bpf_xdp_sk_lookup_tcp_proto = { @@ -6815,7 +6905,8 @@ BPF_CALL_5(bpf_sock_addr_skc_lookup_tcp, struct bpf_sock_addr_kern *, ctx,  {  	return (unsigned long)__bpf_skc_lookup(NULL, tuple, len,  					       sock_net(ctx->sk), 0, -					       IPPROTO_TCP, netns_id, flags); +					       IPPROTO_TCP, netns_id, flags, +					       -1);  }  static const struct bpf_func_proto bpf_sock_addr_skc_lookup_tcp_proto = { @@ -6834,7 +6925,7 @@ BPF_CALL_5(bpf_sock_addr_sk_lookup_tcp, struct bpf_sock_addr_kern *, ctx,  {  	return (unsigned long)__bpf_sk_lookup(NULL, tuple, len,  					      sock_net(ctx->sk), 0, IPPROTO_TCP, -					      netns_id, flags); +					      netns_id, flags, -1);  }  static const struct bpf_func_proto bpf_sock_addr_sk_lookup_tcp_proto = { @@ -6853,7 +6944,7 @@ BPF_CALL_5(bpf_sock_addr_sk_lookup_udp, struct bpf_sock_addr_kern *, ctx,  {  	return (unsigned long)__bpf_sk_lookup(NULL, tuple, len,  					      sock_net(ctx->sk), 0, IPPROTO_UDP, -					      netns_id, flags); +					      netns_id, flags, -1);  }  static const struct bpf_func_proto bpf_sock_addr_sk_lookup_udp_proto = { @@ -6916,6 +7007,8 @@ u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type,  					FIELD));			\  	} while (0) +	BTF_TYPE_EMIT(struct bpf_tcp_sock); +  	switch (si->off) {  	case offsetof(struct bpf_tcp_sock, rtt_min):  		BUILD_BUG_ON(sizeof_field(struct tcp_sock, rtt_min) != @@ -7980,9 +8073,9 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)  #endif  #ifdef CONFIG_INET  	case BPF_FUNC_sk_lookup_tcp: -		return &bpf_sk_lookup_tcp_proto; +		return &bpf_tc_sk_lookup_tcp_proto;  	case BPF_FUNC_sk_lookup_udp: -		return &bpf_sk_lookup_udp_proto; +		return &bpf_tc_sk_lookup_udp_proto;  	case BPF_FUNC_sk_release:  		return &bpf_sk_release_proto;  	case BPF_FUNC_tcp_sock: @@ -7990,7 +8083,7 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)  	case BPF_FUNC_get_listener_sock:  		return &bpf_get_listener_sock_proto;  	case BPF_FUNC_skc_lookup_tcp: -		return &bpf_skc_lookup_tcp_proto; +		return &bpf_tc_skc_lookup_tcp_proto;  	case BPF_FUNC_tcp_check_syncookie:  		return &bpf_tcp_check_syncookie_proto;  	case BPF_FUNC_skb_ecn_set_ce: @@ -11721,3 +11814,66 @@ static int __init bpf_kfunc_init(void)  	return ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &bpf_kfunc_set_xdp);  }  late_initcall(bpf_kfunc_init); + +/* Disables missing prototype warnings */ +__diag_push(); +__diag_ignore_all("-Wmissing-prototypes", +		  "Global functions as their definitions will be in vmlinux BTF"); + +/* bpf_sock_destroy: Destroy the given socket with ECONNABORTED error code. + * + * The function expects a non-NULL pointer to a socket, and invokes the + * protocol specific socket destroy handlers. + * + * The helper can only be called from BPF contexts that have acquired the socket + * locks. + * + * Parameters: + * @sock: Pointer to socket to be destroyed + * + * Return: + * On error, may return EPROTONOSUPPORT, EINVAL. + * EPROTONOSUPPORT if protocol specific destroy handler is not supported. + * 0 otherwise + */ +__bpf_kfunc int bpf_sock_destroy(struct sock_common *sock) +{ +	struct sock *sk = (struct sock *)sock; + +	/* The locking semantics that allow for synchronous execution of the +	 * destroy handlers are only supported for TCP and UDP. +	 * Supporting protocols will need to acquire sock lock in the BPF context +	 * prior to invoking this kfunc. +	 */ +	if (!sk->sk_prot->diag_destroy || (sk->sk_protocol != IPPROTO_TCP && +					   sk->sk_protocol != IPPROTO_UDP)) +		return -EOPNOTSUPP; + +	return sk->sk_prot->diag_destroy(sk, ECONNABORTED); +} + +__diag_pop() + +BTF_SET8_START(bpf_sk_iter_kfunc_ids) +BTF_ID_FLAGS(func, bpf_sock_destroy, KF_TRUSTED_ARGS) +BTF_SET8_END(bpf_sk_iter_kfunc_ids) + +static int tracing_iter_filter(const struct bpf_prog *prog, u32 kfunc_id) +{ +	if (btf_id_set8_contains(&bpf_sk_iter_kfunc_ids, kfunc_id) && +	    prog->expected_attach_type != BPF_TRACE_ITER) +		return -EACCES; +	return 0; +} + +static const struct btf_kfunc_id_set bpf_sk_iter_kfunc_set = { +	.owner = THIS_MODULE, +	.set   = &bpf_sk_iter_kfunc_ids, +	.filter = tracing_iter_filter, +}; + +static int init_subsystem(void) +{ +	return register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &bpf_sk_iter_kfunc_set); +} +late_initcall(init_subsystem); diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 25fb0bbc310f..85a2d0d9bd39 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -27,6 +27,7 @@  #include <linux/tcp.h>  #include <linux/ptp_classify.h>  #include <net/flow_dissector.h> +#include <net/pkt_cls.h>  #include <scsi/fc/fc_fcoe.h>  #include <uapi/linux/batadv_packet.h>  #include <linux/bpf.h> @@ -241,6 +242,15 @@ void skb_flow_dissect_meta(const struct sk_buff *skb,  					 FLOW_DISSECTOR_KEY_META,  					 target_container);  	meta->ingress_ifindex = skb->skb_iif; +#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT) +	if (tc_skb_ext_tc_enabled()) { +		struct tc_skb_ext *ext; + +		ext = skb_ext_find(skb, TC_SKB_EXT); +		if (ext) +			meta->l2_miss = ext->l2_miss; +	} +#endif  }  EXPORT_SYMBOL(skb_flow_dissect_meta); @@ -548,6 +558,30 @@ __skb_flow_dissect_arp(const struct sk_buff *skb,  }  static enum flow_dissect_ret +__skb_flow_dissect_cfm(const struct sk_buff *skb, +		       struct flow_dissector *flow_dissector, +		       void *target_container, const void *data, +		       int nhoff, int hlen) +{ +	struct flow_dissector_key_cfm *key, *hdr, _hdr; + +	if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_CFM)) +		return FLOW_DISSECT_RET_OUT_GOOD; + +	hdr = __skb_header_pointer(skb, nhoff, sizeof(*key), data, hlen, &_hdr); +	if (!hdr) +		return FLOW_DISSECT_RET_OUT_BAD; + +	key = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_CFM, +					target_container); + +	key->mdl_ver = hdr->mdl_ver; +	key->opcode = hdr->opcode; + +	return FLOW_DISSECT_RET_OUT_GOOD; +} + +static enum flow_dissect_ret  __skb_flow_dissect_gre(const struct sk_buff *skb,  		       struct flow_dissector_key_control *key_control,  		       struct flow_dissector *flow_dissector, @@ -1390,6 +1424,12 @@ proto_again:  		break;  	} +	case htons(ETH_P_CFM): +		fdret = __skb_flow_dissect_cfm(skb, flow_dissector, +					       target_container, data, +					       nhoff, hlen); +		break; +  	default:  		fdret = FLOW_DISSECT_RET_OUT_BAD;  		break; diff --git a/net/core/gro.c b/net/core/gro.c index 2d84165cb4f1..0759277dc14e 100644 --- a/net/core/gro.c +++ b/net/core/gro.c @@ -10,7 +10,7 @@  #define GRO_MAX_HEAD (MAX_HEADER + 128)  static DEFINE_SPINLOCK(offload_lock); -static struct list_head offload_base __read_mostly = LIST_HEAD_INIT(offload_base); +struct list_head offload_base __read_mostly = LIST_HEAD_INIT(offload_base);  /* Maximum number of GRO_NORMAL skbs to batch up for list-RX */  int gro_normal_batch __read_mostly = 8; @@ -92,63 +92,6 @@ void dev_remove_offload(struct packet_offload *po)  }  EXPORT_SYMBOL(dev_remove_offload); -/** - *	skb_eth_gso_segment - segmentation handler for ethernet protocols. - *	@skb: buffer to segment - *	@features: features for the output path (see dev->features) - *	@type: Ethernet Protocol ID - */ -struct sk_buff *skb_eth_gso_segment(struct sk_buff *skb, -				    netdev_features_t features, __be16 type) -{ -	struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT); -	struct packet_offload *ptype; - -	rcu_read_lock(); -	list_for_each_entry_rcu(ptype, &offload_base, list) { -		if (ptype->type == type && ptype->callbacks.gso_segment) { -			segs = ptype->callbacks.gso_segment(skb, features); -			break; -		} -	} -	rcu_read_unlock(); - -	return segs; -} -EXPORT_SYMBOL(skb_eth_gso_segment); - -/** - *	skb_mac_gso_segment - mac layer segmentation handler. - *	@skb: buffer to segment - *	@features: features for the output path (see dev->features) - */ -struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb, -				    netdev_features_t features) -{ -	struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT); -	struct packet_offload *ptype; -	int vlan_depth = skb->mac_len; -	__be16 type = skb_network_protocol(skb, &vlan_depth); - -	if (unlikely(!type)) -		return ERR_PTR(-EINVAL); - -	__skb_pull(skb, vlan_depth); - -	rcu_read_lock(); -	list_for_each_entry_rcu(ptype, &offload_base, list) { -		if (ptype->type == type && ptype->callbacks.gso_segment) { -			segs = ptype->callbacks.gso_segment(skb, features); -			break; -		} -	} -	rcu_read_unlock(); - -	__skb_push(skb, skb->data - skb_mac_header(skb)); - -	return segs; -} -EXPORT_SYMBOL(skb_mac_gso_segment);  int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb)  { @@ -239,9 +182,7 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb)  		pinfo->nr_frags = nr_frags + 1 + skbinfo->nr_frags; -		__skb_frag_set_page(frag, page); -		skb_frag_off_set(frag, first_offset); -		skb_frag_size_set(frag, first_size); +		skb_frag_fill_page_desc(frag, page, first_offset, first_size);  		memcpy(frag + 1, skbinfo->frags, sizeof(*frag) * skbinfo->nr_frags);  		/* We dont need to clear skbinfo->nr_frags here */ @@ -363,6 +304,24 @@ void napi_gro_flush(struct napi_struct *napi, bool flush_old)  }  EXPORT_SYMBOL(napi_gro_flush); +static unsigned long gro_list_prepare_tc_ext(const struct sk_buff *skb, +					     const struct sk_buff *p, +					     unsigned long diffs) +{ +#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT) +	struct tc_skb_ext *skb_ext; +	struct tc_skb_ext *p_ext; + +	skb_ext = skb_ext_find(skb, TC_SKB_EXT); +	p_ext = skb_ext_find(p, TC_SKB_EXT); + +	diffs |= (!!p_ext) ^ (!!skb_ext); +	if (!diffs && unlikely(skb_ext)) +		diffs |= p_ext->chain ^ skb_ext->chain; +#endif +	return diffs; +} +  static void gro_list_prepare(const struct list_head *head,  			     const struct sk_buff *skb)  { @@ -397,23 +356,11 @@ static void gro_list_prepare(const struct list_head *head,  		 * avoid trying too hard to skip each of them individually  		 */  		if (!diffs && unlikely(skb->slow_gro | p->slow_gro)) { -#if IS_ENABLED(CONFIG_SKB_EXTENSIONS) && IS_ENABLED(CONFIG_NET_TC_SKB_EXT) -			struct tc_skb_ext *skb_ext; -			struct tc_skb_ext *p_ext; -#endif -  			diffs |= p->sk != skb->sk;  			diffs |= skb_metadata_dst_cmp(p, skb);  			diffs |= skb_get_nfct(p) ^ skb_get_nfct(skb); -#if IS_ENABLED(CONFIG_SKB_EXTENSIONS) && IS_ENABLED(CONFIG_NET_TC_SKB_EXT) -			skb_ext = skb_ext_find(skb, TC_SKB_EXT); -			p_ext = skb_ext_find(p, TC_SKB_EXT); - -			diffs |= (!!p_ext) ^ (!!skb_ext); -			if (!diffs && unlikely(skb_ext)) -				diffs |= p_ext->chain ^ skb_ext->chain; -#endif +			diffs |= gro_list_prepare_tc_ext(skb, p, diffs);  		}  		NAPI_GRO_CB(p)->same_flow = !diffs; @@ -460,6 +407,14 @@ static void gro_pull_from_frag0(struct sk_buff *skb, int grow)  	}  } +static void gro_try_pull_from_frag0(struct sk_buff *skb) +{ +	int grow = skb_gro_offset(skb) - skb_headlen(skb); + +	if (grow > 0) +		gro_pull_from_frag0(skb, grow); +} +  static void gro_flush_oldest(struct napi_struct *napi, struct list_head *head)  {  	struct sk_buff *oldest; @@ -489,7 +444,6 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff  	struct sk_buff *pp = NULL;  	enum gro_result ret;  	int same_flow; -	int grow;  	if (netif_elide_gro(skb->dev))  		goto normal; @@ -564,17 +518,14 @@ found_ptype:  	else  		gro_list->count++; +	/* Must be called before setting NAPI_GRO_CB(skb)->{age|last} */ +	gro_try_pull_from_frag0(skb);  	NAPI_GRO_CB(skb)->age = jiffies;  	NAPI_GRO_CB(skb)->last = skb;  	if (!skb_is_gso(skb))  		skb_shinfo(skb)->gso_size = skb_gro_len(skb);  	list_add(&skb->list, &gro_list->list);  	ret = GRO_HELD; - -pull: -	grow = skb_gro_offset(skb) - skb_headlen(skb); -	if (grow > 0) -		gro_pull_from_frag0(skb, grow);  ok:  	if (gro_list->count) {  		if (!test_bit(bucket, &napi->gro_bitmask)) @@ -587,7 +538,8 @@ ok:  normal:  	ret = GRO_NORMAL; -	goto pull; +	gro_try_pull_from_frag0(skb); +	goto ok;  }  struct packet_offload *gro_find_receive_by_type(__be16 type) diff --git a/net/core/gso.c b/net/core/gso.c new file mode 100644 index 000000000000..9e1803bfc9c6 --- /dev/null +++ b/net/core/gso.c @@ -0,0 +1,273 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +#include <linux/skbuff.h> +#include <linux/sctp.h> +#include <net/gso.h> +#include <net/gro.h> + +/** + *	skb_eth_gso_segment - segmentation handler for ethernet protocols. + *	@skb: buffer to segment + *	@features: features for the output path (see dev->features) + *	@type: Ethernet Protocol ID + */ +struct sk_buff *skb_eth_gso_segment(struct sk_buff *skb, +				    netdev_features_t features, __be16 type) +{ +	struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT); +	struct packet_offload *ptype; + +	rcu_read_lock(); +	list_for_each_entry_rcu(ptype, &offload_base, list) { +		if (ptype->type == type && ptype->callbacks.gso_segment) { +			segs = ptype->callbacks.gso_segment(skb, features); +			break; +		} +	} +	rcu_read_unlock(); + +	return segs; +} +EXPORT_SYMBOL(skb_eth_gso_segment); + +/** + *	skb_mac_gso_segment - mac layer segmentation handler. + *	@skb: buffer to segment + *	@features: features for the output path (see dev->features) + */ +struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb, +				    netdev_features_t features) +{ +	struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT); +	struct packet_offload *ptype; +	int vlan_depth = skb->mac_len; +	__be16 type = skb_network_protocol(skb, &vlan_depth); + +	if (unlikely(!type)) +		return ERR_PTR(-EINVAL); + +	__skb_pull(skb, vlan_depth); + +	rcu_read_lock(); +	list_for_each_entry_rcu(ptype, &offload_base, list) { +		if (ptype->type == type && ptype->callbacks.gso_segment) { +			segs = ptype->callbacks.gso_segment(skb, features); +			break; +		} +	} +	rcu_read_unlock(); + +	__skb_push(skb, skb->data - skb_mac_header(skb)); + +	return segs; +} +EXPORT_SYMBOL(skb_mac_gso_segment); +/* openvswitch calls this on rx path, so we need a different check. + */ +static bool skb_needs_check(const struct sk_buff *skb, bool tx_path) +{ +	if (tx_path) +		return skb->ip_summed != CHECKSUM_PARTIAL && +		       skb->ip_summed != CHECKSUM_UNNECESSARY; + +	return skb->ip_summed == CHECKSUM_NONE; +} + +/** + *	__skb_gso_segment - Perform segmentation on skb. + *	@skb: buffer to segment + *	@features: features for the output path (see dev->features) + *	@tx_path: whether it is called in TX path + * + *	This function segments the given skb and returns a list of segments. + * + *	It may return NULL if the skb requires no segmentation.  This is + *	only possible when GSO is used for verifying header integrity. + * + *	Segmentation preserves SKB_GSO_CB_OFFSET bytes of previous skb cb. + */ +struct sk_buff *__skb_gso_segment(struct sk_buff *skb, +				  netdev_features_t features, bool tx_path) +{ +	struct sk_buff *segs; + +	if (unlikely(skb_needs_check(skb, tx_path))) { +		int err; + +		/* We're going to init ->check field in TCP or UDP header */ +		err = skb_cow_head(skb, 0); +		if (err < 0) +			return ERR_PTR(err); +	} + +	/* Only report GSO partial support if it will enable us to +	 * support segmentation on this frame without needing additional +	 * work. +	 */ +	if (features & NETIF_F_GSO_PARTIAL) { +		netdev_features_t partial_features = NETIF_F_GSO_ROBUST; +		struct net_device *dev = skb->dev; + +		partial_features |= dev->features & dev->gso_partial_features; +		if (!skb_gso_ok(skb, features | partial_features)) +			features &= ~NETIF_F_GSO_PARTIAL; +	} + +	BUILD_BUG_ON(SKB_GSO_CB_OFFSET + +		     sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb)); + +	SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb); +	SKB_GSO_CB(skb)->encap_level = 0; + +	skb_reset_mac_header(skb); +	skb_reset_mac_len(skb); + +	segs = skb_mac_gso_segment(skb, features); + +	if (segs != skb && unlikely(skb_needs_check(skb, tx_path) && !IS_ERR(segs))) +		skb_warn_bad_offload(skb); + +	return segs; +} +EXPORT_SYMBOL(__skb_gso_segment); + +/** + * skb_gso_transport_seglen - Return length of individual segments of a gso packet + * + * @skb: GSO skb + * + * skb_gso_transport_seglen is used to determine the real size of the + * individual segments, including Layer4 headers (TCP/UDP). + * + * The MAC/L2 or network (IP, IPv6) headers are not accounted for. + */ +static unsigned int skb_gso_transport_seglen(const struct sk_buff *skb) +{ +	const struct skb_shared_info *shinfo = skb_shinfo(skb); +	unsigned int thlen = 0; + +	if (skb->encapsulation) { +		thlen = skb_inner_transport_header(skb) - +			skb_transport_header(skb); + +		if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) +			thlen += inner_tcp_hdrlen(skb); +	} else if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) { +		thlen = tcp_hdrlen(skb); +	} else if (unlikely(skb_is_gso_sctp(skb))) { +		thlen = sizeof(struct sctphdr); +	} else if (shinfo->gso_type & SKB_GSO_UDP_L4) { +		thlen = sizeof(struct udphdr); +	} +	/* UFO sets gso_size to the size of the fragmentation +	 * payload, i.e. the size of the L4 (UDP) header is already +	 * accounted for. +	 */ +	return thlen + shinfo->gso_size; +} + +/** + * skb_gso_network_seglen - Return length of individual segments of a gso packet + * + * @skb: GSO skb + * + * skb_gso_network_seglen is used to determine the real size of the + * individual segments, including Layer3 (IP, IPv6) and L4 headers (TCP/UDP). + * + * The MAC/L2 header is not accounted for. + */ +static unsigned int skb_gso_network_seglen(const struct sk_buff *skb) +{ +	unsigned int hdr_len = skb_transport_header(skb) - +			       skb_network_header(skb); + +	return hdr_len + skb_gso_transport_seglen(skb); +} + +/** + * skb_gso_mac_seglen - Return length of individual segments of a gso packet + * + * @skb: GSO skb + * + * skb_gso_mac_seglen is used to determine the real size of the + * individual segments, including MAC/L2, Layer3 (IP, IPv6) and L4 + * headers (TCP/UDP). + */ +static unsigned int skb_gso_mac_seglen(const struct sk_buff *skb) +{ +	unsigned int hdr_len = skb_transport_header(skb) - skb_mac_header(skb); + +	return hdr_len + skb_gso_transport_seglen(skb); +} + +/** + * skb_gso_size_check - check the skb size, considering GSO_BY_FRAGS + * + * There are a couple of instances where we have a GSO skb, and we + * want to determine what size it would be after it is segmented. + * + * We might want to check: + * -    L3+L4+payload size (e.g. IP forwarding) + * - L2+L3+L4+payload size (e.g. sanity check before passing to driver) + * + * This is a helper to do that correctly considering GSO_BY_FRAGS. + * + * @skb: GSO skb + * + * @seg_len: The segmented length (from skb_gso_*_seglen). In the + *           GSO_BY_FRAGS case this will be [header sizes + GSO_BY_FRAGS]. + * + * @max_len: The maximum permissible length. + * + * Returns true if the segmented length <= max length. + */ +static inline bool skb_gso_size_check(const struct sk_buff *skb, +				      unsigned int seg_len, +				      unsigned int max_len) { +	const struct skb_shared_info *shinfo = skb_shinfo(skb); +	const struct sk_buff *iter; + +	if (shinfo->gso_size != GSO_BY_FRAGS) +		return seg_len <= max_len; + +	/* Undo this so we can re-use header sizes */ +	seg_len -= GSO_BY_FRAGS; + +	skb_walk_frags(skb, iter) { +		if (seg_len + skb_headlen(iter) > max_len) +			return false; +	} + +	return true; +} + +/** + * skb_gso_validate_network_len - Will a split GSO skb fit into a given MTU? + * + * @skb: GSO skb + * @mtu: MTU to validate against + * + * skb_gso_validate_network_len validates if a given skb will fit a + * wanted MTU once split. It considers L3 headers, L4 headers, and the + * payload. + */ +bool skb_gso_validate_network_len(const struct sk_buff *skb, unsigned int mtu) +{ +	return skb_gso_size_check(skb, skb_gso_network_seglen(skb), mtu); +} +EXPORT_SYMBOL_GPL(skb_gso_validate_network_len); + +/** + * skb_gso_validate_mac_len - Will a split GSO skb fit in a given length? + * + * @skb: GSO skb + * @len: length to validate against + * + * skb_gso_validate_mac_len validates if a given skb will fit a wanted + * length once split, including L2, L3 and L4 headers and the payload. + */ +bool skb_gso_validate_mac_len(const struct sk_buff *skb, unsigned int len) +{ +	return skb_gso_size_check(skb, skb_gso_mac_seglen(skb), len); +} +EXPORT_SYMBOL_GPL(skb_gso_validate_mac_len); + diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 3e3598cd49f2..f4183c4c1ec8 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -308,7 +308,7 @@ EXPORT_SYMBOL_GPL(get_net_ns_by_id);  /* init code that must occur even if setup_net() is not called. */  static __net_init void preinit_net(struct net *net)  { -	ref_tracker_dir_init(&net->notrefcnt_tracker, 128); +	ref_tracker_dir_init(&net->notrefcnt_tracker, 128, "net notrefcnt");  }  /* @@ -322,7 +322,7 @@ static __net_init int setup_net(struct net *net, struct user_namespace *user_ns)  	LIST_HEAD(net_exit_list);  	refcount_set(&net->ns.count, 1); -	ref_tracker_dir_init(&net->refcnt_tracker, 128); +	ref_tracker_dir_init(&net->refcnt_tracker, 128, "net refcnt");  	refcount_set(&net->passive, 1);  	get_random_bytes(&net->hash_mix, sizeof(u32)); diff --git a/net/core/netdev-genl-gen.c b/net/core/netdev-genl-gen.c index de17ca2f7dbf..ea9231378aa6 100644 --- a/net/core/netdev-genl-gen.c +++ b/net/core/netdev-genl-gen.c @@ -8,7 +8,7 @@  #include "netdev-genl-gen.h" -#include <linux/netdev.h> +#include <uapi/linux/netdev.h>  /* NETDEV_CMD_DEV_GET - do */  static const struct nla_policy netdev_dev_get_nl_policy[NETDEV_A_DEV_IFINDEX + 1] = { diff --git a/net/core/netdev-genl-gen.h b/net/core/netdev-genl-gen.h index 74d74fc23167..7b370c073e7d 100644 --- a/net/core/netdev-genl-gen.h +++ b/net/core/netdev-genl-gen.h @@ -9,7 +9,7 @@  #include <net/netlink.h>  #include <net/genetlink.h> -#include <linux/netdev.h> +#include <uapi/linux/netdev.h>  int netdev_nl_dev_get_doit(struct sk_buff *skb, struct genl_info *info);  int netdev_nl_dev_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb); diff --git a/net/core/netpoll.c b/net/core/netpoll.c index e6a739b1afa9..543007f159f9 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -690,7 +690,7 @@ int netpoll_setup(struct netpoll *np)  		err = -ENODEV;  		goto unlock;  	} -	dev_hold(ndev); +	netdev_hold(ndev, &np->dev_tracker, GFP_KERNEL);  	if (netdev_master_upper_dev_get(ndev)) {  		np_err(np, "%s is a slave device, aborting\n", np->dev_name); @@ -783,12 +783,11 @@ put_noaddr:  	err = __netpoll_setup(np, ndev);  	if (err)  		goto put; -	netdev_tracker_alloc(ndev, &np->dev_tracker, GFP_KERNEL);  	rtnl_unlock();  	return 0;  put: -	dev_put(ndev); +	netdev_put(ndev, &np->dev_tracker);  unlock:  	rtnl_unlock();  	return err; diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 760238196db1..f56b8d697014 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -2785,14 +2785,17 @@ static void pktgen_finalize_skb(struct pktgen_dev *pkt_dev, struct sk_buff *skb,  					break;  			}  			get_page(pkt_dev->page); -			skb_frag_set_page(skb, i, pkt_dev->page); -			skb_frag_off_set(&skb_shinfo(skb)->frags[i], 0); +  			/*last fragment, fill rest of data*/  			if (i == (frags - 1)) -				skb_frag_size_set(&skb_shinfo(skb)->frags[i], -				    (datalen < PAGE_SIZE ? datalen : PAGE_SIZE)); +				skb_frag_fill_page_desc(&skb_shinfo(skb)->frags[i], +							pkt_dev->page, 0, +							(datalen < PAGE_SIZE ? +							 datalen : PAGE_SIZE));  			else -				skb_frag_size_set(&skb_shinfo(skb)->frags[i], frag_len); +				skb_frag_fill_page_desc(&skb_shinfo(skb)->frags[i], +							pkt_dev->page, 0, frag_len); +  			datalen -= skb_frag_size(&skb_shinfo(skb)->frags[i]);  			skb->len += skb_frag_size(&skb_shinfo(skb)->frags[i]);  			skb->data_len += skb_frag_size(&skb_shinfo(skb)->frags[i]); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 41de3a2f29e1..3ad4e030846d 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -961,24 +961,27 @@ static inline int rtnl_vfinfo_size(const struct net_device *dev,  			 nla_total_size(sizeof(struct ifla_vf_rate)) +  			 nla_total_size(sizeof(struct ifla_vf_link_state)) +  			 nla_total_size(sizeof(struct ifla_vf_rss_query_en)) + -			 nla_total_size(0) + /* nest IFLA_VF_STATS */ -			 /* IFLA_VF_STATS_RX_PACKETS */ -			 nla_total_size_64bit(sizeof(__u64)) + -			 /* IFLA_VF_STATS_TX_PACKETS */ -			 nla_total_size_64bit(sizeof(__u64)) + -			 /* IFLA_VF_STATS_RX_BYTES */ -			 nla_total_size_64bit(sizeof(__u64)) + -			 /* IFLA_VF_STATS_TX_BYTES */ -			 nla_total_size_64bit(sizeof(__u64)) + -			 /* IFLA_VF_STATS_BROADCAST */ -			 nla_total_size_64bit(sizeof(__u64)) + -			 /* IFLA_VF_STATS_MULTICAST */ -			 nla_total_size_64bit(sizeof(__u64)) + -			 /* IFLA_VF_STATS_RX_DROPPED */ -			 nla_total_size_64bit(sizeof(__u64)) + -			 /* IFLA_VF_STATS_TX_DROPPED */ -			 nla_total_size_64bit(sizeof(__u64)) +  			 nla_total_size(sizeof(struct ifla_vf_trust))); +		if (~ext_filter_mask & RTEXT_FILTER_SKIP_STATS) { +			size += num_vfs * +				(nla_total_size(0) + /* nest IFLA_VF_STATS */ +				 /* IFLA_VF_STATS_RX_PACKETS */ +				 nla_total_size_64bit(sizeof(__u64)) + +				 /* IFLA_VF_STATS_TX_PACKETS */ +				 nla_total_size_64bit(sizeof(__u64)) + +				 /* IFLA_VF_STATS_RX_BYTES */ +				 nla_total_size_64bit(sizeof(__u64)) + +				 /* IFLA_VF_STATS_TX_BYTES */ +				 nla_total_size_64bit(sizeof(__u64)) + +				 /* IFLA_VF_STATS_BROADCAST */ +				 nla_total_size_64bit(sizeof(__u64)) + +				 /* IFLA_VF_STATS_MULTICAST */ +				 nla_total_size_64bit(sizeof(__u64)) + +				 /* IFLA_VF_STATS_RX_DROPPED */ +				 nla_total_size_64bit(sizeof(__u64)) + +				 /* IFLA_VF_STATS_TX_DROPPED */ +				 nla_total_size_64bit(sizeof(__u64))); +		}  		return size;  	} else  		return 0; @@ -1270,7 +1273,8 @@ static noinline_for_stack int rtnl_fill_stats(struct sk_buff *skb,  static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb,  					       struct net_device *dev,  					       int vfs_num, -					       struct nlattr *vfinfo) +					       struct nlattr *vfinfo, +					       u32 ext_filter_mask)  {  	struct ifla_vf_rss_query_en vf_rss_query_en;  	struct nlattr *vf, *vfstats, *vfvlanlist; @@ -1376,33 +1380,35 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb,  		goto nla_put_vf_failure;  	}  	nla_nest_end(skb, vfvlanlist); -	memset(&vf_stats, 0, sizeof(vf_stats)); -	if (dev->netdev_ops->ndo_get_vf_stats) -		dev->netdev_ops->ndo_get_vf_stats(dev, vfs_num, -						&vf_stats); -	vfstats = nla_nest_start_noflag(skb, IFLA_VF_STATS); -	if (!vfstats) -		goto nla_put_vf_failure; -	if (nla_put_u64_64bit(skb, IFLA_VF_STATS_RX_PACKETS, -			      vf_stats.rx_packets, IFLA_VF_STATS_PAD) || -	    nla_put_u64_64bit(skb, IFLA_VF_STATS_TX_PACKETS, -			      vf_stats.tx_packets, IFLA_VF_STATS_PAD) || -	    nla_put_u64_64bit(skb, IFLA_VF_STATS_RX_BYTES, -			      vf_stats.rx_bytes, IFLA_VF_STATS_PAD) || -	    nla_put_u64_64bit(skb, IFLA_VF_STATS_TX_BYTES, -			      vf_stats.tx_bytes, IFLA_VF_STATS_PAD) || -	    nla_put_u64_64bit(skb, IFLA_VF_STATS_BROADCAST, -			      vf_stats.broadcast, IFLA_VF_STATS_PAD) || -	    nla_put_u64_64bit(skb, IFLA_VF_STATS_MULTICAST, -			      vf_stats.multicast, IFLA_VF_STATS_PAD) || -	    nla_put_u64_64bit(skb, IFLA_VF_STATS_RX_DROPPED, -			      vf_stats.rx_dropped, IFLA_VF_STATS_PAD) || -	    nla_put_u64_64bit(skb, IFLA_VF_STATS_TX_DROPPED, -			      vf_stats.tx_dropped, IFLA_VF_STATS_PAD)) { -		nla_nest_cancel(skb, vfstats); -		goto nla_put_vf_failure; +	if (~ext_filter_mask & RTEXT_FILTER_SKIP_STATS) { +		memset(&vf_stats, 0, sizeof(vf_stats)); +		if (dev->netdev_ops->ndo_get_vf_stats) +			dev->netdev_ops->ndo_get_vf_stats(dev, vfs_num, +							  &vf_stats); +		vfstats = nla_nest_start_noflag(skb, IFLA_VF_STATS); +		if (!vfstats) +			goto nla_put_vf_failure; +		if (nla_put_u64_64bit(skb, IFLA_VF_STATS_RX_PACKETS, +				      vf_stats.rx_packets, IFLA_VF_STATS_PAD) || +		    nla_put_u64_64bit(skb, IFLA_VF_STATS_TX_PACKETS, +				      vf_stats.tx_packets, IFLA_VF_STATS_PAD) || +		    nla_put_u64_64bit(skb, IFLA_VF_STATS_RX_BYTES, +				      vf_stats.rx_bytes, IFLA_VF_STATS_PAD) || +		    nla_put_u64_64bit(skb, IFLA_VF_STATS_TX_BYTES, +				      vf_stats.tx_bytes, IFLA_VF_STATS_PAD) || +		    nla_put_u64_64bit(skb, IFLA_VF_STATS_BROADCAST, +				      vf_stats.broadcast, IFLA_VF_STATS_PAD) || +		    nla_put_u64_64bit(skb, IFLA_VF_STATS_MULTICAST, +				      vf_stats.multicast, IFLA_VF_STATS_PAD) || +		    nla_put_u64_64bit(skb, IFLA_VF_STATS_RX_DROPPED, +				      vf_stats.rx_dropped, IFLA_VF_STATS_PAD) || +		    nla_put_u64_64bit(skb, IFLA_VF_STATS_TX_DROPPED, +				      vf_stats.tx_dropped, IFLA_VF_STATS_PAD)) { +			nla_nest_cancel(skb, vfstats); +			goto nla_put_vf_failure; +		} +		nla_nest_end(skb, vfstats);  	} -	nla_nest_end(skb, vfstats);  	nla_nest_end(skb, vf);  	return 0; @@ -1435,7 +1441,7 @@ static noinline_for_stack int rtnl_fill_vf(struct sk_buff *skb,  		return -EMSGSIZE;  	for (i = 0; i < num_vfs; i++) { -		if (rtnl_fill_vfinfo(skb, dev, i, vfinfo)) +		if (rtnl_fill_vfinfo(skb, dev, i, vfinfo, ext_filter_mask))  			return -EMSGSIZE;  	} @@ -2377,45 +2383,43 @@ static	int rtnl_set_vf_rate(struct net_device *dev, int vf, int min_tx_rate,  static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[],  			    struct netlink_ext_ack *extack)  { -	if (dev) { -		if (tb[IFLA_ADDRESS] && -		    nla_len(tb[IFLA_ADDRESS]) < dev->addr_len) -			return -EINVAL; +	if (tb[IFLA_ADDRESS] && +	    nla_len(tb[IFLA_ADDRESS]) < dev->addr_len) +		return -EINVAL; -		if (tb[IFLA_BROADCAST] && -		    nla_len(tb[IFLA_BROADCAST]) < dev->addr_len) -			return -EINVAL; +	if (tb[IFLA_BROADCAST] && +	    nla_len(tb[IFLA_BROADCAST]) < dev->addr_len) +		return -EINVAL; -		if (tb[IFLA_GSO_MAX_SIZE] && -		    nla_get_u32(tb[IFLA_GSO_MAX_SIZE]) > dev->tso_max_size) { -			NL_SET_ERR_MSG(extack, "too big gso_max_size"); -			return -EINVAL; -		} +	if (tb[IFLA_GSO_MAX_SIZE] && +	    nla_get_u32(tb[IFLA_GSO_MAX_SIZE]) > dev->tso_max_size) { +		NL_SET_ERR_MSG(extack, "too big gso_max_size"); +		return -EINVAL; +	} -		if (tb[IFLA_GSO_MAX_SEGS] && -		    (nla_get_u32(tb[IFLA_GSO_MAX_SEGS]) > GSO_MAX_SEGS || -		     nla_get_u32(tb[IFLA_GSO_MAX_SEGS]) > dev->tso_max_segs)) { -			NL_SET_ERR_MSG(extack, "too big gso_max_segs"); -			return -EINVAL; -		} +	if (tb[IFLA_GSO_MAX_SEGS] && +	    (nla_get_u32(tb[IFLA_GSO_MAX_SEGS]) > GSO_MAX_SEGS || +	     nla_get_u32(tb[IFLA_GSO_MAX_SEGS]) > dev->tso_max_segs)) { +		NL_SET_ERR_MSG(extack, "too big gso_max_segs"); +		return -EINVAL; +	} -		if (tb[IFLA_GRO_MAX_SIZE] && -		    nla_get_u32(tb[IFLA_GRO_MAX_SIZE]) > GRO_MAX_SIZE) { -			NL_SET_ERR_MSG(extack, "too big gro_max_size"); -			return -EINVAL; -		} +	if (tb[IFLA_GRO_MAX_SIZE] && +	    nla_get_u32(tb[IFLA_GRO_MAX_SIZE]) > GRO_MAX_SIZE) { +		NL_SET_ERR_MSG(extack, "too big gro_max_size"); +		return -EINVAL; +	} -		if (tb[IFLA_GSO_IPV4_MAX_SIZE] && -		    nla_get_u32(tb[IFLA_GSO_IPV4_MAX_SIZE]) > dev->tso_max_size) { -			NL_SET_ERR_MSG(extack, "too big gso_ipv4_max_size"); -			return -EINVAL; -		} +	if (tb[IFLA_GSO_IPV4_MAX_SIZE] && +	    nla_get_u32(tb[IFLA_GSO_IPV4_MAX_SIZE]) > dev->tso_max_size) { +		NL_SET_ERR_MSG(extack, "too big gso_ipv4_max_size"); +		return -EINVAL; +	} -		if (tb[IFLA_GRO_IPV4_MAX_SIZE] && -		    nla_get_u32(tb[IFLA_GRO_IPV4_MAX_SIZE]) > GRO_MAX_SIZE) { -			NL_SET_ERR_MSG(extack, "too big gro_ipv4_max_size"); -			return -EINVAL; -		} +	if (tb[IFLA_GRO_IPV4_MAX_SIZE] && +	    nla_get_u32(tb[IFLA_GRO_IPV4_MAX_SIZE]) > GRO_MAX_SIZE) { +		NL_SET_ERR_MSG(extack, "too big gro_ipv4_max_size"); +		return -EINVAL;  	}  	if (tb[IFLA_AF_SPEC]) { @@ -2736,10 +2740,6 @@ static int do_setlink(const struct sk_buff *skb,  	char ifname[IFNAMSIZ];  	int err; -	err = validate_linkmsg(dev, tb, extack); -	if (err < 0) -		return err; -  	if (tb[IFLA_IFNAME])  		nla_strscpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ);  	else @@ -3156,6 +3156,10 @@ static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh,  		goto errout;  	} +	err = validate_linkmsg(dev, tb, extack); +	if (err < 0) +		goto errout; +  	err = do_setlink(skb, dev, ifm, extack, tb, 0);  errout:  	return err; @@ -3399,6 +3403,9 @@ static int rtnl_group_changelink(const struct sk_buff *skb,  	for_each_netdev_safe(net, dev, aux) {  		if (dev->group == group) { +			err = validate_linkmsg(dev, tb, extack); +			if (err < 0) +				return err;  			err = do_setlink(skb, dev, ifm, extack, tb, 0);  			if (err < 0)  				return err; @@ -3556,10 +3563,6 @@ replay:  			m_ops = master_dev->rtnl_link_ops;  	} -	err = validate_linkmsg(dev, tb, extack); -	if (err < 0) -		return err; -  	if (tb[IFLA_LINKINFO]) {  		err = nla_parse_nested_deprecated(linkinfo, IFLA_INFO_MAX,  						  tb[IFLA_LINKINFO], @@ -3623,6 +3626,10 @@ replay:  		if (nlh->nlmsg_flags & NLM_F_REPLACE)  			return -EOPNOTSUPP; +		err = validate_linkmsg(dev, tb, extack); +		if (err < 0) +			return err; +  		if (linkinfo[IFLA_INFO_DATA]) {  			if (!ops || ops != dev->rtnl_link_ops ||  			    !ops->changelink) @@ -4090,7 +4097,7 @@ static int nlmsg_populate_fdb_fill(struct sk_buff *skb,  	ndm->ndm_ifindex = dev->ifindex;  	ndm->ndm_state   = ndm_state; -	if (nla_put(skb, NDA_LLADDR, ETH_ALEN, addr)) +	if (nla_put(skb, NDA_LLADDR, dev->addr_len, addr))  		goto nla_put_failure;  	if (vid)  		if (nla_put(skb, NDA_VLAN, sizeof(u16), &vid)) @@ -4104,10 +4111,10 @@ nla_put_failure:  	return -EMSGSIZE;  } -static inline size_t rtnl_fdb_nlmsg_size(void) +static inline size_t rtnl_fdb_nlmsg_size(const struct net_device *dev)  {  	return NLMSG_ALIGN(sizeof(struct ndmsg)) + -	       nla_total_size(ETH_ALEN) +	/* NDA_LLADDR */ +	       nla_total_size(dev->addr_len) +	/* NDA_LLADDR */  	       nla_total_size(sizeof(u16)) +	/* NDA_VLAN */  	       0;  } @@ -4119,7 +4126,7 @@ static void rtnl_fdb_notify(struct net_device *dev, u8 *addr, u16 vid, int type,  	struct sk_buff *skb;  	int err = -ENOBUFS; -	skb = nlmsg_new(rtnl_fdb_nlmsg_size(), GFP_ATOMIC); +	skb = nlmsg_new(rtnl_fdb_nlmsg_size(dev), GFP_ATOMIC);  	if (!skb)  		goto errout; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index cea28d30abb5..6c5915efbc17 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -67,6 +67,7 @@  #include <net/dst.h>  #include <net/sock.h>  #include <net/checksum.h> +#include <net/gso.h>  #include <net/ip6_checksum.h>  #include <net/xfrm.h>  #include <net/mpls.h> @@ -92,15 +93,7 @@ static struct kmem_cache *skbuff_fclone_cache __ro_after_init;  static struct kmem_cache *skbuff_ext_cache __ro_after_init;  #endif -/* skb_small_head_cache and related code is only supported - * for CONFIG_SLAB and CONFIG_SLUB. - * As soon as SLOB is removed from the kernel, we can clean up this. - */ -#if !defined(CONFIG_SLOB) -# define HAVE_SKB_SMALL_HEAD_CACHE 1 -#endif -#ifdef HAVE_SKB_SMALL_HEAD_CACHE  static struct kmem_cache *skb_small_head_cache __ro_after_init;  #define SKB_SMALL_HEAD_SIZE SKB_HEAD_ALIGN(MAX_TCP_HEADER) @@ -117,7 +110,6 @@ static struct kmem_cache *skb_small_head_cache __ro_after_init;  #define SKB_SMALL_HEAD_HEADROOM						\  	SKB_WITH_OVERHEAD(SKB_SMALL_HEAD_CACHE_SIZE) -#endif /* HAVE_SKB_SMALL_HEAD_CACHE */  int sysctl_max_skb_frags __read_mostly = MAX_SKB_FRAGS;  EXPORT_SYMBOL(sysctl_max_skb_frags); @@ -562,7 +554,6 @@ static void *kmalloc_reserve(unsigned int *size, gfp_t flags, int node,  	void *obj;  	obj_size = SKB_HEAD_ALIGN(*size); -#ifdef HAVE_SKB_SMALL_HEAD_CACHE  	if (obj_size <= SKB_SMALL_HEAD_CACHE_SIZE &&  	    !(flags & KMALLOC_NOT_NORMAL_BITS)) {  		obj = kmem_cache_alloc_node(skb_small_head_cache, @@ -576,7 +567,6 @@ static void *kmalloc_reserve(unsigned int *size, gfp_t flags, int node,  		obj = kmem_cache_alloc_node(skb_small_head_cache, flags, node);  		goto out;  	} -#endif  	*size = obj_size = kmalloc_size_roundup(obj_size);  	/*  	 * Try a regular allocation, when that fails and we're not entitled @@ -898,11 +888,9 @@ static bool skb_pp_recycle(struct sk_buff *skb, void *data, bool napi_safe)  static void skb_kfree_head(void *head, unsigned int end_offset)  { -#ifdef HAVE_SKB_SMALL_HEAD_CACHE  	if (end_offset == SKB_SMALL_HEAD_HEADROOM)  		kmem_cache_free(skb_small_head_cache, head);  	else -#endif  		kfree(head);  } @@ -2160,7 +2148,6 @@ int __skb_unclone_keeptruesize(struct sk_buff *skb, gfp_t pri)  	if (likely(skb_end_offset(skb) == saved_end_offset))  		return 0; -#ifdef HAVE_SKB_SMALL_HEAD_CACHE  	/* We can not change skb->end if the original or new value  	 * is SKB_SMALL_HEAD_HEADROOM, as it might break skb_kfree_head().  	 */ @@ -2174,7 +2161,6 @@ int __skb_unclone_keeptruesize(struct sk_buff *skb, gfp_t pri)  		WARN_ON_ONCE(1);  		return 0;  	} -#endif  	shinfo = skb_shinfo(skb); @@ -3003,32 +2989,32 @@ int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset,  }  EXPORT_SYMBOL_GPL(skb_splice_bits); -static int sendmsg_unlocked(struct sock *sk, struct msghdr *msg, -			    struct kvec *vec, size_t num, size_t size) +static int sendmsg_locked(struct sock *sk, struct msghdr *msg)  {  	struct socket *sock = sk->sk_socket; +	size_t size = msg_data_left(msg);  	if (!sock)  		return -EINVAL; -	return kernel_sendmsg(sock, msg, vec, num, size); + +	if (!sock->ops->sendmsg_locked) +		return sock_no_sendmsg_locked(sk, msg, size); + +	return sock->ops->sendmsg_locked(sk, msg, size);  } -static int sendpage_unlocked(struct sock *sk, struct page *page, int offset, -			     size_t size, int flags) +static int sendmsg_unlocked(struct sock *sk, struct msghdr *msg)  {  	struct socket *sock = sk->sk_socket;  	if (!sock)  		return -EINVAL; -	return kernel_sendpage(sock, page, offset, size, flags); +	return sock_sendmsg(sock, msg);  } -typedef int (*sendmsg_func)(struct sock *sk, struct msghdr *msg, -			    struct kvec *vec, size_t num, size_t size); -typedef int (*sendpage_func)(struct sock *sk, struct page *page, int offset, -			     size_t size, int flags); +typedef int (*sendmsg_func)(struct sock *sk, struct msghdr *msg);  static int __skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, -			   int len, sendmsg_func sendmsg, sendpage_func sendpage) +			   int len, sendmsg_func sendmsg)  {  	unsigned int orig_len = len;  	struct sk_buff *head = skb; @@ -3048,8 +3034,9 @@ do_frag_list:  		memset(&msg, 0, sizeof(msg));  		msg.msg_flags = MSG_DONTWAIT; -		ret = INDIRECT_CALL_2(sendmsg, kernel_sendmsg_locked, -				      sendmsg_unlocked, sk, &msg, &kv, 1, slen); +		iov_iter_kvec(&msg.msg_iter, ITER_SOURCE, &kv, 1, slen); +		ret = INDIRECT_CALL_2(sendmsg, sendmsg_locked, +				      sendmsg_unlocked, sk, &msg);  		if (ret <= 0)  			goto error; @@ -3080,11 +3067,18 @@ do_frag_list:  		slen = min_t(size_t, len, skb_frag_size(frag) - offset);  		while (slen) { -			ret = INDIRECT_CALL_2(sendpage, kernel_sendpage_locked, -					      sendpage_unlocked, sk, -					      skb_frag_page(frag), -					      skb_frag_off(frag) + offset, -					      slen, MSG_DONTWAIT); +			struct bio_vec bvec; +			struct msghdr msg = { +				.msg_flags = MSG_SPLICE_PAGES | MSG_DONTWAIT, +			}; + +			bvec_set_page(&bvec, skb_frag_page(frag), slen, +				      skb_frag_off(frag) + offset); +			iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, +				      slen); + +			ret = INDIRECT_CALL_2(sendmsg, sendmsg_locked, +					      sendmsg_unlocked, sk, &msg);  			if (ret <= 0)  				goto error; @@ -3121,16 +3115,14 @@ error:  int skb_send_sock_locked(struct sock *sk, struct sk_buff *skb, int offset,  			 int len)  { -	return __skb_send_sock(sk, skb, offset, len, kernel_sendmsg_locked, -			       kernel_sendpage_locked); +	return __skb_send_sock(sk, skb, offset, len, sendmsg_locked);  }  EXPORT_SYMBOL_GPL(skb_send_sock_locked);  /* Send skb data on a socket. Socket must be unlocked. */  int skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, int len)  { -	return __skb_send_sock(sk, skb, offset, len, sendmsg_unlocked, -			       sendpage_unlocked); +	return __skb_send_sock(sk, skb, offset, len, sendmsg_unlocked);  }  /** @@ -4203,13 +4195,13 @@ unsigned int skb_find_text(struct sk_buff *skb, unsigned int from,  EXPORT_SYMBOL(skb_find_text);  int skb_append_pagefrags(struct sk_buff *skb, struct page *page, -			 int offset, size_t size) +			 int offset, size_t size, size_t max_frags)  {  	int i = skb_shinfo(skb)->nr_frags;  	if (skb_can_coalesce(skb, i, page, offset)) {  		skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], size); -	} else if (i < MAX_SKB_FRAGS) { +	} else if (i < max_frags) {  		skb_zcopy_downgrade_managed(skb);  		get_page(page);  		skb_fill_page_desc_noacc(skb, i, page, offset, size); @@ -4249,10 +4241,9 @@ static inline skb_frag_t skb_head_frag_to_page_desc(struct sk_buff *frag_skb)  	struct page *page;  	page = virt_to_head_page(frag_skb->head); -	__skb_frag_set_page(&head_frag, page); -	skb_frag_off_set(&head_frag, frag_skb->data - -			 (unsigned char *)page_address(page)); -	skb_frag_size_set(&head_frag, skb_headlen(frag_skb)); +	skb_frag_fill_page_desc(&head_frag, page, frag_skb->data - +				(unsigned char *)page_address(page), +				skb_headlen(frag_skb));  	return head_frag;  } @@ -4768,7 +4759,6 @@ void __init skb_init(void)  						0,  						SLAB_HWCACHE_ALIGN|SLAB_PANIC,  						NULL); -#ifdef HAVE_SKB_SMALL_HEAD_CACHE  	/* usercopy should only access first SKB_SMALL_HEAD_HEADROOM bytes.  	 * struct skb_shared_info is located at the end of skb->head,  	 * and should not be copied to/from user. @@ -4780,7 +4770,6 @@ void __init skb_init(void)  						0,  						SKB_SMALL_HEAD_HEADROOM,  						NULL); -#endif  	skb_extensions_init();  } @@ -5784,147 +5773,6 @@ void skb_scrub_packet(struct sk_buff *skb, bool xnet)  }  EXPORT_SYMBOL_GPL(skb_scrub_packet); -/** - * skb_gso_transport_seglen - Return length of individual segments of a gso packet - * - * @skb: GSO skb - * - * skb_gso_transport_seglen is used to determine the real size of the - * individual segments, including Layer4 headers (TCP/UDP). - * - * The MAC/L2 or network (IP, IPv6) headers are not accounted for. - */ -static unsigned int skb_gso_transport_seglen(const struct sk_buff *skb) -{ -	const struct skb_shared_info *shinfo = skb_shinfo(skb); -	unsigned int thlen = 0; - -	if (skb->encapsulation) { -		thlen = skb_inner_transport_header(skb) - -			skb_transport_header(skb); - -		if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) -			thlen += inner_tcp_hdrlen(skb); -	} else if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) { -		thlen = tcp_hdrlen(skb); -	} else if (unlikely(skb_is_gso_sctp(skb))) { -		thlen = sizeof(struct sctphdr); -	} else if (shinfo->gso_type & SKB_GSO_UDP_L4) { -		thlen = sizeof(struct udphdr); -	} -	/* UFO sets gso_size to the size of the fragmentation -	 * payload, i.e. the size of the L4 (UDP) header is already -	 * accounted for. -	 */ -	return thlen + shinfo->gso_size; -} - -/** - * skb_gso_network_seglen - Return length of individual segments of a gso packet - * - * @skb: GSO skb - * - * skb_gso_network_seglen is used to determine the real size of the - * individual segments, including Layer3 (IP, IPv6) and L4 headers (TCP/UDP). - * - * The MAC/L2 header is not accounted for. - */ -static unsigned int skb_gso_network_seglen(const struct sk_buff *skb) -{ -	unsigned int hdr_len = skb_transport_header(skb) - -			       skb_network_header(skb); - -	return hdr_len + skb_gso_transport_seglen(skb); -} - -/** - * skb_gso_mac_seglen - Return length of individual segments of a gso packet - * - * @skb: GSO skb - * - * skb_gso_mac_seglen is used to determine the real size of the - * individual segments, including MAC/L2, Layer3 (IP, IPv6) and L4 - * headers (TCP/UDP). - */ -static unsigned int skb_gso_mac_seglen(const struct sk_buff *skb) -{ -	unsigned int hdr_len = skb_transport_header(skb) - skb_mac_header(skb); - -	return hdr_len + skb_gso_transport_seglen(skb); -} - -/** - * skb_gso_size_check - check the skb size, considering GSO_BY_FRAGS - * - * There are a couple of instances where we have a GSO skb, and we - * want to determine what size it would be after it is segmented. - * - * We might want to check: - * -    L3+L4+payload size (e.g. IP forwarding) - * - L2+L3+L4+payload size (e.g. sanity check before passing to driver) - * - * This is a helper to do that correctly considering GSO_BY_FRAGS. - * - * @skb: GSO skb - * - * @seg_len: The segmented length (from skb_gso_*_seglen). In the - *           GSO_BY_FRAGS case this will be [header sizes + GSO_BY_FRAGS]. - * - * @max_len: The maximum permissible length. - * - * Returns true if the segmented length <= max length. - */ -static inline bool skb_gso_size_check(const struct sk_buff *skb, -				      unsigned int seg_len, -				      unsigned int max_len) { -	const struct skb_shared_info *shinfo = skb_shinfo(skb); -	const struct sk_buff *iter; - -	if (shinfo->gso_size != GSO_BY_FRAGS) -		return seg_len <= max_len; - -	/* Undo this so we can re-use header sizes */ -	seg_len -= GSO_BY_FRAGS; - -	skb_walk_frags(skb, iter) { -		if (seg_len + skb_headlen(iter) > max_len) -			return false; -	} - -	return true; -} - -/** - * skb_gso_validate_network_len - Will a split GSO skb fit into a given MTU? - * - * @skb: GSO skb - * @mtu: MTU to validate against - * - * skb_gso_validate_network_len validates if a given skb will fit a - * wanted MTU once split. It considers L3 headers, L4 headers, and the - * payload. - */ -bool skb_gso_validate_network_len(const struct sk_buff *skb, unsigned int mtu) -{ -	return skb_gso_size_check(skb, skb_gso_network_seglen(skb), mtu); -} -EXPORT_SYMBOL_GPL(skb_gso_validate_network_len); - -/** - * skb_gso_validate_mac_len - Will a split GSO skb fit in a given length? - * - * @skb: GSO skb - * @len: length to validate against - * - * skb_gso_validate_mac_len validates if a given skb will fit a wanted - * length once split, including L2, L3 and L4 headers and the payload. - */ -bool skb_gso_validate_mac_len(const struct sk_buff *skb, unsigned int len) -{ -	return skb_gso_size_check(skb, skb_gso_mac_seglen(skb), len); -} -EXPORT_SYMBOL_GPL(skb_gso_validate_mac_len); -  static struct sk_buff *skb_reorder_vlan_header(struct sk_buff *skb)  {  	int mac_len, meta_len; @@ -6912,3 +6760,91 @@ nodefer:	__kfree_skb(skb);  	if (unlikely(kick) && !cmpxchg(&sd->defer_ipi_scheduled, 0, 1))  		smp_call_function_single_async(cpu, &sd->defer_csd);  } + +static void skb_splice_csum_page(struct sk_buff *skb, struct page *page, +				 size_t offset, size_t len) +{ +	const char *kaddr; +	__wsum csum; + +	kaddr = kmap_local_page(page); +	csum = csum_partial(kaddr + offset, len, 0); +	kunmap_local(kaddr); +	skb->csum = csum_block_add(skb->csum, csum, skb->len); +} + +/** + * skb_splice_from_iter - Splice (or copy) pages to skbuff + * @skb: The buffer to add pages to + * @iter: Iterator representing the pages to be added + * @maxsize: Maximum amount of pages to be added + * @gfp: Allocation flags + * + * This is a common helper function for supporting MSG_SPLICE_PAGES.  It + * extracts pages from an iterator and adds them to the socket buffer if + * possible, copying them to fragments if not possible (such as if they're slab + * pages). + * + * Returns the amount of data spliced/copied or -EMSGSIZE if there's + * insufficient space in the buffer to transfer anything. + */ +ssize_t skb_splice_from_iter(struct sk_buff *skb, struct iov_iter *iter, +			     ssize_t maxsize, gfp_t gfp) +{ +	size_t frag_limit = READ_ONCE(sysctl_max_skb_frags); +	struct page *pages[8], **ppages = pages; +	ssize_t spliced = 0, ret = 0; +	unsigned int i; + +	while (iter->count > 0) { +		ssize_t space, nr, len; +		size_t off; + +		ret = -EMSGSIZE; +		space = frag_limit - skb_shinfo(skb)->nr_frags; +		if (space < 0) +			break; + +		/* We might be able to coalesce without increasing nr_frags */ +		nr = clamp_t(size_t, space, 1, ARRAY_SIZE(pages)); + +		len = iov_iter_extract_pages(iter, &ppages, maxsize, nr, 0, &off); +		if (len <= 0) { +			ret = len ?: -EIO; +			break; +		} + +		i = 0; +		do { +			struct page *page = pages[i++]; +			size_t part = min_t(size_t, PAGE_SIZE - off, len); + +			ret = -EIO; +			if (WARN_ON_ONCE(!sendpage_ok(page))) +				goto out; + +			ret = skb_append_pagefrags(skb, page, off, part, +						   frag_limit); +			if (ret < 0) { +				iov_iter_revert(iter, len); +				goto out; +			} + +			if (skb->ip_summed == CHECKSUM_NONE) +				skb_splice_csum_page(skb, page, off, part); + +			off = 0; +			spliced += part; +			maxsize -= part; +			len -= part; +		} while (len > 0); + +		if (maxsize <= 0) +			break; +	} + +out: +	skb_len_add(skb, spliced); +	return spliced ?: ret; +} +EXPORT_SYMBOL(skb_splice_from_iter); diff --git a/net/core/sock.c b/net/core/sock.c index 6e5662ca00fe..9370fd50aa2c 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -114,6 +114,9 @@  #include <linux/memcontrol.h>  #include <linux/prefetch.h>  #include <linux/compat.h> +#include <linux/mroute.h> +#include <linux/mroute6.h> +#include <linux/icmpv6.h>  #include <linux/uaccess.h> @@ -138,6 +141,7 @@  #include <net/tcp.h>  #include <net/busy_poll.h> +#include <net/phonet/phonet.h>  #include <linux/ethtool.h> @@ -1246,6 +1250,13 @@ set_sndbuf:  			clear_bit(SOCK_PASSCRED, &sock->flags);  		break; +	case SO_PASSPIDFD: +		if (valbool) +			set_bit(SOCK_PASSPIDFD, &sock->flags); +		else +			clear_bit(SOCK_PASSPIDFD, &sock->flags); +		break; +  	case SO_TIMESTAMP_OLD:  	case SO_TIMESTAMP_NEW:  	case SO_TIMESTAMPNS_OLD: @@ -1726,6 +1737,10 @@ int sk_getsockopt(struct sock *sk, int level, int optname,  		v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);  		break; +	case SO_PASSPIDFD: +		v.val = !!test_bit(SOCK_PASSPIDFD, &sock->flags); +		break; +  	case SO_PEERCRED:  	{  		struct ucred peercred; @@ -1741,6 +1756,39 @@ int sk_getsockopt(struct sock *sk, int level, int optname,  		goto lenout;  	} +	case SO_PEERPIDFD: +	{ +		struct pid *peer_pid; +		struct file *pidfd_file = NULL; +		int pidfd; + +		if (len > sizeof(pidfd)) +			len = sizeof(pidfd); + +		spin_lock(&sk->sk_peer_lock); +		peer_pid = get_pid(sk->sk_peer_pid); +		spin_unlock(&sk->sk_peer_lock); + +		if (!peer_pid) +			return -ESRCH; + +		pidfd = pidfd_prepare(peer_pid, 0, &pidfd_file); +		put_pid(peer_pid); +		if (pidfd < 0) +			return pidfd; + +		if (copy_to_sockptr(optval, &pidfd, len) || +		    copy_to_sockptr(optlen, &len, sizeof(int))) { +			put_unused_fd(pidfd); +			fput(pidfd_file); + +			return -EFAULT; +		} + +		fd_install(pidfd, pidfd_file); +		return 0; +	} +  	case SO_PEERGROUPS:  	{  		const struct cred *cred; @@ -2550,13 +2598,24 @@ kuid_t sock_i_uid(struct sock *sk)  }  EXPORT_SYMBOL(sock_i_uid); -unsigned long sock_i_ino(struct sock *sk) +unsigned long __sock_i_ino(struct sock *sk)  {  	unsigned long ino; -	read_lock_bh(&sk->sk_callback_lock); +	read_lock(&sk->sk_callback_lock);  	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0; -	read_unlock_bh(&sk->sk_callback_lock); +	read_unlock(&sk->sk_callback_lock); +	return ino; +} +EXPORT_SYMBOL(__sock_i_ino); + +unsigned long sock_i_ino(struct sock *sk) +{ +	unsigned long ino; + +	local_bh_disable(); +	ino = __sock_i_ino(sk); +	local_bh_enable();  	return ino;  }  EXPORT_SYMBOL(sock_i_ino); @@ -3213,36 +3272,6 @@ void __receive_sock(struct file *file)  	}  } -ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags) -{ -	ssize_t res; -	struct msghdr msg = {.msg_flags = flags}; -	struct kvec iov; -	char *kaddr = kmap(page); -	iov.iov_base = kaddr + offset; -	iov.iov_len = size; -	res = kernel_sendmsg(sock, &msg, &iov, 1, size); -	kunmap(page); -	return res; -} -EXPORT_SYMBOL(sock_no_sendpage); - -ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page, -				int offset, size_t size, int flags) -{ -	ssize_t res; -	struct msghdr msg = {.msg_flags = flags}; -	struct kvec iov; -	char *kaddr = kmap(page); - -	iov.iov_base = kaddr + offset; -	iov.iov_len = size; -	res = kernel_sendmsg_locked(sk, &msg, &iov, 1, size); -	kunmap(page); -	return res; -} -EXPORT_SYMBOL(sock_no_sendpage_locked); -  /*   *	Default Socket Callbacks   */ @@ -3998,7 +4027,7 @@ static void proto_seq_printf(struct seq_file *seq, struct proto *proto)  {  	seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s " -			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n", +			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",  		   proto->name,  		   proto->obj_size,  		   sock_prot_inuse_get(seq_file_net(seq), proto), @@ -4019,7 +4048,6 @@ static void proto_seq_printf(struct seq_file *seq, struct proto *proto)  		   proto_method_implemented(proto->getsockopt),  		   proto_method_implemented(proto->sendmsg),  		   proto_method_implemented(proto->recvmsg), -		   proto_method_implemented(proto->sendpage),  		   proto_method_implemented(proto->bind),  		   proto_method_implemented(proto->backlog_rcv),  		   proto_method_implemented(proto->hash), @@ -4040,7 +4068,7 @@ static int proto_seq_show(struct seq_file *seq, void *v)  			   "maxhdr",  			   "slab",  			   "module", -			   "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n"); +			   "cl co di ac io in de sh ss gs se re bi br ha uh gp em\n");  	else  		proto_seq_printf(seq, list_entry(v, struct proto, node));  	return 0; @@ -4100,3 +4128,63 @@ int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len)  	return sk->sk_prot->bind_add(sk, addr, addr_len);  }  EXPORT_SYMBOL(sock_bind_add); + +/* Copy 'size' bytes from userspace and return `size` back to userspace */ +int sock_ioctl_inout(struct sock *sk, unsigned int cmd, +		     void __user *arg, void *karg, size_t size) +{ +	int ret; + +	if (copy_from_user(karg, arg, size)) +		return -EFAULT; + +	ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, karg); +	if (ret) +		return ret; + +	if (copy_to_user(arg, karg, size)) +		return -EFAULT; + +	return 0; +} +EXPORT_SYMBOL(sock_ioctl_inout); + +/* This is the most common ioctl prep function, where the result (4 bytes) is + * copied back to userspace if the ioctl() returns successfully. No input is + * copied from userspace as input argument. + */ +static int sock_ioctl_out(struct sock *sk, unsigned int cmd, void __user *arg) +{ +	int ret, karg = 0; + +	ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, &karg); +	if (ret) +		return ret; + +	return put_user(karg, (int __user *)arg); +} + +/* A wrapper around sock ioctls, which copies the data from userspace + * (depending on the protocol/ioctl), and copies back the result to userspace. + * The main motivation for this function is to pass kernel memory to the + * protocol ioctl callbacks, instead of userspace memory. + */ +int sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg) +{ +	int rc = 1; + +	if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET) +		rc = ipmr_sk_ioctl(sk, cmd, arg); +	else if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET6) +		rc = ip6mr_sk_ioctl(sk, cmd, arg); +	else if (sk_is_phonet(sk)) +		rc = phonet_sk_ioctl(sk, cmd, arg); + +	/* If ioctl was processed, returns its value */ +	if (rc <= 0) +		return rc; + +	/* Otherwise call the default handler */ +	return sock_ioctl_out(sk, cmd, arg); +} +EXPORT_SYMBOL(sk_ioctl); diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 00afb66cd095..19538d628714 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -32,8 +32,6 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr)  {  	struct bpf_stab *stab; -	if (!capable(CAP_NET_ADMIN)) -		return ERR_PTR(-EPERM);  	if (attr->max_entries == 0 ||  	    attr->key_size    != 4 ||  	    (attr->value_size != sizeof(u32) && @@ -1085,8 +1083,6 @@ static struct bpf_map *sock_hash_alloc(union bpf_attr *attr)  	struct bpf_shtab *htab;  	int i, err; -	if (!capable(CAP_NET_ADMIN)) -		return ERR_PTR(-EPERM);  	if (attr->max_entries == 0 ||  	    attr->key_size    == 0 ||  	    (attr->value_size != sizeof(u32) &&  |