diff options
Diffstat (limited to 'net/core/skbuff.c')
| -rw-r--r-- | net/core/skbuff.c | 193 | 
1 files changed, 103 insertions, 90 deletions
diff --git a/net/core/skbuff.c b/net/core/skbuff.c index b99127712e67..466999a7515e 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -51,6 +51,7 @@  #endif  #include <linux/string.h>  #include <linux/skbuff.h> +#include <linux/skbuff_ref.h>  #include <linux/splice.h>  #include <linux/cache.h>  #include <linux/rtnetlink.h> @@ -108,9 +109,6 @@ static struct kmem_cache *skbuff_ext_cache __ro_after_init;  #define SKB_SMALL_HEAD_HEADROOM						\  	SKB_WITH_OVERHEAD(SKB_SMALL_HEAD_CACHE_SIZE) -int sysctl_max_skb_frags __read_mostly = MAX_SKB_FRAGS; -EXPORT_SYMBOL(sysctl_max_skb_frags); -  /* kcm_write_msgs() relies on casting paged frags to bio_vec to use   * iov_iter_bvec(). These static asserts ensure the cast is valid is long as the   * netmem is a page. @@ -775,10 +773,9 @@ skb_fail:  EXPORT_SYMBOL(__netdev_alloc_skb);  /** - *	__napi_alloc_skb - allocate skbuff for rx in a specific NAPI instance + *	napi_alloc_skb - allocate skbuff for rx in a specific NAPI instance   *	@napi: napi instance this buffer was allocated for   *	@len: length to allocate - *	@gfp_mask: get_free_pages mask, passed to alloc_skb and alloc_pages   *   *	Allocate a new sk_buff for use in NAPI receive.  This buffer will   *	attempt to allocate the head from a special reserved region used @@ -787,9 +784,9 @@ EXPORT_SYMBOL(__netdev_alloc_skb);   *   *	%NULL is returned if there is no free memory.   */ -struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, -				 gfp_t gfp_mask) +struct sk_buff *napi_alloc_skb(struct napi_struct *napi, unsigned int len)  { +	gfp_t gfp_mask = GFP_ATOMIC | __GFP_NOWARN;  	struct napi_alloc_cache *nc;  	struct sk_buff *skb;  	bool pfmemalloc; @@ -860,7 +857,7 @@ skb_success:  skb_fail:  	return skb;  } -EXPORT_SYMBOL(__napi_alloc_skb); +EXPORT_SYMBOL(napi_alloc_skb);  void skb_add_rx_frag_netmem(struct sk_buff *skb, int i, netmem_ref netmem,  			    int off, int size, unsigned int truesize) @@ -1005,11 +1002,8 @@ int skb_cow_data_for_xdp(struct page_pool *pool, struct sk_buff **pskb,  EXPORT_SYMBOL(skb_cow_data_for_xdp);  #if IS_ENABLED(CONFIG_PAGE_POOL) -bool napi_pp_put_page(struct page *page, bool napi_safe) +bool napi_pp_put_page(struct page *page)  { -	bool allow_direct = false; -	struct page_pool *pp; -  	page = compound_head(page);  	/* page->pp_magic is OR'ed with PP_SIGNATURE after the allocation @@ -1022,39 +1016,18 @@ bool napi_pp_put_page(struct page *page, bool napi_safe)  	if (unlikely(!is_pp_page(page)))  		return false; -	pp = page->pp; - -	/* Allow direct recycle if we have reasons to believe that we are -	 * in the same context as the consumer would run, so there's -	 * no possible race. -	 * __page_pool_put_page() makes sure we're not in hardirq context -	 * and interrupts are enabled prior to accessing the cache. -	 */ -	if (napi_safe || in_softirq()) { -		const struct napi_struct *napi = READ_ONCE(pp->p.napi); -		unsigned int cpuid = smp_processor_id(); - -		allow_direct = napi && READ_ONCE(napi->list_owner) == cpuid; -		allow_direct |= READ_ONCE(pp->cpuid) == cpuid; -	} - -	/* Driver set this to memory recycling info. Reset it on recycle. -	 * This will *not* work for NIC using a split-page memory model. -	 * The page will be returned to the pool here regardless of the -	 * 'flipped' fragment being in use or not. -	 */ -	page_pool_put_full_page(pp, page, allow_direct); +	page_pool_put_full_page(page->pp, page, false);  	return true;  }  EXPORT_SYMBOL(napi_pp_put_page);  #endif -static bool skb_pp_recycle(struct sk_buff *skb, void *data, bool napi_safe) +static bool skb_pp_recycle(struct sk_buff *skb, void *data)  {  	if (!IS_ENABLED(CONFIG_PAGE_POOL) || !skb->pp_recycle)  		return false; -	return napi_pp_put_page(virt_to_page(data), napi_safe); +	return napi_pp_put_page(virt_to_page(data));  }  /** @@ -1096,12 +1069,12 @@ static void skb_kfree_head(void *head, unsigned int end_offset)  		kfree(head);  } -static void skb_free_head(struct sk_buff *skb, bool napi_safe) +static void skb_free_head(struct sk_buff *skb)  {  	unsigned char *head = skb->head;  	if (skb->head_frag) { -		if (skb_pp_recycle(skb, head, napi_safe)) +		if (skb_pp_recycle(skb, head))  			return;  		skb_free_frag(head);  	} else { @@ -1109,8 +1082,7 @@ static void skb_free_head(struct sk_buff *skb, bool napi_safe)  	}  } -static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason, -			     bool napi_safe) +static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason)  {  	struct skb_shared_info *shinfo = skb_shinfo(skb);  	int i; @@ -1127,13 +1099,13 @@ static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason,  	}  	for (i = 0; i < shinfo->nr_frags; i++) -		napi_frag_unref(&shinfo->frags[i], skb->pp_recycle, napi_safe); +		__skb_frag_unref(&shinfo->frags[i], skb->pp_recycle);  free_head:  	if (shinfo->frag_list)  		kfree_skb_list_reason(shinfo->frag_list, reason); -	skb_free_head(skb, napi_safe); +	skb_free_head(skb);  exit:  	/* When we clone an SKB we copy the reycling bit. The pp_recycle  	 * bit is only set on the head though, so in order to avoid races @@ -1194,12 +1166,11 @@ void skb_release_head_state(struct sk_buff *skb)  }  /* Free everything but the sk_buff shell. */ -static void skb_release_all(struct sk_buff *skb, enum skb_drop_reason reason, -			    bool napi_safe) +static void skb_release_all(struct sk_buff *skb, enum skb_drop_reason reason)  {  	skb_release_head_state(skb);  	if (likely(skb->head)) -		skb_release_data(skb, reason, napi_safe); +		skb_release_data(skb, reason);  }  /** @@ -1213,7 +1184,7 @@ static void skb_release_all(struct sk_buff *skb, enum skb_drop_reason reason,  void __kfree_skb(struct sk_buff *skb)  { -	skb_release_all(skb, SKB_DROP_REASON_NOT_SPECIFIED, false); +	skb_release_all(skb, SKB_DROP_REASON_NOT_SPECIFIED);  	kfree_skbmem(skb);  }  EXPORT_SYMBOL(__kfree_skb); @@ -1270,7 +1241,7 @@ static void kfree_skb_add_bulk(struct sk_buff *skb,  		return;  	} -	skb_release_all(skb, reason, false); +	skb_release_all(skb, reason);  	sa->skb_array[sa->skb_count++] = skb;  	if (unlikely(sa->skb_count == KFREE_SKB_BULK_SIZE)) { @@ -1331,22 +1302,28 @@ void skb_dump(const char *level, const struct sk_buff *skb, bool full_pkt)  	has_trans = skb_transport_header_was_set(skb);  	printk("%sskb len=%u headroom=%u headlen=%u tailroom=%u\n" -	       "mac=(%d,%d) net=(%d,%d) trans=%d\n" +	       "mac=(%d,%d) mac_len=%u net=(%d,%d) trans=%d\n"  	       "shinfo(txflags=%u nr_frags=%u gso(size=%hu type=%u segs=%hu))\n" -	       "csum(0x%x ip_summed=%u complete_sw=%u valid=%u level=%u)\n" -	       "hash(0x%x sw=%u l4=%u) proto=0x%04x pkttype=%u iif=%d\n", +	       "csum(0x%x start=%u offset=%u ip_summed=%u complete_sw=%u valid=%u level=%u)\n" +	       "hash(0x%x sw=%u l4=%u) proto=0x%04x pkttype=%u iif=%d\n" +	       "priority=0x%x mark=0x%x alloc_cpu=%u vlan_all=0x%x\n" +	       "encapsulation=%d inner(proto=0x%04x, mac=%u, net=%u, trans=%u)\n",  	       level, skb->len, headroom, skb_headlen(skb), tailroom,  	       has_mac ? skb->mac_header : -1,  	       has_mac ? skb_mac_header_len(skb) : -1, +	       skb->mac_len,  	       skb->network_header,  	       has_trans ? skb_network_header_len(skb) : -1,  	       has_trans ? skb->transport_header : -1,  	       sh->tx_flags, sh->nr_frags,  	       sh->gso_size, sh->gso_type, sh->gso_segs, -	       skb->csum, skb->ip_summed, skb->csum_complete_sw, -	       skb->csum_valid, skb->csum_level, +	       skb->csum, skb->csum_start, skb->csum_offset, skb->ip_summed, +	       skb->csum_complete_sw, skb->csum_valid, skb->csum_level,  	       skb->hash, skb->sw_hash, skb->l4_hash, -	       ntohs(skb->protocol), skb->pkt_type, skb->skb_iif); +	       ntohs(skb->protocol), skb->pkt_type, skb->skb_iif, +	       skb->priority, skb->mark, skb->alloc_cpu, skb->vlan_all, +	       skb->encapsulation, skb->inner_protocol, skb->inner_mac_header, +	       skb->inner_network_header, skb->inner_transport_header);  	if (dev)  		printk("%sdev name=%s feat=%pNF\n", @@ -1444,7 +1421,7 @@ EXPORT_SYMBOL(consume_skb);  void __consume_stateless_skb(struct sk_buff *skb)  {  	trace_consume_skb(skb, __builtin_return_address(0)); -	skb_release_data(skb, SKB_CONSUMED, false); +	skb_release_data(skb, SKB_CONSUMED);  	kfree_skbmem(skb);  } @@ -1471,7 +1448,7 @@ static void napi_skb_cache_put(struct sk_buff *skb)  void __napi_kfree_skb(struct sk_buff *skb, enum skb_drop_reason reason)  { -	skb_release_all(skb, reason, true); +	skb_release_all(skb, reason);  	napi_skb_cache_put(skb);  } @@ -1509,7 +1486,7 @@ void napi_consume_skb(struct sk_buff *skb, int budget)  		return;  	} -	skb_release_all(skb, SKB_CONSUMED, !!budget); +	skb_release_all(skb, SKB_CONSUMED);  	napi_skb_cache_put(skb);  }  EXPORT_SYMBOL(napi_consume_skb); @@ -1640,7 +1617,7 @@ EXPORT_SYMBOL_GPL(alloc_skb_for_msg);   */  struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src)  { -	skb_release_all(dst, SKB_CONSUMED, false); +	skb_release_all(dst, SKB_CONSUMED);  	return __skb_clone(dst, src);  }  EXPORT_SYMBOL_GPL(skb_morph); @@ -1708,7 +1685,7 @@ static struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size)  		return NULL;  	} -	uarg->ubuf.callback = msg_zerocopy_callback; +	uarg->ubuf.ops = &msg_zerocopy_ubuf_ops;  	uarg->id = ((u32)atomic_inc_return(&sk->sk_zckey)) - 1;  	uarg->len = 1;  	uarg->bytelen = size; @@ -1734,7 +1711,7 @@ struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size,  		u32 bytelen, next;  		/* there might be non MSG_ZEROCOPY users */ -		if (uarg->callback != msg_zerocopy_callback) +		if (uarg->ops != &msg_zerocopy_ubuf_ops)  			return NULL;  		/* realloc only when socket is locked (TCP, UDP cork), @@ -1845,8 +1822,8 @@ release:  	sock_put(sk);  } -void msg_zerocopy_callback(struct sk_buff *skb, struct ubuf_info *uarg, -			   bool success) +static void msg_zerocopy_complete(struct sk_buff *skb, struct ubuf_info *uarg, +				  bool success)  {  	struct ubuf_info_msgzc *uarg_zc = uarg_to_msgzc(uarg); @@ -1855,7 +1832,6 @@ void msg_zerocopy_callback(struct sk_buff *skb, struct ubuf_info *uarg,  	if (refcount_dec_and_test(&uarg->refcnt))  		__msg_zerocopy_callback(uarg_zc);  } -EXPORT_SYMBOL_GPL(msg_zerocopy_callback);  void msg_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref)  { @@ -1865,10 +1841,15 @@ void msg_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref)  	uarg_to_msgzc(uarg)->len--;  	if (have_uref) -		msg_zerocopy_callback(NULL, uarg, true); +		msg_zerocopy_complete(NULL, uarg, true);  }  EXPORT_SYMBOL_GPL(msg_zerocopy_put_abort); +const struct ubuf_info_ops msg_zerocopy_ubuf_ops = { +	.complete = msg_zerocopy_complete, +}; +EXPORT_SYMBOL_GPL(msg_zerocopy_ubuf_ops); +  int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,  			     struct msghdr *msg, int len,  			     struct ubuf_info *uarg) @@ -1876,11 +1857,18 @@ int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,  	struct ubuf_info *orig_uarg = skb_zcopy(skb);  	int err, orig_len = skb->len; -	/* An skb can only point to one uarg. This edge case happens when -	 * TCP appends to an skb, but zerocopy_realloc triggered a new alloc. -	 */ -	if (orig_uarg && uarg != orig_uarg) -		return -EEXIST; +	if (uarg->ops->link_skb) { +		err = uarg->ops->link_skb(skb, uarg); +		if (err) +			return err; +	} else { +		/* An skb can only point to one uarg. This edge case happens +		 * when TCP appends to an skb, but zerocopy_realloc triggered +		 * a new alloc. +		 */ +		if (orig_uarg && uarg != orig_uarg) +			return -EEXIST; +	}  	err = __zerocopy_sg_from_iter(msg, sk, skb, &msg->msg_iter, len);  	if (err == -EFAULT || (err == -EMSGSIZE && skb->len == orig_len)) { @@ -1894,7 +1882,8 @@ int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,  		return err;  	} -	skb_zcopy_set(skb, uarg, NULL); +	if (!uarg->ops->link_skb) +		skb_zcopy_set(skb, uarg, NULL);  	return skb->len - orig_len;  }  EXPORT_SYMBOL_GPL(skb_zerocopy_iter_stream); @@ -2123,11 +2112,17 @@ static inline int skb_alloc_rx_flag(const struct sk_buff *skb)  struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask)  { -	int headerlen = skb_headroom(skb); -	unsigned int size = skb_end_offset(skb) + skb->data_len; -	struct sk_buff *n = __alloc_skb(size, gfp_mask, -					skb_alloc_rx_flag(skb), NUMA_NO_NODE); +	struct sk_buff *n; +	unsigned int size; +	int headerlen; + +	if (WARN_ON_ONCE(skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST)) +		return NULL; +	headerlen = skb_headroom(skb); +	size = skb_end_offset(skb) + skb->data_len; +	n = __alloc_skb(size, gfp_mask, +			skb_alloc_rx_flag(skb), NUMA_NO_NODE);  	if (!n)  		return NULL; @@ -2272,9 +2267,9 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,  		if (skb_has_frag_list(skb))  			skb_clone_fraglist(skb); -		skb_release_data(skb, SKB_CONSUMED, false); +		skb_release_data(skb, SKB_CONSUMED);  	} else { -		skb_free_head(skb, false); +		skb_free_head(skb);  	}  	off = (data + nhead) - skb->head; @@ -2455,12 +2450,17 @@ struct sk_buff *skb_copy_expand(const struct sk_buff *skb,  	/*  	 *	Allocate the copy buffer  	 */ -	struct sk_buff *n = __alloc_skb(newheadroom + skb->len + newtailroom, -					gfp_mask, skb_alloc_rx_flag(skb), -					NUMA_NO_NODE); -	int oldheadroom = skb_headroom(skb);  	int head_copy_len, head_copy_off; +	struct sk_buff *n; +	int oldheadroom; +	if (WARN_ON_ONCE(skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST)) +		return NULL; + +	oldheadroom = skb_headroom(skb); +	n = __alloc_skb(newheadroom + skb->len + newtailroom, +			gfp_mask, skb_alloc_rx_flag(skb), +			NUMA_NO_NODE);  	if (!n)  		return NULL; @@ -6575,12 +6575,12 @@ static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off,  			skb_frag_ref(skb, i);  		if (skb_has_frag_list(skb))  			skb_clone_fraglist(skb); -		skb_release_data(skb, SKB_CONSUMED, false); +		skb_release_data(skb, SKB_CONSUMED);  	} else {  		/* we can reuse existing recount- all we did was  		 * relocate values  		 */ -		skb_free_head(skb, false); +		skb_free_head(skb);  	}  	skb->head = data; @@ -6715,7 +6715,7 @@ static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off,  		skb_kfree_head(data, size);  		return -ENOMEM;  	} -	skb_release_data(skb, SKB_CONSUMED, false); +	skb_release_data(skb, SKB_CONSUMED);  	skb->head = data;  	skb->head_frag = 0; @@ -6995,6 +6995,19 @@ free_now:  EXPORT_SYMBOL(__skb_ext_put);  #endif /* CONFIG_SKB_EXTENSIONS */ +static void kfree_skb_napi_cache(struct sk_buff *skb) +{ +	/* if SKB is a clone, don't handle this case */ +	if (skb->fclone != SKB_FCLONE_UNAVAILABLE) { +		__kfree_skb(skb); +		return; +	} + +	local_bh_disable(); +	__napi_kfree_skb(skb, SKB_CONSUMED); +	local_bh_enable(); +} +  /**   * skb_attempt_defer_free - queue skb for remote freeing   * @skb: buffer @@ -7010,10 +7023,10 @@ void skb_attempt_defer_free(struct sk_buff *skb)  	unsigned int defer_max;  	bool kick; -	if (WARN_ON_ONCE(cpu >= nr_cpu_ids) || -	    !cpu_online(cpu) || -	    cpu == raw_smp_processor_id()) { -nodefer:	__kfree_skb(skb); +	if (cpu == raw_smp_processor_id() || +	    WARN_ON_ONCE(cpu >= nr_cpu_ids) || +	    !cpu_online(cpu)) { +nodefer:	kfree_skb_napi_cache(skb);  		return;  	} @@ -7021,7 +7034,7 @@ nodefer:	__kfree_skb(skb);  	DEBUG_NET_WARN_ON_ONCE(skb->destructor);  	sd = &per_cpu(softnet_data, cpu); -	defer_max = READ_ONCE(sysctl_skb_defer_max); +	defer_max = READ_ONCE(net_hotdata.sysctl_skb_defer_max);  	if (READ_ONCE(sd->defer_count) >= defer_max)  		goto nodefer; @@ -7039,8 +7052,8 @@ nodefer:	__kfree_skb(skb);  	/* Make sure to trigger NET_RX_SOFTIRQ on the remote CPU  	 * if we are unlucky enough (this seems very unlikely).  	 */ -	if (unlikely(kick) && !cmpxchg(&sd->defer_ipi_scheduled, 0, 1)) -		smp_call_function_single_async(cpu, &sd->defer_csd); +	if (unlikely(kick)) +		kick_defer_list_purge(sd, cpu);  }  static void skb_splice_csum_page(struct sk_buff *skb, struct page *page, @@ -7073,7 +7086,7 @@ static void skb_splice_csum_page(struct sk_buff *skb, struct page *page,  ssize_t skb_splice_from_iter(struct sk_buff *skb, struct iov_iter *iter,  			     ssize_t maxsize, gfp_t gfp)  { -	size_t frag_limit = READ_ONCE(sysctl_max_skb_frags); +	size_t frag_limit = READ_ONCE(net_hotdata.sysctl_max_skb_frags);  	struct page *pages[8], **ppages = pages;  	ssize_t spliced = 0, ret = 0;  	unsigned int i;  |