diff options
Diffstat (limited to 'net/core/skbuff.c')
| -rw-r--r-- | net/core/skbuff.c | 168 | 
1 files changed, 115 insertions, 53 deletions
diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 1a31815104d6..26a586007d8b 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -58,6 +58,7 @@  #include <linux/scatterlist.h>  #include <linux/errqueue.h>  #include <linux/prefetch.h> +#include <linux/bitfield.h>  #include <linux/if_vlan.h>  #include <linux/mpls.h>  #include <linux/kcov.h> @@ -72,6 +73,7 @@  #include <net/mptcp.h>  #include <net/mctp.h>  #include <net/page_pool.h> +#include <net/dropreason.h>  #include <linux/uaccess.h>  #include <trace/events/skb.h> @@ -122,11 +124,59 @@ EXPORT_SYMBOL(sysctl_max_skb_frags);  #undef FN  #define FN(reason) [SKB_DROP_REASON_##reason] = #reason, -const char * const drop_reasons[] = { +static const char * const drop_reasons[] = {  	[SKB_CONSUMED] = "CONSUMED",  	DEFINE_DROP_REASON(FN, FN)  }; -EXPORT_SYMBOL(drop_reasons); + +static const struct drop_reason_list drop_reasons_core = { +	.reasons = drop_reasons, +	.n_reasons = ARRAY_SIZE(drop_reasons), +}; + +const struct drop_reason_list __rcu * +drop_reasons_by_subsys[SKB_DROP_REASON_SUBSYS_NUM] = { +	[SKB_DROP_REASON_SUBSYS_CORE] = RCU_INITIALIZER(&drop_reasons_core), +}; +EXPORT_SYMBOL(drop_reasons_by_subsys); + +/** + * drop_reasons_register_subsys - register another drop reason subsystem + * @subsys: the subsystem to register, must not be the core + * @list: the list of drop reasons within the subsystem, must point to + *	a statically initialized list + */ +void drop_reasons_register_subsys(enum skb_drop_reason_subsys subsys, +				  const struct drop_reason_list *list) +{ +	if (WARN(subsys <= SKB_DROP_REASON_SUBSYS_CORE || +		 subsys >= ARRAY_SIZE(drop_reasons_by_subsys), +		 "invalid subsystem %d\n", subsys)) +		return; + +	/* must point to statically allocated memory, so INIT is OK */ +	RCU_INIT_POINTER(drop_reasons_by_subsys[subsys], list); +} +EXPORT_SYMBOL_GPL(drop_reasons_register_subsys); + +/** + * drop_reasons_unregister_subsys - unregister a drop reason subsystem + * @subsys: the subsystem to remove, must not be the core + * + * Note: This will synchronize_rcu() to ensure no users when it returns. + */ +void drop_reasons_unregister_subsys(enum skb_drop_reason_subsys subsys) +{ +	if (WARN(subsys <= SKB_DROP_REASON_SUBSYS_CORE || +		 subsys >= ARRAY_SIZE(drop_reasons_by_subsys), +		 "invalid subsystem %d\n", subsys)) +		return; + +	RCU_INIT_POINTER(drop_reasons_by_subsys[subsys], NULL); + +	synchronize_rcu(); +} +EXPORT_SYMBOL_GPL(drop_reasons_unregister_subsys);  /**   *	skb_panic - private function for out-of-line support @@ -420,10 +470,9 @@ struct sk_buff *build_skb(void *data, unsigned int frag_size)  {  	struct sk_buff *skb = __build_skb(data, frag_size); -	if (skb && frag_size) { +	if (likely(skb && frag_size)) {  		skb->head_frag = 1; -		if (page_is_pfmemalloc(virt_to_head_page(data))) -			skb->pfmemalloc = 1; +		skb_propagate_pfmemalloc(virt_to_head_page(data), skb);  	}  	return skb;  } @@ -445,8 +494,7 @@ struct sk_buff *build_skb_around(struct sk_buff *skb,  	if (frag_size) {  		skb->head_frag = 1; -		if (page_is_pfmemalloc(virt_to_head_page(data))) -			skb->pfmemalloc = 1; +		skb_propagate_pfmemalloc(virt_to_head_page(data), skb);  	}  	return skb;  } @@ -841,11 +889,11 @@ static void skb_clone_fraglist(struct sk_buff *skb)  		skb_get(list);  } -static bool skb_pp_recycle(struct sk_buff *skb, void *data) +static bool skb_pp_recycle(struct sk_buff *skb, void *data, bool napi_safe)  {  	if (!IS_ENABLED(CONFIG_PAGE_POOL) || !skb->pp_recycle)  		return false; -	return page_pool_return_skb_page(virt_to_page(data)); +	return page_pool_return_skb_page(virt_to_page(data), napi_safe);  }  static void skb_kfree_head(void *head, unsigned int end_offset) @@ -858,12 +906,12 @@ static void skb_kfree_head(void *head, unsigned int end_offset)  		kfree(head);  } -static void skb_free_head(struct sk_buff *skb) +static void skb_free_head(struct sk_buff *skb, bool napi_safe)  {  	unsigned char *head = skb->head;  	if (skb->head_frag) { -		if (skb_pp_recycle(skb, head)) +		if (skb_pp_recycle(skb, head, napi_safe))  			return;  		skb_free_frag(head);  	} else { @@ -871,7 +919,8 @@ static void skb_free_head(struct sk_buff *skb)  	}  } -static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason) +static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason, +			     bool napi_safe)  {  	struct skb_shared_info *shinfo = skb_shinfo(skb);  	int i; @@ -890,13 +939,13 @@ static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason)  	}  	for (i = 0; i < shinfo->nr_frags; i++) -		__skb_frag_unref(&shinfo->frags[i], skb->pp_recycle); +		napi_frag_unref(&shinfo->frags[i], skb->pp_recycle, napi_safe);  free_head:  	if (shinfo->frag_list)  		kfree_skb_list_reason(shinfo->frag_list, reason); -	skb_free_head(skb); +	skb_free_head(skb, napi_safe);  exit:  	/* When we clone an SKB we copy the reycling bit. The pp_recycle  	 * bit is only set on the head though, so in order to avoid races @@ -957,11 +1006,12 @@ void skb_release_head_state(struct sk_buff *skb)  }  /* Free everything but the sk_buff shell. */ -static void skb_release_all(struct sk_buff *skb, enum skb_drop_reason reason) +static void skb_release_all(struct sk_buff *skb, enum skb_drop_reason reason, +			    bool napi_safe)  {  	skb_release_head_state(skb);  	if (likely(skb->head)) -		skb_release_data(skb, reason); +		skb_release_data(skb, reason, napi_safe);  }  /** @@ -975,7 +1025,7 @@ static void skb_release_all(struct sk_buff *skb, enum skb_drop_reason reason)  void __kfree_skb(struct sk_buff *skb)  { -	skb_release_all(skb, SKB_DROP_REASON_NOT_SPECIFIED); +	skb_release_all(skb, SKB_DROP_REASON_NOT_SPECIFIED, false);  	kfree_skbmem(skb);  }  EXPORT_SYMBOL(__kfree_skb); @@ -986,7 +1036,10 @@ bool __kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason)  	if (unlikely(!skb_unref(skb)))  		return false; -	DEBUG_NET_WARN_ON_ONCE(reason <= 0 || reason >= SKB_DROP_REASON_MAX); +	DEBUG_NET_WARN_ON_ONCE(reason == SKB_NOT_DROPPED_YET || +			       u32_get_bits(reason, +					    SKB_DROP_REASON_SUBSYS_MASK) >= +				SKB_DROP_REASON_SUBSYS_NUM);  	if (reason == SKB_CONSUMED)  		trace_consume_skb(skb, __builtin_return_address(0)); @@ -1029,7 +1082,7 @@ static void kfree_skb_add_bulk(struct sk_buff *skb,  		return;  	} -	skb_release_all(skb, reason); +	skb_release_all(skb, reason, false);  	sa->skb_array[sa->skb_count++] = skb;  	if (unlikely(sa->skb_count == KFREE_SKB_BULK_SIZE)) { @@ -1203,7 +1256,7 @@ EXPORT_SYMBOL(consume_skb);  void __consume_stateless_skb(struct sk_buff *skb)  {  	trace_consume_skb(skb, __builtin_return_address(0)); -	skb_release_data(skb, SKB_CONSUMED); +	skb_release_data(skb, SKB_CONSUMED, false);  	kfree_skbmem(skb);  } @@ -1226,9 +1279,9 @@ static void napi_skb_cache_put(struct sk_buff *skb)  	}  } -void __kfree_skb_defer(struct sk_buff *skb) +void __napi_kfree_skb(struct sk_buff *skb, enum skb_drop_reason reason)  { -	skb_release_all(skb, SKB_DROP_REASON_NOT_SPECIFIED); +	skb_release_all(skb, reason, true);  	napi_skb_cache_put(skb);  } @@ -1266,7 +1319,7 @@ void napi_consume_skb(struct sk_buff *skb, int budget)  		return;  	} -	skb_release_all(skb, SKB_CONSUMED); +	skb_release_all(skb, SKB_CONSUMED, !!budget);  	napi_skb_cache_put(skb);  }  EXPORT_SYMBOL(napi_consume_skb); @@ -1397,7 +1450,7 @@ EXPORT_SYMBOL_GPL(alloc_skb_for_msg);   */  struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src)  { -	skb_release_all(dst, SKB_CONSUMED); +	skb_release_all(dst, SKB_CONSUMED, false);  	return __skb_clone(dst, src);  }  EXPORT_SYMBOL_GPL(skb_morph); @@ -1705,7 +1758,7 @@ int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask)  {  	int num_frags = skb_shinfo(skb)->nr_frags;  	struct page *page, *head = NULL; -	int i, new_frags; +	int i, order, psize, new_frags;  	u32 d_off;  	if (skb_shared(skb) || skb_unclone(skb, gfp_mask)) @@ -1714,9 +1767,17 @@ int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask)  	if (!num_frags)  		goto release; -	new_frags = (__skb_pagelen(skb) + PAGE_SIZE - 1) >> PAGE_SHIFT; +	/* We might have to allocate high order pages, so compute what minimum +	 * page order is needed. +	 */ +	order = 0; +	while ((PAGE_SIZE << order) * MAX_SKB_FRAGS < __skb_pagelen(skb)) +		order++; +	psize = (PAGE_SIZE << order); + +	new_frags = (__skb_pagelen(skb) + psize - 1) >> (PAGE_SHIFT + order);  	for (i = 0; i < new_frags; i++) { -		page = alloc_page(gfp_mask); +		page = alloc_pages(gfp_mask | __GFP_COMP, order);  		if (!page) {  			while (head) {  				struct page *next = (struct page *)page_private(head); @@ -1743,11 +1804,11 @@ int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask)  			vaddr = kmap_atomic(p);  			while (done < p_len) { -				if (d_off == PAGE_SIZE) { +				if (d_off == psize) {  					d_off = 0;  					page = (struct page *)page_private(page);  				} -				copy = min_t(u32, PAGE_SIZE - d_off, p_len - done); +				copy = min_t(u32, psize - d_off, p_len - done);  				memcpy(page_address(page) + d_off,  				       vaddr + p_off + done, copy);  				done += copy; @@ -1763,7 +1824,7 @@ int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask)  	/* skb frags point to kernel buffers */  	for (i = 0; i < new_frags - 1; i++) { -		__skb_fill_page_desc(skb, i, head, 0, PAGE_SIZE); +		__skb_fill_page_desc(skb, i, head, 0, psize);  		head = (struct page *)page_private(head);  	}  	__skb_fill_page_desc(skb, new_frags - 1, head, 0, d_off); @@ -2020,9 +2081,9 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,  		if (skb_has_frag_list(skb))  			skb_clone_fraglist(skb); -		skb_release_data(skb, SKB_CONSUMED); +		skb_release_data(skb, SKB_CONSUMED, false);  	} else { -		skb_free_head(skb); +		skb_free_head(skb, false);  	}  	off = (data + nhead) - skb->head; @@ -5162,6 +5223,9 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb,  			skb = alloc_skb(0, GFP_ATOMIC);  	} else {  		skb = skb_clone(orig_skb, GFP_ATOMIC); + +		if (skb_orphan_frags_rx(skb, GFP_ATOMIC)) +			return;  	}  	if (!skb)  		return; @@ -5189,6 +5253,7 @@ void skb_tstamp_tx(struct sk_buff *orig_skb,  }  EXPORT_SYMBOL_GPL(skb_tstamp_tx); +#ifdef CONFIG_WIRELESS  void skb_complete_wifi_ack(struct sk_buff *skb, bool acked)  {  	struct sock *sk = skb->sk; @@ -5214,6 +5279,7 @@ void skb_complete_wifi_ack(struct sk_buff *skb, bool acked)  		kfree_skb(skb);  }  EXPORT_SYMBOL_GPL(skb_complete_wifi_ack); +#endif /* CONFIG_WIRELESS */  /**   * skb_partial_csum_set - set up and verify partial csum values for packet @@ -5599,18 +5665,18 @@ bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from,  	if (skb_cloned(to))  		return false; -	/* In general, avoid mixing slab allocated and page_pool allocated -	 * pages within the same SKB. However when @to is not pp_recycle and -	 * @from is cloned, we can transition frag pages from page_pool to -	 * reference counted. -	 * -	 * On the other hand, don't allow coalescing two pp_recycle SKBs if -	 * @from is cloned, in case the SKB is using page_pool fragment +	/* In general, avoid mixing page_pool and non-page_pool allocated +	 * pages within the same SKB. Additionally avoid dealing with clones +	 * with page_pool pages, in case the SKB is using page_pool fragment  	 * references (PP_FLAG_PAGE_FRAG). Since we only take full page  	 * references for cloned SKBs at the moment that would result in  	 * inconsistent reference counts. +	 * In theory we could take full references if @from is cloned and +	 * !@to->pp_recycle but its tricky (due to potential race with +	 * the clone disappearing) and rare, so not worth dealing with.  	 */ -	if (to->pp_recycle != (from->pp_recycle && !skb_cloned(from))) +	if (to->pp_recycle != from->pp_recycle || +	    (from->pp_recycle && skb_cloned(from)))  		return false;  	if (len <= skb_tailroom(to)) { @@ -5941,7 +6007,6 @@ EXPORT_SYMBOL(skb_ensure_writable);   */  int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci)  { -	struct vlan_hdr *vhdr;  	int offset = skb->data - skb_mac_header(skb);  	int err; @@ -5957,13 +6022,8 @@ int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci)  	skb_postpull_rcsum(skb, skb->data + (2 * ETH_ALEN), VLAN_HLEN); -	vhdr = (struct vlan_hdr *)(skb->data + ETH_HLEN); -	*vlan_tci = ntohs(vhdr->h_vlan_TCI); +	vlan_remove_tag(skb, vlan_tci); -	memmove(skb->data + VLAN_HLEN, skb->data, 2 * ETH_ALEN); -	__skb_pull(skb, VLAN_HLEN); - -	vlan_set_encap_proto(skb, vhdr);  	skb->mac_header += VLAN_HLEN;  	if (skb_network_offset(skb) < ETH_HLEN) @@ -6391,12 +6451,12 @@ static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off,  			skb_frag_ref(skb, i);  		if (skb_has_frag_list(skb))  			skb_clone_fraglist(skb); -		skb_release_data(skb, SKB_CONSUMED); +		skb_release_data(skb, SKB_CONSUMED, false);  	} else {  		/* we can reuse existing recount- all we did was  		 * relocate values  		 */ -		skb_free_head(skb); +		skb_free_head(skb, false);  	}  	skb->head = data; @@ -6531,7 +6591,7 @@ static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off,  		skb_kfree_head(data, size);  		return -ENOMEM;  	} -	skb_release_data(skb, SKB_CONSUMED); +	skb_release_data(skb, SKB_CONSUMED, false);  	skb->head = data;  	skb->head_frag = 0; @@ -6815,7 +6875,6 @@ void skb_attempt_defer_free(struct sk_buff *skb)  {  	int cpu = skb->alloc_cpu;  	struct softnet_data *sd; -	unsigned long flags;  	unsigned int defer_max;  	bool kick; @@ -6826,12 +6885,15 @@ nodefer:	__kfree_skb(skb);  		return;  	} +	DEBUG_NET_WARN_ON_ONCE(skb_dst(skb)); +	DEBUG_NET_WARN_ON_ONCE(skb->destructor); +  	sd = &per_cpu(softnet_data, cpu);  	defer_max = READ_ONCE(sysctl_skb_defer_max);  	if (READ_ONCE(sd->defer_count) >= defer_max)  		goto nodefer; -	spin_lock_irqsave(&sd->defer_lock, flags); +	spin_lock_bh(&sd->defer_lock);  	/* Send an IPI every time queue reaches half capacity. */  	kick = sd->defer_count == (defer_max >> 1);  	/* Paired with the READ_ONCE() few lines above */ @@ -6840,7 +6902,7 @@ nodefer:	__kfree_skb(skb);  	skb->next = sd->defer_list;  	/* Paired with READ_ONCE() in skb_defer_free_flush() */  	WRITE_ONCE(sd->defer_list, skb); -	spin_unlock_irqrestore(&sd->defer_lock, flags); +	spin_unlock_bh(&sd->defer_lock);  	/* Make sure to trigger NET_RX_SOFTIRQ on the remote CPU  	 * if we are unlucky enough (this seems very unlikely).  |