diff options
Diffstat (limited to 'net/core/skbuff.c')
| -rw-r--r-- | net/core/skbuff.c | 179 |
1 files changed, 124 insertions, 55 deletions
diff --git a/net/core/skbuff.c b/net/core/skbuff.c index eb7d33b41e71..2112146092bf 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -58,6 +58,7 @@ #include <linux/scatterlist.h> #include <linux/errqueue.h> #include <linux/prefetch.h> +#include <linux/bitfield.h> #include <linux/if_vlan.h> #include <linux/mpls.h> #include <linux/kcov.h> @@ -72,6 +73,7 @@ #include <net/mptcp.h> #include <net/mctp.h> #include <net/page_pool.h> +#include <net/dropreason.h> #include <linux/uaccess.h> #include <trace/events/skb.h> @@ -122,11 +124,59 @@ EXPORT_SYMBOL(sysctl_max_skb_frags); #undef FN #define FN(reason) [SKB_DROP_REASON_##reason] = #reason, -const char * const drop_reasons[] = { +static const char * const drop_reasons[] = { [SKB_CONSUMED] = "CONSUMED", DEFINE_DROP_REASON(FN, FN) }; -EXPORT_SYMBOL(drop_reasons); + +static const struct drop_reason_list drop_reasons_core = { + .reasons = drop_reasons, + .n_reasons = ARRAY_SIZE(drop_reasons), +}; + +const struct drop_reason_list __rcu * +drop_reasons_by_subsys[SKB_DROP_REASON_SUBSYS_NUM] = { + [SKB_DROP_REASON_SUBSYS_CORE] = RCU_INITIALIZER(&drop_reasons_core), +}; +EXPORT_SYMBOL(drop_reasons_by_subsys); + +/** + * drop_reasons_register_subsys - register another drop reason subsystem + * @subsys: the subsystem to register, must not be the core + * @list: the list of drop reasons within the subsystem, must point to + * a statically initialized list + */ +void drop_reasons_register_subsys(enum skb_drop_reason_subsys subsys, + const struct drop_reason_list *list) +{ + if (WARN(subsys <= SKB_DROP_REASON_SUBSYS_CORE || + subsys >= ARRAY_SIZE(drop_reasons_by_subsys), + "invalid subsystem %d\n", subsys)) + return; + + /* must point to statically allocated memory, so INIT is OK */ + RCU_INIT_POINTER(drop_reasons_by_subsys[subsys], list); +} +EXPORT_SYMBOL_GPL(drop_reasons_register_subsys); + +/** + * drop_reasons_unregister_subsys - unregister a drop reason subsystem + * @subsys: the subsystem to remove, must not be the core + * + * Note: This will synchronize_rcu() to ensure no users when it returns. + */ +void drop_reasons_unregister_subsys(enum skb_drop_reason_subsys subsys) +{ + if (WARN(subsys <= SKB_DROP_REASON_SUBSYS_CORE || + subsys >= ARRAY_SIZE(drop_reasons_by_subsys), + "invalid subsystem %d\n", subsys)) + return; + + RCU_INIT_POINTER(drop_reasons_by_subsys[subsys], NULL); + + synchronize_rcu(); +} +EXPORT_SYMBOL_GPL(drop_reasons_unregister_subsys); /** * skb_panic - private function for out-of-line support @@ -420,10 +470,9 @@ struct sk_buff *build_skb(void *data, unsigned int frag_size) { struct sk_buff *skb = __build_skb(data, frag_size); - if (skb && frag_size) { + if (likely(skb && frag_size)) { skb->head_frag = 1; - if (page_is_pfmemalloc(virt_to_head_page(data))) - skb->pfmemalloc = 1; + skb_propagate_pfmemalloc(virt_to_head_page(data), skb); } return skb; } @@ -445,8 +494,7 @@ struct sk_buff *build_skb_around(struct sk_buff *skb, if (frag_size) { skb->head_frag = 1; - if (page_is_pfmemalloc(virt_to_head_page(data))) - skb->pfmemalloc = 1; + skb_propagate_pfmemalloc(virt_to_head_page(data), skb); } return skb; } @@ -517,18 +565,16 @@ static void *kmalloc_reserve(unsigned int *size, gfp_t flags, int node, #ifdef HAVE_SKB_SMALL_HEAD_CACHE if (obj_size <= SKB_SMALL_HEAD_CACHE_SIZE && !(flags & KMALLOC_NOT_NORMAL_BITS)) { - - /* skb_small_head_cache has non power of two size, - * likely forcing SLUB to use order-3 pages. - * We deliberately attempt a NOMEMALLOC allocation only. - */ obj = kmem_cache_alloc_node(skb_small_head_cache, flags | __GFP_NOMEMALLOC | __GFP_NOWARN, node); - if (obj) { - *size = SKB_SMALL_HEAD_CACHE_SIZE; + *size = SKB_SMALL_HEAD_CACHE_SIZE; + if (obj || !(gfp_pfmemalloc_allowed(flags))) goto out; - } + /* Try again but now we are using pfmemalloc reserves */ + ret_pfmemalloc = true; + obj = kmem_cache_alloc_node(skb_small_head_cache, flags, node); + goto out; } #endif *size = obj_size = kmalloc_size_roundup(obj_size); @@ -843,11 +889,11 @@ static void skb_clone_fraglist(struct sk_buff *skb) skb_get(list); } -static bool skb_pp_recycle(struct sk_buff *skb, void *data) +static bool skb_pp_recycle(struct sk_buff *skb, void *data, bool napi_safe) { if (!IS_ENABLED(CONFIG_PAGE_POOL) || !skb->pp_recycle) return false; - return page_pool_return_skb_page(virt_to_page(data)); + return page_pool_return_skb_page(virt_to_page(data), napi_safe); } static void skb_kfree_head(void *head, unsigned int end_offset) @@ -860,12 +906,12 @@ static void skb_kfree_head(void *head, unsigned int end_offset) kfree(head); } -static void skb_free_head(struct sk_buff *skb) +static void skb_free_head(struct sk_buff *skb, bool napi_safe) { unsigned char *head = skb->head; if (skb->head_frag) { - if (skb_pp_recycle(skb, head)) + if (skb_pp_recycle(skb, head, napi_safe)) return; skb_free_frag(head); } else { @@ -873,7 +919,8 @@ static void skb_free_head(struct sk_buff *skb) } } -static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason) +static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason, + bool napi_safe) { struct skb_shared_info *shinfo = skb_shinfo(skb); int i; @@ -892,13 +939,13 @@ static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason) } for (i = 0; i < shinfo->nr_frags; i++) - __skb_frag_unref(&shinfo->frags[i], skb->pp_recycle); + napi_frag_unref(&shinfo->frags[i], skb->pp_recycle, napi_safe); free_head: if (shinfo->frag_list) kfree_skb_list_reason(shinfo->frag_list, reason); - skb_free_head(skb); + skb_free_head(skb, napi_safe); exit: /* When we clone an SKB we copy the reycling bit. The pp_recycle * bit is only set on the head though, so in order to avoid races @@ -959,11 +1006,12 @@ void skb_release_head_state(struct sk_buff *skb) } /* Free everything but the sk_buff shell. */ -static void skb_release_all(struct sk_buff *skb, enum skb_drop_reason reason) +static void skb_release_all(struct sk_buff *skb, enum skb_drop_reason reason, + bool napi_safe) { skb_release_head_state(skb); if (likely(skb->head)) - skb_release_data(skb, reason); + skb_release_data(skb, reason, napi_safe); } /** @@ -977,7 +1025,7 @@ static void skb_release_all(struct sk_buff *skb, enum skb_drop_reason reason) void __kfree_skb(struct sk_buff *skb) { - skb_release_all(skb, SKB_DROP_REASON_NOT_SPECIFIED); + skb_release_all(skb, SKB_DROP_REASON_NOT_SPECIFIED, false); kfree_skbmem(skb); } EXPORT_SYMBOL(__kfree_skb); @@ -988,7 +1036,10 @@ bool __kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason) if (unlikely(!skb_unref(skb))) return false; - DEBUG_NET_WARN_ON_ONCE(reason <= 0 || reason >= SKB_DROP_REASON_MAX); + DEBUG_NET_WARN_ON_ONCE(reason == SKB_NOT_DROPPED_YET || + u32_get_bits(reason, + SKB_DROP_REASON_SUBSYS_MASK) >= + SKB_DROP_REASON_SUBSYS_NUM); if (reason == SKB_CONSUMED) trace_consume_skb(skb, __builtin_return_address(0)); @@ -1031,7 +1082,7 @@ static void kfree_skb_add_bulk(struct sk_buff *skb, return; } - skb_release_all(skb, reason); + skb_release_all(skb, reason, false); sa->skb_array[sa->skb_count++] = skb; if (unlikely(sa->skb_count == KFREE_SKB_BULK_SIZE)) { @@ -1205,7 +1256,7 @@ EXPORT_SYMBOL(consume_skb); void __consume_stateless_skb(struct sk_buff *skb) { trace_consume_skb(skb, __builtin_return_address(0)); - skb_release_data(skb, SKB_CONSUMED); + skb_release_data(skb, SKB_CONSUMED, false); kfree_skbmem(skb); } @@ -1228,9 +1279,9 @@ static void napi_skb_cache_put(struct sk_buff *skb) } } -void __kfree_skb_defer(struct sk_buff *skb) +void __napi_kfree_skb(struct sk_buff *skb, enum skb_drop_reason reason) { - skb_release_all(skb, SKB_DROP_REASON_NOT_SPECIFIED); + skb_release_all(skb, reason, true); napi_skb_cache_put(skb); } @@ -1268,7 +1319,7 @@ void napi_consume_skb(struct sk_buff *skb, int budget) return; } - skb_release_all(skb, SKB_CONSUMED); + skb_release_all(skb, SKB_CONSUMED, !!budget); napi_skb_cache_put(skb); } EXPORT_SYMBOL(napi_consume_skb); @@ -1399,7 +1450,7 @@ EXPORT_SYMBOL_GPL(alloc_skb_for_msg); */ struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src) { - skb_release_all(dst, SKB_CONSUMED); + skb_release_all(dst, SKB_CONSUMED, false); return __skb_clone(dst, src); } EXPORT_SYMBOL_GPL(skb_morph); @@ -2022,9 +2073,9 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, if (skb_has_frag_list(skb)) skb_clone_fraglist(skb); - skb_release_data(skb, SKB_CONSUMED); + skb_release_data(skb, SKB_CONSUMED, false); } else { - skb_free_head(skb); + skb_free_head(skb, false); } off = (data + nhead) - skb->head; @@ -2082,6 +2133,7 @@ struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom) } EXPORT_SYMBOL(skb_realloc_headroom); +/* Note: We plan to rework this in linux-6.4 */ int __skb_unclone_keeptruesize(struct sk_buff *skb, gfp_t pri) { unsigned int saved_end_offset, saved_truesize; @@ -2100,6 +2152,22 @@ int __skb_unclone_keeptruesize(struct sk_buff *skb, gfp_t pri) if (likely(skb_end_offset(skb) == saved_end_offset)) return 0; +#ifdef HAVE_SKB_SMALL_HEAD_CACHE + /* We can not change skb->end if the original or new value + * is SKB_SMALL_HEAD_HEADROOM, as it might break skb_kfree_head(). + */ + if (saved_end_offset == SKB_SMALL_HEAD_HEADROOM || + skb_end_offset(skb) == SKB_SMALL_HEAD_HEADROOM) { + /* We think this path should not be taken. + * Add a temporary trace to warn us just in case. + */ + pr_err_once("__skb_unclone_keeptruesize() skb_end_offset() %u -> %u\n", + saved_end_offset, skb_end_offset(skb)); + WARN_ON_ONCE(1); + return 0; + } +#endif + shinfo = skb_shinfo(skb); /* We are about to change back skb->end, @@ -5147,6 +5215,9 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb, skb = alloc_skb(0, GFP_ATOMIC); } else { skb = skb_clone(orig_skb, GFP_ATOMIC); + + if (skb_orphan_frags_rx(skb, GFP_ATOMIC)) + return; } if (!skb) return; @@ -5174,6 +5245,7 @@ void skb_tstamp_tx(struct sk_buff *orig_skb, } EXPORT_SYMBOL_GPL(skb_tstamp_tx); +#ifdef CONFIG_WIRELESS void skb_complete_wifi_ack(struct sk_buff *skb, bool acked) { struct sock *sk = skb->sk; @@ -5199,6 +5271,7 @@ void skb_complete_wifi_ack(struct sk_buff *skb, bool acked) kfree_skb(skb); } EXPORT_SYMBOL_GPL(skb_complete_wifi_ack); +#endif /* CONFIG_WIRELESS */ /** * skb_partial_csum_set - set up and verify partial csum values for packet @@ -5584,18 +5657,18 @@ bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from, if (skb_cloned(to)) return false; - /* In general, avoid mixing slab allocated and page_pool allocated - * pages within the same SKB. However when @to is not pp_recycle and - * @from is cloned, we can transition frag pages from page_pool to - * reference counted. - * - * On the other hand, don't allow coalescing two pp_recycle SKBs if - * @from is cloned, in case the SKB is using page_pool fragment + /* In general, avoid mixing page_pool and non-page_pool allocated + * pages within the same SKB. Additionally avoid dealing with clones + * with page_pool pages, in case the SKB is using page_pool fragment * references (PP_FLAG_PAGE_FRAG). Since we only take full page * references for cloned SKBs at the moment that would result in * inconsistent reference counts. + * In theory we could take full references if @from is cloned and + * !@to->pp_recycle but its tricky (due to potential race with + * the clone disappearing) and rare, so not worth dealing with. */ - if (to->pp_recycle != (from->pp_recycle && !skb_cloned(from))) + if (to->pp_recycle != from->pp_recycle || + (from->pp_recycle && skb_cloned(from))) return false; if (len <= skb_tailroom(to)) { @@ -5926,7 +5999,6 @@ EXPORT_SYMBOL(skb_ensure_writable); */ int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci) { - struct vlan_hdr *vhdr; int offset = skb->data - skb_mac_header(skb); int err; @@ -5942,13 +6014,8 @@ int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci) skb_postpull_rcsum(skb, skb->data + (2 * ETH_ALEN), VLAN_HLEN); - vhdr = (struct vlan_hdr *)(skb->data + ETH_HLEN); - *vlan_tci = ntohs(vhdr->h_vlan_TCI); - - memmove(skb->data + VLAN_HLEN, skb->data, 2 * ETH_ALEN); - __skb_pull(skb, VLAN_HLEN); + vlan_remove_tag(skb, vlan_tci); - vlan_set_encap_proto(skb, vhdr); skb->mac_header += VLAN_HLEN; if (skb_network_offset(skb) < ETH_HLEN) @@ -6376,12 +6443,12 @@ static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off, skb_frag_ref(skb, i); if (skb_has_frag_list(skb)) skb_clone_fraglist(skb); - skb_release_data(skb, SKB_CONSUMED); + skb_release_data(skb, SKB_CONSUMED, false); } else { /* we can reuse existing recount- all we did was * relocate values */ - skb_free_head(skb); + skb_free_head(skb, false); } skb->head = data; @@ -6516,7 +6583,7 @@ static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off, skb_kfree_head(data, size); return -ENOMEM; } - skb_release_data(skb, SKB_CONSUMED); + skb_release_data(skb, SKB_CONSUMED, false); skb->head = data; skb->head_frag = 0; @@ -6800,7 +6867,6 @@ void skb_attempt_defer_free(struct sk_buff *skb) { int cpu = skb->alloc_cpu; struct softnet_data *sd; - unsigned long flags; unsigned int defer_max; bool kick; @@ -6811,12 +6877,15 @@ nodefer: __kfree_skb(skb); return; } + DEBUG_NET_WARN_ON_ONCE(skb_dst(skb)); + DEBUG_NET_WARN_ON_ONCE(skb->destructor); + sd = &per_cpu(softnet_data, cpu); defer_max = READ_ONCE(sysctl_skb_defer_max); if (READ_ONCE(sd->defer_count) >= defer_max) goto nodefer; - spin_lock_irqsave(&sd->defer_lock, flags); + spin_lock_bh(&sd->defer_lock); /* Send an IPI every time queue reaches half capacity. */ kick = sd->defer_count == (defer_max >> 1); /* Paired with the READ_ONCE() few lines above */ @@ -6825,7 +6894,7 @@ nodefer: __kfree_skb(skb); skb->next = sd->defer_list; /* Paired with READ_ONCE() in skb_defer_free_flush() */ WRITE_ONCE(sd->defer_list, skb); - spin_unlock_irqrestore(&sd->defer_lock, flags); + spin_unlock_bh(&sd->defer_lock); /* Make sure to trigger NET_RX_SOFTIRQ on the remote CPU * if we are unlucky enough (this seems very unlikely). |