aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJakub Kicinski <[email protected]>2023-04-14 18:56:13 -0700
committerJakub Kicinski <[email protected]>2023-04-14 18:56:14 -0700
commite61caf04b9f8a2626714ffd12e734c555c758af4 (patch)
tree653aa7637f1c904fef4ffac8a27255428c0f2114
parentc11d2e718c792468e67389b506451eddf26c2dac (diff)
parent294e39e0d03449f32b0723d3fd99ab5c23881c05 (diff)
Merge branch 'page_pool-allow-caching-from-safely-localized-napi'
Jakub Kicinski says: ==================== page_pool: allow caching from safely localized NAPI I went back to the explicit "are we in NAPI method", mostly because I don't like having both around :( (even tho I maintain that in_softirq() && !in_hardirq() is as safe, as softirqs do not nest). Still returning the skbs to a CPU, tho, not to the NAPI instance. I reckon we could create a small refcounted struct per NAPI instance which would allow sockets and other users so hold a persisent and safe reference. But that's a bigger change, and I get 90+% recycling thru the cache with just these patches (for RR and streaming tests with 100% CPU use it's almost 100%). Some numbers for streaming test with 100% CPU use (from previous version, but really they perform the same): HW-GRO page=page before after before after recycle: cached: 0 138669686 0 150197505 cache_full: 0 223391 0 74582 ring: 138551933 9997191 149299454 0 ring_full: 0 488 3154 127590 released_refcnt: 0 0 0 0 alloc: fast: 136491361 148615710 146969587 150322859 slow: 1772 1799 144 105 slow_high_order: 0 0 0 0 empty: 1772 1799 144 105 refill: 2165245 156302 2332880 2128 waive: 0 0 0 0 v1: https://lore.kernel.org/all/[email protected]/ rfcv2: https://lore.kernel.org/all/[email protected]/ ==================== Link: https://lore.kernel.org/r/[email protected] Signed-off-by: Jakub Kicinski <[email protected]>
-rw-r--r--Documentation/networking/page_pool.rst1
-rw-r--r--drivers/net/ethernet/broadcom/bnxt/bnxt.c1
-rw-r--r--include/linux/netdevice.h3
-rw-r--r--include/linux/skbuff.h20
-rw-r--r--include/net/page_pool.h3
-rw-r--r--net/core/dev.c3
-rw-r--r--net/core/page_pool.c15
-rw-r--r--net/core/skbuff.c42
8 files changed, 58 insertions, 30 deletions
diff --git a/Documentation/networking/page_pool.rst b/Documentation/networking/page_pool.rst
index 30f1344e7cca..873efd97f822 100644
--- a/Documentation/networking/page_pool.rst
+++ b/Documentation/networking/page_pool.rst
@@ -165,6 +165,7 @@ Registration
pp_params.pool_size = DESC_NUM;
pp_params.nid = NUMA_NO_NODE;
pp_params.dev = priv->dev;
+ pp_params.napi = napi; /* only if locking is tied to NAPI */
pp_params.dma_dir = xdp_prog ? DMA_BIDIRECTIONAL : DMA_FROM_DEVICE;
page_pool = page_pool_create(&pp_params);
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 92289ab2f34a..8fb9d1bbe56f 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -3211,6 +3211,7 @@ static int bnxt_alloc_rx_page_pool(struct bnxt *bp,
pp.pool_size = bp->rx_ring_size;
pp.nid = dev_to_node(&bp->pdev->dev);
+ pp.napi = &rxr->bnapi->napi;
pp.dev = &bp->pdev->dev;
pp.dma_dir = DMA_BIDIRECTIONAL;
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 96d27d558b0c..203c0df2046c 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -360,8 +360,11 @@ struct napi_struct {
unsigned long gro_bitmask;
int (*poll)(struct napi_struct *, int);
#ifdef CONFIG_NETPOLL
+ /* CPU actively polling if netpoll is configured */
int poll_owner;
#endif
+ /* CPU on which NAPI has been scheduled for processing */
+ int list_owner;
struct net_device *dev;
struct gro_list gro_hash[GRO_HASH_BUCKETS];
struct sk_buff *skb;
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 494a23a976b0..a823ec3aa326 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3386,6 +3386,18 @@ static inline void skb_frag_ref(struct sk_buff *skb, int f)
__skb_frag_ref(&skb_shinfo(skb)->frags[f]);
}
+static inline void
+napi_frag_unref(skb_frag_t *frag, bool recycle, bool napi_safe)
+{
+ struct page *page = skb_frag_page(frag);
+
+#ifdef CONFIG_PAGE_POOL
+ if (recycle && page_pool_return_skb_page(page, napi_safe))
+ return;
+#endif
+ put_page(page);
+}
+
/**
* __skb_frag_unref - release a reference on a paged fragment.
* @frag: the paged fragment
@@ -3396,13 +3408,7 @@ static inline void skb_frag_ref(struct sk_buff *skb, int f)
*/
static inline void __skb_frag_unref(skb_frag_t *frag, bool recycle)
{
- struct page *page = skb_frag_page(frag);
-
-#ifdef CONFIG_PAGE_POOL
- if (recycle && page_pool_return_skb_page(page))
- return;
-#endif
- put_page(page);
+ napi_frag_unref(frag, recycle, false);
}
/**
diff --git a/include/net/page_pool.h b/include/net/page_pool.h
index ddfa0b328677..91b808dade82 100644
--- a/include/net/page_pool.h
+++ b/include/net/page_pool.h
@@ -77,6 +77,7 @@ struct page_pool_params {
unsigned int pool_size;
int nid; /* Numa node id to allocate from pages from */
struct device *dev; /* device, for DMA pre-mapping purposes */
+ struct napi_struct *napi; /* Sole consumer of pages, otherwise NULL */
enum dma_data_direction dma_dir; /* DMA mapping direction */
unsigned int max_len; /* max DMA sync memory size */
unsigned int offset; /* DMA addr offset */
@@ -239,7 +240,7 @@ inline enum dma_data_direction page_pool_get_dma_dir(struct page_pool *pool)
return pool->p.dma_dir;
}
-bool page_pool_return_skb_page(struct page *page);
+bool page_pool_return_skb_page(struct page *page, bool napi_safe);
struct page_pool *page_pool_create(const struct page_pool_params *params);
diff --git a/net/core/dev.c b/net/core/dev.c
index c7f13742b56c..8aea68275172 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4359,6 +4359,7 @@ static inline void ____napi_schedule(struct softnet_data *sd,
}
list_add_tail(&napi->poll_list, &sd->poll_list);
+ WRITE_ONCE(napi->list_owner, smp_processor_id());
/* If not called from net_rx_action()
* we have to raise NET_RX_SOFTIRQ.
*/
@@ -6069,6 +6070,7 @@ bool napi_complete_done(struct napi_struct *n, int work_done)
list_del_init(&n->poll_list);
local_irq_restore(flags);
}
+ WRITE_ONCE(n->list_owner, -1);
val = READ_ONCE(n->state);
do {
@@ -6384,6 +6386,7 @@ void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi,
#ifdef CONFIG_NETPOLL
napi->poll_owner = -1;
#endif
+ napi->list_owner = -1;
set_bit(NAPI_STATE_SCHED, &napi->state);
set_bit(NAPI_STATE_NPSVC, &napi->state);
list_add_rcu(&napi->dev_list, &dev->napi_list);
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index 193c18799865..2f6bf422ed30 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -19,6 +19,7 @@
#include <linux/mm.h> /* for put_page() */
#include <linux/poison.h>
#include <linux/ethtool.h>
+#include <linux/netdevice.h>
#include <trace/events/page_pool.h>
@@ -874,9 +875,11 @@ void page_pool_update_nid(struct page_pool *pool, int new_nid)
}
EXPORT_SYMBOL(page_pool_update_nid);
-bool page_pool_return_skb_page(struct page *page)
+bool page_pool_return_skb_page(struct page *page, bool napi_safe)
{
+ struct napi_struct *napi;
struct page_pool *pp;
+ bool allow_direct;
page = compound_head(page);
@@ -892,12 +895,20 @@ bool page_pool_return_skb_page(struct page *page)
pp = page->pp;
+ /* Allow direct recycle if we have reasons to believe that we are
+ * in the same context as the consumer would run, so there's
+ * no possible race.
+ */
+ napi = pp->p.napi;
+ allow_direct = napi_safe && napi &&
+ READ_ONCE(napi->list_owner) == smp_processor_id();
+
/* Driver set this to memory recycling info. Reset it on recycle.
* This will *not* work for NIC using a split-page memory model.
* The page will be returned to the pool here regardless of the
* 'flipped' fragment being in use or not.
*/
- page_pool_put_full_page(pp, page, false);
+ page_pool_put_full_page(pp, page, allow_direct);
return true;
}
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 78238a13dbcf..ef81452759be 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -839,11 +839,11 @@ static void skb_clone_fraglist(struct sk_buff *skb)
skb_get(list);
}
-static bool skb_pp_recycle(struct sk_buff *skb, void *data)
+static bool skb_pp_recycle(struct sk_buff *skb, void *data, bool napi_safe)
{
if (!IS_ENABLED(CONFIG_PAGE_POOL) || !skb->pp_recycle)
return false;
- return page_pool_return_skb_page(virt_to_page(data));
+ return page_pool_return_skb_page(virt_to_page(data), napi_safe);
}
static void skb_kfree_head(void *head, unsigned int end_offset)
@@ -856,12 +856,12 @@ static void skb_kfree_head(void *head, unsigned int end_offset)
kfree(head);
}
-static void skb_free_head(struct sk_buff *skb)
+static void skb_free_head(struct sk_buff *skb, bool napi_safe)
{
unsigned char *head = skb->head;
if (skb->head_frag) {
- if (skb_pp_recycle(skb, head))
+ if (skb_pp_recycle(skb, head, napi_safe))
return;
skb_free_frag(head);
} else {
@@ -869,7 +869,8 @@ static void skb_free_head(struct sk_buff *skb)
}
}
-static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason)
+static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason,
+ bool napi_safe)
{
struct skb_shared_info *shinfo = skb_shinfo(skb);
int i;
@@ -888,13 +889,13 @@ static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason)
}
for (i = 0; i < shinfo->nr_frags; i++)
- __skb_frag_unref(&shinfo->frags[i], skb->pp_recycle);
+ napi_frag_unref(&shinfo->frags[i], skb->pp_recycle, napi_safe);
free_head:
if (shinfo->frag_list)
kfree_skb_list_reason(shinfo->frag_list, reason);
- skb_free_head(skb);
+ skb_free_head(skb, napi_safe);
exit:
/* When we clone an SKB we copy the reycling bit. The pp_recycle
* bit is only set on the head though, so in order to avoid races
@@ -955,11 +956,12 @@ void skb_release_head_state(struct sk_buff *skb)
}
/* Free everything but the sk_buff shell. */
-static void skb_release_all(struct sk_buff *skb, enum skb_drop_reason reason)
+static void skb_release_all(struct sk_buff *skb, enum skb_drop_reason reason,
+ bool napi_safe)
{
skb_release_head_state(skb);
if (likely(skb->head))
- skb_release_data(skb, reason);
+ skb_release_data(skb, reason, napi_safe);
}
/**
@@ -973,7 +975,7 @@ static void skb_release_all(struct sk_buff *skb, enum skb_drop_reason reason)
void __kfree_skb(struct sk_buff *skb)
{
- skb_release_all(skb, SKB_DROP_REASON_NOT_SPECIFIED);
+ skb_release_all(skb, SKB_DROP_REASON_NOT_SPECIFIED, false);
kfree_skbmem(skb);
}
EXPORT_SYMBOL(__kfree_skb);
@@ -1027,7 +1029,7 @@ static void kfree_skb_add_bulk(struct sk_buff *skb,
return;
}
- skb_release_all(skb, reason);
+ skb_release_all(skb, reason, false);
sa->skb_array[sa->skb_count++] = skb;
if (unlikely(sa->skb_count == KFREE_SKB_BULK_SIZE)) {
@@ -1201,7 +1203,7 @@ EXPORT_SYMBOL(consume_skb);
void __consume_stateless_skb(struct sk_buff *skb)
{
trace_consume_skb(skb, __builtin_return_address(0));
- skb_release_data(skb, SKB_CONSUMED);
+ skb_release_data(skb, SKB_CONSUMED, false);
kfree_skbmem(skb);
}
@@ -1226,7 +1228,7 @@ static void napi_skb_cache_put(struct sk_buff *skb)
void __kfree_skb_defer(struct sk_buff *skb)
{
- skb_release_all(skb, SKB_DROP_REASON_NOT_SPECIFIED);
+ skb_release_all(skb, SKB_DROP_REASON_NOT_SPECIFIED, true);
napi_skb_cache_put(skb);
}
@@ -1264,7 +1266,7 @@ void napi_consume_skb(struct sk_buff *skb, int budget)
return;
}
- skb_release_all(skb, SKB_CONSUMED);
+ skb_release_all(skb, SKB_CONSUMED, !!budget);
napi_skb_cache_put(skb);
}
EXPORT_SYMBOL(napi_consume_skb);
@@ -1395,7 +1397,7 @@ EXPORT_SYMBOL_GPL(alloc_skb_for_msg);
*/
struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src)
{
- skb_release_all(dst, SKB_CONSUMED);
+ skb_release_all(dst, SKB_CONSUMED, false);
return __skb_clone(dst, src);
}
EXPORT_SYMBOL_GPL(skb_morph);
@@ -2018,9 +2020,9 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
if (skb_has_frag_list(skb))
skb_clone_fraglist(skb);
- skb_release_data(skb, SKB_CONSUMED);
+ skb_release_data(skb, SKB_CONSUMED, false);
} else {
- skb_free_head(skb);
+ skb_free_head(skb, false);
}
off = (data + nhead) - skb->head;
@@ -6389,12 +6391,12 @@ static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off,
skb_frag_ref(skb, i);
if (skb_has_frag_list(skb))
skb_clone_fraglist(skb);
- skb_release_data(skb, SKB_CONSUMED);
+ skb_release_data(skb, SKB_CONSUMED, false);
} else {
/* we can reuse existing recount- all we did was
* relocate values
*/
- skb_free_head(skb);
+ skb_free_head(skb, false);
}
skb->head = data;
@@ -6529,7 +6531,7 @@ static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off,
skb_kfree_head(data, size);
return -ENOMEM;
}
- skb_release_data(skb, SKB_CONSUMED);
+ skb_release_data(skb, SKB_CONSUMED, false);
skb->head = data;
skb->head_frag = 0;