aboutsummaryrefslogtreecommitdiff
path: root/net/core/page_pool.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/core/page_pool.c')
-rw-r--r--net/core/page_pool.c231
1 files changed, 168 insertions, 63 deletions
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index 5e409b98aba0..8bcc7014a61a 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -5,6 +5,7 @@
* Copyright (C) 2016 Red Hat, Inc.
*/
+#include <linux/error-injection.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/slab.h>
@@ -23,12 +24,16 @@
#include <trace/events/page_pool.h>
+#include "page_pool_priv.h"
+
#define DEFER_TIME (msecs_to_jiffies(1000))
#define DEFER_WARN_INTERVAL (60 * HZ)
-#define BIAS_MAX LONG_MAX
+#define BIAS_MAX (LONG_MAX >> 1)
#ifdef CONFIG_PAGE_POOL_STATS
+static DEFINE_PER_CPU(struct page_pool_recycle_stats, pp_system_recycle_stats);
+
/* alloc_stat_inc is intended to be used in softirq context */
#define alloc_stat_inc(pool, __stat) (pool->alloc_stats.__stat++)
/* recycle_stat_inc is safe to use when preemption is possible. */
@@ -69,7 +74,7 @@ static const char pp_stats[][ETH_GSTRING_LEN] = {
* is passed to this API which is filled in. The caller can then report
* those stats to the user (perhaps via ethtool, debugfs, etc.).
*/
-bool page_pool_get_stats(struct page_pool *pool,
+bool page_pool_get_stats(const struct page_pool *pool,
struct page_pool_stats *stats)
{
int cpu = 0;
@@ -119,9 +124,9 @@ int page_pool_ethtool_stats_get_count(void)
}
EXPORT_SYMBOL(page_pool_ethtool_stats_get_count);
-u64 *page_pool_ethtool_stats_get(u64 *data, void *stats)
+u64 *page_pool_ethtool_stats_get(u64 *data, const void *stats)
{
- struct page_pool_stats *pool_stats = stats;
+ const struct page_pool_stats *pool_stats = stats;
*data++ = pool_stats->alloc_stats.fast;
*data++ = pool_stats->alloc_stats.slow;
@@ -169,11 +174,15 @@ static void page_pool_producer_unlock(struct page_pool *pool,
}
static int page_pool_init(struct page_pool *pool,
- const struct page_pool_params *params)
+ const struct page_pool_params *params,
+ int cpuid)
{
unsigned int ring_qsize = 1024; /* Default */
- memcpy(&pool->p, params, sizeof(pool->p));
+ memcpy(&pool->p, &params->fast, sizeof(pool->p));
+ memcpy(&pool->slow, &params->slow, sizeof(pool->slow));
+
+ pool->cpuid = cpuid;
/* Validate only known flags were used */
if (pool->p.flags & ~(PP_FLAG_ALL))
@@ -211,14 +220,29 @@ static int page_pool_init(struct page_pool *pool,
*/
}
+ pool->has_init_callback = !!pool->slow.init_callback;
+
#ifdef CONFIG_PAGE_POOL_STATS
- pool->recycle_stats = alloc_percpu(struct page_pool_recycle_stats);
- if (!pool->recycle_stats)
- return -ENOMEM;
+ if (!(pool->p.flags & PP_FLAG_SYSTEM_POOL)) {
+ pool->recycle_stats = alloc_percpu(struct page_pool_recycle_stats);
+ if (!pool->recycle_stats)
+ return -ENOMEM;
+ } else {
+ /* For system page pool instance we use a singular stats object
+ * instead of allocating a separate percpu variable for each
+ * (also percpu) page pool instance.
+ */
+ pool->recycle_stats = &pp_system_recycle_stats;
+ }
#endif
- if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0)
+ if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0) {
+#ifdef CONFIG_PAGE_POOL_STATS
+ if (!(pool->p.flags & PP_FLAG_SYSTEM_POOL))
+ free_percpu(pool->recycle_stats);
+#endif
return -ENOMEM;
+ }
atomic_set(&pool->pages_state_release_cnt, 0);
@@ -231,11 +255,26 @@ static int page_pool_init(struct page_pool *pool,
return 0;
}
+static void page_pool_uninit(struct page_pool *pool)
+{
+ ptr_ring_cleanup(&pool->ring, NULL);
+
+ if (pool->p.flags & PP_FLAG_DMA_MAP)
+ put_device(pool->p.dev);
+
+#ifdef CONFIG_PAGE_POOL_STATS
+ if (!(pool->p.flags & PP_FLAG_SYSTEM_POOL))
+ free_percpu(pool->recycle_stats);
+#endif
+}
+
/**
- * page_pool_create() - create a page pool.
+ * page_pool_create_percpu() - create a page pool for a given cpu.
* @params: parameters, see struct page_pool_params
+ * @cpuid: cpu identifier
*/
-struct page_pool *page_pool_create(const struct page_pool_params *params)
+struct page_pool *
+page_pool_create_percpu(const struct page_pool_params *params, int cpuid)
{
struct page_pool *pool;
int err;
@@ -244,14 +283,32 @@ struct page_pool *page_pool_create(const struct page_pool_params *params)
if (!pool)
return ERR_PTR(-ENOMEM);
- err = page_pool_init(pool, params);
- if (err < 0) {
- pr_warn("%s() gave up with errno %d\n", __func__, err);
- kfree(pool);
- return ERR_PTR(err);
- }
+ err = page_pool_init(pool, params, cpuid);
+ if (err < 0)
+ goto err_free;
+
+ err = page_pool_list(pool);
+ if (err)
+ goto err_uninit;
return pool;
+
+err_uninit:
+ page_pool_uninit(pool);
+err_free:
+ pr_warn("%s() gave up with errno %d\n", __func__, err);
+ kfree(pool);
+ return ERR_PTR(err);
+}
+EXPORT_SYMBOL(page_pool_create_percpu);
+
+/**
+ * page_pool_create() - create a page pool
+ * @params: parameters, see struct page_pool_params
+ */
+struct page_pool *page_pool_create(const struct page_pool_params *params)
+{
+ return page_pool_create_percpu(params, -1);
}
EXPORT_SYMBOL(page_pool_create);
@@ -327,8 +384,8 @@ static struct page *__page_pool_get_cached(struct page_pool *pool)
return page;
}
-static void page_pool_dma_sync_for_device(struct page_pool *pool,
- struct page *page,
+static void page_pool_dma_sync_for_device(const struct page_pool *pool,
+ const struct page *page,
unsigned int dma_sync_size)
{
dma_addr_t dma_addr = page_pool_get_dma_addr(page);
@@ -384,8 +441,8 @@ static void page_pool_set_pp_info(struct page_pool *pool,
* the overhead is negligible.
*/
page_pool_fragment_page(page, 1);
- if (pool->p.init_callback)
- pool->p.init_callback(page, pool->p.init_arg);
+ if (pool->has_init_callback)
+ pool->slow.init_callback(page, pool->slow.init_arg);
}
static void page_pool_clear_pp_info(struct page *page)
@@ -494,13 +551,14 @@ struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp)
return page;
}
EXPORT_SYMBOL(page_pool_alloc_pages);
+ALLOW_ERROR_INJECTION(page_pool_alloc_pages, NULL);
/* Calculate distance between two u32 values, valid if distance is below 2^(31)
* https://en.wikipedia.org/wiki/Serial_number_arithmetic#General_Solution
*/
#define _distance(a, b) (s32)((a) - (b))
-static s32 page_pool_inflight(struct page_pool *pool)
+s32 page_pool_inflight(const struct page_pool *pool, bool strict)
{
u32 release_cnt = atomic_read(&pool->pages_state_release_cnt);
u32 hold_cnt = READ_ONCE(pool->pages_state_hold_cnt);
@@ -508,27 +566,27 @@ static s32 page_pool_inflight(struct page_pool *pool)
inflight = _distance(hold_cnt, release_cnt);
- trace_page_pool_release(pool, inflight, hold_cnt, release_cnt);
- WARN(inflight < 0, "Negative(%d) inflight packet-pages", inflight);
+ if (strict) {
+ trace_page_pool_release(pool, inflight, hold_cnt, release_cnt);
+ WARN(inflight < 0, "Negative(%d) inflight packet-pages",
+ inflight);
+ } else {
+ inflight = max(0, inflight);
+ }
return inflight;
}
-/* Disconnects a page (from a page_pool). API users can have a need
- * to disconnect a page (from a page_pool), to allow it to be used as
- * a regular page (that will eventually be returned to the normal
- * page-allocator via put_page).
- */
-static void page_pool_return_page(struct page_pool *pool, struct page *page)
+static __always_inline
+void __page_pool_release_page_dma(struct page_pool *pool, struct page *page)
{
dma_addr_t dma;
- int count;
if (!(pool->p.flags & PP_FLAG_DMA_MAP))
/* Always account for inflight pages, even if we didn't
* map them
*/
- goto skip_dma_unmap;
+ return;
dma = page_pool_get_dma_addr(page);
@@ -537,7 +595,19 @@ static void page_pool_return_page(struct page_pool *pool, struct page *page)
PAGE_SIZE << pool->p.order, pool->p.dma_dir,
DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING);
page_pool_set_dma_addr(page, 0);
-skip_dma_unmap:
+}
+
+/* Disconnects a page (from a page_pool). API users can have a need
+ * to disconnect a page (from a page_pool), to allow it to be used as
+ * a regular page (that will eventually be returned to the normal
+ * page-allocator via put_page).
+ */
+void page_pool_return_page(struct page_pool *pool, struct page *page)
+{
+ int count;
+
+ __page_pool_release_page_dma(pool, page);
+
page_pool_clear_pp_info(page);
/* This may be the last page returned, releasing the pool, so
@@ -589,6 +659,11 @@ static bool page_pool_recycle_in_cache(struct page *page,
return true;
}
+static bool __page_pool_page_can_be_recycled(const struct page *page)
+{
+ return page_ref_count(page) == 1 && !page_is_pfmemalloc(page);
+}
+
/* If the page refcnt == 1, this will try to recycle the page.
* if PP_FLAG_DMA_SYNC_DEV is set, we'll try to sync the DMA area for
* the configured size min(dma_sync_size, pool->max_len).
@@ -610,15 +685,14 @@ __page_pool_put_page(struct page_pool *pool, struct page *page,
* page is NOT reusable when allocated when system is under
* some pressure. (page_is_pfmemalloc)
*/
- if (likely(page_ref_count(page) == 1 && !page_is_pfmemalloc(page))) {
+ if (likely(__page_pool_page_can_be_recycled(page))) {
/* Read barrier done in page_ref_count / READ_ONCE */
if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV)
page_pool_dma_sync_for_device(pool, page,
dma_sync_size);
- if (allow_direct && in_softirq() &&
- page_pool_recycle_in_cache(page, pool))
+ if (allow_direct && page_pool_recycle_in_cache(page, pool))
return NULL;
/* Page found as candidate for recycling */
@@ -643,9 +717,35 @@ __page_pool_put_page(struct page_pool *pool, struct page *page,
return NULL;
}
-void page_pool_put_defragged_page(struct page_pool *pool, struct page *page,
- unsigned int dma_sync_size, bool allow_direct)
+static bool page_pool_napi_local(const struct page_pool *pool)
+{
+ const struct napi_struct *napi;
+ u32 cpuid;
+
+ if (unlikely(!in_softirq()))
+ return false;
+
+ /* Allow direct recycle if we have reasons to believe that we are
+ * in the same context as the consumer would run, so there's
+ * no possible race.
+ * __page_pool_put_page() makes sure we're not in hardirq context
+ * and interrupts are enabled prior to accessing the cache.
+ */
+ cpuid = smp_processor_id();
+ if (READ_ONCE(pool->cpuid) == cpuid)
+ return true;
+
+ napi = READ_ONCE(pool->p.napi);
+
+ return napi && READ_ONCE(napi->list_owner) == cpuid;
+}
+
+void page_pool_put_unrefed_page(struct page_pool *pool, struct page *page,
+ unsigned int dma_sync_size, bool allow_direct)
{
+ if (!allow_direct)
+ allow_direct = page_pool_napi_local(pool);
+
page = __page_pool_put_page(pool, page, dma_sync_size, allow_direct);
if (page && !page_pool_recycle_in_ring(pool, page)) {
/* Cache full, fallback to free pages */
@@ -653,7 +753,7 @@ void page_pool_put_defragged_page(struct page_pool *pool, struct page *page,
page_pool_return_page(pool, page);
}
}
-EXPORT_SYMBOL(page_pool_put_defragged_page);
+EXPORT_SYMBOL(page_pool_put_unrefed_page);
/**
* page_pool_put_page_bulk() - release references on multiple pages
@@ -674,22 +774,25 @@ void page_pool_put_page_bulk(struct page_pool *pool, void **data,
int count)
{
int i, bulk_len = 0;
+ bool allow_direct;
bool in_softirq;
+ allow_direct = page_pool_napi_local(pool);
+
for (i = 0; i < count; i++) {
struct page *page = virt_to_head_page(data[i]);
/* It is not the last user for the page frag case */
- if (!page_pool_is_last_frag(page))
+ if (!page_pool_is_last_ref(page))
continue;
- page = __page_pool_put_page(pool, page, -1, false);
+ page = __page_pool_put_page(pool, page, -1, allow_direct);
/* Approved for bulk recycling in ptr_ring cache */
if (page)
data[bulk_len++] = page;
}
- if (unlikely(!bulk_len))
+ if (!bulk_len)
return;
/* Bulk producer into ptr_ring page_pool cache */
@@ -722,10 +825,10 @@ static struct page *page_pool_drain_frag(struct page_pool *pool,
long drain_count = BIAS_MAX - pool->frag_users;
/* Some user is still using the page frag */
- if (likely(page_pool_defrag_page(page, drain_count)))
+ if (likely(page_pool_unref_page(page, drain_count)))
return NULL;
- if (page_ref_count(page) == 1 && !page_is_pfmemalloc(page)) {
+ if (__page_pool_page_can_be_recycled(page)) {
if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV)
page_pool_dma_sync_for_device(pool, page, -1);
@@ -743,7 +846,7 @@ static void page_pool_free_frag(struct page_pool *pool)
pool->frag_page = NULL;
- if (!page || page_pool_defrag_page(page, drain_count))
+ if (!page || page_pool_unref_page(page, drain_count))
return;
page_pool_return_page(pool, page);
@@ -814,14 +917,8 @@ static void __page_pool_destroy(struct page_pool *pool)
if (pool->disconnect)
pool->disconnect(pool);
- ptr_ring_cleanup(&pool->ring, NULL);
-
- if (pool->p.flags & PP_FLAG_DMA_MAP)
- put_device(pool->p.dev);
-
-#ifdef CONFIG_PAGE_POOL_STATS
- free_percpu(pool->recycle_stats);
-#endif
+ page_pool_unlist(pool);
+ page_pool_uninit(pool);
kfree(pool);
}
@@ -858,7 +955,7 @@ static int page_pool_release(struct page_pool *pool)
int inflight;
page_pool_scrub(pool);
- inflight = page_pool_inflight(pool);
+ inflight = page_pool_inflight(pool, true);
if (!inflight)
__page_pool_destroy(pool);
@@ -869,18 +966,21 @@ static void page_pool_release_retry(struct work_struct *wq)
{
struct delayed_work *dwq = to_delayed_work(wq);
struct page_pool *pool = container_of(dwq, typeof(*pool), release_dw);
+ void *netdev;
int inflight;
inflight = page_pool_release(pool);
if (!inflight)
return;
- /* Periodic warning */
- if (time_after_eq(jiffies, pool->defer_warn)) {
+ /* Periodic warning for page pools the user can't see */
+ netdev = READ_ONCE(pool->slow.netdev);
+ if (time_after_eq(jiffies, pool->defer_warn) &&
+ (!netdev || netdev == NET_PTR_POISON)) {
int sec = (s32)((u32)jiffies - (u32)pool->defer_start) / HZ;
- pr_warn("%s() stalled pool shutdown %d inflight %d sec\n",
- __func__, inflight, sec);
+ pr_warn("%s() stalled pool shutdown: id %u, %d inflight %d sec\n",
+ __func__, pool->user.id, inflight, sec);
pool->defer_warn = jiffies + DEFER_WARN_INTERVAL;
}
@@ -889,15 +989,20 @@ static void page_pool_release_retry(struct work_struct *wq)
}
void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *),
- struct xdp_mem_info *mem)
+ const struct xdp_mem_info *mem)
{
refcount_inc(&pool->user_cnt);
pool->disconnect = disconnect;
pool->xdp_mem_id = mem->id;
}
-void page_pool_unlink_napi(struct page_pool *pool)
+static void page_pool_disable_direct_recycling(struct page_pool *pool)
{
+ /* Disable direct recycling based on pool->cpuid.
+ * Paired with READ_ONCE() in page_pool_napi_local().
+ */
+ WRITE_ONCE(pool->cpuid, -1);
+
if (!pool->p.napi)
return;
@@ -909,7 +1014,6 @@ void page_pool_unlink_napi(struct page_pool *pool)
WRITE_ONCE(pool->p.napi, NULL);
}
-EXPORT_SYMBOL(page_pool_unlink_napi);
void page_pool_destroy(struct page_pool *pool)
{
@@ -919,12 +1023,13 @@ void page_pool_destroy(struct page_pool *pool)
if (!page_pool_put(pool))
return;
- page_pool_unlink_napi(pool);
+ page_pool_disable_direct_recycling(pool);
page_pool_free_frag(pool);
if (!page_pool_release(pool))
return;
+ page_pool_detached(pool);
pool->defer_start = jiffies;
pool->defer_warn = jiffies + DEFER_WARN_INTERVAL;