diff options
-rw-r--r-- | Documentation/networking/page_pool.rst | 4 | ||||
-rw-r--r-- | drivers/net/ethernet/broadcom/bnxt/bnxt.c | 2 | ||||
-rw-r--r-- | drivers/net/ethernet/hisilicon/hns3/hns3_enet.c | 3 | ||||
-rw-r--r-- | drivers/net/ethernet/intel/idpf/idpf_txrx.c | 3 | ||||
-rw-r--r-- | drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c | 2 | ||||
-rw-r--r-- | drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 2 | ||||
-rw-r--r-- | drivers/net/veth.c | 25 | ||||
-rw-r--r-- | drivers/net/wireless/mediatek/mt76/mac80211.c | 2 | ||||
-rw-r--r-- | include/net/page_pool/helpers.h | 210 | ||||
-rw-r--r-- | include/net/page_pool/types.h | 6 | ||||
-rw-r--r-- | net/core/page_pool.c | 17 | ||||
-rw-r--r-- | net/core/skbuff.c | 2 |
12 files changed, 220 insertions, 58 deletions
diff --git a/Documentation/networking/page_pool.rst b/Documentation/networking/page_pool.rst index 215ebc92752c..60993cb56b32 100644 --- a/Documentation/networking/page_pool.rst +++ b/Documentation/networking/page_pool.rst @@ -58,7 +58,9 @@ a page will cause no race conditions is enough. .. kernel-doc:: include/net/page_pool/helpers.h :identifiers: page_pool_put_page page_pool_put_full_page - page_pool_recycle_direct page_pool_dev_alloc_pages + page_pool_recycle_direct page_pool_free_va + page_pool_dev_alloc_pages page_pool_dev_alloc_frag + page_pool_dev_alloc page_pool_dev_alloc_va page_pool_get_dma_addr page_pool_get_dma_dir .. kernel-doc:: net/core/page_pool.c diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 5d7a29f99401..d0359b569afe 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -3302,8 +3302,6 @@ static int bnxt_alloc_rx_page_pool(struct bnxt *bp, pp.dma_dir = bp->rx_dir; pp.max_len = PAGE_SIZE; pp.flags = PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV; - if (PAGE_SIZE > BNXT_RX_PAGE_SIZE) - pp.flags |= PP_FLAG_PAGE_FRAG; rxr->page_pool = page_pool_create(&pp); if (IS_ERR(rxr->page_pool)) { diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c index cf50368441b7..06117502001f 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c @@ -4940,8 +4940,7 @@ static void hns3_put_ring_config(struct hns3_nic_priv *priv) static void hns3_alloc_page_pool(struct hns3_enet_ring *ring) { struct page_pool_params pp_params = { - .flags = PP_FLAG_DMA_MAP | PP_FLAG_PAGE_FRAG | - PP_FLAG_DMA_SYNC_DEV, + .flags = PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV, .order = hns3_page_order(ring), .pool_size = ring->desc_num * hns3_buf_size(ring) / (PAGE_SIZE << hns3_page_order(ring)), diff --git a/drivers/net/ethernet/intel/idpf/idpf_txrx.c b/drivers/net/ethernet/intel/idpf/idpf_txrx.c index 58c5412d3173..5e1ef70d54fe 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_txrx.c +++ b/drivers/net/ethernet/intel/idpf/idpf_txrx.c @@ -595,9 +595,6 @@ static struct page_pool *idpf_rx_create_page_pool(struct idpf_queue *rxbufq) .offset = 0, }; - if (rxbufq->rx_buf_size == IDPF_RX_BUF_2048) - pp.flags |= PP_FLAG_PAGE_FRAG; - return page_pool_create(&pp); } diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c index 818ce76185b2..1a42bfded872 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c @@ -1404,7 +1404,7 @@ int otx2_pool_init(struct otx2_nic *pfvf, u16 pool_id, } pp_params.order = get_order(buf_size); - pp_params.flags = PP_FLAG_PAGE_FRAG | PP_FLAG_DMA_MAP; + pp_params.flags = PP_FLAG_DMA_MAP; pp_params.pool_size = min(OTX2_PAGE_POOL_SZ, numptrs); pp_params.nid = NUMA_NO_NODE; pp_params.dev = pfvf->dev; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 9325b8f00af0..ea58c6917433 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -897,7 +897,7 @@ static int mlx5e_alloc_rq(struct mlx5e_params *params, struct page_pool_params pp_params = { 0 }; pp_params.order = 0; - pp_params.flags = PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV | PP_FLAG_PAGE_FRAG; + pp_params.flags = PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV; pp_params.pool_size = pool_size; pp_params.nid = node; pp_params.dev = rq->pdev; diff --git a/drivers/net/veth.c b/drivers/net/veth.c index 0deefd1573cf..9980517ed8b0 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -737,10 +737,11 @@ static int veth_convert_skb_to_xdp_buff(struct veth_rq *rq, if (skb_shared(skb) || skb_head_is_locked(skb) || skb_shinfo(skb)->nr_frags || skb_headroom(skb) < XDP_PACKET_HEADROOM) { - u32 size, len, max_head_size, off; + u32 size, len, max_head_size, off, truesize, page_offset; struct sk_buff *nskb; struct page *page; int i, head_off; + void *va; /* We need a private copy of the skb and data buffers since * the ebpf program can modify it. We segment the original skb @@ -753,14 +754,17 @@ static int veth_convert_skb_to_xdp_buff(struct veth_rq *rq, if (skb->len > PAGE_SIZE * MAX_SKB_FRAGS + max_head_size) goto drop; + size = min_t(u32, skb->len, max_head_size); + truesize = SKB_HEAD_ALIGN(size) + VETH_XDP_HEADROOM; + /* Allocate skb head */ - page = page_pool_dev_alloc_pages(rq->page_pool); - if (!page) + va = page_pool_dev_alloc_va(rq->page_pool, &truesize); + if (!va) goto drop; - nskb = napi_build_skb(page_address(page), PAGE_SIZE); + nskb = napi_build_skb(va, truesize); if (!nskb) { - page_pool_put_full_page(rq->page_pool, page, true); + page_pool_free_va(rq->page_pool, va, true); goto drop; } @@ -768,7 +772,6 @@ static int veth_convert_skb_to_xdp_buff(struct veth_rq *rq, skb_copy_header(nskb, skb); skb_mark_for_recycle(nskb); - size = min_t(u32, skb->len, max_head_size); if (skb_copy_bits(skb, 0, nskb->data, size)) { consume_skb(nskb); goto drop; @@ -783,14 +786,18 @@ static int veth_convert_skb_to_xdp_buff(struct veth_rq *rq, len = skb->len - off; for (i = 0; i < MAX_SKB_FRAGS && off < skb->len; i++) { - page = page_pool_dev_alloc_pages(rq->page_pool); + size = min_t(u32, len, PAGE_SIZE); + truesize = size; + + page = page_pool_dev_alloc(rq->page_pool, &page_offset, + &truesize); if (!page) { consume_skb(nskb); goto drop; } - size = min_t(u32, len, PAGE_SIZE); - skb_add_rx_frag(nskb, i, page, 0, size, PAGE_SIZE); + skb_add_rx_frag(nskb, i, page, page_offset, size, + truesize); if (skb_copy_bits(skb, off, page_address(page), size)) { consume_skb(nskb); diff --git a/drivers/net/wireless/mediatek/mt76/mac80211.c b/drivers/net/wireless/mediatek/mt76/mac80211.c index cb76053973aa..51a767121b0d 100644 --- a/drivers/net/wireless/mediatek/mt76/mac80211.c +++ b/drivers/net/wireless/mediatek/mt76/mac80211.c @@ -570,7 +570,7 @@ int mt76_create_page_pool(struct mt76_dev *dev, struct mt76_queue *q) { struct page_pool_params pp_params = { .order = 0, - .flags = PP_FLAG_PAGE_FRAG, + .flags = 0, .nid = NUMA_NO_NODE, .dev = dev->dma_dev, }; diff --git a/include/net/page_pool/helpers.h b/include/net/page_pool/helpers.h index 8f64adf86f5b..4ebd544ae977 100644 --- a/include/net/page_pool/helpers.h +++ b/include/net/page_pool/helpers.h @@ -8,23 +8,46 @@ /** * DOC: page_pool allocator * - * The page_pool allocator is optimized for the XDP mode that - * uses one frame per-page, but it can fallback on the - * regular page allocator APIs. - * - * Basic use involves replacing alloc_pages() calls with the - * page_pool_alloc_pages() call. Drivers should use - * page_pool_dev_alloc_pages() replacing dev_alloc_pages(). - * - * The API keeps track of in-flight pages, in order to let API users know - * when it is safe to free a page_pool object. Thus, API users - * must call page_pool_put_page() to free the page, or attach - * the page to a page_pool-aware object like skbs marked with + * The page_pool allocator is optimized for recycling page or page fragment used + * by skb packet and xdp frame. + * + * Basic use involves replacing and alloc_pages() calls with page_pool_alloc(), + * which allocate memory with or without page splitting depending on the + * requested memory size. + * + * If the driver knows that it always requires full pages or its allocations are + * always smaller than half a page, it can use one of the more specific API + * calls: + * + * 1. page_pool_alloc_pages(): allocate memory without page splitting when + * driver knows that the memory it need is always bigger than half of the page + * allocated from page pool. There is no cache line dirtying for 'struct page' + * when a page is recycled back to the page pool. + * + * 2. page_pool_alloc_frag(): allocate memory with page splitting when driver + * knows that the memory it need is always smaller than or equal to half of the + * page allocated from page pool. Page splitting enables memory saving and thus + * avoids TLB/cache miss for data access, but there also is some cost to + * implement page splitting, mainly some cache line dirtying/bouncing for + * 'struct page' and atomic operation for page->pp_frag_count. + * + * The API keeps track of in-flight pages, in order to let API users know when + * it is safe to free a page_pool object, the API users must call + * page_pool_put_page() or page_pool_free_va() to free the page_pool object, or + * attach the page_pool object to a page_pool-aware object like skbs marked with * skb_mark_for_recycle(). * - * API users must call page_pool_put_page() once on a page, as it - * will either recycle the page, or in case of refcnt > 1, it will - * release the DMA mapping and in-flight state accounting. + * page_pool_put_page() may be called multi times on the same page if a page is + * split into multi fragments. For the last fragment, it will either recycle the + * page, or in case of page->_refcount > 1, it will release the DMA mapping and + * in-flight state accounting. + * + * dma_sync_single_range_for_device() is only called for the last fragment when + * page_pool is created with PP_FLAG_DMA_SYNC_DEV flag, so it depends on the + * last freed fragment to do the sync_for_device operation for all fragments in + * the same page when a page is split, the API user must setup pool->p.max_len + * and pool->p.offset correctly and ensure that page_pool_put_page() is called + * with dma_sync_size being -1 for fragment API. */ #ifndef _NET_PAGE_POOL_HELPERS_H #define _NET_PAGE_POOL_HELPERS_H @@ -73,6 +96,17 @@ static inline struct page *page_pool_dev_alloc_pages(struct page_pool *pool) return page_pool_alloc_pages(pool, gfp); } +/** + * page_pool_dev_alloc_frag() - allocate a page fragment. + * @pool: pool from which to allocate + * @offset: offset to the allocated page + * @size: requested size + * + * Get a page fragment from the page allocator or page_pool caches. + * + * Return: + * Return allocated page fragment, otherwise return NULL. + */ static inline struct page *page_pool_dev_alloc_frag(struct page_pool *pool, unsigned int *offset, unsigned int size) @@ -82,6 +116,91 @@ static inline struct page *page_pool_dev_alloc_frag(struct page_pool *pool, return page_pool_alloc_frag(pool, offset, size, gfp); } +static inline struct page *page_pool_alloc(struct page_pool *pool, + unsigned int *offset, + unsigned int *size, gfp_t gfp) +{ + unsigned int max_size = PAGE_SIZE << pool->p.order; + struct page *page; + + if ((*size << 1) > max_size) { + *size = max_size; + *offset = 0; + return page_pool_alloc_pages(pool, gfp); + } + + page = page_pool_alloc_frag(pool, offset, *size, gfp); + if (unlikely(!page)) + return NULL; + + /* There is very likely not enough space for another fragment, so append + * the remaining size to the current fragment to avoid truesize + * underestimate problem. + */ + if (pool->frag_offset + *size > max_size) { + *size = max_size - *offset; + pool->frag_offset = max_size; + } + + return page; +} + +/** + * page_pool_dev_alloc() - allocate a page or a page fragment. + * @pool: pool from which to allocate + * @offset: offset to the allocated page + * @size: in as the requested size, out as the allocated size + * + * Get a page or a page fragment from the page allocator or page_pool caches + * depending on the requested size in order to allocate memory with least memory + * utilization and performance penalty. + * + * Return: + * Return allocated page or page fragment, otherwise return NULL. + */ +static inline struct page *page_pool_dev_alloc(struct page_pool *pool, + unsigned int *offset, + unsigned int *size) +{ + gfp_t gfp = (GFP_ATOMIC | __GFP_NOWARN); + + return page_pool_alloc(pool, offset, size, gfp); +} + +static inline void *page_pool_alloc_va(struct page_pool *pool, + unsigned int *size, gfp_t gfp) +{ + unsigned int offset; + struct page *page; + + /* Mask off __GFP_HIGHMEM to ensure we can use page_address() */ + page = page_pool_alloc(pool, &offset, size, gfp & ~__GFP_HIGHMEM); + if (unlikely(!page)) + return NULL; + + return page_address(page) + offset; +} + +/** + * page_pool_dev_alloc_va() - allocate a page or a page fragment and return its + * va. + * @pool: pool from which to allocate + * @size: in as the requested size, out as the allocated size + * + * This is just a thin wrapper around the page_pool_alloc() API, and + * it returns va of the allocated page or page fragment. + * + * Return: + * Return the va for the allocated page or page fragment, otherwise return NULL. + */ +static inline void *page_pool_dev_alloc_va(struct page_pool *pool, + unsigned int *size) +{ + gfp_t gfp = (GFP_ATOMIC | __GFP_NOWARN); + + return page_pool_alloc_va(pool, size, gfp); +} + /** * page_pool_get_dma_dir() - Retrieve the stored DMA direction. * @pool: pool from which page was allocated @@ -115,28 +234,49 @@ static inline long page_pool_defrag_page(struct page *page, long nr) long ret; /* If nr == pp_frag_count then we have cleared all remaining - * references to the page. No need to actually overwrite it, instead - * we can leave this to be overwritten by the calling function. + * references to the page: + * 1. 'n == 1': no need to actually overwrite it. + * 2. 'n != 1': overwrite it with one, which is the rare case + * for pp_frag_count draining. * - * The main advantage to doing this is that an atomic_read is - * generally a much cheaper operation than an atomic update, - * especially when dealing with a page that may be partitioned - * into only 2 or 3 pieces. + * The main advantage to doing this is that not only we avoid a atomic + * update, as an atomic_read is generally a much cheaper operation than + * an atomic update, especially when dealing with a page that may be + * partitioned into only 2 or 3 pieces; but also unify the pp_frag_count + * handling by ensuring all pages have partitioned into only 1 piece + * initially, and only overwrite it when the page is partitioned into + * more than one piece. */ - if (atomic_long_read(&page->pp_frag_count) == nr) + if (atomic_long_read(&page->pp_frag_count) == nr) { + /* As we have ensured nr is always one for constant case using + * the BUILD_BUG_ON(), only need to handle the non-constant case + * here for pp_frag_count draining, which is a rare case. + */ + BUILD_BUG_ON(__builtin_constant_p(nr) && nr != 1); + if (!__builtin_constant_p(nr)) + atomic_long_set(&page->pp_frag_count, 1); + return 0; + } ret = atomic_long_sub_return(nr, &page->pp_frag_count); WARN_ON(ret < 0); + + /* We are the last user here too, reset pp_frag_count back to 1 to + * ensure all pages have been partitioned into 1 piece initially, + * this should be the rare case when the last two fragment users call + * page_pool_defrag_page() currently. + */ + if (unlikely(!ret)) + atomic_long_set(&page->pp_frag_count, 1); + return ret; } -static inline bool page_pool_is_last_frag(struct page_pool *pool, - struct page *page) +static inline bool page_pool_is_last_frag(struct page *page) { - /* If fragments aren't enabled or count is 0 we were the last user */ - return !(pool->p.flags & PP_FLAG_PAGE_FRAG) || - (page_pool_defrag_page(page, 1) == 0); + /* If page_pool_defrag_page() returns 0, we were the last user */ + return page_pool_defrag_page(page, 1) == 0; } /** @@ -161,7 +301,7 @@ static inline void page_pool_put_page(struct page_pool *pool, * allow registering MEM_TYPE_PAGE_POOL, but shield linker. */ #ifdef CONFIG_PAGE_POOL - if (!page_pool_is_last_frag(pool, page)) + if (!page_pool_is_last_frag(page)) return; page_pool_put_defragged_page(pool, page, dma_sync_size, allow_direct); @@ -201,6 +341,20 @@ static inline void page_pool_recycle_direct(struct page_pool *pool, (sizeof(dma_addr_t) > sizeof(unsigned long)) /** + * page_pool_free_va() - free a va into the page_pool + * @pool: pool from which va was allocated + * @va: va to be freed + * @allow_direct: freed by the consumer, allow lockless caching + * + * Free a va allocated from page_pool_allo_va(). + */ +static inline void page_pool_free_va(struct page_pool *pool, void *va, + bool allow_direct) +{ + page_pool_put_page(pool, virt_to_head_page(va), -1, allow_direct); +} + +/** * page_pool_get_dma_addr() - Retrieve the stored DMA address. * @page: page allocated from a page pool * diff --git a/include/net/page_pool/types.h b/include/net/page_pool/types.h index 887e7946a597..6fc5134095ed 100644 --- a/include/net/page_pool/types.h +++ b/include/net/page_pool/types.h @@ -17,10 +17,8 @@ * Please note DMA-sync-for-CPU is still * device driver responsibility */ -#define PP_FLAG_PAGE_FRAG BIT(2) /* for page frag feature */ #define PP_FLAG_ALL (PP_FLAG_DMA_MAP |\ - PP_FLAG_DMA_SYNC_DEV |\ - PP_FLAG_PAGE_FRAG) + PP_FLAG_DMA_SYNC_DEV) /* * Fast allocation side cache array/stack @@ -45,7 +43,7 @@ struct pp_alloc_cache { /** * struct page_pool_params - page pool parameters - * @flags: PP_FLAG_DMA_MAP, PP_FLAG_DMA_SYNC_DEV, PP_FLAG_PAGE_FRAG + * @flags: PP_FLAG_DMA_MAP, PP_FLAG_DMA_SYNC_DEV * @order: 2^order pages on allocation * @pool_size: size of the ptr_ring * @nid: NUMA node id to allocate from pages from diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 8a9868ea5067..5e409b98aba0 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -376,6 +376,14 @@ static void page_pool_set_pp_info(struct page_pool *pool, { page->pp = pool; page->pp_magic |= PP_SIGNATURE; + + /* Ensuring all pages have been split into one fragment initially: + * page_pool_set_pp_info() is only called once for every page when it + * is allocated from the page allocator and page_pool_fragment_page() + * is dirtying the same cache line as the page->pp_magic above, so + * the overhead is negligible. + */ + page_pool_fragment_page(page, 1); if (pool->p.init_callback) pool->p.init_callback(page, pool->p.init_arg); } @@ -672,7 +680,7 @@ void page_pool_put_page_bulk(struct page_pool *pool, void **data, struct page *page = virt_to_head_page(data[i]); /* It is not the last user for the page frag case */ - if (!page_pool_is_last_frag(pool, page)) + if (!page_pool_is_last_frag(page)) continue; page = __page_pool_put_page(pool, page, -1, false); @@ -748,8 +756,7 @@ struct page *page_pool_alloc_frag(struct page_pool *pool, unsigned int max_size = PAGE_SIZE << pool->p.order; struct page *page = pool->frag_page; - if (WARN_ON(!(pool->p.flags & PP_FLAG_PAGE_FRAG) || - size > max_size)) + if (WARN_ON(size > max_size)) return NULL; size = ALIGN(size, dma_get_cache_alignment()); @@ -802,7 +809,7 @@ static void page_pool_empty_ring(struct page_pool *pool) } } -static void page_pool_free(struct page_pool *pool) +static void __page_pool_destroy(struct page_pool *pool) { if (pool->disconnect) pool->disconnect(pool); @@ -853,7 +860,7 @@ static int page_pool_release(struct page_pool *pool) page_pool_scrub(pool); inflight = page_pool_inflight(pool); if (!inflight) - page_pool_free(pool); + __page_pool_destroy(pool); return inflight; } diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 975c9a6ffb4a..c52ddd6891d9 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -5765,7 +5765,7 @@ bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from, /* In general, avoid mixing page_pool and non-page_pool allocated * pages within the same SKB. Additionally avoid dealing with clones * with page_pool pages, in case the SKB is using page_pool fragment - * references (PP_FLAG_PAGE_FRAG). Since we only take full page + * references (page_pool_alloc_frag()). Since we only take full page * references for cloned SKBs at the moment that would result in * inconsistent reference counts. * In theory we could take full references if @from is cloned and |