From 10a5e009b93a812956e232cee8804ed99f5b93bb Mon Sep 17 00:00:00 2001
From: Magnus Karlsson <magnus.karlsson@intel.com>
Date: Wed, 22 Sep 2021 09:56:01 +0200
Subject: xsk: Get rid of unused entry in struct xdp_buff_xsk

Get rid of the unused entry "unaligned" in struct xdp_buff_xsk.

Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20210922075613.12186-2-magnus.karlsson@gmail.com
---
 include/net/xsk_buff_pool.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h
index 7a9a23e7a604..bcb29a10307f 100644
--- a/include/net/xsk_buff_pool.h
+++ b/include/net/xsk_buff_pool.h
@@ -23,7 +23,6 @@ struct xdp_buff_xsk {
 	dma_addr_t dma;
 	dma_addr_t frame_dma;
 	struct xsk_buff_pool *pool;
-	bool unaligned;
 	u64 orig_addr;
 	struct list_head free_list_node;
 };
-- 
cgit 


From 47e4075df300050a920b99299c4db3dad9adaba9 Mon Sep 17 00:00:00 2001
From: Magnus Karlsson <magnus.karlsson@intel.com>
Date: Wed, 22 Sep 2021 09:56:02 +0200
Subject: xsk: Batched buffer allocation for the pool

Add a new driver interface xsk_buff_alloc_batch() offering batched
buffer allocations to improve performance. The new interface takes
three arguments: the buffer pool to allocated from, a pointer to an
array of struct xdp_buff pointers which will contain pointers to the
allocated xdp_buffs, and an unsigned integer specifying the max number
of buffers to allocate. The return value is the actual number of
buffers that the allocator managed to allocate and it will be in the
range 0 <= N <= max, where max is the third parameter to the function.

u32 xsk_buff_alloc_batch(struct xsk_buff_pool *pool, struct xdp_buff **xdp,
                         u32 max);

A second driver interface is also introduced that need to be used in
conjunction with xsk_buff_alloc_batch(). It is a helper that sets the
size of struct xdp_buff and is used by the NIC Rx irq routine when
receiving a packet. This helper sets the three struct members data,
data_meta, and data_end. The two first ones is in the xsk_buff_alloc()
case set in the allocation routine and data_end is set when a packet
is received in the receive irq function. This unfortunately leads to
worse performance since the xdp_buff is touched twice with a long time
period in between leading to an extra cache miss. Instead, we fill out
the xdp_buff with all 3 fields at one single point in time in the
driver, when the size of the packet is known. Hence this helper. Note
that the driver has to use this helper (or set all three fields
itself) when using xsk_buff_alloc_batch(). xsk_buff_alloc() works as
before and does not require this.

void xsk_buff_set_size(struct xdp_buff *xdp, u32 size);

Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20210922075613.12186-3-magnus.karlsson@gmail.com
---
 include/net/xdp_sock_drv.h  | 22 ++++++++++++
 include/net/xsk_buff_pool.h |  1 +
 net/xdp/xsk_buff_pool.c     | 87 +++++++++++++++++++++++++++++++++++++++++++++
 net/xdp/xsk_queue.h         | 12 ++++---
 4 files changed, 118 insertions(+), 4 deletions(-)

diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h
index 4e295541e396..443d45951564 100644
--- a/include/net/xdp_sock_drv.h
+++ b/include/net/xdp_sock_drv.h
@@ -77,6 +77,12 @@ static inline struct xdp_buff *xsk_buff_alloc(struct xsk_buff_pool *pool)
 	return xp_alloc(pool);
 }
 
+/* Returns as many entries as possible up to max. 0 <= N <= max. */
+static inline u32 xsk_buff_alloc_batch(struct xsk_buff_pool *pool, struct xdp_buff **xdp, u32 max)
+{
+	return xp_alloc_batch(pool, xdp, max);
+}
+
 static inline bool xsk_buff_can_alloc(struct xsk_buff_pool *pool, u32 count)
 {
 	return xp_can_alloc(pool, count);
@@ -89,6 +95,13 @@ static inline void xsk_buff_free(struct xdp_buff *xdp)
 	xp_free(xskb);
 }
 
+static inline void xsk_buff_set_size(struct xdp_buff *xdp, u32 size)
+{
+	xdp->data = xdp->data_hard_start + XDP_PACKET_HEADROOM;
+	xdp->data_meta = xdp->data;
+	xdp->data_end = xdp->data + size;
+}
+
 static inline dma_addr_t xsk_buff_raw_get_dma(struct xsk_buff_pool *pool,
 					      u64 addr)
 {
@@ -212,6 +225,11 @@ static inline struct xdp_buff *xsk_buff_alloc(struct xsk_buff_pool *pool)
 	return NULL;
 }
 
+static inline u32 xsk_buff_alloc_batch(struct xsk_buff_pool *pool, struct xdp_buff **xdp, u32 max)
+{
+	return 0;
+}
+
 static inline bool xsk_buff_can_alloc(struct xsk_buff_pool *pool, u32 count)
 {
 	return false;
@@ -221,6 +239,10 @@ static inline void xsk_buff_free(struct xdp_buff *xdp)
 {
 }
 
+static inline void xsk_buff_set_size(struct xdp_buff *xdp, u32 size)
+{
+}
+
 static inline dma_addr_t xsk_buff_raw_get_dma(struct xsk_buff_pool *pool,
 					      u64 addr)
 {
diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h
index bcb29a10307f..b7068f97639f 100644
--- a/include/net/xsk_buff_pool.h
+++ b/include/net/xsk_buff_pool.h
@@ -104,6 +104,7 @@ int xp_dma_map(struct xsk_buff_pool *pool, struct device *dev,
 	       unsigned long attrs, struct page **pages, u32 nr_pages);
 void xp_dma_unmap(struct xsk_buff_pool *pool, unsigned long attrs);
 struct xdp_buff *xp_alloc(struct xsk_buff_pool *pool);
+u32 xp_alloc_batch(struct xsk_buff_pool *pool, struct xdp_buff **xdp, u32 max);
 bool xp_can_alloc(struct xsk_buff_pool *pool, u32 count);
 void *xp_raw_get_data(struct xsk_buff_pool *pool, u64 addr);
 dma_addr_t xp_raw_get_dma(struct xsk_buff_pool *pool, u64 addr);
diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c
index 8de01aaac4a0..884d95d70f5e 100644
--- a/net/xdp/xsk_buff_pool.c
+++ b/net/xdp/xsk_buff_pool.c
@@ -507,6 +507,93 @@ struct xdp_buff *xp_alloc(struct xsk_buff_pool *pool)
 }
 EXPORT_SYMBOL(xp_alloc);
 
+static u32 xp_alloc_new_from_fq(struct xsk_buff_pool *pool, struct xdp_buff **xdp, u32 max)
+{
+	u32 i, cached_cons, nb_entries;
+
+	if (max > pool->free_heads_cnt)
+		max = pool->free_heads_cnt;
+	max = xskq_cons_nb_entries(pool->fq, max);
+
+	cached_cons = pool->fq->cached_cons;
+	nb_entries = max;
+	i = max;
+	while (i--) {
+		struct xdp_buff_xsk *xskb;
+		u64 addr;
+		bool ok;
+
+		__xskq_cons_read_addr_unchecked(pool->fq, cached_cons++, &addr);
+
+		ok = pool->unaligned ? xp_check_unaligned(pool, &addr) :
+			xp_check_aligned(pool, &addr);
+		if (unlikely(!ok)) {
+			pool->fq->invalid_descs++;
+			nb_entries--;
+			continue;
+		}
+
+		xskb = pool->free_heads[--pool->free_heads_cnt];
+		*xdp = &xskb->xdp;
+		xskb->orig_addr = addr;
+		xskb->xdp.data_hard_start = pool->addrs + addr + pool->headroom;
+		xskb->frame_dma = (pool->dma_pages[addr >> PAGE_SHIFT] &
+				   ~XSK_NEXT_PG_CONTIG_MASK) + (addr & ~PAGE_MASK);
+		xskb->dma = xskb->frame_dma + pool->headroom + XDP_PACKET_HEADROOM;
+		xdp++;
+	}
+
+	xskq_cons_release_n(pool->fq, max);
+	return nb_entries;
+}
+
+static u32 xp_alloc_reused(struct xsk_buff_pool *pool, struct xdp_buff **xdp, u32 nb_entries)
+{
+	struct xdp_buff_xsk *xskb;
+	u32 i;
+
+	nb_entries = min_t(u32, nb_entries, pool->free_list_cnt);
+
+	i = nb_entries;
+	while (i--) {
+		xskb = list_first_entry(&pool->free_list, struct xdp_buff_xsk, free_list_node);
+		list_del(&xskb->free_list_node);
+
+		*xdp = &xskb->xdp;
+		xdp++;
+	}
+	pool->free_list_cnt -= nb_entries;
+
+	return nb_entries;
+}
+
+u32 xp_alloc_batch(struct xsk_buff_pool *pool, struct xdp_buff **xdp, u32 max)
+{
+	u32 nb_entries1 = 0, nb_entries2;
+
+	if (unlikely(pool->dma_need_sync)) {
+		/* Slow path */
+		*xdp = xp_alloc(pool);
+		return !!*xdp;
+	}
+
+	if (unlikely(pool->free_list_cnt)) {
+		nb_entries1 = xp_alloc_reused(pool, xdp, max);
+		if (nb_entries1 == max)
+			return nb_entries1;
+
+		max -= nb_entries1;
+		xdp += nb_entries1;
+	}
+
+	nb_entries2 = xp_alloc_new_from_fq(pool, xdp, max);
+	if (!nb_entries2)
+		pool->fq->queue_empty_descs++;
+
+	return nb_entries1 + nb_entries2;
+}
+EXPORT_SYMBOL(xp_alloc_batch);
+
 bool xp_can_alloc(struct xsk_buff_pool *pool, u32 count)
 {
 	if (pool->free_list_cnt >= count)
diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
index 9ae13cccfb28..e9aa2c236356 100644
--- a/net/xdp/xsk_queue.h
+++ b/net/xdp/xsk_queue.h
@@ -111,14 +111,18 @@ struct xsk_queue {
 
 /* Functions that read and validate content from consumer rings. */
 
-static inline bool xskq_cons_read_addr_unchecked(struct xsk_queue *q, u64 *addr)
+static inline void __xskq_cons_read_addr_unchecked(struct xsk_queue *q, u32 cached_cons, u64 *addr)
 {
 	struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring;
+	u32 idx = cached_cons & q->ring_mask;
 
-	if (q->cached_cons != q->cached_prod) {
-		u32 idx = q->cached_cons & q->ring_mask;
+	*addr = ring->desc[idx];
+}
 
-		*addr = ring->desc[idx];
+static inline bool xskq_cons_read_addr_unchecked(struct xsk_queue *q, u64 *addr)
+{
+	if (q->cached_cons != q->cached_prod) {
+		__xskq_cons_read_addr_unchecked(q, q->cached_cons, addr);
 		return true;
 	}
 
-- 
cgit 


From 57f7f8b6bc0bc80d94443f94fe5f21f266499a2b Mon Sep 17 00:00:00 2001
From: Magnus Karlsson <magnus.karlsson@intel.com>
Date: Wed, 22 Sep 2021 09:56:03 +0200
Subject: ice: Use xdp_buf instead of rx_buf for xsk zero-copy

In order to use the new xsk batched buffer allocation interface, a
pointer to an array of struct xsk_buff pointers need to be provided so
that the function can put the result of the allocation there. In the
ice driver, we already have a ring that stores pointers to
xdp_buffs. This is only used for the xsk zero-copy driver and is a
union with the structure that is used for the regular non zero-copy
path. Unfortunately, that structure is larger than the xdp_buffs
pointers which mean that there will be a stride (of 20 bytes) between
each xdp_buff pointer. And feeding this into the xsk_buff_alloc_batch
interface will not work since it assumes a regular array of xdp_buff
pointers (each 8 bytes with 0 bytes in-between them on a 64-bit
system).

To fix this, remove the xdp_buff pointer from the rx_buf union and
move it one step higher to the union above which only has pointers to
arrays in it. This solves the problem and we can directly feed the SW
ring of xdp_buff pointers straight into the allocation function in the
next patch when that interface is used. This will improve performance.

Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20210922075613.12186-4-magnus.karlsson@gmail.com
---
 drivers/net/ethernet/intel/ice/ice_txrx.h | 16 +++------
 drivers/net/ethernet/intel/ice/ice_xsk.c  | 56 +++++++++++++++----------------
 2 files changed, 33 insertions(+), 39 deletions(-)

diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.h b/drivers/net/ethernet/intel/ice/ice_txrx.h
index 1e46e80f3d6f..7c2328529ff8 100644
--- a/drivers/net/ethernet/intel/ice/ice_txrx.h
+++ b/drivers/net/ethernet/intel/ice/ice_txrx.h
@@ -164,17 +164,10 @@ struct ice_tx_offload_params {
 };
 
 struct ice_rx_buf {
-	union {
-		struct {
-			dma_addr_t dma;
-			struct page *page;
-			unsigned int page_offset;
-			u16 pagecnt_bias;
-		};
-		struct {
-			struct xdp_buff *xdp;
-		};
-	};
+	dma_addr_t dma;
+	struct page *page;
+	unsigned int page_offset;
+	u16 pagecnt_bias;
 };
 
 struct ice_q_stats {
@@ -270,6 +263,7 @@ struct ice_ring {
 	union {
 		struct ice_tx_buf *tx_buf;
 		struct ice_rx_buf *rx_buf;
+		struct xdp_buff **xdp_buf;
 	};
 	/* CL2 - 2nd cacheline starts here */
 	u16 q_index;			/* Queue number of ring */
diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.c b/drivers/net/ethernet/intel/ice/ice_xsk.c
index 5a9f61deeb38..f4ab5259a56c 100644
--- a/drivers/net/ethernet/intel/ice/ice_xsk.c
+++ b/drivers/net/ethernet/intel/ice/ice_xsk.c
@@ -364,7 +364,7 @@ bool ice_alloc_rx_bufs_zc(struct ice_ring *rx_ring, u16 count)
 {
 	union ice_32b_rx_flex_desc *rx_desc;
 	u16 ntu = rx_ring->next_to_use;
-	struct ice_rx_buf *rx_buf;
+	struct xdp_buff **xdp;
 	bool ok = true;
 	dma_addr_t dma;
 
@@ -372,26 +372,26 @@ bool ice_alloc_rx_bufs_zc(struct ice_ring *rx_ring, u16 count)
 		return true;
 
 	rx_desc = ICE_RX_DESC(rx_ring, ntu);
-	rx_buf = &rx_ring->rx_buf[ntu];
+	xdp = &rx_ring->xdp_buf[ntu];
 
 	do {
-		rx_buf->xdp = xsk_buff_alloc(rx_ring->xsk_pool);
-		if (!rx_buf->xdp) {
+		*xdp = xsk_buff_alloc(rx_ring->xsk_pool);
+		if (!xdp) {
 			ok = false;
 			break;
 		}
 
-		dma = xsk_buff_xdp_get_dma(rx_buf->xdp);
+		dma = xsk_buff_xdp_get_dma(*xdp);
 		rx_desc->read.pkt_addr = cpu_to_le64(dma);
 		rx_desc->wb.status_error0 = 0;
 
 		rx_desc++;
-		rx_buf++;
+		xdp++;
 		ntu++;
 
 		if (unlikely(ntu == rx_ring->count)) {
 			rx_desc = ICE_RX_DESC(rx_ring, 0);
-			rx_buf = rx_ring->rx_buf;
+			xdp = rx_ring->xdp_buf;
 			ntu = 0;
 		}
 	} while (--count);
@@ -421,19 +421,19 @@ static void ice_bump_ntc(struct ice_ring *rx_ring)
 /**
  * ice_construct_skb_zc - Create an sk_buff from zero-copy buffer
  * @rx_ring: Rx ring
- * @rx_buf: zero-copy Rx buffer
+ * @xdp_arr: Pointer to the SW ring of xdp_buff pointers
  *
  * This function allocates a new skb from a zero-copy Rx buffer.
  *
  * Returns the skb on success, NULL on failure.
  */
 static struct sk_buff *
-ice_construct_skb_zc(struct ice_ring *rx_ring, struct ice_rx_buf *rx_buf)
+ice_construct_skb_zc(struct ice_ring *rx_ring, struct xdp_buff **xdp_arr)
 {
-	unsigned int metasize = rx_buf->xdp->data - rx_buf->xdp->data_meta;
-	unsigned int datasize = rx_buf->xdp->data_end - rx_buf->xdp->data;
-	unsigned int datasize_hard = rx_buf->xdp->data_end -
-				     rx_buf->xdp->data_hard_start;
+	struct xdp_buff *xdp = *xdp_arr;
+	unsigned int metasize = xdp->data - xdp->data_meta;
+	unsigned int datasize = xdp->data_end - xdp->data;
+	unsigned int datasize_hard = xdp->data_end - xdp->data_hard_start;
 	struct sk_buff *skb;
 
 	skb = __napi_alloc_skb(&rx_ring->q_vector->napi, datasize_hard,
@@ -441,13 +441,13 @@ ice_construct_skb_zc(struct ice_ring *rx_ring, struct ice_rx_buf *rx_buf)
 	if (unlikely(!skb))
 		return NULL;
 
-	skb_reserve(skb, rx_buf->xdp->data - rx_buf->xdp->data_hard_start);
-	memcpy(__skb_put(skb, datasize), rx_buf->xdp->data, datasize);
+	skb_reserve(skb, xdp->data - xdp->data_hard_start);
+	memcpy(__skb_put(skb, datasize), xdp->data, datasize);
 	if (metasize)
 		skb_metadata_set(skb, metasize);
 
-	xsk_buff_free(rx_buf->xdp);
-	rx_buf->xdp = NULL;
+	xsk_buff_free(xdp);
+	*xdp_arr = NULL;
 	return skb;
 }
 
@@ -521,7 +521,7 @@ int ice_clean_rx_irq_zc(struct ice_ring *rx_ring, int budget)
 	while (likely(total_rx_packets < (unsigned int)budget)) {
 		union ice_32b_rx_flex_desc *rx_desc;
 		unsigned int size, xdp_res = 0;
-		struct ice_rx_buf *rx_buf;
+		struct xdp_buff **xdp;
 		struct sk_buff *skb;
 		u16 stat_err_bits;
 		u16 vlan_tag = 0;
@@ -544,18 +544,18 @@ int ice_clean_rx_irq_zc(struct ice_ring *rx_ring, int budget)
 		if (!size)
 			break;
 
-		rx_buf = &rx_ring->rx_buf[rx_ring->next_to_clean];
-		rx_buf->xdp->data_end = rx_buf->xdp->data + size;
-		xsk_buff_dma_sync_for_cpu(rx_buf->xdp, rx_ring->xsk_pool);
+		xdp = &rx_ring->xdp_buf[rx_ring->next_to_clean];
+		(*xdp)->data_end = (*xdp)->data + size;
+		xsk_buff_dma_sync_for_cpu(*xdp, rx_ring->xsk_pool);
 
-		xdp_res = ice_run_xdp_zc(rx_ring, rx_buf->xdp);
+		xdp_res = ice_run_xdp_zc(rx_ring, *xdp);
 		if (xdp_res) {
 			if (xdp_res & (ICE_XDP_TX | ICE_XDP_REDIR))
 				xdp_xmit |= xdp_res;
 			else
-				xsk_buff_free(rx_buf->xdp);
+				xsk_buff_free(*xdp);
 
-			rx_buf->xdp = NULL;
+			*xdp = NULL;
 			total_rx_bytes += size;
 			total_rx_packets++;
 			cleaned_count++;
@@ -565,7 +565,7 @@ int ice_clean_rx_irq_zc(struct ice_ring *rx_ring, int budget)
 		}
 
 		/* XDP_PASS path */
-		skb = ice_construct_skb_zc(rx_ring, rx_buf);
+		skb = ice_construct_skb_zc(rx_ring, xdp);
 		if (!skb) {
 			rx_ring->rx_stats.alloc_buf_failed++;
 			break;
@@ -813,12 +813,12 @@ void ice_xsk_clean_rx_ring(struct ice_ring *rx_ring)
 	u16 i;
 
 	for (i = 0; i < rx_ring->count; i++) {
-		struct ice_rx_buf *rx_buf = &rx_ring->rx_buf[i];
+		struct xdp_buff **xdp = &rx_ring->xdp_buf[i];
 
-		if (!rx_buf->xdp)
+		if (!xdp)
 			continue;
 
-		rx_buf->xdp = NULL;
+		*xdp = NULL;
 	}
 }
 
-- 
cgit 


From db804cfc21e969a5a4ada4b8142f711def5ed339 Mon Sep 17 00:00:00 2001
From: Magnus Karlsson <magnus.karlsson@intel.com>
Date: Wed, 22 Sep 2021 09:56:04 +0200
Subject: ice: Use the xsk batched rx allocation interface

Use the new xsk batched rx allocation interface for the zero-copy data
path. As the array of struct xdp_buff pointers kept by the driver is
really a ring that wraps, the allocation routine is modified to detect
a wrap and in that case call the allocation function twice. The
allocation function cannot deal with wrapped rings, only arrays. As we
now know exactly how many buffers we get and that there is no
wrapping, the allocation function can be simplified even more as all
if-statements in the allocation loop can be removed, improving
performance.

Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20210922075613.12186-5-magnus.karlsson@gmail.com
---
 drivers/net/ethernet/intel/ice/ice_xsk.c | 44 ++++++++++++++------------------
 1 file changed, 19 insertions(+), 25 deletions(-)

diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.c b/drivers/net/ethernet/intel/ice/ice_xsk.c
index f4ab5259a56c..7682eaa9a9ec 100644
--- a/drivers/net/ethernet/intel/ice/ice_xsk.c
+++ b/drivers/net/ethernet/intel/ice/ice_xsk.c
@@ -365,44 +365,38 @@ bool ice_alloc_rx_bufs_zc(struct ice_ring *rx_ring, u16 count)
 	union ice_32b_rx_flex_desc *rx_desc;
 	u16 ntu = rx_ring->next_to_use;
 	struct xdp_buff **xdp;
-	bool ok = true;
+	u32 nb_buffs, i;
 	dma_addr_t dma;
 
-	if (!count)
-		return true;
-
 	rx_desc = ICE_RX_DESC(rx_ring, ntu);
 	xdp = &rx_ring->xdp_buf[ntu];
 
-	do {
-		*xdp = xsk_buff_alloc(rx_ring->xsk_pool);
-		if (!xdp) {
-			ok = false;
-			break;
-		}
+	nb_buffs = min_t(u16, count, rx_ring->count - ntu);
+	nb_buffs = xsk_buff_alloc_batch(rx_ring->xsk_pool, xdp, nb_buffs);
+	if (!nb_buffs)
+		return false;
 
+	i = nb_buffs;
+	while (i--) {
 		dma = xsk_buff_xdp_get_dma(*xdp);
 		rx_desc->read.pkt_addr = cpu_to_le64(dma);
-		rx_desc->wb.status_error0 = 0;
 
 		rx_desc++;
 		xdp++;
-		ntu++;
-
-		if (unlikely(ntu == rx_ring->count)) {
-			rx_desc = ICE_RX_DESC(rx_ring, 0);
-			xdp = rx_ring->xdp_buf;
-			ntu = 0;
-		}
-	} while (--count);
+	}
 
-	if (rx_ring->next_to_use != ntu) {
-		/* clear the status bits for the next_to_use descriptor */
-		rx_desc->wb.status_error0 = 0;
-		ice_release_rx_desc(rx_ring, ntu);
+	ntu += nb_buffs;
+	if (ntu == rx_ring->count) {
+		rx_desc = ICE_RX_DESC(rx_ring, 0);
+		xdp = rx_ring->xdp_buf;
+		ntu = 0;
 	}
 
-	return ok;
+	/* clear the status bits for the next_to_use descriptor */
+	rx_desc->wb.status_error0 = 0;
+	ice_release_rx_desc(rx_ring, ntu);
+
+	return count == nb_buffs ? true : false;
 }
 
 /**
@@ -545,7 +539,7 @@ int ice_clean_rx_irq_zc(struct ice_ring *rx_ring, int budget)
 			break;
 
 		xdp = &rx_ring->xdp_buf[rx_ring->next_to_clean];
-		(*xdp)->data_end = (*xdp)->data + size;
+		xsk_buff_set_size(*xdp, size);
 		xsk_buff_dma_sync_for_cpu(*xdp, rx_ring->xsk_pool);
 
 		xdp_res = ice_run_xdp_zc(rx_ring, *xdp);
-- 
cgit 


From 6aab0bb0c5cdc02d6f182ada2d86afae0c22fc76 Mon Sep 17 00:00:00 2001
From: Magnus Karlsson <magnus.karlsson@intel.com>
Date: Wed, 22 Sep 2021 09:56:05 +0200
Subject: i40e: Use the xsk batched rx allocation interface

Use the new xsk batched rx allocation interface for the zero-copy data
path. As the array of struct xdp_buff pointers kept by the driver is
really a ring that wraps, the allocation routine is modified to detect
a wrap and in that case call the allocation function twice. The
allocation function cannot deal with wrapped rings, only arrays. As we
now know exactly how many buffers we get and that there is no
wrapping, the allocation function can be simplified even more as all
if-statements in the allocation loop can be removed, improving
performance.

Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20210922075613.12186-6-magnus.karlsson@gmail.com
---
 drivers/net/ethernet/intel/i40e/i40e_xsk.c | 52 ++++++++++++++----------------
 1 file changed, 25 insertions(+), 27 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.c b/drivers/net/ethernet/intel/i40e/i40e_xsk.c
index e7e778ca074c..6f85879ba993 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_xsk.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.c
@@ -193,42 +193,40 @@ bool i40e_alloc_rx_buffers_zc(struct i40e_ring *rx_ring, u16 count)
 {
 	u16 ntu = rx_ring->next_to_use;
 	union i40e_rx_desc *rx_desc;
-	struct xdp_buff **bi, *xdp;
+	struct xdp_buff **xdp;
+	u32 nb_buffs, i;
 	dma_addr_t dma;
-	bool ok = true;
 
 	rx_desc = I40E_RX_DESC(rx_ring, ntu);
-	bi = i40e_rx_bi(rx_ring, ntu);
-	do {
-		xdp = xsk_buff_alloc(rx_ring->xsk_pool);
-		if (!xdp) {
-			ok = false;
-			goto no_buffers;
-		}
-		*bi = xdp;
-		dma = xsk_buff_xdp_get_dma(xdp);
+	xdp = i40e_rx_bi(rx_ring, ntu);
+
+	nb_buffs = min_t(u16, count, rx_ring->count - ntu);
+	nb_buffs = xsk_buff_alloc_batch(rx_ring->xsk_pool, xdp, nb_buffs);
+	if (!nb_buffs)
+		return false;
+
+	i = nb_buffs;
+	while (i--) {
+		dma = xsk_buff_xdp_get_dma(*xdp);
 		rx_desc->read.pkt_addr = cpu_to_le64(dma);
 		rx_desc->read.hdr_addr = 0;
 
 		rx_desc++;
-		bi++;
-		ntu++;
-
-		if (unlikely(ntu == rx_ring->count)) {
-			rx_desc = I40E_RX_DESC(rx_ring, 0);
-			bi = i40e_rx_bi(rx_ring, 0);
-			ntu = 0;
-		}
-	} while (--count);
+		xdp++;
+	}
 
-no_buffers:
-	if (rx_ring->next_to_use != ntu) {
-		/* clear the status bits for the next_to_use descriptor */
-		rx_desc->wb.qword1.status_error_len = 0;
-		i40e_release_rx_desc(rx_ring, ntu);
+	ntu += nb_buffs;
+	if (ntu == rx_ring->count) {
+		rx_desc = I40E_RX_DESC(rx_ring, 0);
+		xdp = i40e_rx_bi(rx_ring, 0);
+		ntu = 0;
 	}
 
-	return ok;
+	/* clear the status bits for the next_to_use descriptor */
+	rx_desc->wb.qword1.status_error_len = 0;
+	i40e_release_rx_desc(rx_ring, ntu);
+
+	return count == nb_buffs ? true : false;
 }
 
 /**
@@ -365,7 +363,7 @@ int i40e_clean_rx_irq_zc(struct i40e_ring *rx_ring, int budget)
 			break;
 
 		bi = *i40e_rx_bi(rx_ring, next_to_clean);
-		bi->data_end = bi->data + size;
+		xsk_buff_set_size(bi, size);
 		xsk_buff_dma_sync_for_cpu(bi, rx_ring->xsk_pool);
 
 		xdp_res = i40e_run_xdp_zc(rx_ring, bi);
-- 
cgit 


From 94033cd8e73b8632bab7c8b7bb54caa4f5616db7 Mon Sep 17 00:00:00 2001
From: Magnus Karlsson <magnus.karlsson@intel.com>
Date: Wed, 22 Sep 2021 09:56:06 +0200
Subject: xsk: Optimize for aligned case

Optimize for the aligned case by precomputing the parameter values of
the xdp_buff_xsk and xdp_buff structures in the heads array. We can do
this as the heads array size is equal to the number of chunks in the
umem for the aligned case. Then every entry in this array will reflect
a certain chunk/frame and can therefore be prepopulated with the
correct values and we can drop the use of the free_heads stack. Note
that it is not possible to allocate more buffers than what has been
allocated in the aligned case since each chunk can only contain a
single buffer.

We can unfortunately not do this in the unaligned case as one chunk
might contain multiple buffers. In this case, we keep the old scheme
of populating a heads entry every time it is used and using
the free_heads stack.

Also move xp_release() and xp_get_handle() to xsk_buff_pool.h. They
were for some reason in xsk.c even though they are buffer pool
operations.

Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20210922075613.12186-7-magnus.karlsson@gmail.com
---
 include/net/xsk_buff_pool.h | 46 ++++++++++++++++++++++++++++++++++++-
 net/xdp/xsk.c               | 15 ------------
 net/xdp/xsk_buff_pool.c     | 56 +++++++++++++++++++++++++++------------------
 3 files changed, 79 insertions(+), 38 deletions(-)

diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h
index b7068f97639f..ddeefc4a1040 100644
--- a/include/net/xsk_buff_pool.h
+++ b/include/net/xsk_buff_pool.h
@@ -7,6 +7,7 @@
 #include <linux/if_xdp.h>
 #include <linux/types.h>
 #include <linux/dma-mapping.h>
+#include <linux/bpf.h>
 #include <net/xdp.h>
 
 struct xsk_buff_pool;
@@ -66,6 +67,7 @@ struct xsk_buff_pool {
 	u32 free_heads_cnt;
 	u32 headroom;
 	u32 chunk_size;
+	u32 chunk_shift;
 	u32 frame_len;
 	u8 cached_need_wakeup;
 	bool uses_need_wakeup;
@@ -80,6 +82,13 @@ struct xsk_buff_pool {
 	struct xdp_buff_xsk *free_heads[];
 };
 
+/* Masks for xdp_umem_page flags.
+ * The low 12-bits of the addr will be 0 since this is the page address, so we
+ * can use them for flags.
+ */
+#define XSK_NEXT_PG_CONTIG_SHIFT 0
+#define XSK_NEXT_PG_CONTIG_MASK BIT_ULL(XSK_NEXT_PG_CONTIG_SHIFT)
+
 /* AF_XDP core. */
 struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs,
 						struct xdp_umem *umem);
@@ -88,7 +97,6 @@ int xp_assign_dev(struct xsk_buff_pool *pool, struct net_device *dev,
 int xp_assign_dev_shared(struct xsk_buff_pool *pool, struct xdp_umem *umem,
 			 struct net_device *dev, u16 queue_id);
 void xp_destroy(struct xsk_buff_pool *pool);
-void xp_release(struct xdp_buff_xsk *xskb);
 void xp_get_pool(struct xsk_buff_pool *pool);
 bool xp_put_pool(struct xsk_buff_pool *pool);
 void xp_clear_dev(struct xsk_buff_pool *pool);
@@ -98,6 +106,21 @@ void xp_del_xsk(struct xsk_buff_pool *pool, struct xdp_sock *xs);
 /* AF_XDP, and XDP core. */
 void xp_free(struct xdp_buff_xsk *xskb);
 
+static inline void xp_init_xskb_addr(struct xdp_buff_xsk *xskb, struct xsk_buff_pool *pool,
+				     u64 addr)
+{
+	xskb->orig_addr = addr;
+	xskb->xdp.data_hard_start = pool->addrs + addr + pool->headroom;
+}
+
+static inline void xp_init_xskb_dma(struct xdp_buff_xsk *xskb, struct xsk_buff_pool *pool,
+				    dma_addr_t *dma_pages, u64 addr)
+{
+	xskb->frame_dma = (dma_pages[addr >> PAGE_SHIFT] & ~XSK_NEXT_PG_CONTIG_MASK) +
+		(addr & ~PAGE_MASK);
+	xskb->dma = xskb->frame_dma + pool->headroom + XDP_PACKET_HEADROOM;
+}
+
 /* AF_XDP ZC drivers, via xdp_sock_buff.h */
 void xp_set_rxq_info(struct xsk_buff_pool *pool, struct xdp_rxq_info *rxq);
 int xp_dma_map(struct xsk_buff_pool *pool, struct device *dev,
@@ -180,4 +203,25 @@ static inline u64 xp_unaligned_add_offset_to_addr(u64 addr)
 		xp_unaligned_extract_offset(addr);
 }
 
+static inline u32 xp_aligned_extract_idx(struct xsk_buff_pool *pool, u64 addr)
+{
+	return xp_aligned_extract_addr(pool, addr) >> pool->chunk_shift;
+}
+
+static inline void xp_release(struct xdp_buff_xsk *xskb)
+{
+	if (xskb->pool->unaligned)
+		xskb->pool->free_heads[xskb->pool->free_heads_cnt++] = xskb;
+}
+
+static inline u64 xp_get_handle(struct xdp_buff_xsk *xskb)
+{
+	u64 offset = xskb->xdp.data - xskb->xdp.data_hard_start;
+
+	offset += xskb->pool->headroom;
+	if (!xskb->pool->unaligned)
+		return xskb->orig_addr + offset;
+	return xskb->orig_addr + (offset << XSK_UNALIGNED_BUF_OFFSET_SHIFT);
+}
+
 #endif /* XSK_BUFF_POOL_H_ */
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index d6b500dc4208..f16074eb53c7 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -134,21 +134,6 @@ int xsk_reg_pool_at_qid(struct net_device *dev, struct xsk_buff_pool *pool,
 	return 0;
 }
 
-void xp_release(struct xdp_buff_xsk *xskb)
-{
-	xskb->pool->free_heads[xskb->pool->free_heads_cnt++] = xskb;
-}
-
-static u64 xp_get_handle(struct xdp_buff_xsk *xskb)
-{
-	u64 offset = xskb->xdp.data - xskb->xdp.data_hard_start;
-
-	offset += xskb->pool->headroom;
-	if (!xskb->pool->unaligned)
-		return xskb->orig_addr + offset;
-	return xskb->orig_addr + (offset << XSK_UNALIGNED_BUF_OFFSET_SHIFT);
-}
-
 static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
 {
 	struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp);
diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c
index 884d95d70f5e..96b14e51ba7e 100644
--- a/net/xdp/xsk_buff_pool.c
+++ b/net/xdp/xsk_buff_pool.c
@@ -44,12 +44,13 @@ void xp_destroy(struct xsk_buff_pool *pool)
 struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs,
 						struct xdp_umem *umem)
 {
+	bool unaligned = umem->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG;
 	struct xsk_buff_pool *pool;
 	struct xdp_buff_xsk *xskb;
-	u32 i;
+	u32 i, entries;
 
-	pool = kvzalloc(struct_size(pool, free_heads, umem->chunks),
-			GFP_KERNEL);
+	entries = unaligned ? umem->chunks : 0;
+	pool = kvzalloc(struct_size(pool, free_heads, entries),	GFP_KERNEL);
 	if (!pool)
 		goto out;
 
@@ -63,7 +64,8 @@ struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs,
 	pool->free_heads_cnt = umem->chunks;
 	pool->headroom = umem->headroom;
 	pool->chunk_size = umem->chunk_size;
-	pool->unaligned = umem->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG;
+	pool->chunk_shift = ffs(umem->chunk_size) - 1;
+	pool->unaligned = unaligned;
 	pool->frame_len = umem->chunk_size - umem->headroom -
 		XDP_PACKET_HEADROOM;
 	pool->umem = umem;
@@ -81,7 +83,10 @@ struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs,
 		xskb = &pool->heads[i];
 		xskb->pool = pool;
 		xskb->xdp.frame_sz = umem->chunk_size - umem->headroom;
-		pool->free_heads[i] = xskb;
+		if (pool->unaligned)
+			pool->free_heads[i] = xskb;
+		else
+			xp_init_xskb_addr(xskb, pool, i * pool->chunk_size);
 	}
 
 	return pool;
@@ -406,6 +411,12 @@ int xp_dma_map(struct xsk_buff_pool *pool, struct device *dev,
 
 	if (pool->unaligned)
 		xp_check_dma_contiguity(dma_map);
+	else
+		for (i = 0; i < pool->heads_cnt; i++) {
+			struct xdp_buff_xsk *xskb = &pool->heads[i];
+
+			xp_init_xskb_dma(xskb, pool, dma_map->dma_pages, xskb->orig_addr);
+		}
 
 	err = xp_init_dma_info(pool, dma_map);
 	if (err) {
@@ -448,8 +459,6 @@ static struct xdp_buff_xsk *__xp_alloc(struct xsk_buff_pool *pool)
 	if (pool->free_heads_cnt == 0)
 		return NULL;
 
-	xskb = pool->free_heads[--pool->free_heads_cnt];
-
 	for (;;) {
 		if (!xskq_cons_peek_addr_unchecked(pool->fq, &addr)) {
 			pool->fq->queue_empty_descs++;
@@ -466,17 +475,17 @@ static struct xdp_buff_xsk *__xp_alloc(struct xsk_buff_pool *pool)
 		}
 		break;
 	}
-	xskq_cons_release(pool->fq);
 
-	xskb->orig_addr = addr;
-	xskb->xdp.data_hard_start = pool->addrs + addr + pool->headroom;
-	if (pool->dma_pages_cnt) {
-		xskb->frame_dma = (pool->dma_pages[addr >> PAGE_SHIFT] &
-				   ~XSK_NEXT_PG_CONTIG_MASK) +
-				  (addr & ~PAGE_MASK);
-		xskb->dma = xskb->frame_dma + pool->headroom +
-			    XDP_PACKET_HEADROOM;
+	if (pool->unaligned) {
+		xskb = pool->free_heads[--pool->free_heads_cnt];
+		xp_init_xskb_addr(xskb, pool, addr);
+		if (pool->dma_pages_cnt)
+			xp_init_xskb_dma(xskb, pool, pool->dma_pages, addr);
+	} else {
+		xskb = &pool->heads[xp_aligned_extract_idx(pool, addr)];
 	}
+
+	xskq_cons_release(pool->fq);
 	return xskb;
 }
 
@@ -533,13 +542,16 @@ static u32 xp_alloc_new_from_fq(struct xsk_buff_pool *pool, struct xdp_buff **xd
 			continue;
 		}
 
-		xskb = pool->free_heads[--pool->free_heads_cnt];
+		if (pool->unaligned) {
+			xskb = pool->free_heads[--pool->free_heads_cnt];
+			xp_init_xskb_addr(xskb, pool, addr);
+			if (pool->dma_pages_cnt)
+				xp_init_xskb_dma(xskb, pool, pool->dma_pages, addr);
+		} else {
+			xskb = &pool->heads[xp_aligned_extract_idx(pool, addr)];
+		}
+
 		*xdp = &xskb->xdp;
-		xskb->orig_addr = addr;
-		xskb->xdp.data_hard_start = pool->addrs + addr + pool->headroom;
-		xskb->frame_dma = (pool->dma_pages[addr >> PAGE_SHIFT] &
-				   ~XSK_NEXT_PG_CONTIG_MASK) + (addr & ~PAGE_MASK);
-		xskb->dma = xskb->frame_dma + pool->headroom + XDP_PACKET_HEADROOM;
 		xdp++;
 	}
 
-- 
cgit 


From 5b132056123dfe25b0a8c96d1420e9c31cb8edf8 Mon Sep 17 00:00:00 2001
From: Magnus Karlsson <magnus.karlsson@intel.com>
Date: Wed, 22 Sep 2021 09:56:07 +0200
Subject: selftests: xsk: Fix missing initialization

Fix missing initialization of the member rx_pkt_nb in the packet
stream. This leads to some tests declaring success too early as the
test thought all packets had already been received.

Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20210922075613.12186-8-magnus.karlsson@gmail.com
---
 tools/testing/selftests/bpf/xdpxceiver.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/tools/testing/selftests/bpf/xdpxceiver.c b/tools/testing/selftests/bpf/xdpxceiver.c
index 127bcde06c86..97591e2a69f7 100644
--- a/tools/testing/selftests/bpf/xdpxceiver.c
+++ b/tools/testing/selftests/bpf/xdpxceiver.c
@@ -445,6 +445,12 @@ static void test_spec_set_name(struct test_spec *test, const char *name)
 	strncpy(test->name, name, MAX_TEST_NAME_SIZE);
 }
 
+static void pkt_stream_reset(struct pkt_stream *pkt_stream)
+{
+	if (pkt_stream)
+		pkt_stream->rx_pkt_nb = 0;
+}
+
 static struct pkt *pkt_stream_get_pkt(struct pkt_stream *pkt_stream, u32 pkt_nb)
 {
 	if (pkt_nb >= pkt_stream->nb_pkts)
@@ -1032,6 +1038,7 @@ static void testapp_validate_traffic(struct test_spec *test)
 		exit_with_error(errno);
 
 	test->current_step++;
+	pkt_stream_reset(ifobj_rx->pkt_stream);
 
 	/*Spawn RX thread */
 	pthread_create(&t0, NULL, ifobj_rx->func_ptr, test);
-- 
cgit 


From 872a1184dbf2b6ed9f435d6a37ad8007126da982 Mon Sep 17 00:00:00 2001
From: Magnus Karlsson <magnus.karlsson@intel.com>
Date: Wed, 22 Sep 2021 09:56:08 +0200
Subject: selftests: xsk: Put the same buffer only once in the fill ring

Fix a problem where the fill ring was populated with too many
entries. If number of buffers in the umem was smaller than the fill
ring size, the code used to loop over from the beginning of the umem
and start putting the same buffers in again. This is racy indeed as a
later packet can be received overwriting an earlier one before the Rx
thread manages to validate it.

Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20210922075613.12186-9-magnus.karlsson@gmail.com
---
 tools/testing/selftests/bpf/xdpxceiver.c | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/tools/testing/selftests/bpf/xdpxceiver.c b/tools/testing/selftests/bpf/xdpxceiver.c
index 97591e2a69f7..c5c68b860ae0 100644
--- a/tools/testing/selftests/bpf/xdpxceiver.c
+++ b/tools/testing/selftests/bpf/xdpxceiver.c
@@ -977,13 +977,18 @@ static void *worker_testapp_validate_tx(void *arg)
 
 static void xsk_populate_fill_ring(struct xsk_umem_info *umem, struct pkt_stream *pkt_stream)
 {
-	u32 idx = 0, i;
+	u32 idx = 0, i, buffers_to_fill;
 	int ret;
 
-	ret = xsk_ring_prod__reserve(&umem->fq, XSK_RING_PROD__DEFAULT_NUM_DESCS, &idx);
-	if (ret != XSK_RING_PROD__DEFAULT_NUM_DESCS)
+	if (umem->num_frames < XSK_RING_PROD__DEFAULT_NUM_DESCS)
+		buffers_to_fill = umem->num_frames;
+	else
+		buffers_to_fill = XSK_RING_PROD__DEFAULT_NUM_DESCS;
+
+	ret = xsk_ring_prod__reserve(&umem->fq, buffers_to_fill, &idx);
+	if (ret != buffers_to_fill)
 		exit_with_error(ENOSPC);
-	for (i = 0; i < XSK_RING_PROD__DEFAULT_NUM_DESCS; i++) {
+	for (i = 0; i < buffers_to_fill; i++) {
 		u64 addr;
 
 		if (pkt_stream->use_addr_for_fill) {
@@ -993,12 +998,12 @@ static void xsk_populate_fill_ring(struct xsk_umem_info *umem, struct pkt_stream
 				break;
 			addr = pkt->addr;
 		} else {
-			addr = (i % umem->num_frames) * umem->frame_size + DEFAULT_OFFSET;
+			addr = i * umem->frame_size + DEFAULT_OFFSET;
 		}
 
 		*xsk_ring_prod__fill_addr(&umem->fq, idx++) = addr;
 	}
-	xsk_ring_prod__submit(&umem->fq, XSK_RING_PROD__DEFAULT_NUM_DESCS);
+	xsk_ring_prod__submit(&umem->fq, buffers_to_fill);
 }
 
 static void *worker_testapp_validate_rx(void *arg)
-- 
cgit 


From 89013b8a29281fa42e39406b8b25672cb6ce2341 Mon Sep 17 00:00:00 2001
From: Magnus Karlsson <magnus.karlsson@intel.com>
Date: Wed, 22 Sep 2021 09:56:09 +0200
Subject: selftests: xsk: Fix socket creation retry

The socket creation retry unnecessarily registered the umem once for
every retry. No reason to do this. It wastes memory and it might lead
to too many pages being locked at some point and the failure of a
test.

Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20210922075613.12186-10-magnus.karlsson@gmail.com
---
 tools/testing/selftests/bpf/xdpxceiver.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/tools/testing/selftests/bpf/xdpxceiver.c b/tools/testing/selftests/bpf/xdpxceiver.c
index c5c68b860ae0..aa5660dc0699 100644
--- a/tools/testing/selftests/bpf/xdpxceiver.c
+++ b/tools/testing/selftests/bpf/xdpxceiver.c
@@ -919,18 +919,17 @@ static void thread_common_ops(struct test_spec *test, struct ifobject *ifobject)
 		u64 umem_sz = ifobject->umem->num_frames * ifobject->umem->frame_size;
 		u32 ctr = 0;
 		void *bufs;
+		int ret;
 
 		bufs = mmap(NULL, umem_sz, PROT_READ | PROT_WRITE, mmap_flags, -1, 0);
 		if (bufs == MAP_FAILED)
 			exit_with_error(errno);
 
-		while (ctr++ < SOCK_RECONF_CTR) {
-			int ret;
-
-			ret = xsk_configure_umem(&ifobject->umem_arr[i], bufs, umem_sz);
-			if (ret)
-				exit_with_error(-ret);
+		ret = xsk_configure_umem(&ifobject->umem_arr[i], bufs, umem_sz);
+		if (ret)
+			exit_with_error(-ret);
 
+		while (ctr++ < SOCK_RECONF_CTR) {
 			ret = xsk_configure_socket(&ifobject->xsk_arr[i], &ifobject->umem_arr[i],
 						   ifobject, i);
 			if (!ret)
-- 
cgit 


From 1bf3649688c103f80690a7088a105924f9d5a6e4 Mon Sep 17 00:00:00 2001
From: Magnus Karlsson <magnus.karlsson@intel.com>
Date: Wed, 22 Sep 2021 09:56:10 +0200
Subject: selftests: xsk: Introduce pacing of traffic

Introduce pacing of traffic so that the Tx thread can never send more
packets than the receiver has processed plus the number of packets it
can have in its umem. So at any point in time, the number of in flight
packets (not processed by the Rx thread) are less than or equal to the
number of packets that can be held in the Rx thread's umem.

The batch size is also increased to improve running time.

Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20210922075613.12186-11-magnus.karlsson@gmail.com
---
 tools/testing/selftests/bpf/xdpxceiver.c | 29 +++++++++++++++++++++++------
 tools/testing/selftests/bpf/xdpxceiver.h |  7 ++++++-
 2 files changed, 29 insertions(+), 7 deletions(-)

diff --git a/tools/testing/selftests/bpf/xdpxceiver.c b/tools/testing/selftests/bpf/xdpxceiver.c
index aa5660dc0699..597fbe206026 100644
--- a/tools/testing/selftests/bpf/xdpxceiver.c
+++ b/tools/testing/selftests/bpf/xdpxceiver.c
@@ -384,6 +384,7 @@ static void __test_spec_init(struct test_spec *test, struct ifobject *ifobj_tx,
 		ifobj->umem = &ifobj->umem_arr[0];
 		ifobj->xsk = &ifobj->xsk_arr[0];
 		ifobj->use_poll = false;
+		ifobj->pacing_on = true;
 		ifobj->pkt_stream = test->pkt_stream_default;
 
 		if (i == 0) {
@@ -724,6 +725,7 @@ static void receive_pkts(struct pkt_stream *pkt_stream, struct xsk_socket_info *
 {
 	struct pkt *pkt = pkt_stream_get_next_rx_pkt(pkt_stream);
 	u32 idx_rx = 0, idx_fq = 0, rcvd, i;
+	u32 total = 0;
 	int ret;
 
 	while (pkt) {
@@ -772,6 +774,13 @@ static void receive_pkts(struct pkt_stream *pkt_stream, struct xsk_socket_info *
 
 		xsk_ring_prod__submit(&xsk->umem->fq, rcvd);
 		xsk_ring_cons__release(&xsk->rx, rcvd);
+
+		pthread_mutex_lock(&pacing_mutex);
+		pkts_in_flight -= rcvd;
+		total += rcvd;
+		if (pkts_in_flight < umem->num_frames)
+			pthread_cond_signal(&pacing_cond);
+		pthread_mutex_unlock(&pacing_mutex);
 	}
 }
 
@@ -797,10 +806,19 @@ static u32 __send_pkts(struct ifobject *ifobject, u32 pkt_nb)
 			valid_pkts++;
 	}
 
+	pthread_mutex_lock(&pacing_mutex);
+	pkts_in_flight += valid_pkts;
+	if (ifobject->pacing_on && pkts_in_flight >= ifobject->umem->num_frames - BATCH_SIZE) {
+		kick_tx(xsk);
+		pthread_cond_wait(&pacing_cond, &pacing_mutex);
+	}
+	pthread_mutex_unlock(&pacing_mutex);
+
 	xsk_ring_prod__submit(&xsk->tx, i);
 	xsk->outstanding_tx += valid_pkts;
-	complete_pkts(xsk, BATCH_SIZE);
+	complete_pkts(xsk, i);
 
+	usleep(10);
 	return i;
 }
 
@@ -819,8 +837,6 @@ static void send_pkts(struct ifobject *ifobject)
 	fds.events = POLLOUT;
 
 	while (pkt_cnt < ifobject->pkt_stream->nb_pkts) {
-		u32 sent;
-
 		if (ifobject->use_poll) {
 			int ret;
 
@@ -832,9 +848,7 @@ static void send_pkts(struct ifobject *ifobject)
 				continue;
 		}
 
-		sent = __send_pkts(ifobject, pkt_cnt);
-		pkt_cnt += sent;
-		usleep(10);
+		pkt_cnt += __send_pkts(ifobject, pkt_cnt);
 	}
 
 	wait_for_tx_completion(ifobject->xsk);
@@ -1043,6 +1057,7 @@ static void testapp_validate_traffic(struct test_spec *test)
 
 	test->current_step++;
 	pkt_stream_reset(ifobj_rx->pkt_stream);
+	pkts_in_flight = 0;
 
 	/*Spawn RX thread */
 	pthread_create(&t0, NULL, ifobj_rx->func_ptr, test);
@@ -1126,6 +1141,8 @@ static void testapp_stats(struct test_spec *test)
 	for (i = 0; i < STAT_TEST_TYPE_MAX; i++) {
 		test_spec_reset(test);
 		stat_test_type = i;
+		/* No or few packets will be received so cannot pace packets */
+		test->ifobj_tx->pacing_on = false;
 
 		switch (stat_test_type) {
 		case STAT_TEST_RX_DROPPED:
diff --git a/tools/testing/selftests/bpf/xdpxceiver.h b/tools/testing/selftests/bpf/xdpxceiver.h
index 5ac4a5e64744..00790c976f4f 100644
--- a/tools/testing/selftests/bpf/xdpxceiver.h
+++ b/tools/testing/selftests/bpf/xdpxceiver.h
@@ -35,7 +35,7 @@
 #define UDP_PKT_DATA_SIZE (UDP_PKT_SIZE - sizeof(struct udphdr))
 #define USLEEP_MAX 10000
 #define SOCK_RECONF_CTR 10
-#define BATCH_SIZE 8
+#define BATCH_SIZE 64
 #define POLL_TMOUT 1000
 #define DEFAULT_PKT_CNT (4 * 1024)
 #define DEFAULT_UMEM_BUFFERS (DEFAULT_PKT_CNT / 4)
@@ -136,6 +136,7 @@ struct ifobject {
 	bool tx_on;
 	bool rx_on;
 	bool use_poll;
+	bool pacing_on;
 	u8 dst_mac[ETH_ALEN];
 	u8 src_mac[ETH_ALEN];
 };
@@ -151,5 +152,9 @@ struct test_spec {
 };
 
 pthread_barrier_t barr;
+pthread_mutex_t pacing_mutex = PTHREAD_MUTEX_INITIALIZER;
+pthread_cond_t pacing_cond = PTHREAD_COND_INITIALIZER;
+
+u32 pkts_in_flight;
 
 #endif				/* XDPXCEIVER_H */
-- 
cgit 


From 96a40678ce5390cd8515ff32e55ad932fd1fa328 Mon Sep 17 00:00:00 2001
From: Magnus Karlsson <magnus.karlsson@intel.com>
Date: Wed, 22 Sep 2021 09:56:11 +0200
Subject: selftests: xsk: Add single packet test

Add a test where a single packet is sent and received. This might
sound like a silly test, but since many of the interfaces in xsk are
batched, it is important to be able to validate that we did not break
something as fundamental as just receiving single packets, instead of
batches of packets at high speed.

Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20210922075613.12186-12-magnus.karlsson@gmail.com
---
 tools/testing/selftests/bpf/xdpxceiver.c | 13 +++++++++++++
 tools/testing/selftests/bpf/xdpxceiver.h |  1 +
 2 files changed, 14 insertions(+)

diff --git a/tools/testing/selftests/bpf/xdpxceiver.c b/tools/testing/selftests/bpf/xdpxceiver.c
index 597fbe206026..3beea7531c8e 100644
--- a/tools/testing/selftests/bpf/xdpxceiver.c
+++ b/tools/testing/selftests/bpf/xdpxceiver.c
@@ -1217,6 +1217,15 @@ static bool testapp_unaligned(struct test_spec *test)
 	return true;
 }
 
+static void testapp_single_pkt(struct test_spec *test)
+{
+	struct pkt pkts[] = {{0x1000, PKT_SIZE, 0, true}};
+
+	pkt_stream_generate_custom(test, pkts, ARRAY_SIZE(pkts));
+	testapp_validate_traffic(test);
+	pkt_stream_restore_default(test);
+}
+
 static void testapp_invalid_desc(struct test_spec *test)
 {
 	struct pkt pkts[] = {
@@ -1298,6 +1307,10 @@ static void run_pkt_test(struct test_spec *test, enum test_mode mode, enum test_
 		test_spec_set_name(test, "RUN_TO_COMPLETION");
 		testapp_validate_traffic(test);
 		break;
+	case TEST_TYPE_RUN_TO_COMPLETION_SINGLE_PKT:
+		test_spec_set_name(test, "RUN_TO_COMPLETION_SINGLE_PKT");
+		testapp_single_pkt(test);
+		break;
 	case TEST_TYPE_RUN_TO_COMPLETION_2K_FRAME:
 		test_spec_set_name(test, "RUN_TO_COMPLETION_2K_FRAME_SIZE");
 		test->ifobj_tx->umem->frame_size = 2048;
diff --git a/tools/testing/selftests/bpf/xdpxceiver.h b/tools/testing/selftests/bpf/xdpxceiver.h
index 00790c976f4f..d075192c95f8 100644
--- a/tools/testing/selftests/bpf/xdpxceiver.h
+++ b/tools/testing/selftests/bpf/xdpxceiver.h
@@ -55,6 +55,7 @@ enum test_mode {
 enum test_type {
 	TEST_TYPE_RUN_TO_COMPLETION,
 	TEST_TYPE_RUN_TO_COMPLETION_2K_FRAME,
+	TEST_TYPE_RUN_TO_COMPLETION_SINGLE_PKT,
 	TEST_TYPE_POLL,
 	TEST_TYPE_UNALIGNED,
 	TEST_TYPE_ALIGNED_INV_DESC,
-- 
cgit 


From e4e9baf06a6ea6cfbf69db4c3766a0879329dda2 Mon Sep 17 00:00:00 2001
From: Magnus Karlsson <magnus.karlsson@intel.com>
Date: Wed, 22 Sep 2021 09:56:12 +0200
Subject: selftests: xsk: Change interleaving of packets in unaligned mode

Change the interleaving of packets in unaligned mode. With the current
buffer addresses in the packet stream, the last buffer in the umem
could not be used as a large packet could potentially write over the
end of the umem. The kernel correctly threw this buffer address away
and refused to use it. This is perfectly fine for all regular packet
streams, but the ones used for unaligned mode have every other packet
being at some different offset. As we will add checks for correct
offsets in the next patch, this needs to be fixed. Just start these
page-boundary straddling buffers one page earlier so that the last
one is not on the last page of the umem, making all buffers valid.

Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20210922075613.12186-13-magnus.karlsson@gmail.com
---
 tools/testing/selftests/bpf/xdpxceiver.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tools/testing/selftests/bpf/xdpxceiver.c b/tools/testing/selftests/bpf/xdpxceiver.c
index 3beea7531c8e..fd620f8accfd 100644
--- a/tools/testing/selftests/bpf/xdpxceiver.c
+++ b/tools/testing/selftests/bpf/xdpxceiver.c
@@ -543,14 +543,14 @@ static void pkt_stream_replace(struct test_spec *test, u32 nb_pkts, u32 pkt_len)
 	test->ifobj_rx->pkt_stream = pkt_stream;
 }
 
-static void pkt_stream_replace_half(struct test_spec *test, u32 pkt_len, u32 offset)
+static void pkt_stream_replace_half(struct test_spec *test, u32 pkt_len, int offset)
 {
 	struct xsk_umem_info *umem = test->ifobj_tx->umem;
 	struct pkt_stream *pkt_stream;
 	u32 i;
 
 	pkt_stream = pkt_stream_clone(umem, test->pkt_stream_default);
-	for (i = 0; i < test->pkt_stream_default->nb_pkts; i += 2) {
+	for (i = 1; i < test->pkt_stream_default->nb_pkts; i += 2) {
 		pkt_stream->pkts[i].addr = (i % umem->num_frames) * umem->frame_size + offset;
 		pkt_stream->pkts[i].len = pkt_len;
 	}
@@ -1209,7 +1209,7 @@ static bool testapp_unaligned(struct test_spec *test)
 	test->ifobj_tx->umem->unaligned_mode = true;
 	test->ifobj_rx->umem->unaligned_mode = true;
 	/* Let half of the packets straddle a buffer boundrary */
-	pkt_stream_replace_half(test, PKT_SIZE, test->ifobj_tx->umem->frame_size - 32);
+	pkt_stream_replace_half(test, PKT_SIZE, -PKT_SIZE / 2);
 	test->ifobj_rx->pkt_stream->use_addr_for_fill = true;
 	testapp_validate_traffic(test);
 
-- 
cgit 


From e34087fc00f4f853886952711195984abdece7a3 Mon Sep 17 00:00:00 2001
From: Magnus Karlsson <magnus.karlsson@intel.com>
Date: Wed, 22 Sep 2021 09:56:13 +0200
Subject: selftests: xsk: Add frame_headroom test

Add a test for the frame_headroom feature that can be set on the
umem. The logic added validates that all offsets in all tests and
packets are valid, not just the ones that have a specifically
configured frame_headroom.

Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20210922075613.12186-14-magnus.karlsson@gmail.com
---
 tools/testing/selftests/bpf/xdpxceiver.c | 52 ++++++++++++++++++++++++++------
 tools/testing/selftests/bpf/xdpxceiver.h |  3 +-
 2 files changed, 44 insertions(+), 11 deletions(-)

diff --git a/tools/testing/selftests/bpf/xdpxceiver.c b/tools/testing/selftests/bpf/xdpxceiver.c
index fd620f8accfd..6c7cf8aadc79 100644
--- a/tools/testing/selftests/bpf/xdpxceiver.c
+++ b/tools/testing/selftests/bpf/xdpxceiver.c
@@ -514,8 +514,7 @@ static struct pkt_stream *pkt_stream_generate(struct xsk_umem_info *umem, u32 nb
 
 	pkt_stream->nb_pkts = nb_pkts;
 	for (i = 0; i < nb_pkts; i++) {
-		pkt_stream->pkts[i].addr = (i % umem->num_frames) * umem->frame_size +
-			DEFAULT_OFFSET;
+		pkt_stream->pkts[i].addr = (i % umem->num_frames) * umem->frame_size;
 		pkt_stream->pkts[i].len = pkt_len;
 		pkt_stream->pkts[i].payload = i;
 
@@ -642,6 +641,25 @@ static void pkt_dump(void *pkt, u32 len)
 	fprintf(stdout, "---------------------------------------\n");
 }
 
+static bool is_offset_correct(struct xsk_umem_info *umem, struct pkt_stream *pkt_stream, u64 addr,
+			      u64 pkt_stream_addr)
+{
+	u32 headroom = umem->unaligned_mode ? 0 : umem->frame_headroom;
+	u32 offset = addr % umem->frame_size, expected_offset = 0;
+
+	if (!pkt_stream->use_addr_for_fill)
+		pkt_stream_addr = 0;
+
+	expected_offset += (pkt_stream_addr + headroom + XDP_PACKET_HEADROOM) % umem->frame_size;
+
+	if (offset == expected_offset)
+		return true;
+
+	ksft_test_result_fail("ERROR: [%s] expected [%u], got [%u]\n", __func__, expected_offset,
+			      offset);
+	return false;
+}
+
 static bool is_pkt_valid(struct pkt *pkt, void *buffer, u64 addr, u32 len)
 {
 	void *data = xsk_umem__get_data(buffer, addr);
@@ -724,6 +742,7 @@ static void receive_pkts(struct pkt_stream *pkt_stream, struct xsk_socket_info *
 			 struct pollfd *fds)
 {
 	struct pkt *pkt = pkt_stream_get_next_rx_pkt(pkt_stream);
+	struct xsk_umem_info *umem = xsk->umem;
 	u32 idx_rx = 0, idx_fq = 0, rcvd, i;
 	u32 total = 0;
 	int ret;
@@ -731,7 +750,7 @@ static void receive_pkts(struct pkt_stream *pkt_stream, struct xsk_socket_info *
 	while (pkt) {
 		rcvd = xsk_ring_cons__peek(&xsk->rx, BATCH_SIZE, &idx_rx);
 		if (!rcvd) {
-			if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq)) {
+			if (xsk_ring_prod__needs_wakeup(&umem->fq)) {
 				ret = poll(fds, 1, POLL_TMOUT);
 				if (ret < 0)
 					exit_with_error(-ret);
@@ -739,16 +758,16 @@ static void receive_pkts(struct pkt_stream *pkt_stream, struct xsk_socket_info *
 			continue;
 		}
 
-		ret = xsk_ring_prod__reserve(&xsk->umem->fq, rcvd, &idx_fq);
+		ret = xsk_ring_prod__reserve(&umem->fq, rcvd, &idx_fq);
 		while (ret != rcvd) {
 			if (ret < 0)
 				exit_with_error(-ret);
-			if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq)) {
+			if (xsk_ring_prod__needs_wakeup(&umem->fq)) {
 				ret = poll(fds, 1, POLL_TMOUT);
 				if (ret < 0)
 					exit_with_error(-ret);
 			}
-			ret = xsk_ring_prod__reserve(&xsk->umem->fq, rcvd, &idx_fq);
+			ret = xsk_ring_prod__reserve(&umem->fq, rcvd, &idx_fq);
 		}
 
 		for (i = 0; i < rcvd; i++) {
@@ -765,14 +784,17 @@ static void receive_pkts(struct pkt_stream *pkt_stream, struct xsk_socket_info *
 
 			orig = xsk_umem__extract_addr(addr);
 			addr = xsk_umem__add_offset_to_addr(addr);
-			if (!is_pkt_valid(pkt, xsk->umem->buffer, addr, desc->len))
+
+			if (!is_pkt_valid(pkt, umem->buffer, addr, desc->len))
+				return;
+			if (!is_offset_correct(umem, pkt_stream, addr, pkt->addr))
 				return;
 
-			*xsk_ring_prod__fill_addr(&xsk->umem->fq, idx_fq++) = orig;
+			*xsk_ring_prod__fill_addr(&umem->fq, idx_fq++) = orig;
 			pkt = pkt_stream_get_next_rx_pkt(pkt_stream);
 		}
 
-		xsk_ring_prod__submit(&xsk->umem->fq, rcvd);
+		xsk_ring_prod__submit(&umem->fq, rcvd);
 		xsk_ring_cons__release(&xsk->rx, rcvd);
 
 		pthread_mutex_lock(&pacing_mutex);
@@ -1011,7 +1033,7 @@ static void xsk_populate_fill_ring(struct xsk_umem_info *umem, struct pkt_stream
 				break;
 			addr = pkt->addr;
 		} else {
-			addr = i * umem->frame_size + DEFAULT_OFFSET;
+			addr = i * umem->frame_size;
 		}
 
 		*xsk_ring_prod__fill_addr(&umem->fq, idx++) = addr;
@@ -1134,6 +1156,13 @@ static void testapp_bpf_res(struct test_spec *test)
 	testapp_validate_traffic(test);
 }
 
+static void testapp_headroom(struct test_spec *test)
+{
+	test_spec_set_name(test, "UMEM_HEADROOM");
+	test->ifobj_rx->umem->frame_headroom = UMEM_HEADROOM_TEST_SIZE;
+	testapp_validate_traffic(test);
+}
+
 static void testapp_stats(struct test_spec *test)
 {
 	int i;
@@ -1346,6 +1375,9 @@ static void run_pkt_test(struct test_spec *test, enum test_mode mode, enum test_
 		if (!testapp_unaligned(test))
 			return;
 		break;
+	case TEST_TYPE_HEADROOM:
+		testapp_headroom(test);
+		break;
 	default:
 		break;
 	}
diff --git a/tools/testing/selftests/bpf/xdpxceiver.h b/tools/testing/selftests/bpf/xdpxceiver.h
index d075192c95f8..2f705f44b748 100644
--- a/tools/testing/selftests/bpf/xdpxceiver.h
+++ b/tools/testing/selftests/bpf/xdpxceiver.h
@@ -41,7 +41,7 @@
 #define DEFAULT_UMEM_BUFFERS (DEFAULT_PKT_CNT / 4)
 #define UMEM_SIZE (DEFAULT_UMEM_BUFFERS * XSK_UMEM__DEFAULT_FRAME_SIZE)
 #define RX_FULL_RXQSIZE 32
-#define DEFAULT_OFFSET 256
+#define UMEM_HEADROOM_TEST_SIZE 128
 #define XSK_UMEM__INVALID_FRAME_SIZE (XSK_UMEM__DEFAULT_FRAME_SIZE + 1)
 
 #define print_verbose(x...) do { if (opt_verbose) ksft_print_msg(x); } while (0)
@@ -61,6 +61,7 @@ enum test_type {
 	TEST_TYPE_ALIGNED_INV_DESC,
 	TEST_TYPE_ALIGNED_INV_DESC_2K_FRAME,
 	TEST_TYPE_UNALIGNED_INV_DESC,
+	TEST_TYPE_HEADROOM,
 	TEST_TYPE_TEARDOWN,
 	TEST_TYPE_BIDI,
 	TEST_TYPE_STATS,
-- 
cgit