diff options
Diffstat (limited to 'drivers/net/virtio_net.c')
| -rw-r--r-- | drivers/net/virtio_net.c | 581 | 
1 files changed, 380 insertions, 201 deletions
| diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 7723b2a49d8e..2396c28c0122 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -134,7 +134,7 @@ struct send_queue {  	struct scatterlist sg[MAX_SKB_FRAGS + 2];  	/* Name of the send queue: output.$index */ -	char name[40]; +	char name[16];  	struct virtnet_sq_stats stats; @@ -171,7 +171,7 @@ struct receive_queue {  	unsigned int min_buf_len;  	/* Name of this receive queue: input.$index */ -	char name[40]; +	char name[16];  	struct xdp_rxq_info xdp_rxq;  }; @@ -447,7 +447,6 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,  				   struct receive_queue *rq,  				   struct page *page, unsigned int offset,  				   unsigned int len, unsigned int truesize, -				   bool hdr_valid, unsigned int metasize,  				   unsigned int headroom)  {  	struct sk_buff *skb; @@ -466,21 +465,11 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,  	else  		hdr_padded_len = sizeof(struct padded_vnet_hdr); -	/* If headroom is not 0, there is an offset between the beginning of the -	 * data and the allocated space, otherwise the data and the allocated -	 * space are aligned. -	 * -	 * Buffers with headroom use PAGE_SIZE as alloc size, see -	 * add_recvbuf_mergeable() + get_mergeable_buf_len() -	 */ -	truesize = headroom ? PAGE_SIZE : truesize; -	tailroom = truesize - headroom;  	buf = p - headroom; -  	len -= hdr_len;  	offset += hdr_padded_len;  	p += hdr_padded_len; -	tailroom -= hdr_padded_len + len; +	tailroom = truesize - headroom  - hdr_padded_len - len;  	shinfo_size = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); @@ -510,7 +499,7 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,  	if (len <= skb_tailroom(skb))  		copy = len;  	else -		copy = ETH_HLEN + metasize; +		copy = ETH_HLEN;  	skb_put_data(skb, p, copy);  	len -= copy; @@ -549,20 +538,93 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,  		give_pages(rq, page);  ok: -	/* hdr_valid means no XDP, so we can copy the vnet header */ -	if (hdr_valid) { -		hdr = skb_vnet_hdr(skb); -		memcpy(hdr, hdr_p, hdr_len); -	} +	hdr = skb_vnet_hdr(skb); +	memcpy(hdr, hdr_p, hdr_len);  	if (page_to_free)  		put_page(page_to_free); -	if (metasize) { -		__skb_pull(skb, metasize); -		skb_metadata_set(skb, metasize); +	return skb; +} + +static void free_old_xmit_skbs(struct send_queue *sq, bool in_napi) +{ +	unsigned int len; +	unsigned int packets = 0; +	unsigned int bytes = 0; +	void *ptr; + +	while ((ptr = virtqueue_get_buf(sq->vq, &len)) != NULL) { +		if (likely(!is_xdp_frame(ptr))) { +			struct sk_buff *skb = ptr; + +			pr_debug("Sent skb %p\n", skb); + +			bytes += skb->len; +			napi_consume_skb(skb, in_napi); +		} else { +			struct xdp_frame *frame = ptr_to_xdp(ptr); + +			bytes += xdp_get_frame_len(frame); +			xdp_return_frame(frame); +		} +		packets++;  	} -	return skb; +	/* Avoid overhead when no packets have been processed +	 * happens when called speculatively from start_xmit. +	 */ +	if (!packets) +		return; + +	u64_stats_update_begin(&sq->stats.syncp); +	sq->stats.bytes += bytes; +	sq->stats.packets += packets; +	u64_stats_update_end(&sq->stats.syncp); +} + +static bool is_xdp_raw_buffer_queue(struct virtnet_info *vi, int q) +{ +	if (q < (vi->curr_queue_pairs - vi->xdp_queue_pairs)) +		return false; +	else if (q < vi->curr_queue_pairs) +		return true; +	else +		return false; +} + +static void check_sq_full_and_disable(struct virtnet_info *vi, +				      struct net_device *dev, +				      struct send_queue *sq) +{ +	bool use_napi = sq->napi.weight; +	int qnum; + +	qnum = sq - vi->sq; + +	/* If running out of space, stop queue to avoid getting packets that we +	 * are then unable to transmit. +	 * An alternative would be to force queuing layer to requeue the skb by +	 * returning NETDEV_TX_BUSY. However, NETDEV_TX_BUSY should not be +	 * returned in a normal path of operation: it means that driver is not +	 * maintaining the TX queue stop/start state properly, and causes +	 * the stack to do a non-trivial amount of useless work. +	 * Since most packets only take 1 or 2 ring slots, stopping the queue +	 * early means 16 slots are typically wasted. +	 */ +	if (sq->vq->num_free < 2+MAX_SKB_FRAGS) { +		netif_stop_subqueue(dev, qnum); +		if (use_napi) { +			if (unlikely(!virtqueue_enable_cb_delayed(sq->vq))) +				virtqueue_napi_schedule(&sq->napi, sq->vq); +		} else if (unlikely(!virtqueue_enable_cb_delayed(sq->vq))) { +			/* More just got used, free them then recheck. */ +			free_old_xmit_skbs(sq, false); +			if (sq->vq->num_free >= 2+MAX_SKB_FRAGS) { +				netif_start_subqueue(dev, qnum); +				virtqueue_disable_cb(sq->vq); +			} +		} +	}  }  static int __virtnet_xdp_xmit_one(struct virtnet_info *vi, @@ -570,22 +632,43 @@ static int __virtnet_xdp_xmit_one(struct virtnet_info *vi,  				   struct xdp_frame *xdpf)  {  	struct virtio_net_hdr_mrg_rxbuf *hdr; -	int err; +	struct skb_shared_info *shinfo; +	u8 nr_frags = 0; +	int err, i;  	if (unlikely(xdpf->headroom < vi->hdr_len))  		return -EOVERFLOW; -	/* Make room for virtqueue hdr (also change xdpf->headroom?) */ +	if (unlikely(xdp_frame_has_frags(xdpf))) { +		shinfo = xdp_get_shared_info_from_frame(xdpf); +		nr_frags = shinfo->nr_frags; +	} + +	/* In wrapping function virtnet_xdp_xmit(), we need to free +	 * up the pending old buffers, where we need to calculate the +	 * position of skb_shared_info in xdp_get_frame_len() and +	 * xdp_return_frame(), which will involve to xdpf->data and +	 * xdpf->headroom. Therefore, we need to update the value of +	 * headroom synchronously here. +	 */ +	xdpf->headroom -= vi->hdr_len;  	xdpf->data -= vi->hdr_len;  	/* Zero header and leave csum up to XDP layers */  	hdr = xdpf->data;  	memset(hdr, 0, vi->hdr_len);  	xdpf->len   += vi->hdr_len; -	sg_init_one(sq->sg, xdpf->data, xdpf->len); +	sg_init_table(sq->sg, nr_frags + 1); +	sg_set_buf(sq->sg, xdpf->data, xdpf->len); +	for (i = 0; i < nr_frags; i++) { +		skb_frag_t *frag = &shinfo->frags[i]; + +		sg_set_page(&sq->sg[i + 1], skb_frag_page(frag), +			    skb_frag_size(frag), skb_frag_off(frag)); +	} -	err = virtqueue_add_outbuf(sq->vq, sq->sg, 1, xdp_to_ptr(xdpf), -				   GFP_ATOMIC); +	err = virtqueue_add_outbuf(sq->vq, sq->sg, nr_frags + 1, +				   xdp_to_ptr(xdpf), GFP_ATOMIC);  	if (unlikely(err))  		return -ENOSPC; /* Caller handle free/refcnt */ @@ -665,7 +748,7 @@ static int virtnet_xdp_xmit(struct net_device *dev,  		if (likely(is_xdp_frame(ptr))) {  			struct xdp_frame *frame = ptr_to_xdp(ptr); -			bytes += frame->len; +			bytes += xdp_get_frame_len(frame);  			xdp_return_frame(frame);  		} else {  			struct sk_buff *skb = ptr; @@ -685,6 +768,9 @@ static int virtnet_xdp_xmit(struct net_device *dev,  	}  	ret = nxmit; +	if (!is_xdp_raw_buffer_queue(vi, sq - vi->sq)) +		check_sq_full_and_disable(vi, dev, sq); +  	if (flags & XDP_XMIT_FLUSH) {  		if (virtqueue_kick_prepare(sq->vq) && virtqueue_notify(sq->vq))  			kicks = 1; @@ -722,7 +808,7 @@ static unsigned int virtnet_get_headroom(struct virtnet_info *vi)   * have enough headroom.   */  static struct page *xdp_linearize_page(struct receive_queue *rq, -				       u16 *num_buf, +				       int *num_buf,  				       struct page *p,  				       int offset,  				       int page_off, @@ -822,7 +908,7 @@ static struct sk_buff *receive_small(struct net_device *dev,  		if (unlikely(xdp_headroom < virtnet_get_headroom(vi))) {  			int offset = buf - page_address(page) + header_offset;  			unsigned int tlen = len + vi->hdr_len; -			u16 num_buf = 1; +			int num_buf = 1;  			xdp_headroom = virtnet_get_headroom(vi);  			header_offset = VIRTNET_RX_PAD + xdp_headroom; @@ -924,7 +1010,7 @@ static struct sk_buff *receive_big(struct net_device *dev,  {  	struct page *page = buf;  	struct sk_buff *skb = -		page_to_skb(vi, rq, page, 0, len, PAGE_SIZE, true, 0, 0); +		page_to_skb(vi, rq, page, 0, len, PAGE_SIZE, 0);  	stats->bytes += len - vi->hdr_len;  	if (unlikely(!skb)) @@ -938,6 +1024,143 @@ err:  	return NULL;  } +/* Why not use xdp_build_skb_from_frame() ? + * XDP core assumes that xdp frags are PAGE_SIZE in length, while in + * virtio-net there are 2 points that do not match its requirements: + *  1. The size of the prefilled buffer is not fixed before xdp is set. + *  2. xdp_build_skb_from_frame() does more checks that we don't need, + *     like eth_type_trans() (which virtio-net does in receive_buf()). + */ +static struct sk_buff *build_skb_from_xdp_buff(struct net_device *dev, +					       struct virtnet_info *vi, +					       struct xdp_buff *xdp, +					       unsigned int xdp_frags_truesz) +{ +	struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp); +	unsigned int headroom, data_len; +	struct sk_buff *skb; +	int metasize; +	u8 nr_frags; + +	if (unlikely(xdp->data_end > xdp_data_hard_end(xdp))) { +		pr_debug("Error building skb as missing reserved tailroom for xdp"); +		return NULL; +	} + +	if (unlikely(xdp_buff_has_frags(xdp))) +		nr_frags = sinfo->nr_frags; + +	skb = build_skb(xdp->data_hard_start, xdp->frame_sz); +	if (unlikely(!skb)) +		return NULL; + +	headroom = xdp->data - xdp->data_hard_start; +	data_len = xdp->data_end - xdp->data; +	skb_reserve(skb, headroom); +	__skb_put(skb, data_len); + +	metasize = xdp->data - xdp->data_meta; +	metasize = metasize > 0 ? metasize : 0; +	if (metasize) +		skb_metadata_set(skb, metasize); + +	if (unlikely(xdp_buff_has_frags(xdp))) +		xdp_update_skb_shared_info(skb, nr_frags, +					   sinfo->xdp_frags_size, +					   xdp_frags_truesz, +					   xdp_buff_is_frag_pfmemalloc(xdp)); + +	return skb; +} + +/* TODO: build xdp in big mode */ +static int virtnet_build_xdp_buff_mrg(struct net_device *dev, +				      struct virtnet_info *vi, +				      struct receive_queue *rq, +				      struct xdp_buff *xdp, +				      void *buf, +				      unsigned int len, +				      unsigned int frame_sz, +				      int *num_buf, +				      unsigned int *xdp_frags_truesize, +				      struct virtnet_rq_stats *stats) +{ +	struct virtio_net_hdr_mrg_rxbuf *hdr = buf; +	unsigned int headroom, tailroom, room; +	unsigned int truesize, cur_frag_size; +	struct skb_shared_info *shinfo; +	unsigned int xdp_frags_truesz = 0; +	struct page *page; +	skb_frag_t *frag; +	int offset; +	void *ctx; + +	xdp_init_buff(xdp, frame_sz, &rq->xdp_rxq); +	xdp_prepare_buff(xdp, buf - VIRTIO_XDP_HEADROOM, +			 VIRTIO_XDP_HEADROOM + vi->hdr_len, len - vi->hdr_len, true); + +	if (!*num_buf) +		return 0; + +	if (*num_buf > 1) { +		/* If we want to build multi-buffer xdp, we need +		 * to specify that the flags of xdp_buff have the +		 * XDP_FLAGS_HAS_FRAG bit. +		 */ +		if (!xdp_buff_has_frags(xdp)) +			xdp_buff_set_frags_flag(xdp); + +		shinfo = xdp_get_shared_info_from_buff(xdp); +		shinfo->nr_frags = 0; +		shinfo->xdp_frags_size = 0; +	} + +	if (*num_buf > MAX_SKB_FRAGS + 1) +		return -EINVAL; + +	while (--*num_buf > 0) { +		buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx); +		if (unlikely(!buf)) { +			pr_debug("%s: rx error: %d buffers out of %d missing\n", +				 dev->name, *num_buf, +				 virtio16_to_cpu(vi->vdev, hdr->num_buffers)); +			dev->stats.rx_length_errors++; +			return -EINVAL; +		} + +		stats->bytes += len; +		page = virt_to_head_page(buf); +		offset = buf - page_address(page); + +		truesize = mergeable_ctx_to_truesize(ctx); +		headroom = mergeable_ctx_to_headroom(ctx); +		tailroom = headroom ? sizeof(struct skb_shared_info) : 0; +		room = SKB_DATA_ALIGN(headroom + tailroom); + +		cur_frag_size = truesize; +		xdp_frags_truesz += cur_frag_size; +		if (unlikely(len > truesize - room || cur_frag_size > PAGE_SIZE)) { +			put_page(page); +			pr_debug("%s: rx error: len %u exceeds truesize %lu\n", +				 dev->name, len, (unsigned long)(truesize - room)); +			dev->stats.rx_length_errors++; +			return -EINVAL; +		} + +		frag = &shinfo->frags[shinfo->nr_frags++]; +		__skb_frag_set_page(frag, page); +		skb_frag_off_set(frag, offset); +		skb_frag_size_set(frag, len); +		if (page_is_pfmemalloc(page)) +			xdp_buff_set_frag_pfmemalloc(xdp); + +		shinfo->xdp_frags_size += len; +	} + +	*xdp_frags_truesize = xdp_frags_truesz; +	return 0; +} +  static struct sk_buff *receive_mergeable(struct net_device *dev,  					 struct virtnet_info *vi,  					 struct receive_queue *rq, @@ -948,23 +1171,24 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,  					 struct virtnet_rq_stats *stats)  {  	struct virtio_net_hdr_mrg_rxbuf *hdr = buf; -	u16 num_buf = virtio16_to_cpu(vi->vdev, hdr->num_buffers); +	int num_buf = virtio16_to_cpu(vi->vdev, hdr->num_buffers);  	struct page *page = virt_to_head_page(buf);  	int offset = buf - page_address(page);  	struct sk_buff *head_skb, *curr_skb;  	struct bpf_prog *xdp_prog;  	unsigned int truesize = mergeable_ctx_to_truesize(ctx);  	unsigned int headroom = mergeable_ctx_to_headroom(ctx); -	unsigned int metasize = 0; -	unsigned int frame_sz; +	unsigned int tailroom = headroom ? sizeof(struct skb_shared_info) : 0; +	unsigned int room = SKB_DATA_ALIGN(headroom + tailroom); +	unsigned int frame_sz, xdp_room;  	int err;  	head_skb = NULL;  	stats->bytes += len - vi->hdr_len; -	if (unlikely(len > truesize)) { +	if (unlikely(len > truesize - room)) {  		pr_debug("%s: rx error: len %u exceeds truesize %lu\n", -			 dev->name, len, (unsigned long)ctx); +			 dev->name, len, (unsigned long)(truesize - room));  		dev->stats.rx_length_errors++;  		goto err_skb;  	} @@ -977,11 +1201,14 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,  	rcu_read_lock();  	xdp_prog = rcu_dereference(rq->xdp_prog);  	if (xdp_prog) { +		unsigned int xdp_frags_truesz = 0; +		struct skb_shared_info *shinfo;  		struct xdp_frame *xdpf;  		struct page *xdp_page;  		struct xdp_buff xdp;  		void *data;  		u32 act; +		int i;  		/* Transient failure which in theory could occur if  		 * in-flight packets from before XDP was enabled reach @@ -990,19 +1217,23 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,  		if (unlikely(hdr->hdr.gso_type))  			goto err_xdp; -		/* Buffers with headroom use PAGE_SIZE as alloc size, -		 * see add_recvbuf_mergeable() + get_mergeable_buf_len() +		/* Now XDP core assumes frag size is PAGE_SIZE, but buffers +		 * with headroom may add hole in truesize, which +		 * make their length exceed PAGE_SIZE. So we disabled the +		 * hole mechanism for xdp. See add_recvbuf_mergeable().  		 */ -		frame_sz = headroom ? PAGE_SIZE : truesize; - -		/* This happens when rx buffer size is underestimated -		 * or headroom is not enough because of the buffer -		 * was refilled before XDP is set. This should only -		 * happen for the first several packets, so we don't -		 * care much about its performance. +		frame_sz = truesize; + +		/* This happens when headroom is not enough because +		 * of the buffer was prefilled before XDP is set. +		 * This should only happen for the first several packets. +		 * In fact, vq reset can be used here to help us clean up +		 * the prefilled buffers, but many existing devices do not +		 * support it, and we don't want to bother users who are +		 * using xdp normally.  		 */ -		if (unlikely(num_buf > 1 || -			     headroom < virtnet_get_headroom(vi))) { +		if (!xdp_prog->aux->xdp_has_frags && +		    (num_buf > 1 || headroom < virtnet_get_headroom(vi))) {  			/* linearize data for XDP */  			xdp_page = xdp_linearize_page(rq, &num_buf,  						      page, offset, @@ -1013,82 +1244,56 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,  			if (!xdp_page)  				goto err_xdp;  			offset = VIRTIO_XDP_HEADROOM; +		} else if (unlikely(headroom < virtnet_get_headroom(vi))) { +			xdp_room = SKB_DATA_ALIGN(VIRTIO_XDP_HEADROOM + +						  sizeof(struct skb_shared_info)); +			if (len + xdp_room > PAGE_SIZE) +				goto err_xdp; + +			xdp_page = alloc_page(GFP_ATOMIC); +			if (!xdp_page) +				goto err_xdp; + +			memcpy(page_address(xdp_page) + VIRTIO_XDP_HEADROOM, +			       page_address(page) + offset, len); +			frame_sz = PAGE_SIZE; +			offset = VIRTIO_XDP_HEADROOM;  		} else {  			xdp_page = page;  		} -		/* Allow consuming headroom but reserve enough space to push -		 * the descriptor on if we get an XDP_TX return code. -		 */  		data = page_address(xdp_page) + offset; -		xdp_init_buff(&xdp, frame_sz - vi->hdr_len, &rq->xdp_rxq); -		xdp_prepare_buff(&xdp, data - VIRTIO_XDP_HEADROOM + vi->hdr_len, -				 VIRTIO_XDP_HEADROOM, len - vi->hdr_len, true); +		err = virtnet_build_xdp_buff_mrg(dev, vi, rq, &xdp, data, len, frame_sz, +						 &num_buf, &xdp_frags_truesz, stats); +		if (unlikely(err)) +			goto err_xdp_frags;  		act = bpf_prog_run_xdp(xdp_prog, &xdp);  		stats->xdp_packets++;  		switch (act) {  		case XDP_PASS: -			metasize = xdp.data - xdp.data_meta; - -			/* recalculate offset to account for any header -			 * adjustments and minus the metasize to copy the -			 * metadata in page_to_skb(). Note other cases do not -			 * build an skb and avoid using offset -			 */ -			offset = xdp.data - page_address(xdp_page) - -				 vi->hdr_len - metasize; +			head_skb = build_skb_from_xdp_buff(dev, vi, &xdp, xdp_frags_truesz); +			if (unlikely(!head_skb)) +				goto err_xdp_frags; -			/* recalculate len if xdp.data, xdp.data_end or -			 * xdp.data_meta were adjusted -			 */ -			len = xdp.data_end - xdp.data + vi->hdr_len + metasize; - -			/* recalculate headroom if xdp.data or xdp_data_meta -			 * were adjusted, note that offset should always point -			 * to the start of the reserved bytes for virtio_net -			 * header which are followed by xdp.data, that means -			 * that offset is equal to the headroom (when buf is -			 * starting at the beginning of the page, otherwise -			 * there is a base offset inside the page) but it's used -			 * with a different starting point (buf start) than -			 * xdp.data (buf start + vnet hdr size). If xdp.data or -			 * data_meta were adjusted by the xdp prog then the -			 * headroom size has changed and so has the offset, we -			 * can use data_hard_start, which points at buf start + -			 * vnet hdr size, to calculate the new headroom and use -			 * it later to compute buf start in page_to_skb() -			 */ -			headroom = xdp.data - xdp.data_hard_start - metasize; - -			/* We can only create skb based on xdp_page. */ -			if (unlikely(xdp_page != page)) { -				rcu_read_unlock(); +			if (unlikely(xdp_page != page))  				put_page(page); -				head_skb = page_to_skb(vi, rq, xdp_page, offset, -						       len, PAGE_SIZE, false, -						       metasize, -						       headroom); -				return head_skb; -			} -			break; +			rcu_read_unlock(); +			return head_skb;  		case XDP_TX:  			stats->xdp_tx++;  			xdpf = xdp_convert_buff_to_frame(&xdp);  			if (unlikely(!xdpf)) { -				if (unlikely(xdp_page != page)) -					put_page(xdp_page); -				goto err_xdp; +				netdev_dbg(dev, "convert buff to frame failed for xdp\n"); +				goto err_xdp_frags;  			}  			err = virtnet_xdp_xmit(dev, 1, &xdpf, 0);  			if (unlikely(!err)) {  				xdp_return_frame_rx_napi(xdpf);  			} else if (unlikely(err < 0)) {  				trace_xdp_exception(vi->dev, xdp_prog, act); -				if (unlikely(xdp_page != page)) -					put_page(xdp_page); -				goto err_xdp; +				goto err_xdp_frags;  			}  			*xdp_xmit |= VIRTIO_XDP_TX;  			if (unlikely(xdp_page != page)) @@ -1098,11 +1303,8 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,  		case XDP_REDIRECT:  			stats->xdp_redirects++;  			err = xdp_do_redirect(dev, &xdp, xdp_prog); -			if (err) { -				if (unlikely(xdp_page != page)) -					put_page(xdp_page); -				goto err_xdp; -			} +			if (err) +				goto err_xdp_frags;  			*xdp_xmit |= VIRTIO_XDP_REDIR;  			if (unlikely(xdp_page != page))  				put_page(page); @@ -1115,16 +1317,26 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,  			trace_xdp_exception(vi->dev, xdp_prog, act);  			fallthrough;  		case XDP_DROP: -			if (unlikely(xdp_page != page)) -				__free_pages(xdp_page, 0); -			goto err_xdp; +			goto err_xdp_frags; +		} +err_xdp_frags: +		if (unlikely(xdp_page != page)) +			__free_pages(xdp_page, 0); + +		if (xdp_buff_has_frags(&xdp)) { +			shinfo = xdp_get_shared_info_from_buff(&xdp); +			for (i = 0; i < shinfo->nr_frags; i++) { +				xdp_page = skb_frag_page(&shinfo->frags[i]); +				put_page(xdp_page); +			}  		} + +		goto err_xdp;  	}  	rcu_read_unlock();  skip_xdp: -	head_skb = page_to_skb(vi, rq, page, offset, len, truesize, !xdp_prog, -			       metasize, headroom); +	head_skb = page_to_skb(vi, rq, page, offset, len, truesize, headroom);  	curr_skb = head_skb;  	if (unlikely(!curr_skb)) @@ -1146,9 +1358,12 @@ skip_xdp:  		page = virt_to_head_page(buf);  		truesize = mergeable_ctx_to_truesize(ctx); -		if (unlikely(len > truesize)) { +		headroom = mergeable_ctx_to_headroom(ctx); +		tailroom = headroom ? sizeof(struct skb_shared_info) : 0; +		room = SKB_DATA_ALIGN(headroom + tailroom); +		if (unlikely(len > truesize - room)) {  			pr_debug("%s: rx error: len %u exceeds truesize %lu\n", -				 dev->name, len, (unsigned long)ctx); +				 dev->name, len, (unsigned long)(truesize - room));  			dev->stats.rx_length_errors++;  			goto err_skb;  		} @@ -1251,13 +1466,7 @@ static void receive_buf(struct virtnet_info *vi, struct receive_queue *rq,  	if (unlikely(len < vi->hdr_len + ETH_HLEN)) {  		pr_debug("%s: short packet %i\n", dev->name, len);  		dev->stats.rx_length_errors++; -		if (vi->mergeable_rx_bufs) { -			put_page(virt_to_head_page(buf)); -		} else if (vi->big_packets) { -			give_pages(rq, buf); -		} else { -			put_page(virt_to_head_page(buf)); -		} +		virtnet_rq_free_unused_buf(rq->vq, buf);  		return;  	} @@ -1426,13 +1635,16 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi,  		/* To avoid internal fragmentation, if there is very likely not  		 * enough space for another buffer, add the remaining space to  		 * the current buffer. +		 * XDP core assumes that frame_size of xdp_buff and the length +		 * of the frag are PAGE_SIZE, so we disable the hole mechanism.  		 */ -		len += hole; +		if (!headroom) +			len += hole;  		alloc_frag->offset += hole;  	}  	sg_init_one(rq->sg, buf, len); -	ctx = mergeable_len_to_ctx(len, headroom); +	ctx = mergeable_len_to_ctx(len + room, headroom);  	err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp);  	if (err < 0)  		put_page(virt_to_head_page(buf)); @@ -1590,52 +1802,6 @@ static int virtnet_receive(struct receive_queue *rq, int budget,  	return stats.packets;  } -static void free_old_xmit_skbs(struct send_queue *sq, bool in_napi) -{ -	unsigned int len; -	unsigned int packets = 0; -	unsigned int bytes = 0; -	void *ptr; - -	while ((ptr = virtqueue_get_buf(sq->vq, &len)) != NULL) { -		if (likely(!is_xdp_frame(ptr))) { -			struct sk_buff *skb = ptr; - -			pr_debug("Sent skb %p\n", skb); - -			bytes += skb->len; -			napi_consume_skb(skb, in_napi); -		} else { -			struct xdp_frame *frame = ptr_to_xdp(ptr); - -			bytes += frame->len; -			xdp_return_frame(frame); -		} -		packets++; -	} - -	/* Avoid overhead when no packets have been processed -	 * happens when called speculatively from start_xmit. -	 */ -	if (!packets) -		return; - -	u64_stats_update_begin(&sq->stats.syncp); -	sq->stats.bytes += bytes; -	sq->stats.packets += packets; -	u64_stats_update_end(&sq->stats.syncp); -} - -static bool is_xdp_raw_buffer_queue(struct virtnet_info *vi, int q) -{ -	if (q < (vi->curr_queue_pairs - vi->xdp_queue_pairs)) -		return false; -	else if (q < vi->curr_queue_pairs) -		return true; -	else -		return false; -} -  static void virtnet_poll_cleantx(struct receive_queue *rq)  {  	struct virtnet_info *vi = rq->vq->vdev->priv; @@ -1677,13 +1843,13 @@ static int virtnet_poll(struct napi_struct *napi, int budget)  	received = virtnet_receive(rq, budget, &xdp_xmit); +	if (xdp_xmit & VIRTIO_XDP_REDIR) +		xdp_do_flush(); +  	/* Out of packets? */  	if (received < budget)  		virtqueue_napi_complete(napi, rq->vq, received); -	if (xdp_xmit & VIRTIO_XDP_REDIR) -		xdp_do_flush(); -  	if (xdp_xmit & VIRTIO_XDP_TX) {  		sq = virtnet_xdp_get_sq(vi);  		if (virtqueue_kick_prepare(sq->vq) && virtqueue_notify(sq->vq)) { @@ -1865,28 +2031,7 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)  		nf_reset_ct(skb);  	} -	/* If running out of space, stop queue to avoid getting packets that we -	 * are then unable to transmit. -	 * An alternative would be to force queuing layer to requeue the skb by -	 * returning NETDEV_TX_BUSY. However, NETDEV_TX_BUSY should not be -	 * returned in a normal path of operation: it means that driver is not -	 * maintaining the TX queue stop/start state properly, and causes -	 * the stack to do a non-trivial amount of useless work. -	 * Since most packets only take 1 or 2 ring slots, stopping the queue -	 * early means 16 slots are typically wasted. -	 */ -	if (sq->vq->num_free < 2+MAX_SKB_FRAGS) { -		netif_stop_subqueue(dev, qnum); -		if (!use_napi && -		    unlikely(!virtqueue_enable_cb_delayed(sq->vq))) { -			/* More just got used, free them then recheck. */ -			free_old_xmit_skbs(sq, false); -			if (sq->vq->num_free >= 2+MAX_SKB_FRAGS) { -				netif_start_subqueue(dev, qnum); -				virtqueue_disable_cb(sq->vq); -			} -		} -	} +	check_sq_full_and_disable(vi, dev, sq);  	if (kick || netif_xmit_stopped(txq)) {  		if (virtqueue_kick_prepare(sq->vq) && virtqueue_notify(sq->vq)) { @@ -2156,9 +2301,9 @@ static int virtnet_close(struct net_device *dev)  	cancel_delayed_work_sync(&vi->refill);  	for (i = 0; i < vi->max_queue_pairs; i++) { -		xdp_rxq_info_unreg(&vi->rq[i].xdp_rxq); -		napi_disable(&vi->rq[i].napi);  		virtnet_napi_tx_disable(&vi->sq[i].napi); +		napi_disable(&vi->rq[i].napi); +		xdp_rxq_info_unreg(&vi->rq[i].xdp_rxq);  	}  	return 0; @@ -3078,7 +3223,9 @@ static int virtnet_restore_guest_offloads(struct virtnet_info *vi)  static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog,  			   struct netlink_ext_ack *extack)  { -	unsigned long int max_sz = PAGE_SIZE - sizeof(struct padded_vnet_hdr); +	unsigned int room = SKB_DATA_ALIGN(VIRTIO_XDP_HEADROOM + +					   sizeof(struct skb_shared_info)); +	unsigned int max_sz = PAGE_SIZE - room - ETH_HLEN;  	struct virtnet_info *vi = netdev_priv(dev);  	struct bpf_prog *old_prog;  	u16 xdp_qp = 0, curr_qp; @@ -3101,9 +3248,9 @@ static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog,  		return -EINVAL;  	} -	if (dev->mtu > max_sz) { -		NL_SET_ERR_MSG_MOD(extack, "MTU too large to enable XDP"); -		netdev_warn(dev, "XDP requires MTU less than %lu\n", max_sz); +	if (prog && !prog->aux->xdp_has_frags && dev->mtu > max_sz) { +		NL_SET_ERR_MSG_MOD(extack, "MTU too large to enable XDP without frags"); +		netdev_warn(dev, "single-buffer XDP requires MTU less than %u\n", max_sz);  		return -EINVAL;  	} @@ -3155,7 +3302,10 @@ static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog,  			if (i == 0 && !old_prog)  				virtnet_clear_guest_offloads(vi);  		} +		if (!old_prog) +			xdp_features_set_redirect_target(dev, true);  	} else { +		xdp_features_clear_redirect_target(dev);  		vi->xdp_enabled = false;  	} @@ -3688,6 +3838,12 @@ static int virtnet_validate(struct virtio_device *vdev)  			__virtio_clear_bit(vdev, VIRTIO_NET_F_MTU);  	} +	if (virtio_has_feature(vdev, VIRTIO_NET_F_STANDBY) && +	    !virtio_has_feature(vdev, VIRTIO_NET_F_MAC)) { +		dev_warn(&vdev->dev, "device advertises feature VIRTIO_NET_F_STANDBY but not VIRTIO_NET_F_MAC, disabling standby"); +		__virtio_clear_bit(vdev, VIRTIO_NET_F_STANDBY); +	} +  	return 0;  } @@ -3785,6 +3941,7 @@ static int virtnet_probe(struct virtio_device *vdev)  		dev->hw_features |= NETIF_F_GRO_HW;  	dev->vlan_features = dev->features; +	dev->xdp_features = NETDEV_XDP_ACT_BASIC | NETDEV_XDP_ACT_REDIRECT;  	/* MTU range: 68 - 65535 */  	dev->min_mtu = MIN_MTU; @@ -3800,6 +3957,8 @@ static int virtnet_probe(struct virtio_device *vdev)  		eth_hw_addr_set(dev, addr);  	} else {  		eth_hw_addr_random(dev); +		dev_info(&vdev->dev, "Assigned random MAC address %pM\n", +			 dev->dev_addr);  	}  	/* Set up our device-specific information */ @@ -3811,8 +3970,10 @@ static int virtnet_probe(struct virtio_device *vdev)  	INIT_WORK(&vi->config_work, virtnet_config_changed_work);  	spin_lock_init(&vi->refill_lock); -	if (virtio_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF)) +	if (virtio_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF)) {  		vi->mergeable_rx_bufs = true; +		dev->xdp_features |= NETDEV_XDP_ACT_RX_SG; +	}  	if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_NOTF_COAL)) {  		vi->rx_usecs = 0; @@ -3927,6 +4088,24 @@ static int virtnet_probe(struct virtio_device *vdev)  	virtio_device_ready(vdev); +	/* a random MAC address has been assigned, notify the device. +	 * We don't fail probe if VIRTIO_NET_F_CTRL_MAC_ADDR is not there +	 * because many devices work fine without getting MAC explicitly +	 */ +	if (!virtio_has_feature(vdev, VIRTIO_NET_F_MAC) && +	    virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_MAC_ADDR)) { +		struct scatterlist sg; + +		sg_init_one(&sg, dev->dev_addr, dev->addr_len); +		if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MAC, +					  VIRTIO_NET_CTRL_MAC_ADDR_SET, &sg)) { +			pr_debug("virtio_net: setting MAC address failed\n"); +			rtnl_unlock(); +			err = -EINVAL; +			goto free_unregister_netdev; +		} +	} +  	rtnl_unlock();  	err = virtnet_cpu_notif_add(vi); |