diff options
Diffstat (limited to 'io_uring/net.c')
| -rw-r--r-- | io_uring/net.c | 875 | 
1 files changed, 513 insertions, 362 deletions
diff --git a/io_uring/net.c b/io_uring/net.c index 1e7665ff6ef7..0a48596429d9 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -28,6 +28,7 @@ struct io_accept {  	struct sockaddr __user		*addr;  	int __user			*addr_len;  	int				flags; +	int				iou_flags;  	u32				file_slot;  	unsigned long			nofile;  }; @@ -57,7 +58,7 @@ struct io_sr_msg {  		struct user_msghdr __user	*umsg;  		void __user			*buf;  	}; -	unsigned			len; +	int				len;  	unsigned			done_io;  	unsigned			msg_flags;  	unsigned			nr_multishot_loops; @@ -115,80 +116,85 @@ static bool io_net_retry(struct socket *sock, int flags)  	return sock->type == SOCK_STREAM || sock->type == SOCK_SEQPACKET;  } +static void io_netmsg_iovec_free(struct io_async_msghdr *kmsg) +{ +	if (kmsg->free_iov) { +		kfree(kmsg->free_iov); +		kmsg->free_iov_nr = 0; +		kmsg->free_iov = NULL; +	} +} +  static void io_netmsg_recycle(struct io_kiocb *req, unsigned int issue_flags)  {  	struct io_async_msghdr *hdr = req->async_data; +	struct iovec *iov; -	if (!req_has_async_data(req) || issue_flags & IO_URING_F_UNLOCKED) +	/* can't recycle, ensure we free the iovec if we have one */ +	if (unlikely(issue_flags & IO_URING_F_UNLOCKED)) { +		io_netmsg_iovec_free(hdr);  		return; +	}  	/* Let normal cleanup path reap it if we fail adding to the cache */ -	if (io_alloc_cache_put(&req->ctx->netmsg_cache, &hdr->cache)) { +	iov = hdr->free_iov; +	if (io_alloc_cache_put(&req->ctx->netmsg_cache, hdr)) { +		if (iov) +			kasan_mempool_poison_object(iov);  		req->async_data = NULL;  		req->flags &= ~REQ_F_ASYNC_DATA;  	}  } -static struct io_async_msghdr *io_msg_alloc_async(struct io_kiocb *req, -						  unsigned int issue_flags) +static struct io_async_msghdr *io_msg_alloc_async(struct io_kiocb *req)  {  	struct io_ring_ctx *ctx = req->ctx; -	struct io_cache_entry *entry;  	struct io_async_msghdr *hdr; -	if (!(issue_flags & IO_URING_F_UNLOCKED)) { -		entry = io_alloc_cache_get(&ctx->netmsg_cache); -		if (entry) { -			hdr = container_of(entry, struct io_async_msghdr, cache); -			hdr->free_iov = NULL; -			req->flags |= REQ_F_ASYNC_DATA; -			req->async_data = hdr; -			return hdr; +	hdr = io_alloc_cache_get(&ctx->netmsg_cache); +	if (hdr) { +		if (hdr->free_iov) { +			kasan_mempool_unpoison_object(hdr->free_iov, +				hdr->free_iov_nr * sizeof(struct iovec)); +			req->flags |= REQ_F_NEED_CLEANUP;  		} +		req->flags |= REQ_F_ASYNC_DATA; +		req->async_data = hdr; +		return hdr;  	}  	if (!io_alloc_async_data(req)) {  		hdr = req->async_data; +		hdr->free_iov_nr = 0;  		hdr->free_iov = NULL;  		return hdr;  	}  	return NULL;  } -static inline struct io_async_msghdr *io_msg_alloc_async_prep(struct io_kiocb *req) +/* assign new iovec to kmsg, if we need to */ +static int io_net_vec_assign(struct io_kiocb *req, struct io_async_msghdr *kmsg, +			     struct iovec *iov)  { -	/* ->prep_async is always called from the submission context */ -	return io_msg_alloc_async(req, 0); +	if (iov) { +		req->flags |= REQ_F_NEED_CLEANUP; +		kmsg->free_iov_nr = kmsg->msg.msg_iter.nr_segs; +		if (kmsg->free_iov) +			kfree(kmsg->free_iov); +		kmsg->free_iov = iov; +	} +	return 0;  } -static int io_setup_async_msg(struct io_kiocb *req, -			      struct io_async_msghdr *kmsg, -			      unsigned int issue_flags) +static inline void io_mshot_prep_retry(struct io_kiocb *req, +				       struct io_async_msghdr *kmsg)  { -	struct io_async_msghdr *async_msg; - -	if (req_has_async_data(req)) -		return -EAGAIN; -	async_msg = io_msg_alloc_async(req, issue_flags); -	if (!async_msg) { -		kfree(kmsg->free_iov); -		return -ENOMEM; -	} -	req->flags |= REQ_F_NEED_CLEANUP; -	memcpy(async_msg, kmsg, sizeof(*kmsg)); -	if (async_msg->msg.msg_name) -		async_msg->msg.msg_name = &async_msg->addr; - -	if ((req->flags & REQ_F_BUFFER_SELECT) && !async_msg->msg.msg_iter.nr_segs) -		return -EAGAIN; - -	/* if were using fast_iov, set it to the new one */ -	if (iter_is_iovec(&kmsg->msg.msg_iter) && !kmsg->free_iov) { -		size_t fast_idx = iter_iov(&kmsg->msg.msg_iter) - kmsg->fast_iov; -		async_msg->msg.msg_iter.__iov = &async_msg->fast_iov[fast_idx]; -	} +	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); -	return -EAGAIN; +	req->flags &= ~REQ_F_BL_EMPTY; +	sr->done_io = 0; +	sr->len = 0; /* get from the provided buffer */ +	req->buf_index = sr->buf_group;  }  #ifdef CONFIG_COMPAT @@ -198,7 +204,16 @@ static int io_compat_msg_copy_hdr(struct io_kiocb *req,  {  	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);  	struct compat_iovec __user *uiov; -	int ret; +	struct iovec *iov; +	int ret, nr_segs; + +	if (iomsg->free_iov) { +		nr_segs = iomsg->free_iov_nr; +		iov = iomsg->free_iov; +	} else { +		iov = &iomsg->fast_iov; +		nr_segs = 1; +	}  	if (copy_from_user(msg, sr->umsg_compat, sizeof(*msg)))  		return -EFAULT; @@ -207,9 +222,9 @@ static int io_compat_msg_copy_hdr(struct io_kiocb *req,  	if (req->flags & REQ_F_BUFFER_SELECT) {  		compat_ssize_t clen; -		iomsg->free_iov = NULL;  		if (msg->msg_iovlen == 0) { -			sr->len = 0; +			sr->len = iov->iov_len = 0; +			iov->iov_base = NULL;  		} else if (msg->msg_iovlen > 1) {  			return -EINVAL;  		} else { @@ -225,14 +240,12 @@ static int io_compat_msg_copy_hdr(struct io_kiocb *req,  		return 0;  	} -	iomsg->free_iov = iomsg->fast_iov;  	ret = __import_iovec(ddir, (struct iovec __user *)uiov, msg->msg_iovlen, -				UIO_FASTIOV, &iomsg->free_iov, -				&iomsg->msg.msg_iter, true); +				nr_segs, &iov, &iomsg->msg.msg_iter, true);  	if (unlikely(ret < 0))  		return ret; -	return 0; +	return io_net_vec_assign(req, iomsg, iov);  }  #endif @@ -240,7 +253,16 @@ static int io_msg_copy_hdr(struct io_kiocb *req, struct io_async_msghdr *iomsg,  			   struct user_msghdr *msg, int ddir)  {  	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); -	int ret; +	struct iovec *iov; +	int ret, nr_segs; + +	if (iomsg->free_iov) { +		nr_segs = iomsg->free_iov_nr; +		iov = iomsg->free_iov; +	} else { +		iov = &iomsg->fast_iov; +		nr_segs = 1; +	}  	if (!user_access_begin(sr->umsg, sizeof(*sr->umsg)))  		return -EFAULT; @@ -256,9 +278,8 @@ static int io_msg_copy_hdr(struct io_kiocb *req, struct io_async_msghdr *iomsg,  	if (req->flags & REQ_F_BUFFER_SELECT) {  		if (msg->msg_iovlen == 0) { -			sr->len = iomsg->fast_iov[0].iov_len = 0; -			iomsg->fast_iov[0].iov_base = NULL; -			iomsg->free_iov = NULL; +			sr->len = iov->iov_len = 0; +			iov->iov_base = NULL;  		} else if (msg->msg_iovlen > 1) {  			ret = -EINVAL;  			goto ua_end; @@ -266,10 +287,9 @@ static int io_msg_copy_hdr(struct io_kiocb *req, struct io_async_msghdr *iomsg,  			/* we only need the length for provided buffers */  			if (!access_ok(&msg->msg_iov[0].iov_len, sizeof(__kernel_size_t)))  				goto ua_end; -			unsafe_get_user(iomsg->fast_iov[0].iov_len, -					&msg->msg_iov[0].iov_len, ua_end); -			sr->len = iomsg->fast_iov[0].iov_len; -			iomsg->free_iov = NULL; +			unsafe_get_user(iov->iov_len, &msg->msg_iov[0].iov_len, +					ua_end); +			sr->len = iov->iov_len;  		}  		ret = 0;  ua_end: @@ -278,13 +298,12 @@ ua_end:  	}  	user_access_end(); -	iomsg->free_iov = iomsg->fast_iov; -	ret = __import_iovec(ddir, msg->msg_iov, msg->msg_iovlen, UIO_FASTIOV, -				&iomsg->free_iov, &iomsg->msg.msg_iter, false); +	ret = __import_iovec(ddir, msg->msg_iov, msg->msg_iovlen, nr_segs, +				&iov, &iomsg->msg.msg_iter, false);  	if (unlikely(ret < 0))  		return ret; -	return 0; +	return io_net_vec_assign(req, iomsg, iov);  }  static int io_sendmsg_copy_hdr(struct io_kiocb *req, @@ -320,60 +339,58 @@ static int io_sendmsg_copy_hdr(struct io_kiocb *req,  	return ret;  } -int io_send_prep_async(struct io_kiocb *req) +void io_sendmsg_recvmsg_cleanup(struct io_kiocb *req)  { -	struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg); -	struct io_async_msghdr *io; -	int ret; +	struct io_async_msghdr *io = req->async_data; -	if (req_has_async_data(req)) -		return 0; -	zc->done_io = 0; -	if (!zc->addr) -		return 0; -	io = io_msg_alloc_async_prep(req); -	if (!io) -		return -ENOMEM; -	ret = move_addr_to_kernel(zc->addr, zc->addr_len, &io->addr); -	return ret; +	io_netmsg_iovec_free(io);  } -static int io_setup_async_addr(struct io_kiocb *req, -			      struct sockaddr_storage *addr_storage, -			      unsigned int issue_flags) +static int io_send_setup(struct io_kiocb *req)  {  	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); -	struct io_async_msghdr *io; +	struct io_async_msghdr *kmsg = req->async_data; +	int ret; -	if (!sr->addr || req_has_async_data(req)) -		return -EAGAIN; -	io = io_msg_alloc_async(req, issue_flags); -	if (!io) -		return -ENOMEM; -	memcpy(&io->addr, addr_storage, sizeof(io->addr)); -	return -EAGAIN; +	kmsg->msg.msg_name = NULL; +	kmsg->msg.msg_namelen = 0; +	kmsg->msg.msg_control = NULL; +	kmsg->msg.msg_controllen = 0; +	kmsg->msg.msg_ubuf = NULL; + +	if (sr->addr) { +		ret = move_addr_to_kernel(sr->addr, sr->addr_len, &kmsg->addr); +		if (unlikely(ret < 0)) +			return ret; +		kmsg->msg.msg_name = &kmsg->addr; +		kmsg->msg.msg_namelen = sr->addr_len; +	} +	if (!io_do_buffer_select(req)) { +		ret = import_ubuf(ITER_SOURCE, sr->buf, sr->len, +				  &kmsg->msg.msg_iter); +		if (unlikely(ret < 0)) +			return ret; +	} +	return 0;  } -int io_sendmsg_prep_async(struct io_kiocb *req) +static int io_sendmsg_prep_setup(struct io_kiocb *req, int is_msg)  { -	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); +	struct io_async_msghdr *kmsg;  	int ret; -	sr->done_io = 0; -	if (!io_msg_alloc_async_prep(req)) +	kmsg = io_msg_alloc_async(req); +	if (unlikely(!kmsg))  		return -ENOMEM; -	ret = io_sendmsg_copy_hdr(req, req->async_data); +	if (!is_msg) +		return io_send_setup(req); +	ret = io_sendmsg_copy_hdr(req, kmsg);  	if (!ret)  		req->flags |= REQ_F_NEED_CLEANUP;  	return ret;  } -void io_sendmsg_recvmsg_cleanup(struct io_kiocb *req) -{ -	struct io_async_msghdr *io = req->async_data; - -	kfree(io->free_iov); -} +#define SENDMSG_FLAGS (IORING_RECVSEND_POLL_FIRST | IORING_RECVSEND_BUNDLE)  int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)  { @@ -393,34 +410,114 @@ int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)  	sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));  	sr->len = READ_ONCE(sqe->len);  	sr->flags = READ_ONCE(sqe->ioprio); -	if (sr->flags & ~IORING_RECVSEND_POLL_FIRST) +	if (sr->flags & ~SENDMSG_FLAGS)  		return -EINVAL;  	sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;  	if (sr->msg_flags & MSG_DONTWAIT)  		req->flags |= REQ_F_NOWAIT; +	if (sr->flags & IORING_RECVSEND_BUNDLE) { +		if (req->opcode == IORING_OP_SENDMSG) +			return -EINVAL; +		if (!(req->flags & REQ_F_BUFFER_SELECT)) +			return -EINVAL; +		sr->msg_flags |= MSG_WAITALL; +		sr->buf_group = req->buf_index; +		req->buf_list = NULL; +	} +	if (req->flags & REQ_F_BUFFER_SELECT && sr->len) +		return -EINVAL;  #ifdef CONFIG_COMPAT  	if (req->ctx->compat)  		sr->msg_flags |= MSG_CMSG_COMPAT;  #endif -	return 0; +	return io_sendmsg_prep_setup(req, req->opcode == IORING_OP_SENDMSG);  }  static void io_req_msg_cleanup(struct io_kiocb *req, -			       struct io_async_msghdr *kmsg,  			       unsigned int issue_flags)  {  	req->flags &= ~REQ_F_NEED_CLEANUP; -	/* fast path, check for non-NULL to avoid function call */ -	if (kmsg->free_iov) -		kfree(kmsg->free_iov);  	io_netmsg_recycle(req, issue_flags);  } +/* + * For bundle completions, we need to figure out how many segments we consumed. + * A bundle could be using a single ITER_UBUF if that's all we mapped, or it + * could be using an ITER_IOVEC. If the latter, then if we consumed all of + * the segments, then it's a trivial questiont o answer. If we have residual + * data in the iter, then loop the segments to figure out how much we + * transferred. + */ +static int io_bundle_nbufs(struct io_async_msghdr *kmsg, int ret) +{ +	struct iovec *iov; +	int nbufs; + +	/* no data is always zero segments, and a ubuf is always 1 segment */ +	if (ret <= 0) +		return 0; +	if (iter_is_ubuf(&kmsg->msg.msg_iter)) +		return 1; + +	iov = kmsg->free_iov; +	if (!iov) +		iov = &kmsg->fast_iov; + +	/* if all data was transferred, it's basic pointer math */ +	if (!iov_iter_count(&kmsg->msg.msg_iter)) +		return iter_iov(&kmsg->msg.msg_iter) - iov; + +	/* short transfer, count segments */ +	nbufs = 0; +	do { +		int this_len = min_t(int, iov[nbufs].iov_len, ret); + +		nbufs++; +		ret -= this_len; +	} while (ret); + +	return nbufs; +} + +static inline bool io_send_finish(struct io_kiocb *req, int *ret, +				  struct io_async_msghdr *kmsg, +				  unsigned issue_flags) +{ +	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); +	bool bundle_finished = *ret <= 0; +	unsigned int cflags; + +	if (!(sr->flags & IORING_RECVSEND_BUNDLE)) { +		cflags = io_put_kbuf(req, issue_flags); +		goto finish; +	} + +	cflags = io_put_kbufs(req, io_bundle_nbufs(kmsg, *ret), issue_flags); + +	if (bundle_finished || req->flags & REQ_F_BL_EMPTY) +		goto finish; + +	/* +	 * Fill CQE for this receive and see if we should keep trying to +	 * receive from this socket. +	 */ +	if (io_req_post_cqe(req, *ret, cflags | IORING_CQE_F_MORE)) { +		io_mshot_prep_retry(req, kmsg); +		return false; +	} + +	/* Otherwise stop bundle and use the current result. */ +finish: +	io_req_set_res(req, *ret, cflags); +	*ret = IOU_OK; +	return true; +} +  int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)  {  	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); -	struct io_async_msghdr iomsg, *kmsg; +	struct io_async_msghdr *kmsg = req->async_data;  	struct socket *sock;  	unsigned flags;  	int min_ret = 0; @@ -430,19 +527,9 @@ int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)  	if (unlikely(!sock))  		return -ENOTSOCK; -	if (req_has_async_data(req)) { -		kmsg = req->async_data; -		kmsg->msg.msg_control_user = sr->msg_control; -	} else { -		ret = io_sendmsg_copy_hdr(req, &iomsg); -		if (ret) -			return ret; -		kmsg = &iomsg; -	} -  	if (!(req->flags & REQ_F_POLLED) &&  	    (sr->flags & IORING_RECVSEND_POLL_FIRST)) -		return io_setup_async_msg(req, kmsg, issue_flags); +		return -EAGAIN;  	flags = sr->msg_flags;  	if (issue_flags & IO_URING_F_NONBLOCK) @@ -450,23 +537,25 @@ int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)  	if (flags & MSG_WAITALL)  		min_ret = iov_iter_count(&kmsg->msg.msg_iter); +	kmsg->msg.msg_control_user = sr->msg_control; +  	ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags);  	if (ret < min_ret) {  		if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) -			return io_setup_async_msg(req, kmsg, issue_flags); +			return -EAGAIN;  		if (ret > 0 && io_net_retry(sock, flags)) {  			kmsg->msg.msg_controllen = 0;  			kmsg->msg.msg_control = NULL;  			sr->done_io += ret;  			req->flags |= REQ_F_BL_NO_RECYCLE; -			return io_setup_async_msg(req, kmsg, issue_flags); +			return -EAGAIN;  		}  		if (ret == -ERESTARTSYS)  			ret = -EINTR;  		req_set_fail(req);  	} -	io_req_msg_cleanup(req, kmsg, issue_flags); +	io_req_msg_cleanup(req, issue_flags);  	if (ret >= 0)  		ret += sr->done_io;  	else if (sr->done_io) @@ -477,65 +566,77 @@ int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)  int io_send(struct io_kiocb *req, unsigned int issue_flags)  { -	struct sockaddr_storage __address;  	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); -	struct msghdr msg; +	struct io_async_msghdr *kmsg = req->async_data;  	struct socket *sock;  	unsigned flags;  	int min_ret = 0;  	int ret; -	msg.msg_name = NULL; -	msg.msg_control = NULL; -	msg.msg_controllen = 0; -	msg.msg_namelen = 0; -	msg.msg_ubuf = NULL; - -	if (sr->addr) { -		if (req_has_async_data(req)) { -			struct io_async_msghdr *io = req->async_data; - -			msg.msg_name = &io->addr; -		} else { -			ret = move_addr_to_kernel(sr->addr, sr->addr_len, &__address); -			if (unlikely(ret < 0)) -				return ret; -			msg.msg_name = (struct sockaddr *)&__address; -		} -		msg.msg_namelen = sr->addr_len; -	} - -	if (!(req->flags & REQ_F_POLLED) && -	    (sr->flags & IORING_RECVSEND_POLL_FIRST)) -		return io_setup_async_addr(req, &__address, issue_flags); -  	sock = sock_from_file(req->file);  	if (unlikely(!sock))  		return -ENOTSOCK; -	ret = import_ubuf(ITER_SOURCE, sr->buf, sr->len, &msg.msg_iter); -	if (unlikely(ret)) -		return ret; +	if (!(req->flags & REQ_F_POLLED) && +	    (sr->flags & IORING_RECVSEND_POLL_FIRST)) +		return -EAGAIN;  	flags = sr->msg_flags;  	if (issue_flags & IO_URING_F_NONBLOCK)  		flags |= MSG_DONTWAIT; -	if (flags & MSG_WAITALL) -		min_ret = iov_iter_count(&msg.msg_iter); + +retry_bundle: +	if (io_do_buffer_select(req)) { +		struct buf_sel_arg arg = { +			.iovs = &kmsg->fast_iov, +			.max_len = INT_MAX, +			.nr_iovs = 1, +			.mode = KBUF_MODE_EXPAND, +		}; + +		if (kmsg->free_iov) { +			arg.nr_iovs = kmsg->free_iov_nr; +			arg.iovs = kmsg->free_iov; +			arg.mode |= KBUF_MODE_FREE; +		} + +		if (!(sr->flags & IORING_RECVSEND_BUNDLE)) +			arg.nr_iovs = 1; + +		ret = io_buffers_select(req, &arg, issue_flags); +		if (unlikely(ret < 0)) +			return ret; + +		sr->len = arg.out_len; +		iov_iter_init(&kmsg->msg.msg_iter, ITER_SOURCE, arg.iovs, ret, +				arg.out_len); +		if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->free_iov) { +			kmsg->free_iov_nr = ret; +			kmsg->free_iov = arg.iovs; +		} +	} + +	/* +	 * If MSG_WAITALL is set, or this is a bundle send, then we need +	 * the full amount. If just bundle is set, if we do a short send +	 * then we complete the bundle sequence rather than continue on. +	 */ +	if (flags & MSG_WAITALL || sr->flags & IORING_RECVSEND_BUNDLE) +		min_ret = iov_iter_count(&kmsg->msg.msg_iter);  	flags &= ~MSG_INTERNAL_SENDMSG_FLAGS; -	msg.msg_flags = flags; -	ret = sock_sendmsg(sock, &msg); +	kmsg->msg.msg_flags = flags; +	ret = sock_sendmsg(sock, &kmsg->msg);  	if (ret < min_ret) {  		if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) -			return io_setup_async_addr(req, &__address, issue_flags); +			return -EAGAIN;  		if (ret > 0 && io_net_retry(sock, flags)) {  			sr->len -= ret;  			sr->buf += ret;  			sr->done_io += ret;  			req->flags |= REQ_F_BL_NO_RECYCLE; -			return io_setup_async_addr(req, &__address, issue_flags); +			return -EAGAIN;  		}  		if (ret == -ERESTARTSYS)  			ret = -EINTR; @@ -545,8 +646,12 @@ int io_send(struct io_kiocb *req, unsigned int issue_flags)  		ret += sr->done_io;  	else if (sr->done_io)  		ret = sr->done_io; -	io_req_set_res(req, ret, 0); -	return IOU_OK; + +	if (!io_send_finish(req, &ret, kmsg, issue_flags)) +		goto retry_bundle; + +	io_req_msg_cleanup(req, issue_flags); +	return ret;  }  static int io_recvmsg_mshot_prep(struct io_kiocb *req, @@ -611,23 +716,42 @@ static int io_recvmsg_copy_hdr(struct io_kiocb *req,  					msg.msg_controllen);  } -int io_recvmsg_prep_async(struct io_kiocb *req) +static int io_recvmsg_prep_setup(struct io_kiocb *req)  {  	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); -	struct io_async_msghdr *iomsg; +	struct io_async_msghdr *kmsg;  	int ret; -	sr->done_io = 0; -	if (!io_msg_alloc_async_prep(req)) +	kmsg = io_msg_alloc_async(req); +	if (unlikely(!kmsg))  		return -ENOMEM; -	iomsg = req->async_data; -	ret = io_recvmsg_copy_hdr(req, iomsg); + +	if (req->opcode == IORING_OP_RECV) { +		kmsg->msg.msg_name = NULL; +		kmsg->msg.msg_namelen = 0; +		kmsg->msg.msg_control = NULL; +		kmsg->msg.msg_get_inq = 1; +		kmsg->msg.msg_controllen = 0; +		kmsg->msg.msg_iocb = NULL; +		kmsg->msg.msg_ubuf = NULL; + +		if (!io_do_buffer_select(req)) { +			ret = import_ubuf(ITER_DEST, sr->buf, sr->len, +					  &kmsg->msg.msg_iter); +			if (unlikely(ret)) +				return ret; +		} +		return 0; +	} + +	ret = io_recvmsg_copy_hdr(req, kmsg);  	if (!ret)  		req->flags |= REQ_F_NEED_CLEANUP;  	return ret;  } -#define RECVMSG_FLAGS (IORING_RECVSEND_POLL_FIRST | IORING_RECV_MULTISHOT) +#define RECVMSG_FLAGS (IORING_RECVSEND_POLL_FIRST | IORING_RECV_MULTISHOT | \ +			IORING_RECVSEND_BUNDLE)  int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)  { @@ -641,21 +765,14 @@ int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)  	sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));  	sr->len = READ_ONCE(sqe->len);  	sr->flags = READ_ONCE(sqe->ioprio); -	if (sr->flags & ~(RECVMSG_FLAGS)) +	if (sr->flags & ~RECVMSG_FLAGS)  		return -EINVAL;  	sr->msg_flags = READ_ONCE(sqe->msg_flags);  	if (sr->msg_flags & MSG_DONTWAIT)  		req->flags |= REQ_F_NOWAIT;  	if (sr->msg_flags & MSG_ERRQUEUE)  		req->flags |= REQ_F_CLEAR_POLLIN; -	if (sr->flags & IORING_RECV_MULTISHOT) { -		if (!(req->flags & REQ_F_BUFFER_SELECT)) -			return -EINVAL; -		if (sr->msg_flags & MSG_WAITALL) -			return -EINVAL; -		if (req->opcode == IORING_OP_RECV && sr->len) -			return -EINVAL; -		req->flags |= REQ_F_APOLL_MULTISHOT; +	if (req->flags & REQ_F_BUFFER_SELECT) {  		/*  		 * Store the buffer group for this multishot receive separately,  		 * as if we end up doing an io-wq based issue that selects a @@ -665,6 +782,20 @@ int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)  		 * restore it.  		 */  		sr->buf_group = req->buf_index; +		req->buf_list = NULL; +	} +	if (sr->flags & IORING_RECV_MULTISHOT) { +		if (!(req->flags & REQ_F_BUFFER_SELECT)) +			return -EINVAL; +		if (sr->msg_flags & MSG_WAITALL) +			return -EINVAL; +		if (req->opcode == IORING_OP_RECV && sr->len) +			return -EINVAL; +		req->flags |= REQ_F_APOLL_MULTISHOT; +	} +	if (sr->flags & IORING_RECVSEND_BUNDLE) { +		if (req->opcode == IORING_OP_RECVMSG) +			return -EINVAL;  	}  #ifdef CONFIG_COMPAT @@ -672,17 +803,7 @@ int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)  		sr->msg_flags |= MSG_CMSG_COMPAT;  #endif  	sr->nr_multishot_loops = 0; -	return 0; -} - -static inline void io_recv_prep_retry(struct io_kiocb *req) -{ -	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); - -	req->flags &= ~REQ_F_BL_EMPTY; -	sr->done_io = 0; -	sr->len = 0; /* get from the provided buffer */ -	req->buf_index = sr->buf_group; +	return io_recvmsg_prep_setup(req);  }  /* @@ -692,28 +813,36 @@ static inline void io_recv_prep_retry(struct io_kiocb *req)   * again (for multishot).   */  static inline bool io_recv_finish(struct io_kiocb *req, int *ret, -				  struct msghdr *msg, bool mshot_finished, -				  unsigned issue_flags) +				  struct io_async_msghdr *kmsg, +				  bool mshot_finished, unsigned issue_flags)  { +	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);  	unsigned int cflags; -	cflags = io_put_kbuf(req, issue_flags); -	if (msg->msg_inq > 0) +	if (sr->flags & IORING_RECVSEND_BUNDLE) +		cflags = io_put_kbufs(req, io_bundle_nbufs(kmsg, *ret), +				      issue_flags); +	else +		cflags = io_put_kbuf(req, issue_flags); + +	if (kmsg->msg.msg_inq > 0)  		cflags |= IORING_CQE_F_SOCK_NONEMPTY; +	/* bundle with no more immediate buffers, we're done */ +	if (sr->flags & IORING_RECVSEND_BUNDLE && req->flags & REQ_F_BL_EMPTY) +		goto finish; +  	/*  	 * Fill CQE for this receive and see if we should keep trying to  	 * receive from this socket.  	 */  	if ((req->flags & REQ_F_APOLL_MULTISHOT) && !mshot_finished && -	    io_fill_cqe_req_aux(req, issue_flags & IO_URING_F_COMPLETE_DEFER, -				*ret, cflags | IORING_CQE_F_MORE)) { -		struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); +	    io_req_post_cqe(req, *ret, cflags | IORING_CQE_F_MORE)) {  		int mshot_retry_ret = IOU_ISSUE_SKIP_COMPLETE; -		io_recv_prep_retry(req); +		io_mshot_prep_retry(req, kmsg);  		/* Known not-empty or unknown state, retry */ -		if (cflags & IORING_CQE_F_SOCK_NONEMPTY || msg->msg_inq < 0) { +		if (cflags & IORING_CQE_F_SOCK_NONEMPTY || kmsg->msg.msg_inq < 0) {  			if (sr->nr_multishot_loops++ < MULTISHOT_MAX_RETRY)  				return false;  			/* mshot retries exceeded, force a requeue */ @@ -728,12 +857,14 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret,  	}  	/* Finish the request / stop multishot. */ +finish:  	io_req_set_res(req, *ret, cflags);  	if (issue_flags & IO_URING_F_MULTISHOT)  		*ret = IOU_STOP_MULTISHOT;  	else  		*ret = IOU_OK; +	io_req_msg_cleanup(req, issue_flags);  	return true;  } @@ -824,7 +955,7 @@ static int io_recvmsg_multishot(struct socket *sock, struct io_sr_msg *io,  int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)  {  	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); -	struct io_async_msghdr iomsg, *kmsg; +	struct io_async_msghdr *kmsg = req->async_data;  	struct socket *sock;  	unsigned flags;  	int ret, min_ret = 0; @@ -835,18 +966,9 @@ int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)  	if (unlikely(!sock))  		return -ENOTSOCK; -	if (req_has_async_data(req)) { -		kmsg = req->async_data; -	} else { -		ret = io_recvmsg_copy_hdr(req, &iomsg); -		if (ret) -			return ret; -		kmsg = &iomsg; -	} -  	if (!(req->flags & REQ_F_POLLED) &&  	    (sr->flags & IORING_RECVSEND_POLL_FIRST)) -		return io_setup_async_msg(req, kmsg, issue_flags); +		return -EAGAIN;  	flags = sr->msg_flags;  	if (force_nonblock) @@ -888,17 +1010,16 @@ retry_multishot:  	if (ret < min_ret) {  		if (ret == -EAGAIN && force_nonblock) { -			ret = io_setup_async_msg(req, kmsg, issue_flags); -			if (ret == -EAGAIN && (issue_flags & IO_URING_F_MULTISHOT)) { +			if (issue_flags & IO_URING_F_MULTISHOT) {  				io_kbuf_recycle(req, issue_flags);  				return IOU_ISSUE_SKIP_COMPLETE;  			} -			return ret; +			return -EAGAIN;  		}  		if (ret > 0 && io_net_retry(sock, flags)) {  			sr->done_io += ret;  			req->flags |= REQ_F_BL_NO_RECYCLE; -			return io_setup_async_msg(req, kmsg, issue_flags); +			return -EAGAIN;  		}  		if (ret == -ERESTARTSYS)  			ret = -EINTR; @@ -914,21 +1035,79 @@ retry_multishot:  	else  		io_kbuf_recycle(req, issue_flags); -	if (!io_recv_finish(req, &ret, &kmsg->msg, mshot_finished, issue_flags)) +	if (!io_recv_finish(req, &ret, kmsg, mshot_finished, issue_flags))  		goto retry_multishot; -	if (mshot_finished) -		io_req_msg_cleanup(req, kmsg, issue_flags); -	else if (ret == -EAGAIN) -		return io_setup_async_msg(req, kmsg, issue_flags); -  	return ret;  } +static int io_recv_buf_select(struct io_kiocb *req, struct io_async_msghdr *kmsg, +			      size_t *len, unsigned int issue_flags) +{ +	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); +	int ret; + +	/* +	 * If the ring isn't locked, then don't use the peek interface +	 * to grab multiple buffers as we will lock/unlock between +	 * this selection and posting the buffers. +	 */ +	if (!(issue_flags & IO_URING_F_UNLOCKED) && +	    sr->flags & IORING_RECVSEND_BUNDLE) { +		struct buf_sel_arg arg = { +			.iovs = &kmsg->fast_iov, +			.nr_iovs = 1, +			.mode = KBUF_MODE_EXPAND, +		}; + +		if (kmsg->free_iov) { +			arg.nr_iovs = kmsg->free_iov_nr; +			arg.iovs = kmsg->free_iov; +			arg.mode |= KBUF_MODE_FREE; +		} + +		if (kmsg->msg.msg_inq > 0) +			arg.max_len = min_not_zero(sr->len, kmsg->msg.msg_inq); + +		ret = io_buffers_peek(req, &arg); +		if (unlikely(ret < 0)) +			return ret; + +		/* special case 1 vec, can be a fast path */ +		if (ret == 1) { +			sr->buf = arg.iovs[0].iov_base; +			sr->len = arg.iovs[0].iov_len; +			goto map_ubuf; +		} +		iov_iter_init(&kmsg->msg.msg_iter, ITER_DEST, arg.iovs, ret, +				arg.out_len); +		if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->free_iov) { +			kmsg->free_iov_nr = ret; +			kmsg->free_iov = arg.iovs; +		} +	} else { +		void __user *buf; + +		*len = sr->len; +		buf = io_buffer_select(req, len, issue_flags); +		if (!buf) +			return -ENOBUFS; +		sr->buf = buf; +		sr->len = *len; +map_ubuf: +		ret = import_ubuf(ITER_DEST, sr->buf, sr->len, +				  &kmsg->msg.msg_iter); +		if (unlikely(ret)) +			return ret; +	} + +	return 0; +} +  int io_recv(struct io_kiocb *req, unsigned int issue_flags)  {  	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); -	struct msghdr msg; +	struct io_async_msghdr *kmsg = req->async_data;  	struct socket *sock;  	unsigned flags;  	int ret, min_ret = 0; @@ -943,40 +1122,25 @@ int io_recv(struct io_kiocb *req, unsigned int issue_flags)  	if (unlikely(!sock))  		return -ENOTSOCK; -	msg.msg_name = NULL; -	msg.msg_namelen = 0; -	msg.msg_control = NULL; -	msg.msg_get_inq = 1; -	msg.msg_controllen = 0; -	msg.msg_iocb = NULL; -	msg.msg_ubuf = NULL; -  	flags = sr->msg_flags;  	if (force_nonblock)  		flags |= MSG_DONTWAIT;  retry_multishot:  	if (io_do_buffer_select(req)) { -		void __user *buf; - -		buf = io_buffer_select(req, &len, issue_flags); -		if (!buf) -			return -ENOBUFS; -		sr->buf = buf; -		sr->len = len; +		ret = io_recv_buf_select(req, kmsg, &len, issue_flags); +		if (unlikely(ret)) +			goto out_free; +		sr->buf = NULL;  	} -	ret = import_ubuf(ITER_DEST, sr->buf, len, &msg.msg_iter); -	if (unlikely(ret)) -		goto out_free; - -	msg.msg_inq = -1; -	msg.msg_flags = 0; +	kmsg->msg.msg_inq = -1; +	kmsg->msg.msg_flags = 0;  	if (flags & MSG_WAITALL) -		min_ret = iov_iter_count(&msg.msg_iter); +		min_ret = iov_iter_count(&kmsg->msg.msg_iter); -	ret = sock_recvmsg(sock, &msg, flags); +	ret = sock_recvmsg(sock, &kmsg->msg, flags);  	if (ret < min_ret) {  		if (ret == -EAGAIN && force_nonblock) {  			if (issue_flags & IO_URING_F_MULTISHOT) { @@ -996,7 +1160,7 @@ retry_multishot:  		if (ret == -ERESTARTSYS)  			ret = -EINTR;  		req_set_fail(req); -	} else if ((flags & MSG_WAITALL) && (msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) { +	} else if ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) {  out_free:  		req_set_fail(req);  	} @@ -1008,7 +1172,7 @@ out_free:  	else  		io_kbuf_recycle(req, issue_flags); -	if (!io_recv_finish(req, &ret, &msg, ret <= 0, issue_flags)) +	if (!io_recv_finish(req, &ret, kmsg, ret <= 0, issue_flags))  		goto retry_multishot;  	return ret; @@ -1017,14 +1181,10 @@ out_free:  void io_send_zc_cleanup(struct io_kiocb *req)  {  	struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg); -	struct io_async_msghdr *io; +	struct io_async_msghdr *io = req->async_data; -	if (req_has_async_data(req)) { -		io = req->async_data; -		/* might be ->fast_iov if *msg_copy_hdr failed */ -		if (io->free_iov != io->fast_iov) -			kfree(io->free_iov); -	} +	if (req_has_async_data(req)) +		io_netmsg_iovec_free(io);  	if (zc->notif) {  		io_notif_flush(zc->notif);  		zc->notif = NULL; @@ -1041,6 +1201,7 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)  	struct io_kiocb *notif;  	zc->done_io = 0; +	req->flags |= REQ_F_POLL_NO_LAZY;  	if (unlikely(READ_ONCE(sqe->__pad2[0]) || READ_ONCE(sqe->addr3)))  		return -EINVAL; @@ -1061,8 +1222,11 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)  		if (zc->flags & ~IO_ZC_FLAGS_VALID)  			return -EINVAL;  		if (zc->flags & IORING_SEND_ZC_REPORT_USAGE) { -			io_notif_set_extended(notif); -			io_notif_to_data(notif)->zc_report = true; +			struct io_notif_data *nd = io_notif_to_data(notif); + +			nd->zc_report = true; +			nd->zc_used = false; +			nd->zc_copied = false;  		}  	} @@ -1090,7 +1254,7 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)  	zc->buf = u64_to_user_ptr(READ_ONCE(sqe->addr));  	zc->len = READ_ONCE(sqe->len); -	zc->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL; +	zc->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL | MSG_ZEROCOPY;  	if (zc->msg_flags & MSG_DONTWAIT)  		req->flags |= REQ_F_NOWAIT; @@ -1098,7 +1262,7 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)  	if (req->ctx->compat)  		zc->msg_flags |= MSG_CMSG_COMPAT;  #endif -	return 0; +	return io_sendmsg_prep_setup(req, req->opcode == IORING_OP_SENDMSG_ZC);  }  static int io_sg_from_iter_iovec(struct sock *sk, struct sk_buff *skb, @@ -1159,11 +1323,34 @@ static int io_sg_from_iter(struct sock *sk, struct sk_buff *skb,  	return ret;  } +static int io_send_zc_import(struct io_kiocb *req, struct io_async_msghdr *kmsg) +{ +	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); +	int ret; + +	if (sr->flags & IORING_RECVSEND_FIXED_BUF) { +		ret = io_import_fixed(ITER_SOURCE, &kmsg->msg.msg_iter, req->imu, +					(u64)(uintptr_t)sr->buf, sr->len); +		if (unlikely(ret)) +			return ret; +		kmsg->msg.sg_from_iter = io_sg_from_iter; +	} else { +		ret = import_ubuf(ITER_SOURCE, sr->buf, sr->len, &kmsg->msg.msg_iter); +		if (unlikely(ret)) +			return ret; +		ret = io_notif_account_mem(sr->notif, sr->len); +		if (unlikely(ret)) +			return ret; +		kmsg->msg.sg_from_iter = io_sg_from_iter_iovec; +	} + +	return ret; +} +  int io_send_zc(struct io_kiocb *req, unsigned int issue_flags)  { -	struct sockaddr_storage __address;  	struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg); -	struct msghdr msg; +	struct io_async_msghdr *kmsg = req->async_data;  	struct socket *sock;  	unsigned msg_flags;  	int ret, min_ret = 0; @@ -1174,67 +1361,37 @@ int io_send_zc(struct io_kiocb *req, unsigned int issue_flags)  	if (!test_bit(SOCK_SUPPORT_ZC, &sock->flags))  		return -EOPNOTSUPP; -	msg.msg_name = NULL; -	msg.msg_control = NULL; -	msg.msg_controllen = 0; -	msg.msg_namelen = 0; - -	if (zc->addr) { -		if (req_has_async_data(req)) { -			struct io_async_msghdr *io = req->async_data; - -			msg.msg_name = &io->addr; -		} else { -			ret = move_addr_to_kernel(zc->addr, zc->addr_len, &__address); -			if (unlikely(ret < 0)) -				return ret; -			msg.msg_name = (struct sockaddr *)&__address; -		} -		msg.msg_namelen = zc->addr_len; -	} -  	if (!(req->flags & REQ_F_POLLED) &&  	    (zc->flags & IORING_RECVSEND_POLL_FIRST)) -		return io_setup_async_addr(req, &__address, issue_flags); +		return -EAGAIN; -	if (zc->flags & IORING_RECVSEND_FIXED_BUF) { -		ret = io_import_fixed(ITER_SOURCE, &msg.msg_iter, req->imu, -					(u64)(uintptr_t)zc->buf, zc->len); -		if (unlikely(ret)) -			return ret; -		msg.sg_from_iter = io_sg_from_iter; -	} else { -		io_notif_set_extended(zc->notif); -		ret = import_ubuf(ITER_SOURCE, zc->buf, zc->len, &msg.msg_iter); -		if (unlikely(ret)) -			return ret; -		ret = io_notif_account_mem(zc->notif, zc->len); +	if (!zc->done_io) { +		ret = io_send_zc_import(req, kmsg);  		if (unlikely(ret))  			return ret; -		msg.sg_from_iter = io_sg_from_iter_iovec;  	} -	msg_flags = zc->msg_flags | MSG_ZEROCOPY; +	msg_flags = zc->msg_flags;  	if (issue_flags & IO_URING_F_NONBLOCK)  		msg_flags |= MSG_DONTWAIT;  	if (msg_flags & MSG_WAITALL) -		min_ret = iov_iter_count(&msg.msg_iter); +		min_ret = iov_iter_count(&kmsg->msg.msg_iter);  	msg_flags &= ~MSG_INTERNAL_SENDMSG_FLAGS; -	msg.msg_flags = msg_flags; -	msg.msg_ubuf = &io_notif_to_data(zc->notif)->uarg; -	ret = sock_sendmsg(sock, &msg); +	kmsg->msg.msg_flags = msg_flags; +	kmsg->msg.msg_ubuf = &io_notif_to_data(zc->notif)->uarg; +	ret = sock_sendmsg(sock, &kmsg->msg);  	if (unlikely(ret < min_ret)) {  		if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) -			return io_setup_async_addr(req, &__address, issue_flags); +			return -EAGAIN; -		if (ret > 0 && io_net_retry(sock, msg.msg_flags)) { +		if (ret > 0 && io_net_retry(sock, kmsg->msg.msg_flags)) {  			zc->len -= ret;  			zc->buf += ret;  			zc->done_io += ret;  			req->flags |= REQ_F_BL_NO_RECYCLE; -			return io_setup_async_addr(req, &__address, issue_flags); +			return -EAGAIN;  		}  		if (ret == -ERESTARTSYS)  			ret = -EINTR; @@ -1252,7 +1409,7 @@ int io_send_zc(struct io_kiocb *req, unsigned int issue_flags)  	 */  	if (!(issue_flags & IO_URING_F_UNLOCKED)) {  		io_notif_flush(zc->notif); -		req->flags &= ~REQ_F_NEED_CLEANUP; +		io_req_msg_cleanup(req, 0);  	}  	io_req_set_res(req, ret, IORING_CQE_F_MORE);  	return IOU_OK; @@ -1261,62 +1418,46 @@ int io_send_zc(struct io_kiocb *req, unsigned int issue_flags)  int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags)  {  	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); -	struct io_async_msghdr iomsg, *kmsg; +	struct io_async_msghdr *kmsg = req->async_data;  	struct socket *sock;  	unsigned flags;  	int ret, min_ret = 0; -	io_notif_set_extended(sr->notif); -  	sock = sock_from_file(req->file);  	if (unlikely(!sock))  		return -ENOTSOCK;  	if (!test_bit(SOCK_SUPPORT_ZC, &sock->flags))  		return -EOPNOTSUPP; -	if (req_has_async_data(req)) { -		kmsg = req->async_data; -	} else { -		ret = io_sendmsg_copy_hdr(req, &iomsg); -		if (ret) -			return ret; -		kmsg = &iomsg; -	} -  	if (!(req->flags & REQ_F_POLLED) &&  	    (sr->flags & IORING_RECVSEND_POLL_FIRST)) -		return io_setup_async_msg(req, kmsg, issue_flags); +		return -EAGAIN; -	flags = sr->msg_flags | MSG_ZEROCOPY; +	flags = sr->msg_flags;  	if (issue_flags & IO_URING_F_NONBLOCK)  		flags |= MSG_DONTWAIT;  	if (flags & MSG_WAITALL)  		min_ret = iov_iter_count(&kmsg->msg.msg_iter); +	kmsg->msg.msg_control_user = sr->msg_control;  	kmsg->msg.msg_ubuf = &io_notif_to_data(sr->notif)->uarg;  	kmsg->msg.sg_from_iter = io_sg_from_iter_iovec;  	ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags);  	if (unlikely(ret < min_ret)) {  		if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) -			return io_setup_async_msg(req, kmsg, issue_flags); +			return -EAGAIN;  		if (ret > 0 && io_net_retry(sock, flags)) {  			sr->done_io += ret;  			req->flags |= REQ_F_BL_NO_RECYCLE; -			return io_setup_async_msg(req, kmsg, issue_flags); +			return -EAGAIN;  		}  		if (ret == -ERESTARTSYS)  			ret = -EINTR;  		req_set_fail(req);  	} -	/* fast path, check for non-NULL to avoid function call */ -	if (kmsg->free_iov) { -		kfree(kmsg->free_iov); -		kmsg->free_iov = NULL; -	} -	io_netmsg_recycle(req, issue_flags);  	if (ret >= 0)  		ret += sr->done_io;  	else if (sr->done_io) @@ -1328,7 +1469,7 @@ int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags)  	 */  	if (!(issue_flags & IO_URING_F_UNLOCKED)) {  		io_notif_flush(sr->notif); -		req->flags &= ~REQ_F_NEED_CLEANUP; +		io_req_msg_cleanup(req, 0);  	}  	io_req_set_res(req, ret, IORING_CQE_F_MORE);  	return IOU_OK; @@ -1346,10 +1487,12 @@ void io_sendrecv_fail(struct io_kiocb *req)  		req->cqe.flags |= IORING_CQE_F_MORE;  } +#define ACCEPT_FLAGS	(IORING_ACCEPT_MULTISHOT | IORING_ACCEPT_DONTWAIT | \ +			 IORING_ACCEPT_POLL_FIRST) +  int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)  {  	struct io_accept *accept = io_kiocb_to_cmd(req, struct io_accept); -	unsigned flags;  	if (sqe->len || sqe->buf_index)  		return -EINVAL; @@ -1358,15 +1501,15 @@ int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)  	accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2));  	accept->flags = READ_ONCE(sqe->accept_flags);  	accept->nofile = rlimit(RLIMIT_NOFILE); -	flags = READ_ONCE(sqe->ioprio); -	if (flags & ~IORING_ACCEPT_MULTISHOT) +	accept->iou_flags = READ_ONCE(sqe->ioprio); +	if (accept->iou_flags & ~ACCEPT_FLAGS)  		return -EINVAL;  	accept->file_slot = READ_ONCE(sqe->file_index);  	if (accept->file_slot) {  		if (accept->flags & SOCK_CLOEXEC)  			return -EINVAL; -		if (flags & IORING_ACCEPT_MULTISHOT && +		if (accept->iou_flags & IORING_ACCEPT_MULTISHOT &&  		    accept->file_slot != IORING_FILE_INDEX_ALLOC)  			return -EINVAL;  	} @@ -1374,8 +1517,10 @@ int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)  		return -EINVAL;  	if (SOCK_NONBLOCK != O_NONBLOCK && (accept->flags & SOCK_NONBLOCK))  		accept->flags = (accept->flags & ~SOCK_NONBLOCK) | O_NONBLOCK; -	if (flags & IORING_ACCEPT_MULTISHOT) +	if (accept->iou_flags & IORING_ACCEPT_MULTISHOT)  		req->flags |= REQ_F_APOLL_MULTISHOT; +	if (accept->iou_flags & IORING_ACCEPT_DONTWAIT) +		req->flags |= REQ_F_NOWAIT;  	return 0;  } @@ -1383,24 +1528,34 @@ int io_accept(struct io_kiocb *req, unsigned int issue_flags)  {  	struct io_accept *accept = io_kiocb_to_cmd(req, struct io_accept);  	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; -	unsigned int file_flags = force_nonblock ? O_NONBLOCK : 0;  	bool fixed = !!accept->file_slot; +	struct proto_accept_arg arg = { +		.flags = force_nonblock ? O_NONBLOCK : 0, +	};  	struct file *file; +	unsigned cflags;  	int ret, fd; +	if (!(req->flags & REQ_F_POLLED) && +	    accept->iou_flags & IORING_ACCEPT_POLL_FIRST) +		return -EAGAIN; +  retry:  	if (!fixed) {  		fd = __get_unused_fd_flags(accept->flags, accept->nofile);  		if (unlikely(fd < 0))  			return fd;  	} -	file = do_accept(req->file, file_flags, accept->addr, accept->addr_len, +	arg.err = 0; +	arg.is_empty = -1; +	file = do_accept(req->file, &arg, accept->addr, accept->addr_len,  			 accept->flags);  	if (IS_ERR(file)) {  		if (!fixed)  			put_unused_fd(fd);  		ret = PTR_ERR(file); -		if (ret == -EAGAIN && force_nonblock) { +		if (ret == -EAGAIN && force_nonblock && +		    !(accept->iou_flags & IORING_ACCEPT_DONTWAIT)) {  			/*  			 * if it's multishot and polled, we don't need to  			 * return EAGAIN to arm the poll infra since it @@ -1421,18 +1576,26 @@ retry:  						accept->file_slot);  	} +	cflags = 0; +	if (!arg.is_empty) +		cflags |= IORING_CQE_F_SOCK_NONEMPTY; +  	if (!(req->flags & REQ_F_APOLL_MULTISHOT)) { -		io_req_set_res(req, ret, 0); +		io_req_set_res(req, ret, cflags);  		return IOU_OK;  	}  	if (ret < 0)  		return ret; -	if (io_fill_cqe_req_aux(req, issue_flags & IO_URING_F_COMPLETE_DEFER, -				ret, IORING_CQE_F_MORE)) -		goto retry; +	if (io_req_post_cqe(req, ret, cflags | IORING_CQE_F_MORE)) { +		if (cflags & IORING_CQE_F_SOCK_NONEMPTY || arg.is_empty == -1) +			goto retry; +		if (issue_flags & IO_URING_F_MULTISHOT) +			return IOU_ISSUE_SKIP_COMPLETE; +		return -EAGAIN; +	} -	io_req_set_res(req, ret, 0); +	io_req_set_res(req, ret, cflags);  	return IOU_STOP_MULTISHOT;  } @@ -1490,17 +1653,10 @@ int io_socket(struct io_kiocb *req, unsigned int issue_flags)  	return IOU_OK;  } -int io_connect_prep_async(struct io_kiocb *req) -{ -	struct io_async_connect *io = req->async_data; -	struct io_connect *conn = io_kiocb_to_cmd(req, struct io_connect); - -	return move_addr_to_kernel(conn->addr, conn->addr_len, &io->address); -} -  int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)  {  	struct io_connect *conn = io_kiocb_to_cmd(req, struct io_connect); +	struct io_async_msghdr *io;  	if (sqe->len || sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in)  		return -EINVAL; @@ -1508,32 +1664,26 @@ int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)  	conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));  	conn->addr_len =  READ_ONCE(sqe->addr2);  	conn->in_progress = conn->seen_econnaborted = false; -	return 0; + +	io = io_msg_alloc_async(req); +	if (unlikely(!io)) +		return -ENOMEM; + +	return move_addr_to_kernel(conn->addr, conn->addr_len, &io->addr);  }  int io_connect(struct io_kiocb *req, unsigned int issue_flags)  {  	struct io_connect *connect = io_kiocb_to_cmd(req, struct io_connect); -	struct io_async_connect __io, *io; +	struct io_async_msghdr *io = req->async_data;  	unsigned file_flags;  	int ret;  	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; -	if (req_has_async_data(req)) { -		io = req->async_data; -	} else { -		ret = move_addr_to_kernel(connect->addr, -						connect->addr_len, -						&__io.address); -		if (ret) -			goto out; -		io = &__io; -	} -  	file_flags = force_nonblock ? O_NONBLOCK : 0; -	ret = __sys_connect_file(req->file, &io->address, -					connect->addr_len, file_flags); +	ret = __sys_connect_file(req->file, &io->addr, connect->addr_len, +				 file_flags);  	if ((ret == -EAGAIN || ret == -EINPROGRESS || ret == -ECONNABORTED)  	    && force_nonblock) {  		if (ret == -EINPROGRESS) { @@ -1543,13 +1693,6 @@ int io_connect(struct io_kiocb *req, unsigned int issue_flags)  				goto out;  			connect->seen_econnaborted = true;  		} -		if (req_has_async_data(req)) -			return -EAGAIN; -		if (io_alloc_async_data(req)) { -			ret = -ENOMEM; -			goto out; -		} -		memcpy(req->async_data, &__io, sizeof(__io));  		return -EAGAIN;  	}  	if (connect->in_progress) { @@ -1567,12 +1710,20 @@ int io_connect(struct io_kiocb *req, unsigned int issue_flags)  out:  	if (ret < 0)  		req_set_fail(req); +	io_req_msg_cleanup(req, issue_flags);  	io_req_set_res(req, ret, 0);  	return IOU_OK;  } -void io_netmsg_cache_free(struct io_cache_entry *entry) +void io_netmsg_cache_free(const void *entry)  { -	kfree(container_of(entry, struct io_async_msghdr, cache)); +	struct io_async_msghdr *kmsg = (struct io_async_msghdr *) entry; + +	if (kmsg->free_iov) { +		kasan_mempool_unpoison_object(kmsg->free_iov, +				kmsg->free_iov_nr * sizeof(struct iovec)); +		io_netmsg_iovec_free(kmsg); +	} +	kfree(kmsg);  }  #endif  |