diff options
Diffstat (limited to 'fs/io_uring.c')
| -rw-r--r-- | fs/io_uring.c | 2401 | 
1 files changed, 1594 insertions, 807 deletions
diff --git a/fs/io_uring.c b/fs/io_uring.c index fb2a0cb4aaf8..496a2af7d12c 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -57,12 +57,13 @@  #include <linux/mman.h>  #include <linux/percpu.h>  #include <linux/slab.h> -#include <linux/blkdev.h> +#include <linux/blk-mq.h>  #include <linux/bvec.h>  #include <linux/net.h>  #include <net/sock.h>  #include <net/af_unix.h>  #include <net/scm.h> +#include <net/busy_poll.h>  #include <linux/anon_inodes.h>  #include <linux/sched/mm.h>  #include <linux/uaccess.h> @@ -108,7 +109,8 @@  #define SQE_COMMON_FLAGS (IOSQE_FIXED_FILE | IOSQE_IO_LINK | \  			  IOSQE_IO_HARDLINK | IOSQE_ASYNC) -#define SQE_VALID_FLAGS	(SQE_COMMON_FLAGS|IOSQE_BUFFER_SELECT|IOSQE_IO_DRAIN) +#define SQE_VALID_FLAGS	(SQE_COMMON_FLAGS | IOSQE_BUFFER_SELECT | \ +			IOSQE_IO_DRAIN | IOSQE_CQE_SKIP_SUCCESS)  #define IO_REQ_CLEAN_FLAGS (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP | \  				REQ_F_POLLED | REQ_F_INFLIGHT | REQ_F_CREDS | \ @@ -262,11 +264,18 @@ struct io_rsrc_data {  	bool				quiesce;  }; +struct io_buffer_list { +	struct list_head list; +	struct list_head buf_list; +	__u16 bgid; +}; +  struct io_buffer {  	struct list_head list;  	__u64 addr;  	__u32 len;  	__u16 bid; +	__u16 bgid;  };  struct io_restriction { @@ -320,10 +329,19 @@ struct io_submit_state {  	bool			plug_started;  	bool			need_plug; +	bool			flush_cqes;  	unsigned short		submit_nr;  	struct blk_plug		plug;  }; +struct io_ev_fd { +	struct eventfd_ctx	*cq_ev_fd; +	unsigned int		eventfd_async: 1; +	struct rcu_head		rcu; +}; + +#define IO_BUFFERS_HASH_BITS	5 +  struct io_ring_ctx {  	/* const or read-mostly hot data */  	struct { @@ -333,10 +351,11 @@ struct io_ring_ctx {  		unsigned int		flags;  		unsigned int		compat: 1;  		unsigned int		drain_next: 1; -		unsigned int		eventfd_async: 1;  		unsigned int		restricted: 1;  		unsigned int		off_timeout_used: 1;  		unsigned int		drain_active: 1; +		unsigned int		drain_disabled: 1; +		unsigned int		has_evfd: 1;  	} ____cacheline_aligned_in_smp;  	/* submission data */ @@ -375,7 +394,9 @@ struct io_ring_ctx {  		struct list_head	timeout_list;  		struct list_head	ltimeout_list;  		struct list_head	cq_overflow_list; -		struct xarray		io_buffers; +		struct list_head	*io_buffers; +		struct list_head	io_buffers_cache; +		struct list_head	apoll_cache;  		struct xarray		personalities;  		u32			pers_next;  		unsigned		sq_thread_idle; @@ -392,11 +413,16 @@ struct io_ring_ctx {  	struct list_head	sqd_list;  	unsigned long		check_cq_overflow; +#ifdef CONFIG_NET_RX_BUSY_POLL +	/* used to track busy poll napi_id */ +	struct list_head	napi_list; +	spinlock_t		napi_lock;	/* napi_list lock */ +#endif  	struct {  		unsigned		cached_cq_tail;  		unsigned		cq_entries; -		struct eventfd_ctx	*cq_ev_fd; +		struct io_ev_fd	__rcu	*io_ev_fd;  		struct wait_queue_head	cq_wait;  		unsigned		cq_extra;  		atomic_t		cq_timeouts; @@ -418,6 +444,8 @@ struct io_ring_ctx {  		struct hlist_head	*cancel_hash;  		unsigned		cancel_hash_bits;  		bool			poll_multi_queue; + +		struct list_head	io_buffers_comp;  	} ____cacheline_aligned_in_smp;  	struct io_restriction		restrictions; @@ -433,6 +461,8 @@ struct io_ring_ctx {  		struct llist_head		rsrc_put_llist;  		struct list_head		rsrc_ref_list;  		spinlock_t			rsrc_ref_lock; + +		struct list_head	io_buffers_pages;  	};  	/* Keep this last, we don't need it for the fast path */ @@ -458,6 +488,11 @@ struct io_ring_ctx {  	};  }; +/* + * Arbitrary limit, can be raised if need be + */ +#define IO_RINGFD_REG_MAX 16 +  struct io_uring_task {  	/* submission side */  	int			cached_refs; @@ -471,7 +506,9 @@ struct io_uring_task {  	spinlock_t		task_lock;  	struct io_wq_work_list	task_list; +	struct io_wq_work_list	prior_task_list;  	struct callback_head	task_work; +	struct file		**registered_rings;  	bool			task_running;  }; @@ -483,8 +520,6 @@ struct io_poll_iocb {  	struct file			*file;  	struct wait_queue_head		*head;  	__poll_t			events; -	bool				done; -	bool				canceled;  	struct wait_queue_entry		wait;  }; @@ -640,7 +675,7 @@ struct io_statx {  	int				dfd;  	unsigned int			mask;  	unsigned int			flags; -	const char __user		*filename; +	struct filename			*filename;  	struct statx __user		*buffer;  }; @@ -688,6 +723,12 @@ struct io_hardlink {  	int				flags;  }; +struct io_msg { +	struct file			*file; +	u64 user_data; +	u32 len; +}; +  struct io_async_connect {  	struct sockaddr_storage		address;  }; @@ -721,6 +762,7 @@ enum {  	REQ_F_HARDLINK_BIT	= IOSQE_IO_HARDLINK_BIT,  	REQ_F_FORCE_ASYNC_BIT	= IOSQE_ASYNC_BIT,  	REQ_F_BUFFER_SELECT_BIT	= IOSQE_BUFFER_SELECT_BIT, +	REQ_F_CQE_SKIP_BIT	= IOSQE_CQE_SKIP_SUCCESS_BIT,  	/* first byte is taken by user flags, shift it to not overlap */  	REQ_F_FAIL_BIT		= 8, @@ -737,6 +779,9 @@ enum {  	REQ_F_REFCOUNT_BIT,  	REQ_F_ARM_LTIMEOUT_BIT,  	REQ_F_ASYNC_DATA_BIT, +	REQ_F_SKIP_LINK_CQES_BIT, +	REQ_F_SINGLE_POLL_BIT, +	REQ_F_DOUBLE_POLL_BIT,  	/* keep async read/write and isreg together and in order */  	REQ_F_SUPPORT_NOWAIT_BIT,  	REQ_F_ISREG_BIT, @@ -758,6 +803,8 @@ enum {  	REQ_F_FORCE_ASYNC	= BIT(REQ_F_FORCE_ASYNC_BIT),  	/* IOSQE_BUFFER_SELECT */  	REQ_F_BUFFER_SELECT	= BIT(REQ_F_BUFFER_SELECT_BIT), +	/* IOSQE_CQE_SKIP_SUCCESS */ +	REQ_F_CQE_SKIP		= BIT(REQ_F_CQE_SKIP_BIT),  	/* fail rest of links */  	REQ_F_FAIL		= BIT(REQ_F_FAIL_BIT), @@ -791,6 +838,12 @@ enum {  	REQ_F_ARM_LTIMEOUT	= BIT(REQ_F_ARM_LTIMEOUT_BIT),  	/* ->async_data allocated */  	REQ_F_ASYNC_DATA	= BIT(REQ_F_ASYNC_DATA_BIT), +	/* don't post CQEs while failing linked requests */ +	REQ_F_SKIP_LINK_CQES	= BIT(REQ_F_SKIP_LINK_CQES_BIT), +	/* single poll may be active */ +	REQ_F_SINGLE_POLL	= BIT(REQ_F_SINGLE_POLL_BIT), +	/* double poll may active */ +	REQ_F_DOUBLE_POLL	= BIT(REQ_F_DOUBLE_POLL_BIT),  };  struct async_poll { @@ -817,7 +870,7 @@ enum {   * NOTE! Each of the iocb union members has the file pointer   * as the first entry in their struct definition. So you can   * access the file pointer through any of the sub-structs, - * or directly as just 'ki_filp' in this struct. + * or directly as just 'file' in this struct.   */  struct io_kiocb {  	union { @@ -847,6 +900,7 @@ struct io_kiocb {  		struct io_mkdir		mkdir;  		struct io_symlink	symlink;  		struct io_hardlink	hardlink; +		struct io_msg		msg;  	};  	u8				opcode; @@ -869,6 +923,7 @@ struct io_kiocb {  	/* used by request caches, completion batching and iopoll */  	struct io_wq_work_node		comp_list;  	atomic_t			refs; +	atomic_t			poll_refs;  	struct io_kiocb			*link;  	struct io_task_work		io_task_work;  	/* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */ @@ -877,11 +932,11 @@ struct io_kiocb {  	struct async_poll		*apoll;  	/* opcode allocated if it needs to store data for async defer */  	void				*async_data; -	struct io_wq_work		work;  	/* custom credentials, valid IFF REQ_F_CREDS is set */ -	const struct cred		*creds;  	/* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */  	struct io_buffer		*kbuf; +	const struct cred		*creds; +	struct io_wq_work		work;  };  struct io_tctx_node { @@ -1096,6 +1151,9 @@ static const struct io_op_def io_op_defs[] = {  	[IORING_OP_MKDIRAT] = {},  	[IORING_OP_SYMLINKAT] = {},  	[IORING_OP_LINKAT] = {}, +	[IORING_OP_MSG_RING] = { +		.needs_file		= 1, +	},  };  /* requests with any of those set should undergo io_disarm_next() */ @@ -1108,8 +1166,8 @@ static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,  					 bool cancel_all);  static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd); -static bool io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data, -				 s32 res, u32 cflags); +static void io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags); +  static void io_put_req(struct io_kiocb *req);  static void io_put_req_deferred(struct io_kiocb *req);  static void io_dismantle_req(struct io_kiocb *req); @@ -1132,6 +1190,7 @@ static int io_install_fixed_file(struct io_kiocb *req, struct file *file,  static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags);  static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer); +static void io_eventfd_signal(struct io_ring_ctx *ctx);  static struct kmem_cache *req_cachep; @@ -1183,12 +1242,6 @@ static inline bool req_ref_put_and_test(struct io_kiocb *req)  	return atomic_dec_and_test(&req->refs);  } -static inline void req_ref_put(struct io_kiocb *req) -{ -	WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT)); -	WARN_ON_ONCE(req_ref_put_and_test(req)); -} -  static inline void req_ref_get(struct io_kiocb *req)  {  	WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT)); @@ -1264,16 +1317,88 @@ static inline void io_req_set_rsrc_node(struct io_kiocb *req,  	}  } -static void io_refs_resurrect(struct percpu_ref *ref, struct completion *compl) +static unsigned int __io_put_kbuf(struct io_kiocb *req, struct list_head *list)  { -	bool got = percpu_ref_tryget(ref); +	struct io_buffer *kbuf = req->kbuf; +	unsigned int cflags; -	/* already at zero, wait for ->release() */ -	if (!got) -		wait_for_completion(compl); -	percpu_ref_resurrect(ref); -	if (got) -		percpu_ref_put(ref); +	cflags = IORING_CQE_F_BUFFER | (kbuf->bid << IORING_CQE_BUFFER_SHIFT); +	req->flags &= ~REQ_F_BUFFER_SELECTED; +	list_add(&kbuf->list, list); +	req->kbuf = NULL; +	return cflags; +} + +static inline unsigned int io_put_kbuf_comp(struct io_kiocb *req) +{ +	if (likely(!(req->flags & REQ_F_BUFFER_SELECTED))) +		return 0; +	return __io_put_kbuf(req, &req->ctx->io_buffers_comp); +} + +static inline unsigned int io_put_kbuf(struct io_kiocb *req, +				       unsigned issue_flags) +{ +	unsigned int cflags; + +	if (likely(!(req->flags & REQ_F_BUFFER_SELECTED))) +		return 0; + +	/* +	 * We can add this buffer back to two lists: +	 * +	 * 1) The io_buffers_cache list. This one is protected by the +	 *    ctx->uring_lock. If we already hold this lock, add back to this +	 *    list as we can grab it from issue as well. +	 * 2) The io_buffers_comp list. This one is protected by the +	 *    ctx->completion_lock. +	 * +	 * We migrate buffers from the comp_list to the issue cache list +	 * when we need one. +	 */ +	if (issue_flags & IO_URING_F_UNLOCKED) { +		struct io_ring_ctx *ctx = req->ctx; + +		spin_lock(&ctx->completion_lock); +		cflags = __io_put_kbuf(req, &ctx->io_buffers_comp); +		spin_unlock(&ctx->completion_lock); +	} else { +		cflags = __io_put_kbuf(req, &req->ctx->io_buffers_cache); +	} + +	return cflags; +} + +static struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx, +						 unsigned int bgid) +{ +	struct list_head *hash_list; +	struct io_buffer_list *bl; + +	hash_list = &ctx->io_buffers[hash_32(bgid, IO_BUFFERS_HASH_BITS)]; +	list_for_each_entry(bl, hash_list, list) +		if (bl->bgid == bgid || bgid == -1U) +			return bl; + +	return NULL; +} + +static void io_kbuf_recycle(struct io_kiocb *req) +{ +	struct io_ring_ctx *ctx = req->ctx; +	struct io_buffer_list *bl; +	struct io_buffer *buf; + +	if (likely(!(req->flags & REQ_F_BUFFER_SELECTED))) +		return; + +	lockdep_assert_held(&ctx->uring_lock); + +	buf = req->kbuf; +	bl = io_buffer_get_list(ctx, buf->bgid); +	list_add(&buf->list, &bl->buf_list); +	req->flags &= ~REQ_F_BUFFER_SELECTED; +	req->kbuf = NULL;  }  static bool io_match_task(struct io_kiocb *head, struct task_struct *task, @@ -1340,6 +1465,10 @@ static inline bool req_has_async_data(struct io_kiocb *req)  static inline void req_set_fail(struct io_kiocb *req)  {  	req->flags |= REQ_F_FAIL; +	if (req->flags & REQ_F_CQE_SKIP) { +		req->flags &= ~REQ_F_CQE_SKIP; +		req->flags |= REQ_F_SKIP_LINK_CQES; +	}  }  static inline void req_fail_link_node(struct io_kiocb *req, int res) @@ -1382,7 +1511,7 @@ static __cold void io_fallback_req_func(struct work_struct *work)  static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)  {  	struct io_ring_ctx *ctx; -	int hash_bits; +	int i, hash_bits;  	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);  	if (!ctx) @@ -1409,6 +1538,13 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)  	/* set invalid range, so io_import_fixed() fails meeting it */  	ctx->dummy_ubuf->ubuf = -1UL; +	ctx->io_buffers = kcalloc(1U << IO_BUFFERS_HASH_BITS, +					sizeof(struct list_head), GFP_KERNEL); +	if (!ctx->io_buffers) +		goto err; +	for (i = 0; i < (1U << IO_BUFFERS_HASH_BITS); i++) +		INIT_LIST_HEAD(&ctx->io_buffers[i]); +  	if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free,  			    PERCPU_REF_ALLOW_REINIT, GFP_KERNEL))  		goto err; @@ -1417,14 +1553,17 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)  	init_waitqueue_head(&ctx->sqo_sq_wait);  	INIT_LIST_HEAD(&ctx->sqd_list);  	INIT_LIST_HEAD(&ctx->cq_overflow_list); +	INIT_LIST_HEAD(&ctx->io_buffers_cache); +	INIT_LIST_HEAD(&ctx->apoll_cache);  	init_completion(&ctx->ref_comp); -	xa_init_flags(&ctx->io_buffers, XA_FLAGS_ALLOC1);  	xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1);  	mutex_init(&ctx->uring_lock);  	init_waitqueue_head(&ctx->cq_wait);  	spin_lock_init(&ctx->completion_lock);  	spin_lock_init(&ctx->timeout_lock);  	INIT_WQ_LIST(&ctx->iopoll_list); +	INIT_LIST_HEAD(&ctx->io_buffers_pages); +	INIT_LIST_HEAD(&ctx->io_buffers_comp);  	INIT_LIST_HEAD(&ctx->defer_list);  	INIT_LIST_HEAD(&ctx->timeout_list);  	INIT_LIST_HEAD(&ctx->ltimeout_list); @@ -1437,10 +1576,15 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)  	INIT_WQ_LIST(&ctx->locked_free_list);  	INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func);  	INIT_WQ_LIST(&ctx->submit_state.compl_reqs); +#ifdef CONFIG_NET_RX_BUSY_POLL +	INIT_LIST_HEAD(&ctx->napi_list); +	spin_lock_init(&ctx->napi_lock); +#endif  	return ctx;  err:  	kfree(ctx->dummy_ubuf);  	kfree(ctx->cancel_hash); +	kfree(ctx->io_buffers);  	kfree(ctx);  	return NULL;  } @@ -1553,8 +1697,11 @@ static void io_prep_async_link(struct io_kiocb *req)  static inline void io_req_add_compl_list(struct io_kiocb *req)  { -	struct io_submit_state *state = &req->ctx->submit_state; +	struct io_ring_ctx *ctx = req->ctx; +	struct io_submit_state *state = &ctx->submit_state; +	if (!(req->flags & REQ_F_CQE_SKIP)) +		ctx->submit_state.flush_cqes = true;  	wq_list_add_tail(&req->comp_list, &state->compl_reqs);  } @@ -1580,8 +1727,8 @@ static void io_queue_async_work(struct io_kiocb *req, bool *dont_use)  	if (WARN_ON_ONCE(!same_thread_group(req->task, current)))  		req->work.flags |= IO_WQ_WORK_CANCEL; -	trace_io_uring_queue_async_work(ctx, io_wq_is_hashed(&req->work), req, -					&req->work, req->flags); +	trace_io_uring_queue_async_work(ctx, req, req->user_data, req->opcode, req->flags, +					&req->work, io_wq_is_hashed(&req->work));  	io_wq_enqueue(tctx->io_wq, &req->work);  	if (link)  		io_queue_linked_timeout(link); @@ -1599,7 +1746,7 @@ static void io_kill_timeout(struct io_kiocb *req, int status)  		atomic_set(&req->ctx->cq_timeouts,  			atomic_read(&req->ctx->cq_timeouts) + 1);  		list_del_init(&req->timeout.list); -		io_cqring_fill_event(req->ctx, req->user_data, status, 0); +		io_fill_cqe_req(req, status, 0);  		io_put_req_deferred(req);  	}  } @@ -1651,22 +1798,27 @@ static __cold void io_flush_timeouts(struct io_ring_ctx *ctx)  	spin_unlock_irq(&ctx->timeout_lock);  } -static __cold void __io_commit_cqring_flush(struct io_ring_ctx *ctx) -{ -	if (ctx->off_timeout_used) -		io_flush_timeouts(ctx); -	if (ctx->drain_active) -		io_queue_deferred(ctx); -} -  static inline void io_commit_cqring(struct io_ring_ctx *ctx)  { -	if (unlikely(ctx->off_timeout_used || ctx->drain_active)) -		__io_commit_cqring_flush(ctx);  	/* order cqe stores with ring update */  	smp_store_release(&ctx->rings->cq.tail, ctx->cached_cq_tail);  } +static void __io_commit_cqring_flush(struct io_ring_ctx *ctx) +{ +	if (ctx->off_timeout_used || ctx->drain_active) { +		spin_lock(&ctx->completion_lock); +		if (ctx->off_timeout_used) +			io_flush_timeouts(ctx); +		if (ctx->drain_active) +			io_queue_deferred(ctx); +		io_commit_cqring(ctx); +		spin_unlock(&ctx->completion_lock); +	} +	if (ctx->has_evfd) +		io_eventfd_signal(ctx); +} +  static inline bool io_sqring_full(struct io_ring_ctx *ctx)  {  	struct io_rings *r = ctx->rings; @@ -1696,23 +1848,34 @@ static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx)  	return &rings->cqes[tail & mask];  } -static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx) +static void io_eventfd_signal(struct io_ring_ctx *ctx)  { -	if (likely(!ctx->cq_ev_fd)) -		return false; +	struct io_ev_fd *ev_fd; + +	rcu_read_lock(); +	/* +	 * rcu_dereference ctx->io_ev_fd once and use it for both for checking +	 * and eventfd_signal +	 */ +	ev_fd = rcu_dereference(ctx->io_ev_fd); + +	/* +	 * Check again if ev_fd exists incase an io_eventfd_unregister call +	 * completed between the NULL check of ctx->io_ev_fd at the start of +	 * the function and rcu_read_lock. +	 */ +	if (unlikely(!ev_fd)) +		goto out;  	if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED) -		return false; -	return !ctx->eventfd_async || io_wq_current_is_worker(); +		goto out; + +	if (!ev_fd->eventfd_async || io_wq_current_is_worker()) +		eventfd_signal(ev_fd->cq_ev_fd, 1); +out: +	rcu_read_unlock();  } -/* - * This should only get called when at least one event has been posted. - * Some applications rely on the eventfd notification count only changing - * IFF a new CQE has been added to the CQ ring. There's no depedency on - * 1:1 relationship between how many times this function is called (and - * hence the eventfd count) and number of CQEs posted to the CQ ring. - */ -static void io_cqring_ev_posted(struct io_ring_ctx *ctx) +static inline void io_cqring_wake(struct io_ring_ctx *ctx)  {  	/*  	 * wake_up_all() may seem excessive, but io_wake_function() and @@ -1721,21 +1884,32 @@ static void io_cqring_ev_posted(struct io_ring_ctx *ctx)  	 */  	if (wq_has_sleeper(&ctx->cq_wait))  		wake_up_all(&ctx->cq_wait); -	if (io_should_trigger_evfd(ctx)) -		eventfd_signal(ctx->cq_ev_fd, 1); +} + +/* + * This should only get called when at least one event has been posted. + * Some applications rely on the eventfd notification count only changing + * IFF a new CQE has been added to the CQ ring. There's no depedency on + * 1:1 relationship between how many times this function is called (and + * hence the eventfd count) and number of CQEs posted to the CQ ring. + */ +static inline void io_cqring_ev_posted(struct io_ring_ctx *ctx) +{ +	if (unlikely(ctx->off_timeout_used || ctx->drain_active || +		     ctx->has_evfd)) +		__io_commit_cqring_flush(ctx); + +	io_cqring_wake(ctx);  }  static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx)  { -	/* see waitqueue_active() comment */ -	smp_mb(); +	if (unlikely(ctx->off_timeout_used || ctx->drain_active || +		     ctx->has_evfd)) +		__io_commit_cqring_flush(ctx); -	if (ctx->flags & IORING_SETUP_SQPOLL) { -		if (waitqueue_active(&ctx->cq_wait)) -			wake_up_all(&ctx->cq_wait); -	} -	if (io_should_trigger_evfd(ctx)) -		eventfd_signal(ctx->cq_ev_fd, 1); +	if (ctx->flags & IORING_SETUP_SQPOLL) +		io_cqring_wake(ctx);  }  /* Returns true if there are no backlogged entries after the flush */ @@ -1830,6 +2004,18 @@ static inline void io_get_task_refs(int nr)  		io_task_refs_refill(tctx);  } +static __cold void io_uring_drop_tctx_refs(struct task_struct *task) +{ +	struct io_uring_task *tctx = task->io_uring; +	unsigned int refs = tctx->cached_refs; + +	if (refs) { +		tctx->cached_refs = 0; +		percpu_counter_sub(&tctx->inflight, refs); +		put_task_struct_many(task, refs); +	} +} +  static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,  				     s32 res, u32 cflags)  { @@ -1858,13 +2044,11 @@ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,  	return true;  } -static inline bool __io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data, -					  s32 res, u32 cflags) +static inline bool __io_fill_cqe(struct io_ring_ctx *ctx, u64 user_data, +				 s32 res, u32 cflags)  {  	struct io_uring_cqe *cqe; -	trace_io_uring_complete(ctx, user_data, res, cflags); -  	/*  	 * If we can't get a cq entry, userspace overflowed the  	 * submission (by quite a lot). Increment the overflow count in @@ -1880,20 +2064,33 @@ static inline bool __io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data  	return io_cqring_event_overflow(ctx, user_data, res, cflags);  } -/* not as hot to bloat with inlining */ -static noinline bool io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data, -					  s32 res, u32 cflags) +static inline bool __io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags)  { -	return __io_cqring_fill_event(ctx, user_data, res, cflags); +	trace_io_uring_complete(req->ctx, req, req->user_data, res, cflags); +	return __io_fill_cqe(req->ctx, req->user_data, res, cflags);  } -static void io_req_complete_post(struct io_kiocb *req, s32 res, -				 u32 cflags) +static noinline void io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags) +{ +	if (!(req->flags & REQ_F_CQE_SKIP)) +		__io_fill_cqe_req(req, res, cflags); +} + +static noinline bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, +				     s32 res, u32 cflags) +{ +	ctx->cq_extra++; +	trace_io_uring_complete(ctx, NULL, user_data, res, cflags); +	return __io_fill_cqe(ctx, user_data, res, cflags); +} + +static void __io_req_complete_post(struct io_kiocb *req, s32 res, +				   u32 cflags)  {  	struct io_ring_ctx *ctx = req->ctx; -	spin_lock(&ctx->completion_lock); -	__io_cqring_fill_event(ctx, req->user_data, res, cflags); +	if (!(req->flags & REQ_F_CQE_SKIP)) +		__io_fill_cqe_req(req, res, cflags);  	/*  	 * If we're the last reference to this request, add to our locked  	 * free_list cache. @@ -1913,6 +2110,15 @@ static void io_req_complete_post(struct io_kiocb *req, s32 res,  		wq_list_add_head(&req->comp_list, &ctx->locked_free_list);  		ctx->locked_free_nr++;  	} +} + +static void io_req_complete_post(struct io_kiocb *req, s32 res, +				 u32 cflags) +{ +	struct io_ring_ctx *ctx = req->ctx; + +	spin_lock(&ctx->completion_lock); +	__io_req_complete_post(req, res, cflags);  	io_commit_cqring(ctx);  	spin_unlock(&ctx->completion_lock);  	io_cqring_ev_posted(ctx); @@ -1943,7 +2149,7 @@ static inline void io_req_complete(struct io_kiocb *req, s32 res)  static void io_req_complete_failed(struct io_kiocb *req, s32 res)  {  	req_set_fail(req); -	io_req_complete_post(req, res, 0); +	io_req_complete_post(req, res, io_put_kbuf(req, 0));  }  static void io_req_complete_fail_submit(struct io_kiocb *req) @@ -2101,8 +2307,8 @@ static bool io_kill_linked_timeout(struct io_kiocb *req)  		link->timeout.head = NULL;  		if (hrtimer_try_to_cancel(&io->timer) != -1) {  			list_del(&link->timeout.list); -			io_cqring_fill_event(link->ctx, link->user_data, -					     -ECANCELED, 0); +			/* leave REQ_F_CQE_SKIP to io_fill_cqe_req */ +			io_fill_cqe_req(link, -ECANCELED, 0);  			io_put_req_deferred(link);  			return true;  		} @@ -2114,6 +2320,7 @@ static void io_fail_links(struct io_kiocb *req)  	__must_hold(&req->ctx->completion_lock)  {  	struct io_kiocb *nxt, *link = req->link; +	bool ignore_cqes = req->flags & REQ_F_SKIP_LINK_CQES;  	req->link = NULL;  	while (link) { @@ -2125,8 +2332,13 @@ static void io_fail_links(struct io_kiocb *req)  		nxt = link->link;  		link->link = NULL; -		trace_io_uring_fail_link(req, link); -		io_cqring_fill_event(link->ctx, link->user_data, res, 0); +		trace_io_uring_fail_link(req->ctx, req, req->user_data, +					req->opcode, link); + +		if (!ignore_cqes) { +			link->flags &= ~REQ_F_CQE_SKIP; +			io_fill_cqe_req(link, res, 0); +		}  		io_put_req_deferred(link);  		link = nxt;  	} @@ -2143,8 +2355,8 @@ static bool io_disarm_next(struct io_kiocb *req)  		req->flags &= ~REQ_F_ARM_LTIMEOUT;  		if (link && link->opcode == IORING_OP_LINK_TIMEOUT) {  			io_remove_next_linked(req); -			io_cqring_fill_event(link->ctx, link->user_data, -					     -ECANCELED, 0); +			/* leave REQ_F_CQE_SKIP to io_fill_cqe_req */ +			io_fill_cqe_req(link, -ECANCELED, 0);  			io_put_req_deferred(link);  			posted = true;  		} @@ -2171,7 +2383,7 @@ static void __io_req_find_next_prep(struct io_kiocb *req)  	spin_lock(&ctx->completion_lock);  	posted = io_disarm_next(req);  	if (posted) -		io_commit_cqring(req->ctx); +		io_commit_cqring(ctx);  	spin_unlock(&ctx->completion_lock);  	if (posted)  		io_cqring_ev_posted(ctx); @@ -2208,51 +2420,109 @@ static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked)  	percpu_ref_put(&ctx->refs);  } +static inline void ctx_commit_and_unlock(struct io_ring_ctx *ctx) +{ +	io_commit_cqring(ctx); +	spin_unlock(&ctx->completion_lock); +	io_cqring_ev_posted(ctx); +} + +static void handle_prev_tw_list(struct io_wq_work_node *node, +				struct io_ring_ctx **ctx, bool *uring_locked) +{ +	if (*ctx && !*uring_locked) +		spin_lock(&(*ctx)->completion_lock); + +	do { +		struct io_wq_work_node *next = node->next; +		struct io_kiocb *req = container_of(node, struct io_kiocb, +						    io_task_work.node); + +		if (req->ctx != *ctx) { +			if (unlikely(!*uring_locked && *ctx)) +				ctx_commit_and_unlock(*ctx); + +			ctx_flush_and_put(*ctx, uring_locked); +			*ctx = req->ctx; +			/* if not contended, grab and improve batching */ +			*uring_locked = mutex_trylock(&(*ctx)->uring_lock); +			percpu_ref_get(&(*ctx)->refs); +			if (unlikely(!*uring_locked)) +				spin_lock(&(*ctx)->completion_lock); +		} +		if (likely(*uring_locked)) +			req->io_task_work.func(req, uring_locked); +		else +			__io_req_complete_post(req, req->result, +						io_put_kbuf_comp(req)); +		node = next; +	} while (node); + +	if (unlikely(!*uring_locked)) +		ctx_commit_and_unlock(*ctx); +} + +static void handle_tw_list(struct io_wq_work_node *node, +			   struct io_ring_ctx **ctx, bool *locked) +{ +	do { +		struct io_wq_work_node *next = node->next; +		struct io_kiocb *req = container_of(node, struct io_kiocb, +						    io_task_work.node); + +		if (req->ctx != *ctx) { +			ctx_flush_and_put(*ctx, locked); +			*ctx = req->ctx; +			/* if not contended, grab and improve batching */ +			*locked = mutex_trylock(&(*ctx)->uring_lock); +			percpu_ref_get(&(*ctx)->refs); +		} +		req->io_task_work.func(req, locked); +		node = next; +	} while (node); +} +  static void tctx_task_work(struct callback_head *cb)  { -	bool locked = false; +	bool uring_locked = false;  	struct io_ring_ctx *ctx = NULL;  	struct io_uring_task *tctx = container_of(cb, struct io_uring_task,  						  task_work);  	while (1) { -		struct io_wq_work_node *node; +		struct io_wq_work_node *node1, *node2; -		if (!tctx->task_list.first && locked) +		if (!tctx->task_list.first && +		    !tctx->prior_task_list.first && uring_locked)  			io_submit_flush_completions(ctx);  		spin_lock_irq(&tctx->task_lock); -		node = tctx->task_list.first; +		node1 = tctx->prior_task_list.first; +		node2 = tctx->task_list.first;  		INIT_WQ_LIST(&tctx->task_list); -		if (!node) +		INIT_WQ_LIST(&tctx->prior_task_list); +		if (!node2 && !node1)  			tctx->task_running = false;  		spin_unlock_irq(&tctx->task_lock); -		if (!node) +		if (!node2 && !node1)  			break; -		do { -			struct io_wq_work_node *next = node->next; -			struct io_kiocb *req = container_of(node, struct io_kiocb, -							    io_task_work.node); - -			if (req->ctx != ctx) { -				ctx_flush_and_put(ctx, &locked); -				ctx = req->ctx; -				/* if not contended, grab and improve batching */ -				locked = mutex_trylock(&ctx->uring_lock); -				percpu_ref_get(&ctx->refs); -			} -			req->io_task_work.func(req, &locked); -			node = next; -		} while (node); +		if (node1) +			handle_prev_tw_list(node1, &ctx, &uring_locked); +		if (node2) +			handle_tw_list(node2, &ctx, &uring_locked);  		cond_resched();  	} -	ctx_flush_and_put(ctx, &locked); +	ctx_flush_and_put(ctx, &uring_locked); + +	/* relaxed read is enough as only the task itself sets ->in_idle */ +	if (unlikely(atomic_read(&tctx->in_idle))) +		io_uring_drop_tctx_refs(current);  } -static void io_req_task_work_add(struct io_kiocb *req) +static void io_req_task_work_add(struct io_kiocb *req, bool priority)  {  	struct task_struct *tsk = req->task;  	struct io_uring_task *tctx = tsk->io_uring; @@ -2264,7 +2534,10 @@ static void io_req_task_work_add(struct io_kiocb *req)  	WARN_ON_ONCE(!tctx);  	spin_lock_irqsave(&tctx->task_lock, flags); -	wq_list_add_tail(&req->io_task_work.node, &tctx->task_list); +	if (priority) +		wq_list_add_tail(&req->io_task_work.node, &tctx->prior_task_list); +	else +		wq_list_add_tail(&req->io_task_work.node, &tctx->task_list);  	running = tctx->task_running;  	if (!running)  		tctx->task_running = true; @@ -2289,8 +2562,7 @@ static void io_req_task_work_add(struct io_kiocb *req)  	spin_lock_irqsave(&tctx->task_lock, flags);  	tctx->task_running = false; -	node = tctx->task_list.first; -	INIT_WQ_LIST(&tctx->task_list); +	node = wq_list_merge(&tctx->prior_task_list, &tctx->task_list);  	spin_unlock_irqrestore(&tctx->task_lock, flags);  	while (node) { @@ -2327,19 +2599,19 @@ static void io_req_task_queue_fail(struct io_kiocb *req, int ret)  {  	req->result = ret;  	req->io_task_work.func = io_req_task_cancel; -	io_req_task_work_add(req); +	io_req_task_work_add(req, false);  }  static void io_req_task_queue(struct io_kiocb *req)  {  	req->io_task_work.func = io_req_task_submit; -	io_req_task_work_add(req); +	io_req_task_work_add(req, false);  }  static void io_req_task_queue_reissue(struct io_kiocb *req)  {  	req->io_task_work.func = io_queue_async_work; -	io_req_task_work_add(req); +	io_req_task_work_add(req, false);  }  static inline void io_queue_next(struct io_kiocb *req) @@ -2403,17 +2675,30 @@ static void __io_submit_flush_completions(struct io_ring_ctx *ctx)  	struct io_wq_work_node *node, *prev;  	struct io_submit_state *state = &ctx->submit_state; -	spin_lock(&ctx->completion_lock); -	wq_list_for_each(node, prev, &state->compl_reqs) { -		struct io_kiocb *req = container_of(node, struct io_kiocb, +	if (state->flush_cqes) { +		spin_lock(&ctx->completion_lock); +		wq_list_for_each(node, prev, &state->compl_reqs) { +			struct io_kiocb *req = container_of(node, struct io_kiocb,  						    comp_list); -		__io_cqring_fill_event(ctx, req->user_data, req->result, -					req->cflags); +			if (!(req->flags & REQ_F_CQE_SKIP)) +				__io_fill_cqe_req(req, req->result, req->cflags); +			if ((req->flags & REQ_F_POLLED) && req->apoll) { +				struct async_poll *apoll = req->apoll; + +				if (apoll->double_poll) +					kfree(apoll->double_poll); +				list_add(&apoll->poll.wait.entry, +						&ctx->apoll_cache); +				req->flags &= ~REQ_F_POLLED; +			} +		} + +		io_commit_cqring(ctx); +		spin_unlock(&ctx->completion_lock); +		io_cqring_ev_posted(ctx); +		state->flush_cqes = false;  	} -	io_commit_cqring(ctx); -	spin_unlock(&ctx->completion_lock); -	io_cqring_ev_posted(ctx);  	io_free_batch_list(ctx, state->compl_reqs.first);  	INIT_WQ_LIST(&state->compl_reqs); @@ -2444,7 +2729,7 @@ static inline void io_put_req_deferred(struct io_kiocb *req)  {  	if (req_ref_put_and_test(req)) {  		req->io_task_work.func = io_free_req_work; -		io_req_task_work_add(req); +		io_req_task_work_add(req, false);  	}  } @@ -2463,24 +2748,6 @@ static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx)  	return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head;  } -static unsigned int io_put_kbuf(struct io_kiocb *req, struct io_buffer *kbuf) -{ -	unsigned int cflags; - -	cflags = kbuf->bid << IORING_CQE_BUFFER_SHIFT; -	cflags |= IORING_CQE_F_BUFFER; -	req->flags &= ~REQ_F_BUFFER_SELECTED; -	kfree(kbuf); -	return cflags; -} - -static inline unsigned int io_put_rw_kbuf(struct io_kiocb *req) -{ -	if (likely(!(req->flags & REQ_F_BUFFER_SELECTED))) -		return 0; -	return io_put_kbuf(req, req->kbuf); -} -  static inline bool io_run_task_work(void)  {  	if (test_thread_flag(TIF_NOTIFY_SIGNAL) || current->task_works) { @@ -2543,8 +2810,10 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)  		/* order with io_complete_rw_iopoll(), e.g. ->result updates */  		if (!smp_load_acquire(&req->iopoll_completed))  			break; -		__io_cqring_fill_event(ctx, req->user_data, req->result, -					io_put_rw_kbuf(req)); +		if (unlikely(req->flags & REQ_F_CQE_SKIP)) +			continue; + +		__io_fill_cqe_req(req, req->result, io_put_kbuf(req, 0));  		nr_events++;  	} @@ -2718,25 +2987,26 @@ static bool __io_complete_rw_common(struct io_kiocb *req, long res)  	return false;  } -static void io_req_task_complete(struct io_kiocb *req, bool *locked) +static inline void io_req_task_complete(struct io_kiocb *req, bool *locked)  { -	unsigned int cflags = io_put_rw_kbuf(req);  	int res = req->result;  	if (*locked) { -		io_req_complete_state(req, res, cflags); +		io_req_complete_state(req, res, io_put_kbuf(req, 0));  		io_req_add_compl_list(req);  	} else { -		io_req_complete_post(req, res, cflags); +		io_req_complete_post(req, res, +					io_put_kbuf(req, IO_URING_F_UNLOCKED));  	}  } -static void __io_complete_rw(struct io_kiocb *req, long res, long res2, +static void __io_complete_rw(struct io_kiocb *req, long res,  			     unsigned int issue_flags)  {  	if (__io_complete_rw_common(req, res))  		return; -	__io_req_complete(req, issue_flags, req->result, io_put_rw_kbuf(req)); +	__io_req_complete(req, issue_flags, req->result, +				io_put_kbuf(req, issue_flags));  }  static void io_complete_rw(struct kiocb *kiocb, long res) @@ -2747,7 +3017,7 @@ static void io_complete_rw(struct kiocb *kiocb, long res)  		return;  	req->result = res;  	req->io_task_work.func = io_req_task_complete; -	io_req_task_work_add(req); +	io_req_task_work_add(req, !!(req->ctx->flags & IORING_SETUP_SQPOLL));  }  static void io_complete_rw_iopoll(struct kiocb *kiocb, long res) @@ -2891,14 +3161,6 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)  		req->flags |= io_file_get_flags(file) << REQ_F_SUPPORT_NOWAIT_BIT;  	kiocb->ki_pos = READ_ONCE(sqe->off); -	if (kiocb->ki_pos == -1) { -		if (!(file->f_mode & FMODE_STREAM)) { -			req->flags |= REQ_F_CUR_POS; -			kiocb->ki_pos = file->f_pos; -		} else { -			kiocb->ki_pos = 0; -		} -	}  	kiocb->ki_flags = iocb_flags(file);  	ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags));  	if (unlikely(ret)) @@ -2965,10 +3227,27 @@ static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)  	}  } -static void kiocb_done(struct kiocb *kiocb, ssize_t ret, +static inline loff_t *io_kiocb_update_pos(struct io_kiocb *req) +{ +	struct kiocb *kiocb = &req->rw.kiocb; +	bool is_stream = req->file->f_mode & FMODE_STREAM; + +	if (kiocb->ki_pos == -1) { +		if (!is_stream) { +			req->flags |= REQ_F_CUR_POS; +			kiocb->ki_pos = req->file->f_pos; +			return &kiocb->ki_pos; +		} else { +			kiocb->ki_pos = 0; +			return NULL; +		} +	} +	return is_stream ? NULL : &kiocb->ki_pos; +} + +static void kiocb_done(struct io_kiocb *req, ssize_t ret,  		       unsigned int issue_flags)  { -	struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);  	struct io_async_rw *io = req->async_data;  	/* add previously done IO, if any */ @@ -2980,29 +3259,18 @@ static void kiocb_done(struct kiocb *kiocb, ssize_t ret,  	}  	if (req->flags & REQ_F_CUR_POS) -		req->file->f_pos = kiocb->ki_pos; -	if (ret >= 0 && (kiocb->ki_complete == io_complete_rw)) -		__io_complete_rw(req, ret, 0, issue_flags); +		req->file->f_pos = req->rw.kiocb.ki_pos; +	if (ret >= 0 && (req->rw.kiocb.ki_complete == io_complete_rw)) +		__io_complete_rw(req, ret, issue_flags);  	else -		io_rw_done(kiocb, ret); +		io_rw_done(&req->rw.kiocb, ret);  	if (req->flags & REQ_F_REISSUE) {  		req->flags &= ~REQ_F_REISSUE; -		if (io_resubmit_prep(req)) { +		if (io_resubmit_prep(req))  			io_req_task_queue_reissue(req); -		} else { -			unsigned int cflags = io_put_rw_kbuf(req); -			struct io_ring_ctx *ctx = req->ctx; - -			req_set_fail(req); -			if (issue_flags & IO_URING_F_UNLOCKED) { -				mutex_lock(&ctx->uring_lock); -				__io_req_complete(req, issue_flags, ret, cflags); -				mutex_unlock(&ctx->uring_lock); -			} else { -				__io_req_complete(req, issue_flags, ret, cflags); -			} -		} +		else +			io_req_task_queue_fail(req, ret);  	}  } @@ -3100,30 +3368,36 @@ static void io_ring_submit_lock(struct io_ring_ctx *ctx, bool needs_lock)  		mutex_lock(&ctx->uring_lock);  } +static void io_buffer_add_list(struct io_ring_ctx *ctx, +			       struct io_buffer_list *bl, unsigned int bgid) +{ +	struct list_head *list; + +	list = &ctx->io_buffers[hash_32(bgid, IO_BUFFERS_HASH_BITS)]; +	INIT_LIST_HEAD(&bl->buf_list); +	bl->bgid = bgid; +	list_add(&bl->list, list); +} +  static struct io_buffer *io_buffer_select(struct io_kiocb *req, size_t *len,  					  int bgid, unsigned int issue_flags)  {  	struct io_buffer *kbuf = req->kbuf; -	struct io_buffer *head;  	bool needs_lock = issue_flags & IO_URING_F_UNLOCKED; +	struct io_ring_ctx *ctx = req->ctx; +	struct io_buffer_list *bl;  	if (req->flags & REQ_F_BUFFER_SELECTED)  		return kbuf; -	io_ring_submit_lock(req->ctx, needs_lock); +	io_ring_submit_lock(ctx, needs_lock); -	lockdep_assert_held(&req->ctx->uring_lock); +	lockdep_assert_held(&ctx->uring_lock); -	head = xa_load(&req->ctx->io_buffers, bgid); -	if (head) { -		if (!list_empty(&head->list)) { -			kbuf = list_last_entry(&head->list, struct io_buffer, -							list); -			list_del(&kbuf->list); -		} else { -			kbuf = head; -			xa_erase(&req->ctx->io_buffers, bgid); -		} +	bl = io_buffer_get_list(ctx, bgid); +	if (bl && !list_empty(&bl->buf_list)) { +		kbuf = list_first_entry(&bl->buf_list, struct io_buffer, list); +		list_del(&kbuf->list);  		if (*len > kbuf->len)  			*len = kbuf->len;  		req->flags |= REQ_F_BUFFER_SELECTED; @@ -3229,10 +3503,12 @@ static struct iovec *__io_import_iovec(int rw, struct io_kiocb *req,  	size_t sqe_len;  	ssize_t ret; -	BUILD_BUG_ON(ERR_PTR(0) != NULL); - -	if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) -		return ERR_PTR(io_import_fixed(req, rw, iter)); +	if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) { +		ret = io_import_fixed(req, rw, iter); +		if (ret) +			return ERR_PTR(ret); +		return NULL; +	}  	/* buffer index only valid with fixed read/write, or buffer select  */  	if (unlikely(req->buf_index && !(req->flags & REQ_F_BUFFER_SELECT))) @@ -3250,15 +3526,18 @@ static struct iovec *__io_import_iovec(int rw, struct io_kiocb *req,  		}  		ret = import_single_range(rw, buf, sqe_len, s->fast_iov, iter); -		return ERR_PTR(ret); +		if (ret) +			return ERR_PTR(ret); +		return NULL;  	}  	iovec = s->fast_iov;  	if (req->flags & REQ_F_BUFFER_SELECT) {  		ret = io_iov_buffer_select(req, iovec, issue_flags); -		if (!ret) -			iov_iter_init(iter, rw, iovec, 1, iovec->iov_len); -		return ERR_PTR(ret); +		if (ret) +			return ERR_PTR(ret); +		iov_iter_init(iter, rw, iovec, 1, iovec->iov_len); +		return NULL;  	}  	ret = __import_iovec(rw, buf, sqe_len, UIO_FASTIOV, &iovec, iter, @@ -3294,6 +3573,7 @@ static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter)  	struct kiocb *kiocb = &req->rw.kiocb;  	struct file *file = req->file;  	ssize_t ret = 0; +	loff_t *ppos;  	/*  	 * Don't support polled IO through this interface, and we can't @@ -3306,6 +3586,8 @@ static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter)  	    !(kiocb->ki_filp->f_flags & O_NONBLOCK))  		return -EAGAIN; +	ppos = io_kiocb_ppos(kiocb); +  	while (iov_iter_count(iter)) {  		struct iovec iovec;  		ssize_t nr; @@ -3319,10 +3601,10 @@ static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter)  		if (rw == READ) {  			nr = file->f_op->read(file, iovec.iov_base, -					      iovec.iov_len, io_kiocb_ppos(kiocb)); +					      iovec.iov_len, ppos);  		} else {  			nr = file->f_op->write(file, iovec.iov_base, -					       iovec.iov_len, io_kiocb_ppos(kiocb)); +					       iovec.iov_len, ppos);  		}  		if (nr < 0) { @@ -3330,13 +3612,15 @@ static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter)  				ret = nr;  			break;  		} +		ret += nr;  		if (!iov_iter_is_bvec(iter)) {  			iov_iter_advance(iter, nr);  		} else { -			req->rw.len -= nr;  			req->rw.addr += nr; +			req->rw.len -= nr; +			if (!req->rw.len) +				break;  		} -		ret += nr;  		if (nr != iovec.iov_len)  			break;  	} @@ -3523,12 +3807,23 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)  	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;  	struct io_async_rw *rw;  	ssize_t ret, ret2; +	loff_t *ppos;  	if (!req_has_async_data(req)) {  		ret = io_import_iovec(READ, req, &iovec, s, issue_flags);  		if (unlikely(ret < 0))  			return ret;  	} else { +		/* +		 * Safe and required to re-import if we're using provided +		 * buffers, as we dropped the selected one before retry. +		 */ +		if (req->flags & REQ_F_BUFFER_SELECT) { +			ret = io_import_iovec(READ, req, &iovec, s, issue_flags); +			if (unlikely(ret < 0)) +				return ret; +		} +  		rw = req->async_data;  		s = &rw->s;  		/* @@ -3553,7 +3848,9 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)  		kiocb->ki_flags &= ~IOCB_NOWAIT;  	} -	ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), req->result); +	ppos = io_kiocb_update_pos(req); + +	ret = rw_verify_area(READ, req->file, ppos, req->result);  	if (unlikely(ret)) {  		kfree(iovec);  		return ret; @@ -3563,6 +3860,9 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)  	if (ret == -EAGAIN || (req->flags & REQ_F_REISSUE)) {  		req->flags &= ~REQ_F_REISSUE; +		/* if we can poll, just do that */ +		if (req->opcode == IORING_OP_READ && file_can_poll(req->file)) +			return -EAGAIN;  		/* IOPOLL retry should happen for io-wq threads */  		if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL))  			goto done; @@ -3629,7 +3929,7 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)  		iov_iter_restore(&s->iter, &s->iter_state);  	} while (ret > 0);  done: -	kiocb_done(kiocb, ret, issue_flags); +	kiocb_done(req, ret, issue_flags);  out_free:  	/* it's faster to check here then delegate to kfree */  	if (iovec) @@ -3652,6 +3952,7 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags)  	struct kiocb *kiocb = &req->rw.kiocb;  	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;  	ssize_t ret, ret2; +	loff_t *ppos;  	if (!req_has_async_data(req)) {  		ret = io_import_iovec(WRITE, req, &iovec, s, issue_flags); @@ -3682,7 +3983,9 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags)  		kiocb->ki_flags &= ~IOCB_NOWAIT;  	} -	ret = rw_verify_area(WRITE, req->file, io_kiocb_ppos(kiocb), req->result); +	ppos = io_kiocb_update_pos(req); + +	ret = rw_verify_area(WRITE, req->file, ppos, req->result);  	if (unlikely(ret))  		goto out_free; @@ -3726,7 +4029,7 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags)  		if (ret2 == -EAGAIN && (req->ctx->flags & IORING_SETUP_IOPOLL))  			goto copy_iov;  done: -		kiocb_done(kiocb, ret2, issue_flags); +		kiocb_done(req, ret2, issue_flags);  	} else {  copy_iov:  		iov_iter_restore(&s->iter, &s->iter_state); @@ -4129,6 +4432,45 @@ static int io_nop(struct io_kiocb *req, unsigned int issue_flags)  	return 0;  } +static int io_msg_ring_prep(struct io_kiocb *req, +			    const struct io_uring_sqe *sqe) +{ +	if (unlikely(sqe->addr || sqe->ioprio || sqe->rw_flags || +		     sqe->splice_fd_in || sqe->buf_index || sqe->personality)) +		return -EINVAL; + +	if (req->file->f_op != &io_uring_fops) +		return -EBADFD; + +	req->msg.user_data = READ_ONCE(sqe->off); +	req->msg.len = READ_ONCE(sqe->len); +	return 0; +} + +static int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags) +{ +	struct io_ring_ctx *target_ctx; +	struct io_msg *msg = &req->msg; +	int ret = -EOVERFLOW; +	bool filled; + +	target_ctx = req->file->private_data; + +	spin_lock(&target_ctx->completion_lock); +	filled = io_fill_cqe_aux(target_ctx, msg->user_data, msg->len, +					IORING_CQE_F_MSG); +	io_commit_cqring(target_ctx); +	spin_unlock(&target_ctx->completion_lock); + +	if (filled) { +		io_cqring_ev_posted(target_ctx); +		ret = 0; +	} + +	__io_req_complete(req, issue_flags, ret, 0); +	return 0; +} +  static int io_fsync_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)  {  	struct io_ring_ctx *ctx = req->ctx; @@ -4352,8 +4694,8 @@ static int io_remove_buffers_prep(struct io_kiocb *req,  	return 0;  } -static int __io_remove_buffers(struct io_ring_ctx *ctx, struct io_buffer *buf, -			       int bgid, unsigned nbufs) +static int __io_remove_buffers(struct io_ring_ctx *ctx, +			       struct io_buffer_list *bl, unsigned nbufs)  {  	unsigned i = 0; @@ -4362,19 +4704,16 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx, struct io_buffer *buf,  		return 0;  	/* the head kbuf is the list itself */ -	while (!list_empty(&buf->list)) { +	while (!list_empty(&bl->buf_list)) {  		struct io_buffer *nxt; -		nxt = list_first_entry(&buf->list, struct io_buffer, list); +		nxt = list_first_entry(&bl->buf_list, struct io_buffer, list);  		list_del(&nxt->list); -		kfree(nxt);  		if (++i == nbufs)  			return i;  		cond_resched();  	}  	i++; -	kfree(buf); -	xa_erase(&ctx->io_buffers, bgid);  	return i;  } @@ -4383,7 +4722,7 @@ static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags)  {  	struct io_provide_buf *p = &req->pbuf;  	struct io_ring_ctx *ctx = req->ctx; -	struct io_buffer *head; +	struct io_buffer_list *bl;  	int ret = 0;  	bool needs_lock = issue_flags & IO_URING_F_UNLOCKED; @@ -4392,9 +4731,9 @@ static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags)  	lockdep_assert_held(&ctx->uring_lock);  	ret = -ENOENT; -	head = xa_load(&ctx->io_buffers, p->bgid); -	if (head) -		ret = __io_remove_buffers(ctx, head, p->bgid, p->nbufs); +	bl = io_buffer_get_list(ctx, p->bgid); +	if (bl) +		ret = __io_remove_buffers(ctx, bl, p->nbufs);  	if (ret < 0)  		req_set_fail(req); @@ -4439,38 +4778,80 @@ static int io_provide_buffers_prep(struct io_kiocb *req,  	return 0;  } -static int io_add_buffers(struct io_provide_buf *pbuf, struct io_buffer **head) +static int io_refill_buffer_cache(struct io_ring_ctx *ctx) +{ +	struct io_buffer *buf; +	struct page *page; +	int bufs_in_page; + +	/* +	 * Completions that don't happen inline (eg not under uring_lock) will +	 * add to ->io_buffers_comp. If we don't have any free buffers, check +	 * the completion list and splice those entries first. +	 */ +	if (!list_empty_careful(&ctx->io_buffers_comp)) { +		spin_lock(&ctx->completion_lock); +		if (!list_empty(&ctx->io_buffers_comp)) { +			list_splice_init(&ctx->io_buffers_comp, +						&ctx->io_buffers_cache); +			spin_unlock(&ctx->completion_lock); +			return 0; +		} +		spin_unlock(&ctx->completion_lock); +	} + +	/* +	 * No free buffers and no completion entries either. Allocate a new +	 * page worth of buffer entries and add those to our freelist. +	 */ +	page = alloc_page(GFP_KERNEL_ACCOUNT); +	if (!page) +		return -ENOMEM; + +	list_add(&page->lru, &ctx->io_buffers_pages); + +	buf = page_address(page); +	bufs_in_page = PAGE_SIZE / sizeof(*buf); +	while (bufs_in_page) { +		list_add_tail(&buf->list, &ctx->io_buffers_cache); +		buf++; +		bufs_in_page--; +	} + +	return 0; +} + +static int io_add_buffers(struct io_ring_ctx *ctx, struct io_provide_buf *pbuf, +			  struct io_buffer_list *bl)  {  	struct io_buffer *buf;  	u64 addr = pbuf->addr;  	int i, bid = pbuf->bid;  	for (i = 0; i < pbuf->nbufs; i++) { -		buf = kmalloc(sizeof(*buf), GFP_KERNEL_ACCOUNT); -		if (!buf) +		if (list_empty(&ctx->io_buffers_cache) && +		    io_refill_buffer_cache(ctx))  			break; - +		buf = list_first_entry(&ctx->io_buffers_cache, struct io_buffer, +					list); +		list_move_tail(&buf->list, &bl->buf_list);  		buf->addr = addr;  		buf->len = min_t(__u32, pbuf->len, MAX_RW_COUNT);  		buf->bid = bid; +		buf->bgid = pbuf->bgid;  		addr += pbuf->len;  		bid++; -		if (!*head) { -			INIT_LIST_HEAD(&buf->list); -			*head = buf; -		} else { -			list_add_tail(&buf->list, &(*head)->list); -		} +		cond_resched();  	} -	return i ? i : -ENOMEM; +	return i ? 0 : -ENOMEM;  }  static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)  {  	struct io_provide_buf *p = &req->pbuf;  	struct io_ring_ctx *ctx = req->ctx; -	struct io_buffer *head, *list; +	struct io_buffer_list *bl;  	int ret = 0;  	bool needs_lock = issue_flags & IO_URING_F_UNLOCKED; @@ -4478,14 +4859,18 @@ static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)  	lockdep_assert_held(&ctx->uring_lock); -	list = head = xa_load(&ctx->io_buffers, p->bgid); - -	ret = io_add_buffers(p, &head); -	if (ret >= 0 && !list) { -		ret = xa_insert(&ctx->io_buffers, p->bgid, head, GFP_KERNEL); -		if (ret < 0) -			__io_remove_buffers(ctx, head, p->bgid, -1U); +	bl = io_buffer_get_list(ctx, p->bgid); +	if (unlikely(!bl)) { +		bl = kmalloc(sizeof(*bl), GFP_KERNEL); +		if (!bl) { +			ret = -ENOMEM; +			goto err; +		} +		io_buffer_add_list(ctx, bl, p->bgid);  	} + +	ret = io_add_buffers(ctx, p, bl); +err:  	if (ret < 0)  		req_set_fail(req);  	/* complete before unlock, IOPOLL may need the lock */ @@ -4615,6 +5000,8 @@ static int io_fadvise(struct io_kiocb *req, unsigned int issue_flags)  static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)  { +	const char __user *path; +  	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))  		return -EINVAL;  	if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in) @@ -4624,10 +5011,22 @@ static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)  	req->statx.dfd = READ_ONCE(sqe->fd);  	req->statx.mask = READ_ONCE(sqe->len); -	req->statx.filename = u64_to_user_ptr(READ_ONCE(sqe->addr)); +	path = u64_to_user_ptr(READ_ONCE(sqe->addr));  	req->statx.buffer = u64_to_user_ptr(READ_ONCE(sqe->addr2));  	req->statx.flags = READ_ONCE(sqe->statx_flags); +	req->statx.filename = getname_flags(path, +					getname_statx_lookup_flags(req->statx.flags), +					NULL); + +	if (IS_ERR(req->statx.filename)) { +		int ret = PTR_ERR(req->statx.filename); + +		req->statx.filename = NULL; +		return ret; +	} + +	req->flags |= REQ_F_NEED_CLEANUP;  	return 0;  } @@ -4839,17 +5238,18 @@ static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)  		min_ret = iov_iter_count(&kmsg->msg.msg_iter);  	ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags); -	if ((issue_flags & IO_URING_F_NONBLOCK) && ret == -EAGAIN) -		return io_setup_async_msg(req, kmsg); -	if (ret == -ERESTARTSYS) -		ret = -EINTR; +	if (ret < min_ret) { +		if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) +			return io_setup_async_msg(req, kmsg); +		if (ret == -ERESTARTSYS) +			ret = -EINTR; +		req_set_fail(req); +	}  	/* fast path, check for non-NULL to avoid function call */  	if (kmsg->free_iov)  		kfree(kmsg->free_iov);  	req->flags &= ~REQ_F_NEED_CLEANUP; -	if (ret < min_ret) -		req_set_fail(req);  	__io_req_complete(req, issue_flags, ret, 0);  	return 0;  } @@ -4885,13 +5285,13 @@ static int io_send(struct io_kiocb *req, unsigned int issue_flags)  	msg.msg_flags = flags;  	ret = sock_sendmsg(sock, &msg); -	if ((issue_flags & IO_URING_F_NONBLOCK) && ret == -EAGAIN) -		return -EAGAIN; -	if (ret == -ERESTARTSYS) -		ret = -EINTR; - -	if (ret < min_ret) +	if (ret < min_ret) { +		if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) +			return -EAGAIN; +		if (ret == -ERESTARTSYS) +			ret = -EINTR;  		req_set_fail(req); +	}  	__io_req_complete(req, issue_flags, ret, 0);  	return 0;  } @@ -4991,11 +5391,6 @@ static struct io_buffer *io_recv_buffer_select(struct io_kiocb *req,  	return io_buffer_select(req, &sr->len, sr->bgid, issue_flags);  } -static inline unsigned int io_put_recv_kbuf(struct io_kiocb *req) -{ -	return io_put_kbuf(req, req->kbuf); -} -  static int io_recvmsg_prep_async(struct io_kiocb *req)  {  	int ret; @@ -5033,8 +5428,7 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)  	struct socket *sock;  	struct io_buffer *kbuf;  	unsigned flags; -	int min_ret = 0; -	int ret, cflags = 0; +	int ret, min_ret = 0;  	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;  	sock = sock_from_file(req->file); @@ -5068,20 +5462,21 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)  	ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.umsg,  					kmsg->uaddr, flags); -	if (force_nonblock && ret == -EAGAIN) -		return io_setup_async_msg(req, kmsg); -	if (ret == -ERESTARTSYS) -		ret = -EINTR; +	if (ret < min_ret) { +		if (ret == -EAGAIN && force_nonblock) +			return io_setup_async_msg(req, kmsg); +		if (ret == -ERESTARTSYS) +			ret = -EINTR; +		req_set_fail(req); +	} else if ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) { +		req_set_fail(req); +	} -	if (req->flags & REQ_F_BUFFER_SELECTED) -		cflags = io_put_recv_kbuf(req);  	/* fast path, check for non-NULL to avoid function call */  	if (kmsg->free_iov)  		kfree(kmsg->free_iov);  	req->flags &= ~REQ_F_NEED_CLEANUP; -	if (ret < min_ret || ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC)))) -		req_set_fail(req); -	__io_req_complete(req, issue_flags, ret, cflags); +	__io_req_complete(req, issue_flags, ret, io_put_kbuf(req, issue_flags));  	return 0;  } @@ -5094,8 +5489,7 @@ static int io_recv(struct io_kiocb *req, unsigned int issue_flags)  	struct socket *sock;  	struct iovec iov;  	unsigned flags; -	int min_ret = 0; -	int ret, cflags = 0; +	int ret, min_ret = 0;  	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;  	sock = sock_from_file(req->file); @@ -5127,16 +5521,18 @@ static int io_recv(struct io_kiocb *req, unsigned int issue_flags)  		min_ret = iov_iter_count(&msg.msg_iter);  	ret = sock_recvmsg(sock, &msg, flags); -	if (force_nonblock && ret == -EAGAIN) -		return -EAGAIN; -	if (ret == -ERESTARTSYS) -		ret = -EINTR; +	if (ret < min_ret) { +		if (ret == -EAGAIN && force_nonblock) +			return -EAGAIN; +		if (ret == -ERESTARTSYS) +			ret = -EINTR; +		req_set_fail(req); +	} else if ((flags & MSG_WAITALL) && (msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) {  out_free: -	if (req->flags & REQ_F_BUFFER_SELECTED) -		cflags = io_put_recv_kbuf(req); -	if (ret < min_ret || ((flags & MSG_WAITALL) && (msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))))  		req_set_fail(req); -	__io_req_complete(req, issue_flags, ret, cflags); +	} + +	__io_req_complete(req, issue_flags, ret, io_put_kbuf(req, issue_flags));  	return 0;  } @@ -5155,8 +5551,7 @@ static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)  	accept->nofile = rlimit(RLIMIT_NOFILE);  	accept->file_slot = READ_ONCE(sqe->file_index); -	if (accept->file_slot && ((req->open.how.flags & O_CLOEXEC) || -				  (accept->flags & SOCK_CLOEXEC))) +	if (accept->file_slot && (accept->flags & SOCK_CLOEXEC))  		return -EINVAL;  	if (accept->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))  		return -EINVAL; @@ -5296,59 +5691,132 @@ IO_NETOP_FN(send);  IO_NETOP_FN(recv);  #endif /* CONFIG_NET */ -struct io_poll_table { -	struct poll_table_struct pt; -	struct io_kiocb *req; -	int nr_entries; -	int error; +#ifdef CONFIG_NET_RX_BUSY_POLL + +#define NAPI_TIMEOUT			(60 * SEC_CONVERSION) + +struct napi_entry { +	struct list_head	list; +	unsigned int		napi_id; +	unsigned long		timeout;  }; -static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll, -			   __poll_t mask, io_req_tw_func_t func) +/* + * Add busy poll NAPI ID from sk. + */ +static void io_add_napi(struct file *file, struct io_ring_ctx *ctx)  { -	/* for instances that support it check for an event match first: */ -	if (mask && !(mask & poll->events)) -		return 0; +	unsigned int napi_id; +	struct socket *sock; +	struct sock *sk; +	struct napi_entry *ne; -	trace_io_uring_task_add(req->ctx, req->opcode, req->user_data, mask); +	if (!net_busy_loop_on()) +		return; -	list_del_init(&poll->wait.entry); +	sock = sock_from_file(file); +	if (!sock) +		return; -	req->result = mask; -	req->io_task_work.func = func; +	sk = sock->sk; +	if (!sk) +		return; -	/* -	 * If this fails, then the task is exiting. When a task exits, the -	 * work gets canceled, so just cancel this request as well instead -	 * of executing it. We can't safely execute it anyway, as we may not -	 * have the needed state needed for it anyway. -	 */ -	io_req_task_work_add(req); -	return 1; +	napi_id = READ_ONCE(sk->sk_napi_id); + +	/* Non-NAPI IDs can be rejected */ +	if (napi_id < MIN_NAPI_ID) +		return; + +	spin_lock(&ctx->napi_lock); +	list_for_each_entry(ne, &ctx->napi_list, list) { +		if (ne->napi_id == napi_id) { +			ne->timeout = jiffies + NAPI_TIMEOUT; +			goto out; +		} +	} + +	ne = kmalloc(sizeof(*ne), GFP_NOWAIT); +	if (!ne) +		goto out; + +	ne->napi_id = napi_id; +	ne->timeout = jiffies + NAPI_TIMEOUT; +	list_add_tail(&ne->list, &ctx->napi_list); +out: +	spin_unlock(&ctx->napi_lock);  } -static bool io_poll_rewait(struct io_kiocb *req, struct io_poll_iocb *poll) -	__acquires(&req->ctx->completion_lock) +static inline void io_check_napi_entry_timeout(struct napi_entry *ne)  { -	struct io_ring_ctx *ctx = req->ctx; - -	/* req->task == current here, checking PF_EXITING is safe */ -	if (unlikely(req->task->flags & PF_EXITING)) -		WRITE_ONCE(poll->canceled, true); +	if (time_after(jiffies, ne->timeout)) { +		list_del(&ne->list); +		kfree(ne); +	} +} -	if (!req->result && !READ_ONCE(poll->canceled)) { -		struct poll_table_struct pt = { ._key = poll->events }; +/* + * Busy poll if globally on and supporting sockets found + */ +static bool io_napi_busy_loop(struct list_head *napi_list) +{ +	struct napi_entry *ne, *n; -		req->result = vfs_poll(req->file, &pt) & poll->events; +	list_for_each_entry_safe(ne, n, napi_list, list) { +		napi_busy_loop(ne->napi_id, NULL, NULL, true, +			       BUSY_POLL_BUDGET); +		io_check_napi_entry_timeout(ne);  	} +	return !list_empty(napi_list); +} -	spin_lock(&ctx->completion_lock); -	if (!req->result && !READ_ONCE(poll->canceled)) { -		add_wait_queue(poll->head, &poll->wait); -		return true; +static void io_free_napi_list(struct io_ring_ctx *ctx) +{ +	spin_lock(&ctx->napi_lock); +	while (!list_empty(&ctx->napi_list)) { +		struct napi_entry *ne = +			list_first_entry(&ctx->napi_list, struct napi_entry, +					 list); + +		list_del(&ne->list); +		kfree(ne);  	} +	spin_unlock(&ctx->napi_lock); +} +#else +static inline void io_add_napi(struct file *file, struct io_ring_ctx *ctx) +{ +} -	return false; +static inline void io_free_napi_list(struct io_ring_ctx *ctx) +{ +} +#endif /* CONFIG_NET_RX_BUSY_POLL */ + +struct io_poll_table { +	struct poll_table_struct pt; +	struct io_kiocb *req; +	int nr_entries; +	int error; +}; + +#define IO_POLL_CANCEL_FLAG	BIT(31) +#define IO_POLL_REF_MASK	((1u << 20)-1) + +/* + * If refs part of ->poll_refs (see IO_POLL_REF_MASK) is 0, it's free. We can + * bump it and acquire ownership. It's disallowed to modify requests while not + * owning it, that prevents from races for enqueueing task_work's and b/w + * arming poll and wakeups. + */ +static inline bool io_poll_get_ownership(struct io_kiocb *req) +{ +	return !(atomic_fetch_inc(&req->poll_refs) & IO_POLL_REF_MASK); +} + +static void io_poll_mark_cancelled(struct io_kiocb *req) +{ +	atomic_or(IO_POLL_CANCEL_FLAG, &req->poll_refs);  }  static struct io_poll_iocb *io_poll_get_double(struct io_kiocb *req) @@ -5366,133 +5834,255 @@ static struct io_poll_iocb *io_poll_get_single(struct io_kiocb *req)  	return &req->apoll->poll;  } -static void io_poll_remove_double(struct io_kiocb *req) -	__must_hold(&req->ctx->completion_lock) +static void io_poll_req_insert(struct io_kiocb *req)  { -	struct io_poll_iocb *poll = io_poll_get_double(req); +	struct io_ring_ctx *ctx = req->ctx; +	struct hlist_head *list; -	lockdep_assert_held(&req->ctx->completion_lock); +	list = &ctx->cancel_hash[hash_long(req->user_data, ctx->cancel_hash_bits)]; +	hlist_add_head(&req->hash_node, list); +} -	if (poll && poll->head) { -		struct wait_queue_head *head = poll->head; +static void io_init_poll_iocb(struct io_poll_iocb *poll, __poll_t events, +			      wait_queue_func_t wake_func) +{ +	poll->head = NULL; +#define IO_POLL_UNMASK	(EPOLLERR|EPOLLHUP|EPOLLNVAL|EPOLLRDHUP) +	/* mask in events that we always want/need */ +	poll->events = events | IO_POLL_UNMASK; +	INIT_LIST_HEAD(&poll->wait.entry); +	init_waitqueue_func_entry(&poll->wait, wake_func); +} +static inline void io_poll_remove_entry(struct io_poll_iocb *poll) +{ +	struct wait_queue_head *head = smp_load_acquire(&poll->head); + +	if (head) {  		spin_lock_irq(&head->lock);  		list_del_init(&poll->wait.entry); -		if (poll->wait.private) -			req_ref_put(req);  		poll->head = NULL;  		spin_unlock_irq(&head->lock);  	}  } -static bool __io_poll_complete(struct io_kiocb *req, __poll_t mask) -	__must_hold(&req->ctx->completion_lock) +static void io_poll_remove_entries(struct io_kiocb *req) +{ +	/* +	 * Nothing to do if neither of those flags are set. Avoid dipping +	 * into the poll/apoll/double cachelines if we can. +	 */ +	if (!(req->flags & (REQ_F_SINGLE_POLL | REQ_F_DOUBLE_POLL))) +		return; + +	/* +	 * While we hold the waitqueue lock and the waitqueue is nonempty, +	 * wake_up_pollfree() will wait for us.  However, taking the waitqueue +	 * lock in the first place can race with the waitqueue being freed. +	 * +	 * We solve this as eventpoll does: by taking advantage of the fact that +	 * all users of wake_up_pollfree() will RCU-delay the actual free.  If +	 * we enter rcu_read_lock() and see that the pointer to the queue is +	 * non-NULL, we can then lock it without the memory being freed out from +	 * under us. +	 * +	 * Keep holding rcu_read_lock() as long as we hold the queue lock, in +	 * case the caller deletes the entry from the queue, leaving it empty. +	 * In that case, only RCU prevents the queue memory from being freed. +	 */ +	rcu_read_lock(); +	if (req->flags & REQ_F_SINGLE_POLL) +		io_poll_remove_entry(io_poll_get_single(req)); +	if (req->flags & REQ_F_DOUBLE_POLL) +		io_poll_remove_entry(io_poll_get_double(req)); +	rcu_read_unlock(); +} + +/* + * All poll tw should go through this. Checks for poll events, manages + * references, does rewait, etc. + * + * Returns a negative error on failure. >0 when no action require, which is + * either spurious wakeup or multishot CQE is served. 0 when it's done with + * the request, then the mask is stored in req->result. + */ +static int io_poll_check_events(struct io_kiocb *req)  {  	struct io_ring_ctx *ctx = req->ctx; -	unsigned flags = IORING_CQE_F_MORE; -	int error; +	struct io_poll_iocb *poll = io_poll_get_single(req); +	int v; -	if (READ_ONCE(req->poll.canceled)) { -		error = -ECANCELED; -		req->poll.events |= EPOLLONESHOT; -	} else { -		error = mangle_poll(mask); -	} -	if (req->poll.events & EPOLLONESHOT) -		flags = 0; -	if (!io_cqring_fill_event(ctx, req->user_data, error, flags)) { -		req->poll.events |= EPOLLONESHOT; -		flags = 0; -	} -	if (flags & IORING_CQE_F_MORE) -		ctx->cq_extra++; +	/* req->task == current here, checking PF_EXITING is safe */ +	if (unlikely(req->task->flags & PF_EXITING)) +		io_poll_mark_cancelled(req); + +	do { +		v = atomic_read(&req->poll_refs); + +		/* tw handler should be the owner, and so have some references */ +		if (WARN_ON_ONCE(!(v & IO_POLL_REF_MASK))) +			return 0; +		if (v & IO_POLL_CANCEL_FLAG) +			return -ECANCELED; -	return !(flags & IORING_CQE_F_MORE); +		if (!req->result) { +			struct poll_table_struct pt = { ._key = req->cflags }; + +			req->result = vfs_poll(req->file, &pt) & req->cflags; +		} + +		/* multishot, just fill an CQE and proceed */ +		if (req->result && !(req->cflags & EPOLLONESHOT)) { +			__poll_t mask = mangle_poll(req->result & poll->events); +			bool filled; + +			spin_lock(&ctx->completion_lock); +			filled = io_fill_cqe_aux(ctx, req->user_data, mask, +						 IORING_CQE_F_MORE); +			io_commit_cqring(ctx); +			spin_unlock(&ctx->completion_lock); +			if (unlikely(!filled)) +				return -ECANCELED; +			io_cqring_ev_posted(ctx); +			io_add_napi(req->file, ctx); +		} else if (req->result) { +			return 0; +		} + +		/* +		 * Release all references, retry if someone tried to restart +		 * task_work while we were executing it. +		 */ +	} while (atomic_sub_return(v & IO_POLL_REF_MASK, &req->poll_refs)); + +	return 1;  }  static void io_poll_task_func(struct io_kiocb *req, bool *locked)  {  	struct io_ring_ctx *ctx = req->ctx; -	struct io_kiocb *nxt; +	int ret; -	if (io_poll_rewait(req, &req->poll)) { -		spin_unlock(&ctx->completion_lock); +	ret = io_poll_check_events(req); +	if (ret > 0) +		return; + +	if (!ret) { +		req->result = mangle_poll(req->result & req->poll.events);  	} else { -		bool done; +		req->result = ret; +		req_set_fail(req); +	} -		if (req->poll.done) { -			spin_unlock(&ctx->completion_lock); -			return; -		} -		done = __io_poll_complete(req, req->result); -		if (done) { -			io_poll_remove_double(req); -			hash_del(&req->hash_node); -			req->poll.done = true; -		} else { -			req->result = 0; -			add_wait_queue(req->poll.head, &req->poll.wait); -		} -		io_commit_cqring(ctx); -		spin_unlock(&ctx->completion_lock); -		io_cqring_ev_posted(ctx); +	io_poll_remove_entries(req); +	spin_lock(&ctx->completion_lock); +	hash_del(&req->hash_node); +	__io_req_complete_post(req, req->result, 0); +	io_commit_cqring(ctx); +	spin_unlock(&ctx->completion_lock); +	io_cqring_ev_posted(ctx); +} -		if (done) { -			nxt = io_put_req_find_next(req); -			if (nxt) -				io_req_task_submit(nxt, locked); -		} -	} +static void io_apoll_task_func(struct io_kiocb *req, bool *locked) +{ +	struct io_ring_ctx *ctx = req->ctx; +	int ret; + +	ret = io_poll_check_events(req); +	if (ret > 0) +		return; + +	io_poll_remove_entries(req); +	spin_lock(&ctx->completion_lock); +	hash_del(&req->hash_node); +	spin_unlock(&ctx->completion_lock); + +	if (!ret) +		io_req_task_submit(req, locked); +	else +		io_req_complete_failed(req, ret);  } -static int io_poll_double_wake(struct wait_queue_entry *wait, unsigned mode, -			       int sync, void *key) +static void __io_poll_execute(struct io_kiocb *req, int mask, int events) +{ +	req->result = mask; +	/* +	 * This is useful for poll that is armed on behalf of another +	 * request, and where the wakeup path could be on a different +	 * CPU. We want to avoid pulling in req->apoll->events for that +	 * case. +	 */ +	req->cflags = events; +	if (req->opcode == IORING_OP_POLL_ADD) +		req->io_task_work.func = io_poll_task_func; +	else +		req->io_task_work.func = io_apoll_task_func; + +	trace_io_uring_task_add(req->ctx, req, req->user_data, req->opcode, mask); +	io_req_task_work_add(req, false); +} + +static inline void io_poll_execute(struct io_kiocb *req, int res, int events) +{ +	if (io_poll_get_ownership(req)) +		__io_poll_execute(req, res, events); +} + +static void io_poll_cancel_req(struct io_kiocb *req) +{ +	io_poll_mark_cancelled(req); +	/* kick tw, which should complete the request */ +	io_poll_execute(req, 0, 0); +} + +static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, +			void *key)  {  	struct io_kiocb *req = wait->private; -	struct io_poll_iocb *poll = io_poll_get_single(req); +	struct io_poll_iocb *poll = container_of(wait, struct io_poll_iocb, +						 wait);  	__poll_t mask = key_to_poll(key); -	unsigned long flags; -	/* for instances that support it check for an event match first: */ -	if (mask && !(mask & poll->events)) -		return 0; -	if (!(poll->events & EPOLLONESHOT)) -		return poll->wait.func(&poll->wait, mode, sync, key); +	if (unlikely(mask & POLLFREE)) { +		io_poll_mark_cancelled(req); +		/* we have to kick tw in case it's not already */ +		io_poll_execute(req, 0, poll->events); -	list_del_init(&wait->entry); +		/* +		 * If the waitqueue is being freed early but someone is already +		 * holds ownership over it, we have to tear down the request as +		 * best we can. That means immediately removing the request from +		 * its waitqueue and preventing all further accesses to the +		 * waitqueue via the request. +		 */ +		list_del_init(&poll->wait.entry); + +		/* +		 * Careful: this *must* be the last step, since as soon +		 * as req->head is NULL'ed out, the request can be +		 * completed and freed, since aio_poll_complete_work() +		 * will no longer need to take the waitqueue lock. +		 */ +		smp_store_release(&poll->head, NULL); +		return 1; +	} -	if (poll->head) { -		bool done; +	/* for instances that support it check for an event match first */ +	if (mask && !(mask & poll->events)) +		return 0; -		spin_lock_irqsave(&poll->head->lock, flags); -		done = list_empty(&poll->wait.entry); -		if (!done) +	if (io_poll_get_ownership(req)) { +		/* optional, saves extra locking for removal in tw handler */ +		if (mask && poll->events & EPOLLONESHOT) {  			list_del_init(&poll->wait.entry); -		/* make sure double remove sees this as being gone */ -		wait->private = NULL; -		spin_unlock_irqrestore(&poll->head->lock, flags); -		if (!done) { -			/* use wait func handler, so it matches the rq type */ -			poll->wait.func(&poll->wait, mode, sync, key); +			poll->head = NULL; +			req->flags &= ~REQ_F_SINGLE_POLL;  		} +		__io_poll_execute(req, mask, poll->events);  	} -	req_ref_put(req);  	return 1;  } -static void io_init_poll_iocb(struct io_poll_iocb *poll, __poll_t events, -			      wait_queue_func_t wake_func) -{ -	poll->head = NULL; -	poll->done = false; -	poll->canceled = false; -#define IO_POLL_UNMASK	(EPOLLERR|EPOLLHUP|EPOLLNVAL|EPOLLRDHUP) -	/* mask in events that we always want/need */ -	poll->events = events | IO_POLL_UNMASK; -	INIT_LIST_HEAD(&poll->wait.entry); -	init_waitqueue_func_entry(&poll->wait, wake_func); -} -  static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt,  			    struct wait_queue_head *head,  			    struct io_poll_iocb **poll_ptr) @@ -5505,10 +6095,10 @@ static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt,  	 * if this happens.  	 */  	if (unlikely(pt->nr_entries)) { -		struct io_poll_iocb *poll_one = poll; +		struct io_poll_iocb *first = poll;  		/* double add on the same waitqueue head, ignore */ -		if (poll_one->head == head) +		if (first->head == head)  			return;  		/* already have a 2nd entry, fail a third attempt */  		if (*poll_ptr) { @@ -5517,28 +6107,23 @@ static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt,  			pt->error = -EINVAL;  			return;  		} -		/* -		 * Can't handle multishot for double wait for now, turn it -		 * into one-shot mode. -		 */ -		if (!(poll_one->events & EPOLLONESHOT)) -			poll_one->events |= EPOLLONESHOT; +  		poll = kmalloc(sizeof(*poll), GFP_ATOMIC);  		if (!poll) {  			pt->error = -ENOMEM;  			return;  		} -		io_init_poll_iocb(poll, poll_one->events, io_poll_double_wake); -		req_ref_get(req); -		poll->wait.private = req; - +		req->flags |= REQ_F_DOUBLE_POLL; +		io_init_poll_iocb(poll, first->events, first->wait.func);  		*poll_ptr = poll;  		if (req->opcode == IORING_OP_POLL_ADD)  			req->flags |= REQ_F_ASYNC_DATA;  	} +	req->flags |= REQ_F_SINGLE_POLL;  	pt->nr_entries++;  	poll->head = head; +	poll->wait.private = req;  	if (poll->events & EPOLLEXCLUSIVE)  		add_wait_queue_exclusive(head, &poll->wait); @@ -5546,70 +6131,24 @@ static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt,  		add_wait_queue(head, &poll->wait);  } -static void io_async_queue_proc(struct file *file, struct wait_queue_head *head, +static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,  			       struct poll_table_struct *p)  {  	struct io_poll_table *pt = container_of(p, struct io_poll_table, pt); -	struct async_poll *apoll = pt->req->apoll; - -	__io_queue_proc(&apoll->poll, pt, head, &apoll->double_poll); -} - -static void io_async_task_func(struct io_kiocb *req, bool *locked) -{ -	struct async_poll *apoll = req->apoll; -	struct io_ring_ctx *ctx = req->ctx; - -	trace_io_uring_task_run(req->ctx, req, req->opcode, req->user_data); - -	if (io_poll_rewait(req, &apoll->poll)) { -		spin_unlock(&ctx->completion_lock); -		return; -	} - -	hash_del(&req->hash_node); -	io_poll_remove_double(req); -	apoll->poll.done = true; -	spin_unlock(&ctx->completion_lock); - -	if (!READ_ONCE(apoll->poll.canceled)) -		io_req_task_submit(req, locked); -	else -		io_req_complete_failed(req, -ECANCELED); -} - -static int io_async_wake(struct wait_queue_entry *wait, unsigned mode, int sync, -			void *key) -{ -	struct io_kiocb *req = wait->private; -	struct io_poll_iocb *poll = &req->apoll->poll; - -	trace_io_uring_poll_wake(req->ctx, req->opcode, req->user_data, -					key_to_poll(key)); -	return __io_async_wake(req, poll, key_to_poll(key), io_async_task_func); +	__io_queue_proc(&pt->req->poll, pt, head, +			(struct io_poll_iocb **) &pt->req->async_data);  } -static void io_poll_req_insert(struct io_kiocb *req) +static int __io_arm_poll_handler(struct io_kiocb *req, +				 struct io_poll_iocb *poll, +				 struct io_poll_table *ipt, __poll_t mask)  {  	struct io_ring_ctx *ctx = req->ctx; -	struct hlist_head *list; - -	list = &ctx->cancel_hash[hash_long(req->user_data, ctx->cancel_hash_bits)]; -	hlist_add_head(&req->hash_node, list); -} - -static __poll_t __io_arm_poll_handler(struct io_kiocb *req, -				      struct io_poll_iocb *poll, -				      struct io_poll_table *ipt, __poll_t mask, -				      wait_queue_func_t wake_func) -	__acquires(&ctx->completion_lock) -{ -	struct io_ring_ctx *ctx = req->ctx; -	bool cancel = false; +	int v;  	INIT_HLIST_NODE(&req->hash_node); -	io_init_poll_iocb(poll, mask, wake_func); +	io_init_poll_iocb(poll, mask, io_poll_wake);  	poll->file = req->file;  	poll->wait.private = req; @@ -5618,31 +6157,55 @@ static __poll_t __io_arm_poll_handler(struct io_kiocb *req,  	ipt->error = 0;  	ipt->nr_entries = 0; +	/* +	 * Take the ownership to delay any tw execution up until we're done +	 * with poll arming. see io_poll_get_ownership(). +	 */ +	atomic_set(&req->poll_refs, 1);  	mask = vfs_poll(req->file, &ipt->pt) & poll->events; -	if (unlikely(!ipt->nr_entries) && !ipt->error) -		ipt->error = -EINVAL; + +	if (mask && (poll->events & EPOLLONESHOT)) { +		io_poll_remove_entries(req); +		/* no one else has access to the req, forget about the ref */ +		return mask; +	} +	if (!mask && unlikely(ipt->error || !ipt->nr_entries)) { +		io_poll_remove_entries(req); +		if (!ipt->error) +			ipt->error = -EINVAL; +		return 0; +	}  	spin_lock(&ctx->completion_lock); -	if (ipt->error || (mask && (poll->events & EPOLLONESHOT))) -		io_poll_remove_double(req); -	if (likely(poll->head)) { -		spin_lock_irq(&poll->head->lock); -		if (unlikely(list_empty(&poll->wait.entry))) { -			if (ipt->error) -				cancel = true; -			ipt->error = 0; -			mask = 0; -		} -		if ((mask && (poll->events & EPOLLONESHOT)) || ipt->error) -			list_del_init(&poll->wait.entry); -		else if (cancel) -			WRITE_ONCE(poll->canceled, true); -		else if (!poll->done) /* actually waiting for an event */ -			io_poll_req_insert(req); -		spin_unlock_irq(&poll->head->lock); +	io_poll_req_insert(req); +	spin_unlock(&ctx->completion_lock); + +	if (mask) { +		/* can't multishot if failed, just queue the event we've got */ +		if (unlikely(ipt->error || !ipt->nr_entries)) +			poll->events |= EPOLLONESHOT; +		__io_poll_execute(req, mask, poll->events); +		return 0;  	} +	io_add_napi(req->file, req->ctx); -	return mask; +	/* +	 * Release ownership. If someone tried to queue a tw while it was +	 * locked, kick it off for them. +	 */ +	v = atomic_dec_return(&req->poll_refs); +	if (unlikely(v & IO_POLL_REF_MASK)) +		__io_poll_execute(req, 0, poll->events); +	return 0; +} + +static void io_async_queue_proc(struct file *file, struct wait_queue_head *head, +			       struct poll_table_struct *p) +{ +	struct io_poll_table *pt = container_of(p, struct io_poll_table, pt); +	struct async_poll *apoll = pt->req->apoll; + +	__io_queue_proc(&apoll->poll, pt, head, &apoll->double_poll);  }  enum { @@ -5651,13 +6214,14 @@ enum {  	IO_APOLL_READY  }; -static int io_arm_poll_handler(struct io_kiocb *req) +static int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags)  {  	const struct io_op_def *def = &io_op_defs[req->opcode];  	struct io_ring_ctx *ctx = req->ctx;  	struct async_poll *apoll;  	struct io_poll_table ipt; -	__poll_t ret, mask = EPOLLONESHOT | POLLERR | POLLPRI; +	__poll_t mask = EPOLLONESHOT | POLLERR | POLLPRI; +	int ret;  	if (!def->pollin && !def->pollout)  		return IO_APOLL_ABORTED; @@ -5675,63 +6239,30 @@ static int io_arm_poll_handler(struct io_kiocb *req)  		mask |= POLLOUT | POLLWRNORM;  	} -	apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC); -	if (unlikely(!apoll)) -		return IO_APOLL_ABORTED; +	if (!(issue_flags & IO_URING_F_UNLOCKED) && +	    !list_empty(&ctx->apoll_cache)) { +		apoll = list_first_entry(&ctx->apoll_cache, struct async_poll, +						poll.wait.entry); +		list_del_init(&apoll->poll.wait.entry); +	} else { +		apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC); +		if (unlikely(!apoll)) +			return IO_APOLL_ABORTED; +	}  	apoll->double_poll = NULL;  	req->apoll = apoll;  	req->flags |= REQ_F_POLLED;  	ipt.pt._qproc = io_async_queue_proc; -	io_req_set_refcount(req); -	ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask, -					io_async_wake); -	spin_unlock(&ctx->completion_lock); +	ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask);  	if (ret || ipt.error)  		return ret ? IO_APOLL_READY : IO_APOLL_ABORTED; -	trace_io_uring_poll_arm(ctx, req, req->opcode, req->user_data, +	trace_io_uring_poll_arm(ctx, req, req->user_data, req->opcode,  				mask, apoll->poll.events);  	return IO_APOLL_OK;  } -static bool __io_poll_remove_one(struct io_kiocb *req, -				 struct io_poll_iocb *poll, bool do_cancel) -	__must_hold(&req->ctx->completion_lock) -{ -	bool do_complete = false; - -	if (!poll->head) -		return false; -	spin_lock_irq(&poll->head->lock); -	if (do_cancel) -		WRITE_ONCE(poll->canceled, true); -	if (!list_empty(&poll->wait.entry)) { -		list_del_init(&poll->wait.entry); -		do_complete = true; -	} -	spin_unlock_irq(&poll->head->lock); -	hash_del(&req->hash_node); -	return do_complete; -} - -static bool io_poll_remove_one(struct io_kiocb *req) -	__must_hold(&req->ctx->completion_lock) -{ -	bool do_complete; - -	io_poll_remove_double(req); -	do_complete = __io_poll_remove_one(req, io_poll_get_single(req), true); - -	if (do_complete) { -		io_cqring_fill_event(req->ctx, req->user_data, -ECANCELED, 0); -		io_commit_cqring(req->ctx); -		req_set_fail(req); -		io_put_req_deferred(req); -	} -	return do_complete; -} -  /*   * Returns true if we found and killed one or more poll requests   */ @@ -5740,7 +6271,8 @@ static __cold bool io_poll_remove_all(struct io_ring_ctx *ctx,  {  	struct hlist_node *tmp;  	struct io_kiocb *req; -	int posted = 0, i; +	bool found = false; +	int i;  	spin_lock(&ctx->completion_lock);  	for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) { @@ -5748,16 +6280,14 @@ static __cold bool io_poll_remove_all(struct io_ring_ctx *ctx,  		list = &ctx->cancel_hash[i];  		hlist_for_each_entry_safe(req, tmp, list, hash_node) { -			if (io_match_task_safe(req, tsk, cancel_all)) -				posted += io_poll_remove_one(req); +			if (io_match_task_safe(req, tsk, cancel_all)) { +				io_poll_cancel_req(req); +				found = true; +			}  		}  	}  	spin_unlock(&ctx->completion_lock); - -	if (posted) -		io_cqring_ev_posted(ctx); - -	return posted != 0; +	return found;  }  static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, __u64 sqe_addr, @@ -5778,19 +6308,26 @@ static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, __u64 sqe_addr,  	return NULL;  } +static bool io_poll_disarm(struct io_kiocb *req) +	__must_hold(&ctx->completion_lock) +{ +	if (!io_poll_get_ownership(req)) +		return false; +	io_poll_remove_entries(req); +	hash_del(&req->hash_node); +	return true; +} +  static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr,  			  bool poll_only)  	__must_hold(&ctx->completion_lock)  { -	struct io_kiocb *req; +	struct io_kiocb *req = io_poll_find(ctx, sqe_addr, poll_only); -	req = io_poll_find(ctx, sqe_addr, poll_only);  	if (!req)  		return -ENOENT; -	if (io_poll_remove_one(req)) -		return 0; - -	return -EALREADY; +	io_poll_cancel_req(req); +	return 0;  }  static __poll_t io_poll_parse_events(const struct io_uring_sqe *sqe, @@ -5840,23 +6377,6 @@ static int io_poll_update_prep(struct io_kiocb *req,  	return 0;  } -static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, -			void *key) -{ -	struct io_kiocb *req = wait->private; -	struct io_poll_iocb *poll = &req->poll; - -	return __io_async_wake(req, poll, key_to_poll(key), io_poll_task_func); -} - -static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head, -			       struct poll_table_struct *p) -{ -	struct io_poll_table *pt = container_of(p, struct io_poll_table, pt); - -	__io_queue_proc(&pt->req->poll, pt, head, (struct io_poll_iocb **) &pt->req->async_data); -} -  static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)  {  	struct io_poll_iocb *poll = &req->poll; @@ -5869,109 +6389,71 @@ static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe  	flags = READ_ONCE(sqe->len);  	if (flags & ~IORING_POLL_ADD_MULTI)  		return -EINVAL; +	if ((flags & IORING_POLL_ADD_MULTI) && (req->flags & REQ_F_CQE_SKIP)) +		return -EINVAL;  	io_req_set_refcount(req); -	poll->events = io_poll_parse_events(sqe, flags); +	req->cflags = poll->events = io_poll_parse_events(sqe, flags);  	return 0;  }  static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags)  {  	struct io_poll_iocb *poll = &req->poll; -	struct io_ring_ctx *ctx = req->ctx;  	struct io_poll_table ipt; -	__poll_t mask; -	bool done; +	int ret;  	ipt.pt._qproc = io_poll_queue_proc; -	mask = __io_arm_poll_handler(req, &req->poll, &ipt, poll->events, -					io_poll_wake); - -	if (mask) { /* no async, we'd stolen it */ -		ipt.error = 0; -		done = __io_poll_complete(req, mask); -		io_commit_cqring(req->ctx); -	} -	spin_unlock(&ctx->completion_lock); - -	if (mask) { -		io_cqring_ev_posted(ctx); -		if (done) -			io_put_req(req); -	} -	return ipt.error; +	ret = __io_arm_poll_handler(req, &req->poll, &ipt, poll->events); +	ret = ret ?: ipt.error; +	if (ret) +		__io_req_complete(req, issue_flags, ret, 0); +	return 0;  }  static int io_poll_update(struct io_kiocb *req, unsigned int issue_flags)  {  	struct io_ring_ctx *ctx = req->ctx;  	struct io_kiocb *preq; -	bool completing; -	int ret; +	int ret2, ret = 0; +	bool locked;  	spin_lock(&ctx->completion_lock);  	preq = io_poll_find(ctx, req->poll_update.old_user_data, true); -	if (!preq) { -		ret = -ENOENT; -		goto err; -	} - -	if (!req->poll_update.update_events && !req->poll_update.update_user_data) { -		completing = true; -		ret = io_poll_remove_one(preq) ? 0 : -EALREADY; -		goto err; -	} - -	/* -	 * Don't allow racy completion with singleshot, as we cannot safely -	 * update those. For multishot, if we're racing with completion, just -	 * let completion re-add it. -	 */ -	completing = !__io_poll_remove_one(preq, &preq->poll, false); -	if (completing && (preq->poll.events & EPOLLONESHOT)) { -		ret = -EALREADY; -		goto err; -	} -	/* we now have a detached poll request. reissue. */ -	ret = 0; -err: -	if (ret < 0) { +	if (!preq || !io_poll_disarm(preq)) {  		spin_unlock(&ctx->completion_lock); -		req_set_fail(req); -		io_req_complete(req, ret); -		return 0; -	} -	/* only mask one event flags, keep behavior flags */ -	if (req->poll_update.update_events) { -		preq->poll.events &= ~0xffff; -		preq->poll.events |= req->poll_update.events & 0xffff; -		preq->poll.events |= IO_POLL_UNMASK; +		ret = preq ? -EALREADY : -ENOENT; +		goto out;  	} -	if (req->poll_update.update_user_data) -		preq->user_data = req->poll_update.new_user_data;  	spin_unlock(&ctx->completion_lock); -	/* complete update request, we're done with it */ -	io_req_complete(req, ret); - -	if (!completing) { -		ret = io_poll_add(preq, issue_flags); -		if (ret < 0) { -			req_set_fail(preq); -			io_req_complete(preq, ret); +	if (req->poll_update.update_events || req->poll_update.update_user_data) { +		/* only mask one event flags, keep behavior flags */ +		if (req->poll_update.update_events) { +			preq->poll.events &= ~0xffff; +			preq->poll.events |= req->poll_update.events & 0xffff; +			preq->poll.events |= IO_POLL_UNMASK;  		} -	} -	return 0; -} +		if (req->poll_update.update_user_data) +			preq->user_data = req->poll_update.new_user_data; -static void io_req_task_timeout(struct io_kiocb *req, bool *locked) -{ -	struct io_timeout_data *data = req->async_data; +		ret2 = io_poll_add(preq, issue_flags); +		/* successfully updated, don't complete poll request */ +		if (!ret2) +			goto out; +	} -	if (!(data->flags & IORING_TIMEOUT_ETIME_SUCCESS)) +	req_set_fail(preq); +	preq->result = -ECANCELED; +	locked = !(issue_flags & IO_URING_F_UNLOCKED); +	io_req_task_complete(preq, &locked); +out: +	if (ret < 0)  		req_set_fail(req); -	io_req_complete_post(req, -ETIME, 0); +	/* complete update request, we're done with it */ +	__io_req_complete(req, issue_flags, ret, 0); +	return 0;  }  static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer) @@ -5988,8 +6470,12 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)  		atomic_read(&req->ctx->cq_timeouts) + 1);  	spin_unlock_irqrestore(&ctx->timeout_lock, flags); -	req->io_task_work.func = io_req_task_timeout; -	io_req_task_work_add(req); +	if (!(data->flags & IORING_TIMEOUT_ETIME_SUCCESS)) +		req_set_fail(req); + +	req->result = -ETIME; +	req->io_task_work.func = io_req_task_complete; +	io_req_task_work_add(req, false);  	return HRTIMER_NORESTART;  } @@ -6024,10 +6510,7 @@ static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)  	if (IS_ERR(req))  		return PTR_ERR(req); - -	req_set_fail(req); -	io_cqring_fill_event(ctx, req->user_data, -ECANCELED, 0); -	io_put_req_deferred(req); +	io_req_task_queue_fail(req, -ECANCELED);  	return 0;  } @@ -6115,6 +6598,8 @@ static int io_timeout_remove_prep(struct io_kiocb *req,  			return -EINVAL;  		if (get_timespec64(&tr->ts, u64_to_user_ptr(sqe->addr2)))  			return -EFAULT; +		if (tr->ts.tv_sec < 0 || tr->ts.tv_nsec < 0) +			return -EINVAL;  	} else if (tr->flags) {  		/* timeout removal doesn't support flags */  		return -EINVAL; @@ -6316,16 +6801,21 @@ static int io_try_cancel_userdata(struct io_kiocb *req, u64 sqe_addr)  	WARN_ON_ONCE(!io_wq_current_is_worker() && req->task != current);  	ret = io_async_cancel_one(req->task->io_uring, sqe_addr, ctx); -	if (ret != -ENOENT) -		return ret; +	/* +	 * Fall-through even for -EALREADY, as we may have poll armed +	 * that need unarming. +	 */ +	if (!ret) +		return 0;  	spin_lock(&ctx->completion_lock); +	ret = io_poll_cancel(ctx, sqe_addr, false); +	if (ret != -ENOENT) +		goto out; +  	spin_lock_irq(&ctx->timeout_lock);  	ret = io_timeout_cancel(ctx, sqe_addr);  	spin_unlock_irq(&ctx->timeout_lock); -	if (ret != -ENOENT) -		goto out; -	ret = io_poll_cancel(ctx, sqe_addr, false);  out:  	spin_unlock(&ctx->completion_lock);  	return ret; @@ -6493,6 +6983,8 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)  		return io_symlinkat_prep(req, sqe);  	case IORING_OP_LINKAT:  		return io_linkat_prep(req, sqe); +	case IORING_OP_MSG_RING: +		return io_msg_ring_prep(req, sqe);  	}  	printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n", @@ -6544,12 +7036,15 @@ static __cold void io_drain_req(struct io_kiocb *req)  	u32 seq = io_get_sequence(req);  	/* Still need defer if there is pending req in defer list. */ +	spin_lock(&ctx->completion_lock);  	if (!req_need_defer(req, seq) && list_empty_careful(&ctx->defer_list)) { +		spin_unlock(&ctx->completion_lock);  queue:  		ctx->drain_active = false;  		io_req_task_queue(req);  		return;  	} +	spin_unlock(&ctx->completion_lock);  	ret = io_req_prep_async(req);  	if (ret) { @@ -6571,7 +7066,7 @@ fail:  		goto queue;  	} -	trace_io_uring_defer(ctx, req, req->user_data); +	trace_io_uring_defer(ctx, req, req->user_data, req->opcode);  	de->req = req;  	de->seq = seq;  	list_add_tail(&de->list, &ctx->defer_list); @@ -6580,10 +7075,8 @@ fail:  static void io_clean_op(struct io_kiocb *req)  { -	if (req->flags & REQ_F_BUFFER_SELECTED) { -		kfree(req->kbuf); -		req->kbuf = NULL; -	} +	if (req->flags & REQ_F_BUFFER_SELECTED) +		io_put_kbuf_comp(req);  	if (req->flags & REQ_F_NEED_CLEANUP) {  		switch (req->opcode) { @@ -6633,6 +7126,10 @@ static void io_clean_op(struct io_kiocb *req)  			putname(req->hardlink.oldpath);  			putname(req->hardlink.newpath);  			break; +		case IORING_OP_STATX: +			if (req->statx.filename) +				putname(req->statx.filename); +			break;  		}  	}  	if ((req->flags & REQ_F_POLLED) && req->apoll) { @@ -6775,6 +7272,9 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)  	case IORING_OP_LINKAT:  		ret = io_linkat(req, issue_flags);  		break; +	case IORING_OP_MSG_RING: +		ret = io_msg_ring(req, issue_flags); +		break;  	default:  		ret = -EINVAL;  		break; @@ -6850,7 +7350,7 @@ static void io_wq_submit_work(struct io_wq_work *work)  			continue;  		} -		if (io_arm_poll_handler(req) == IO_APOLL_OK) +		if (io_arm_poll_handler(req, issue_flags) == IO_APOLL_OK)  			return;  		/* aborted or ready, in either case retry blocking */  		needs_poll = false; @@ -6907,7 +7407,7 @@ static struct file *io_file_get_normal(struct io_ring_ctx *ctx,  {  	struct file *file = fget(fd); -	trace_io_uring_file_get(ctx, fd); +	trace_io_uring_file_get(ctx, req, req->user_data, fd);  	/* we don't allow fixed io_uring files */  	if (file && unlikely(file->f_op == &io_uring_fops)) @@ -6965,7 +7465,7 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)  	spin_unlock_irqrestore(&ctx->timeout_lock, flags);  	req->io_task_work.func = io_req_task_link_timeout; -	io_req_task_work_add(req); +	io_req_task_work_add(req, false);  	return HRTIMER_NORESTART;  } @@ -6996,7 +7496,7 @@ static void io_queue_sqe_arm_apoll(struct io_kiocb *req)  {  	struct io_kiocb *linked_timeout = io_prep_linked_timeout(req); -	switch (io_arm_poll_handler(req)) { +	switch (io_arm_poll_handler(req, 0)) {  	case IO_APOLL_READY:  		io_req_task_queue(req);  		break; @@ -7005,8 +7505,12 @@ static void io_queue_sqe_arm_apoll(struct io_kiocb *req)  		 * Queued up for async execution, worker will release  		 * submit reference when the iocb is actually submitted.  		 */ +		io_kbuf_recycle(req);  		io_queue_async_work(req, NULL);  		break; +	case IO_APOLL_OK: +		io_kbuf_recycle(req); +		break;  	}  	if (linked_timeout) @@ -7100,10 +7604,10 @@ static void io_init_req_drain(struct io_kiocb *req)  		 * If we need to drain a request in the middle of a link, drain  		 * the head request and the next request/link after the current  		 * link. Considering sequential execution of links, -		 * IOSQE_IO_DRAIN will be maintained for every request of our +		 * REQ_F_IO_DRAIN will be maintained for every request of our  		 * link.  		 */ -		head->flags |= IOSQE_IO_DRAIN | REQ_F_FORCE_ASYNC; +		head->flags |= REQ_F_IO_DRAIN | REQ_F_FORCE_ASYNC;  		ctx->drain_next = true;  	}  } @@ -7136,8 +7640,13 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,  		if ((sqe_flags & IOSQE_BUFFER_SELECT) &&  		    !io_op_defs[opcode].buffer_select)  			return -EOPNOTSUPP; -		if (sqe_flags & IOSQE_IO_DRAIN) +		if (sqe_flags & IOSQE_CQE_SKIP_SUCCESS) +			ctx->drain_disabled = true; +		if (sqe_flags & IOSQE_IO_DRAIN) { +			if (ctx->drain_disabled) +				return -EOPNOTSUPP;  			io_init_req_drain(req); +		}  	}  	if (unlikely(ctx->restricted || ctx->drain_active || ctx->drain_next)) {  		if (ctx->restricted && !io_check_restriction(ctx, req, sqe_flags)) @@ -7149,7 +7658,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,  		if (unlikely(ctx->drain_next) && !ctx->submit_state.link.head) {  			ctx->drain_next = false;  			ctx->drain_active = true; -			req->flags |= IOSQE_IO_DRAIN | REQ_F_FORCE_ASYNC; +			req->flags |= REQ_F_IO_DRAIN | REQ_F_FORCE_ASYNC;  		}  	} @@ -7200,7 +7709,7 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,  	ret = io_init_req(ctx, req, sqe);  	if (unlikely(ret)) { -		trace_io_uring_req_failed(sqe, ret); +		trace_io_uring_req_failed(sqe, ctx, req, ret);  		/* fail even hard links since we don't submit */  		if (link->head) { @@ -7227,7 +7736,7 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,  	}  	/* don't need @sqe from now on */ -	trace_io_uring_submit_sqe(ctx, req, req->opcode, req->user_data, +	trace_io_uring_submit_sqe(ctx, req, req->user_data, req->opcode,  				  req->flags, true,  				  ctx->flags & IORING_SETUP_SQPOLL); @@ -7370,8 +7879,14 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)  		}  		/* will complete beyond this point, count as submitted */  		submitted++; -		if (io_submit_sqe(ctx, req, sqe)) -			break; +		if (io_submit_sqe(ctx, req, sqe)) { +			/* +			 * Continue submitting even for sqe failure if the +			 * ring was setup with IORING_SETUP_SUBMIT_ALL +			 */ +			if (!(ctx->flags & IORING_SETUP_SUBMIT_ALL)) +				break; +		}  	} while (submitted < nr);  	if (unlikely(submitted != nr)) { @@ -7438,7 +7953,13 @@ static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)  		    !(ctx->flags & IORING_SETUP_R_DISABLED))  			ret = io_submit_sqes(ctx, to_submit);  		mutex_unlock(&ctx->uring_lock); - +#ifdef CONFIG_NET_RX_BUSY_POLL +		spin_lock(&ctx->napi_lock); +		if (!list_empty(&ctx->napi_list) && +		    io_napi_busy_loop(&ctx->napi_list)) +			++ret; +		spin_unlock(&ctx->napi_lock); +#endif  		if (to_submit && wq_has_sleeper(&ctx->sqo_sq_wait))  			wake_up(&ctx->sqo_sq_wait);  		if (creds) @@ -7569,6 +8090,9 @@ struct io_wait_queue {  	struct io_ring_ctx *ctx;  	unsigned cq_tail;  	unsigned nr_timeouts; +#ifdef CONFIG_NET_RX_BUSY_POLL +	unsigned busy_poll_to; +#endif  };  static inline bool io_should_wake(struct io_wait_queue *iowq) @@ -7603,17 +8127,17 @@ static int io_run_task_work_sig(void)  {  	if (io_run_task_work())  		return 1; -	if (!signal_pending(current)) -		return 0;  	if (test_thread_flag(TIF_NOTIFY_SIGNAL))  		return -ERESTARTSYS; -	return -EINTR; +	if (task_sigpending(current)) +		return -EINTR; +	return 0;  }  /* when returns >0, the caller should retry */  static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,  					  struct io_wait_queue *iowq, -					  signed long *timeout) +					  ktime_t timeout)  {  	int ret; @@ -7625,10 +8149,92 @@ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,  	if (test_bit(0, &ctx->check_cq_overflow))  		return 1; -	*timeout = schedule_timeout(*timeout); -	return !*timeout ? -ETIME : 1; +	if (!schedule_hrtimeout(&timeout, HRTIMER_MODE_ABS)) +		return -ETIME; +	return 1;  } +#ifdef CONFIG_NET_RX_BUSY_POLL +static void io_adjust_busy_loop_timeout(struct timespec64 *ts, +					struct io_wait_queue *iowq) +{ +	unsigned busy_poll_to = READ_ONCE(sysctl_net_busy_poll); +	struct timespec64 pollto = ns_to_timespec64(1000 * (s64)busy_poll_to); + +	if (timespec64_compare(ts, &pollto) > 0) { +		*ts = timespec64_sub(*ts, pollto); +		iowq->busy_poll_to = busy_poll_to; +	} else { +		u64 to = timespec64_to_ns(ts); + +		do_div(to, 1000); +		iowq->busy_poll_to = to; +		ts->tv_sec = 0; +		ts->tv_nsec = 0; +	} +} + +static inline bool io_busy_loop_timeout(unsigned long start_time, +					unsigned long bp_usec) +{ +	if (bp_usec) { +		unsigned long end_time = start_time + bp_usec; +		unsigned long now = busy_loop_current_time(); + +		return time_after(now, end_time); +	} +	return true; +} + +static bool io_busy_loop_end(void *p, unsigned long start_time) +{ +	struct io_wait_queue *iowq = p; + +	return signal_pending(current) || +	       io_should_wake(iowq) || +	       io_busy_loop_timeout(start_time, iowq->busy_poll_to); +} + +static void io_blocking_napi_busy_loop(struct list_head *napi_list, +				       struct io_wait_queue *iowq) +{ +	unsigned long start_time = +		list_is_singular(napi_list) ? 0 : +		busy_loop_current_time(); + +	do { +		if (list_is_singular(napi_list)) { +			struct napi_entry *ne = +				list_first_entry(napi_list, +						 struct napi_entry, list); + +			napi_busy_loop(ne->napi_id, io_busy_loop_end, iowq, +				       true, BUSY_POLL_BUDGET); +			io_check_napi_entry_timeout(ne); +			break; +		} +	} while (io_napi_busy_loop(napi_list) && +		 !io_busy_loop_end(iowq, start_time)); +} + +static void io_putback_napi_list(struct io_ring_ctx *ctx, +				 struct list_head *napi_list) +{ +	struct napi_entry *cne, *lne; + +	spin_lock(&ctx->napi_lock); +	list_for_each_entry(cne, &ctx->napi_list, list) +		list_for_each_entry(lne, napi_list, list) +			if (cne->napi_id == lne->napi_id) { +				list_del(&lne->list); +				kfree(lne); +				break; +			} +	list_splice(napi_list, &ctx->napi_list); +	spin_unlock(&ctx->napi_lock); +} +#endif /* CONFIG_NET_RX_BUSY_POLL */ +  /*   * Wait until events become available, if we don't already have some. The   * application must reap them itself, as they reside on the shared cq ring. @@ -7639,8 +8245,11 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,  {  	struct io_wait_queue iowq;  	struct io_rings *rings = ctx->rings; -	signed long timeout = MAX_SCHEDULE_TIMEOUT; +	ktime_t timeout = KTIME_MAX;  	int ret; +#ifdef CONFIG_NET_RX_BUSY_POLL +	LIST_HEAD(local_napi_list); +#endif  	do {  		io_cqring_overflow_flush(ctx); @@ -7650,14 +8259,6 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,  			break;  	} while (1); -	if (uts) { -		struct timespec64 ts; - -		if (get_timespec64(&ts, uts)) -			return -EFAULT; -		timeout = timespec64_to_jiffies(&ts); -	} -  	if (sig) {  #ifdef CONFIG_COMPAT  		if (in_compat_syscall()) @@ -7671,6 +8272,30 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,  			return ret;  	} +#ifdef CONFIG_NET_RX_BUSY_POLL +	iowq.busy_poll_to = 0; +	if (!(ctx->flags & IORING_SETUP_SQPOLL)) { +		spin_lock(&ctx->napi_lock); +		list_splice_init(&ctx->napi_list, &local_napi_list); +		spin_unlock(&ctx->napi_lock); +	} +#endif +	if (uts) { +		struct timespec64 ts; + +		if (get_timespec64(&ts, uts)) +			return -EFAULT; +#ifdef CONFIG_NET_RX_BUSY_POLL +		if (!list_empty(&local_napi_list)) +			io_adjust_busy_loop_timeout(&ts, &iowq); +#endif +		timeout = ktime_add_ns(timespec64_to_ktime(ts), ktime_get_ns()); +	} +#ifdef CONFIG_NET_RX_BUSY_POLL +	else if (!list_empty(&local_napi_list)) +		iowq.busy_poll_to = READ_ONCE(sysctl_net_busy_poll); +#endif +  	init_waitqueue_func_entry(&iowq.wq, io_wake_function);  	iowq.wq.private = current;  	INIT_LIST_HEAD(&iowq.wq.entry); @@ -7679,6 +8304,12 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,  	iowq.cq_tail = READ_ONCE(ctx->rings->cq.head) + min_events;  	trace_io_uring_cqring_wait(ctx, min_events); +#ifdef CONFIG_NET_RX_BUSY_POLL +	if (iowq.busy_poll_to) +		io_blocking_napi_busy_loop(&local_napi_list, &iowq); +	if (!list_empty(&local_napi_list)) +		io_putback_napi_list(ctx, &local_napi_list); +#endif  	do {  		/* if we can't even flush overflow, don't wait for more */  		if (!io_cqring_overflow_flush(ctx)) { @@ -7687,7 +8318,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,  		}  		prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq,  						TASK_INTERRUPTIBLE); -		ret = io_cqring_wait_schedule(ctx, &iowq, &timeout); +		ret = io_cqring_wait_schedule(ctx, &iowq, timeout);  		finish_wait(&ctx->cq_wait, &iowq.wq);  		cond_resched();  	} while (ret > 0); @@ -7741,10 +8372,15 @@ static __cold void io_rsrc_node_ref_zero(struct percpu_ref *ref)  	struct io_ring_ctx *ctx = node->rsrc_data->ctx;  	unsigned long flags;  	bool first_add = false; +	unsigned long delay = HZ;  	spin_lock_irqsave(&ctx->rsrc_ref_lock, flags);  	node->done = true; +	/* if we are mid-quiesce then do not delay */ +	if (node->rsrc_data->quiesce) +		delay = 0; +  	while (!list_empty(&ctx->rsrc_ref_list)) {  		node = list_first_entry(&ctx->rsrc_ref_list,  					    struct io_rsrc_node, node); @@ -7757,10 +8393,10 @@ static __cold void io_rsrc_node_ref_zero(struct percpu_ref *ref)  	spin_unlock_irqrestore(&ctx->rsrc_ref_lock, flags);  	if (first_add) -		mod_delayed_work(system_wq, &ctx->rsrc_put_work, HZ); +		mod_delayed_work(system_wq, &ctx->rsrc_put_work, delay);  } -static struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx) +static struct io_rsrc_node *io_rsrc_node_alloc(void)  {  	struct io_rsrc_node *ref_node; @@ -7811,7 +8447,7 @@ static int io_rsrc_node_switch_start(struct io_ring_ctx *ctx)  {  	if (ctx->rsrc_backup_node)  		return 0; -	ctx->rsrc_backup_node = io_rsrc_node_alloc(ctx); +	ctx->rsrc_backup_node = io_rsrc_node_alloc();  	return ctx->rsrc_backup_node ? 0 : -ENOMEM;  } @@ -7839,7 +8475,15 @@ static __cold int io_rsrc_ref_quiesce(struct io_rsrc_data *data,  		ret = wait_for_completion_interruptible(&data->done);  		if (!ret) {  			mutex_lock(&ctx->uring_lock); -			break; +			if (atomic_read(&data->refs) > 0) { +				/* +				 * it has been revived by another thread while +				 * we were unlocked +				 */ +				mutex_unlock(&ctx->uring_lock); +			} else { +				break; +			}  		}  		atomic_inc(&data->refs); @@ -8263,8 +8907,7 @@ static void __io_rsrc_put_work(struct io_rsrc_node *ref_node)  			io_ring_submit_lock(ctx, lock_ring);  			spin_lock(&ctx->completion_lock); -			io_cqring_fill_event(ctx, prsrc->tag, 0, 0); -			ctx->cq_extra++; +			io_fill_cqe_aux(ctx, prsrc->tag, 0, 0);  			io_commit_cqring(ctx);  			spin_unlock(&ctx->completion_lock);  			io_cqring_ev_posted(ctx); @@ -8655,8 +9298,16 @@ static __cold int io_uring_alloc_task_context(struct task_struct *task,  	if (unlikely(!tctx))  		return -ENOMEM; +	tctx->registered_rings = kcalloc(IO_RINGFD_REG_MAX, +					 sizeof(struct file *), GFP_KERNEL); +	if (unlikely(!tctx->registered_rings)) { +		kfree(tctx); +		return -ENOMEM; +	} +  	ret = percpu_counter_init(&tctx->inflight, 0, GFP_KERNEL);  	if (unlikely(ret)) { +		kfree(tctx->registered_rings);  		kfree(tctx);  		return ret;  	} @@ -8665,6 +9316,7 @@ static __cold int io_uring_alloc_task_context(struct task_struct *task,  	if (IS_ERR(tctx->io_wq)) {  		ret = PTR_ERR(tctx->io_wq);  		percpu_counter_destroy(&tctx->inflight); +		kfree(tctx->registered_rings);  		kfree(tctx);  		return ret;  	} @@ -8676,6 +9328,7 @@ static __cold int io_uring_alloc_task_context(struct task_struct *task,  	task->io_uring = tctx;  	spin_lock_init(&tctx->task_lock);  	INIT_WQ_LIST(&tctx->task_list); +	INIT_WQ_LIST(&tctx->prior_task_list);  	init_task_work(&tctx->task_work, tctx_task_work);  	return 0;  } @@ -8688,6 +9341,7 @@ void __io_uring_free(struct task_struct *tsk)  	WARN_ON_ONCE(tctx->io_wq);  	WARN_ON_ONCE(tctx->cached_refs); +	kfree(tctx->registered_rings);  	percpu_counter_destroy(&tctx->inflight);  	kfree(tctx);  	tsk->io_uring = NULL; @@ -8847,10 +9501,9 @@ static void io_mem_free(void *ptr)  static void *io_mem_alloc(size_t size)  { -	gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP | -				__GFP_NORETRY | __GFP_ACCOUNT; +	gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP; -	return (void *) __get_free_pages(gfp_flags, get_order(size)); +	return (void *) __get_free_pages(gfp, get_order(size));  }  static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries, @@ -9265,33 +9918,55 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,  	return done ? done : err;  } -static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg) +static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg, +			       unsigned int eventfd_async)  { +	struct io_ev_fd *ev_fd;  	__s32 __user *fds = arg;  	int fd; -	if (ctx->cq_ev_fd) +	ev_fd = rcu_dereference_protected(ctx->io_ev_fd, +					lockdep_is_held(&ctx->uring_lock)); +	if (ev_fd)  		return -EBUSY;  	if (copy_from_user(&fd, fds, sizeof(*fds)))  		return -EFAULT; -	ctx->cq_ev_fd = eventfd_ctx_fdget(fd); -	if (IS_ERR(ctx->cq_ev_fd)) { -		int ret = PTR_ERR(ctx->cq_ev_fd); +	ev_fd = kmalloc(sizeof(*ev_fd), GFP_KERNEL); +	if (!ev_fd) +		return -ENOMEM; -		ctx->cq_ev_fd = NULL; +	ev_fd->cq_ev_fd = eventfd_ctx_fdget(fd); +	if (IS_ERR(ev_fd->cq_ev_fd)) { +		int ret = PTR_ERR(ev_fd->cq_ev_fd); +		kfree(ev_fd);  		return ret;  	} - +	ev_fd->eventfd_async = eventfd_async; +	ctx->has_evfd = true; +	rcu_assign_pointer(ctx->io_ev_fd, ev_fd);  	return 0;  } +static void io_eventfd_put(struct rcu_head *rcu) +{ +	struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu); + +	eventfd_ctx_put(ev_fd->cq_ev_fd); +	kfree(ev_fd); +} +  static int io_eventfd_unregister(struct io_ring_ctx *ctx)  { -	if (ctx->cq_ev_fd) { -		eventfd_ctx_put(ctx->cq_ev_fd); -		ctx->cq_ev_fd = NULL; +	struct io_ev_fd *ev_fd; + +	ev_fd = rcu_dereference_protected(ctx->io_ev_fd, +					lockdep_is_held(&ctx->uring_lock)); +	if (ev_fd) { +		ctx->has_evfd = false; +		rcu_assign_pointer(ctx->io_ev_fd, NULL); +		call_rcu(&ev_fd->rcu, io_eventfd_put);  		return 0;  	} @@ -9300,11 +9975,28 @@ static int io_eventfd_unregister(struct io_ring_ctx *ctx)  static void io_destroy_buffers(struct io_ring_ctx *ctx)  { -	struct io_buffer *buf; -	unsigned long index; +	int i; + +	for (i = 0; i < (1U << IO_BUFFERS_HASH_BITS); i++) { +		struct list_head *list = &ctx->io_buffers[i]; + +		while (!list_empty(list)) { +			struct io_buffer_list *bl; + +			bl = list_first_entry(list, struct io_buffer_list, list); +			__io_remove_buffers(ctx, bl, -1U); +			list_del(&bl->list); +			kfree(bl); +		} +	} + +	while (!list_empty(&ctx->io_buffers_pages)) { +		struct page *page; -	xa_for_each(&ctx->io_buffers, index, buf) -		__io_remove_buffers(ctx, buf, index, -1U); +		page = list_first_entry(&ctx->io_buffers_pages, struct page, lru); +		list_del_init(&page->lru); +		__free_page(page); +	}  }  static void io_req_caches_free(struct io_ring_ctx *ctx) @@ -9335,6 +10027,18 @@ static void io_wait_rsrc_data(struct io_rsrc_data *data)  		wait_for_completion(&data->done);  } +static void io_flush_apoll_cache(struct io_ring_ctx *ctx) +{ +	struct async_poll *apoll; + +	while (!list_empty(&ctx->apoll_cache)) { +		apoll = list_first_entry(&ctx->apoll_cache, struct async_poll, +						poll.wait.entry); +		list_del(&apoll->poll.wait.entry); +		kfree(apoll); +	} +} +  static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)  {  	io_sq_thread_finish(ctx); @@ -9356,8 +10060,9 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)  		__io_sqe_files_unregister(ctx);  	if (ctx->rings)  		__io_cqring_overflow_flush(ctx, true); -	mutex_unlock(&ctx->uring_lock);  	io_eventfd_unregister(ctx); +	io_flush_apoll_cache(ctx); +	mutex_unlock(&ctx->uring_lock);  	io_destroy_buffers(ctx);  	if (ctx->sq_creds)  		put_cred(ctx->sq_creds); @@ -9389,8 +10094,10 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)  	io_req_caches_free(ctx);  	if (ctx->hash_map)  		io_wq_put_hash(ctx->hash_map); +	io_free_napi_list(ctx);  	kfree(ctx->cancel_hash);  	kfree(ctx->dummy_ubuf); +	kfree(ctx->io_buffers);  	kfree(ctx);  } @@ -9814,18 +10521,6 @@ static s64 tctx_inflight(struct io_uring_task *tctx, bool tracked)  	return percpu_counter_sum(&tctx->inflight);  } -static __cold void io_uring_drop_tctx_refs(struct task_struct *task) -{ -	struct io_uring_task *tctx = task->io_uring; -	unsigned int refs = tctx->cached_refs; - -	if (refs) { -		tctx->cached_refs = 0; -		percpu_counter_sub(&tctx->inflight, refs); -		put_task_struct_many(task, refs); -	} -} -  /*   * Find any io_uring ctx that this task has registered or done IO on, and cancel   * requests. @sqd should be not-null IFF it's an SQPOLL thread cancellation. @@ -9883,10 +10578,14 @@ static __cold void io_uring_cancel_generic(bool cancel_all,  			schedule();  		finish_wait(&tctx->wait, &wait);  	} while (1); -	atomic_dec(&tctx->in_idle);  	io_uring_clean_tctx(tctx);  	if (cancel_all) { +		/* +		 * We shouldn't run task_works after cancel, so just leave +		 * ->in_idle set for normal exit. +		 */ +		atomic_dec(&tctx->in_idle);  		/* for exec all current's requests should be gone, kill tctx */  		__io_uring_free(current);  	} @@ -9897,6 +10596,139 @@ void __io_uring_cancel(bool cancel_all)  	io_uring_cancel_generic(cancel_all, NULL);  } +void io_uring_unreg_ringfd(void) +{ +	struct io_uring_task *tctx = current->io_uring; +	int i; + +	for (i = 0; i < IO_RINGFD_REG_MAX; i++) { +		if (tctx->registered_rings[i]) { +			fput(tctx->registered_rings[i]); +			tctx->registered_rings[i] = NULL; +		} +	} +} + +static int io_ring_add_registered_fd(struct io_uring_task *tctx, int fd, +				     int start, int end) +{ +	struct file *file; +	int offset; + +	for (offset = start; offset < end; offset++) { +		offset = array_index_nospec(offset, IO_RINGFD_REG_MAX); +		if (tctx->registered_rings[offset]) +			continue; + +		file = fget(fd); +		if (!file) { +			return -EBADF; +		} else if (file->f_op != &io_uring_fops) { +			fput(file); +			return -EOPNOTSUPP; +		} +		tctx->registered_rings[offset] = file; +		return offset; +	} + +	return -EBUSY; +} + +/* + * Register a ring fd to avoid fdget/fdput for each io_uring_enter() + * invocation. User passes in an array of struct io_uring_rsrc_update + * with ->data set to the ring_fd, and ->offset given for the desired + * index. If no index is desired, application may set ->offset == -1U + * and we'll find an available index. Returns number of entries + * successfully processed, or < 0 on error if none were processed. + */ +static int io_ringfd_register(struct io_ring_ctx *ctx, void __user *__arg, +			      unsigned nr_args) +{ +	struct io_uring_rsrc_update __user *arg = __arg; +	struct io_uring_rsrc_update reg; +	struct io_uring_task *tctx; +	int ret, i; + +	if (!nr_args || nr_args > IO_RINGFD_REG_MAX) +		return -EINVAL; + +	mutex_unlock(&ctx->uring_lock); +	ret = io_uring_add_tctx_node(ctx); +	mutex_lock(&ctx->uring_lock); +	if (ret) +		return ret; + +	tctx = current->io_uring; +	for (i = 0; i < nr_args; i++) { +		int start, end; + +		if (copy_from_user(®, &arg[i], sizeof(reg))) { +			ret = -EFAULT; +			break; +		} + +		if (reg.offset == -1U) { +			start = 0; +			end = IO_RINGFD_REG_MAX; +		} else { +			if (reg.offset >= IO_RINGFD_REG_MAX) { +				ret = -EINVAL; +				break; +			} +			start = reg.offset; +			end = start + 1; +		} + +		ret = io_ring_add_registered_fd(tctx, reg.data, start, end); +		if (ret < 0) +			break; + +		reg.offset = ret; +		if (copy_to_user(&arg[i], ®, sizeof(reg))) { +			fput(tctx->registered_rings[reg.offset]); +			tctx->registered_rings[reg.offset] = NULL; +			ret = -EFAULT; +			break; +		} +	} + +	return i ? i : ret; +} + +static int io_ringfd_unregister(struct io_ring_ctx *ctx, void __user *__arg, +				unsigned nr_args) +{ +	struct io_uring_rsrc_update __user *arg = __arg; +	struct io_uring_task *tctx = current->io_uring; +	struct io_uring_rsrc_update reg; +	int ret = 0, i; + +	if (!nr_args || nr_args > IO_RINGFD_REG_MAX) +		return -EINVAL; +	if (!tctx) +		return 0; + +	for (i = 0; i < nr_args; i++) { +		if (copy_from_user(®, &arg[i], sizeof(reg))) { +			ret = -EFAULT; +			break; +		} +		if (reg.offset >= IO_RINGFD_REG_MAX) { +			ret = -EINVAL; +			break; +		} + +		reg.offset = array_index_nospec(reg.offset, IO_RINGFD_REG_MAX); +		if (tctx->registered_rings[reg.offset]) { +			fput(tctx->registered_rings[reg.offset]); +			tctx->registered_rings[reg.offset] = NULL; +		} +	} + +	return i ? i : ret; +} +  static void *io_uring_validate_mmap_request(struct file *file,  					    loff_t pgoff, size_t sz)  { @@ -10027,12 +10859,28 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,  	io_run_task_work();  	if (unlikely(flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP | -			       IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG))) +			       IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG | +			       IORING_ENTER_REGISTERED_RING)))  		return -EINVAL; -	f = fdget(fd); -	if (unlikely(!f.file)) -		return -EBADF; +	/* +	 * Ring fd has been registered via IORING_REGISTER_RING_FDS, we +	 * need only dereference our task private array to find it. +	 */ +	if (flags & IORING_ENTER_REGISTERED_RING) { +		struct io_uring_task *tctx = current->io_uring; + +		if (!tctx || fd >= IO_RINGFD_REG_MAX) +			return -EINVAL; +		fd = array_index_nospec(fd, IO_RINGFD_REG_MAX); +		f.file = tctx->registered_rings[fd]; +		if (unlikely(!f.file)) +			return -EBADF; +	} else { +		f = fdget(fd); +		if (unlikely(!f.file)) +			return -EBADF; +	}  	ret = -EOPNOTSUPP;  	if (unlikely(f.file->f_op != &io_uring_fops)) @@ -10106,7 +10954,8 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,  out:  	percpu_ref_put(&ctx->refs);  out_fput: -	fdput(f); +	if (!(flags & IORING_ENTER_REGISTERED_RING)) +		fdput(f);  	return submitted ? submitted : ret;  } @@ -10164,7 +11013,7 @@ static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx,  	 * and sq_tail and cq_head are changed by userspace. But it's ok since  	 * we usually use these info when it is stuck.  	 */ -	seq_printf(m, "SqMask:\t\t0x%x\n", sq_mask); +	seq_printf(m, "SqMask:\t0x%x\n", sq_mask);  	seq_printf(m, "SqHead:\t%u\n", sq_head);  	seq_printf(m, "SqTail:\t%u\n", sq_tail);  	seq_printf(m, "CachedSqHead:\t%u\n", ctx->cached_sq_head); @@ -10473,7 +11322,7 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,  			IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL |  			IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED |  			IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS | -			IORING_FEAT_RSRC_TAGS; +			IORING_FEAT_RSRC_TAGS | IORING_FEAT_CQE_SKIP;  	if (copy_to_user(params, p, sizeof(*p))) {  		ret = -EFAULT; @@ -10524,7 +11373,7 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params)  	if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL |  			IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE |  			IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ | -			IORING_SETUP_R_DISABLED)) +			IORING_SETUP_R_DISABLED | IORING_SETUP_SUBMIT_ALL))  		return -EINVAL;  	return  io_uring_create(entries, &p, params); @@ -10874,61 +11723,6 @@ err:  	return ret;  } -static bool io_register_op_must_quiesce(int op) -{ -	switch (op) { -	case IORING_REGISTER_BUFFERS: -	case IORING_UNREGISTER_BUFFERS: -	case IORING_REGISTER_FILES: -	case IORING_UNREGISTER_FILES: -	case IORING_REGISTER_FILES_UPDATE: -	case IORING_REGISTER_PROBE: -	case IORING_REGISTER_PERSONALITY: -	case IORING_UNREGISTER_PERSONALITY: -	case IORING_REGISTER_FILES2: -	case IORING_REGISTER_FILES_UPDATE2: -	case IORING_REGISTER_BUFFERS2: -	case IORING_REGISTER_BUFFERS_UPDATE: -	case IORING_REGISTER_IOWQ_AFF: -	case IORING_UNREGISTER_IOWQ_AFF: -	case IORING_REGISTER_IOWQ_MAX_WORKERS: -		return false; -	default: -		return true; -	} -} - -static __cold int io_ctx_quiesce(struct io_ring_ctx *ctx) -{ -	long ret; - -	percpu_ref_kill(&ctx->refs); - -	/* -	 * Drop uring mutex before waiting for references to exit. If another -	 * thread is currently inside io_uring_enter() it might need to grab the -	 * uring_lock to make progress. If we hold it here across the drain -	 * wait, then we can deadlock. It's safe to drop the mutex here, since -	 * no new references will come in after we've killed the percpu ref. -	 */ -	mutex_unlock(&ctx->uring_lock); -	do { -		ret = wait_for_completion_interruptible_timeout(&ctx->ref_comp, HZ); -		if (ret) { -			ret = min(0L, ret); -			break; -		} - -		ret = io_run_task_work_sig(); -		io_req_caches_free(ctx); -	} while (ret >= 0); -	mutex_lock(&ctx->uring_lock); - -	if (ret) -		io_refs_resurrect(&ctx->refs, &ctx->ref_comp); -	return ret; -} -  static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,  			       void __user *arg, unsigned nr_args)  	__releases(ctx->uring_lock) @@ -10952,12 +11746,6 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,  			return -EACCES;  	} -	if (io_register_op_must_quiesce(opcode)) { -		ret = io_ctx_quiesce(ctx); -		if (ret) -			return ret; -	} -  	switch (opcode) {  	case IORING_REGISTER_BUFFERS:  		ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL); @@ -10981,17 +11769,16 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,  		ret = io_register_files_update(ctx, arg, nr_args);  		break;  	case IORING_REGISTER_EVENTFD: -	case IORING_REGISTER_EVENTFD_ASYNC:  		ret = -EINVAL;  		if (nr_args != 1)  			break; -		ret = io_eventfd_register(ctx, arg); -		if (ret) +		ret = io_eventfd_register(ctx, arg, 0); +		break; +	case IORING_REGISTER_EVENTFD_ASYNC: +		ret = -EINVAL; +		if (nr_args != 1)  			break; -		if (opcode == IORING_REGISTER_EVENTFD_ASYNC) -			ctx->eventfd_async = 1; -		else -			ctx->eventfd_async = 0; +		ret = io_eventfd_register(ctx, arg, 1);  		break;  	case IORING_UNREGISTER_EVENTFD:  		ret = -EINVAL; @@ -11058,16 +11845,17 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,  			break;  		ret = io_register_iowq_max_workers(ctx, arg);  		break; +	case IORING_REGISTER_RING_FDS: +		ret = io_ringfd_register(ctx, arg, nr_args); +		break; +	case IORING_UNREGISTER_RING_FDS: +		ret = io_ringfd_unregister(ctx, arg, nr_args); +		break;  	default:  		ret = -EINVAL;  		break;  	} -	if (io_register_op_must_quiesce(opcode)) { -		/* bring the ctx back to life */ -		percpu_ref_reinit(&ctx->refs); -		reinit_completion(&ctx->ref_comp); -	}  	return ret;  } @@ -11093,8 +11881,7 @@ SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,  	mutex_lock(&ctx->uring_lock);  	ret = __io_uring_register(ctx, opcode, arg, nr_args);  	mutex_unlock(&ctx->uring_lock); -	trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs, -							ctx->cq_ev_fd != NULL, ret); +	trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs, ret);  out_fput:  	fdput(f);  	return ret;  |