diff options
Diffstat (limited to 'io_uring/io_uring.c')
| -rw-r--r-- | io_uring/io_uring.c | 497 | 
1 files changed, 261 insertions, 236 deletions
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 3bca7a79efda..1b53a2ab0a27 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -95,6 +95,7 @@  #include "timeout.h"  #include "poll.h" +#include "rw.h"  #include "alloc_cache.h"  #define IORING_MAX_ENTRIES	32768 @@ -145,8 +146,6 @@ static bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,  					 struct task_struct *task,  					 bool cancel_all); -static void io_dismantle_req(struct io_kiocb *req); -static void io_clean_op(struct io_kiocb *req);  static void io_queue_sqe(struct io_kiocb *req);  static void io_move_task_work_from_local(struct io_ring_ctx *ctx);  static void __io_submit_flush_completions(struct io_ring_ctx *ctx); @@ -367,6 +366,39 @@ static bool req_need_defer(struct io_kiocb *req, u32 seq)  	return false;  } +static void io_clean_op(struct io_kiocb *req) +{ +	if (req->flags & REQ_F_BUFFER_SELECTED) { +		spin_lock(&req->ctx->completion_lock); +		io_put_kbuf_comp(req); +		spin_unlock(&req->ctx->completion_lock); +	} + +	if (req->flags & REQ_F_NEED_CLEANUP) { +		const struct io_cold_def *def = &io_cold_defs[req->opcode]; + +		if (def->cleanup) +			def->cleanup(req); +	} +	if ((req->flags & REQ_F_POLLED) && req->apoll) { +		kfree(req->apoll->double_poll); +		kfree(req->apoll); +		req->apoll = NULL; +	} +	if (req->flags & REQ_F_INFLIGHT) { +		struct io_uring_task *tctx = req->task->io_uring; + +		atomic_dec(&tctx->inflight_tracked); +	} +	if (req->flags & REQ_F_CREDS) +		put_cred(req->creds); +	if (req->flags & REQ_F_ASYNC_DATA) { +		kfree(req->async_data); +		req->async_data = NULL; +	} +	req->flags &= ~IO_REQ_CLEAN_FLAGS; +} +  static inline void io_req_track_inflight(struct io_kiocb *req)  {  	if (!(req->flags & REQ_F_INFLIGHT)) { @@ -423,8 +455,8 @@ static void io_prep_async_work(struct io_kiocb *req)  	if (req->flags & REQ_F_FORCE_ASYNC)  		req->work.flags |= IO_WQ_WORK_CONCURRENT; -	if (req->file && !io_req_ffs_set(req)) -		req->flags |= io_file_get_flags(req->file) << REQ_F_SUPPORT_NOWAIT_BIT; +	if (req->file && !(req->flags & REQ_F_FIXED_FILE)) +		req->flags |= io_file_get_flags(req->file);  	if (req->file && (req->flags & REQ_F_ISREG)) {  		bool should_hash = def->hash_reg_file; @@ -594,42 +626,18 @@ void __io_commit_cqring_flush(struct io_ring_ctx *ctx)  }  static inline void __io_cq_lock(struct io_ring_ctx *ctx) -	__acquires(ctx->completion_lock)  {  	if (!ctx->task_complete)  		spin_lock(&ctx->completion_lock);  } -static inline void __io_cq_unlock(struct io_ring_ctx *ctx) -{ -	if (!ctx->task_complete) -		spin_unlock(&ctx->completion_lock); -} -  static inline void io_cq_lock(struct io_ring_ctx *ctx)  	__acquires(ctx->completion_lock)  {  	spin_lock(&ctx->completion_lock);  } -static inline void io_cq_unlock(struct io_ring_ctx *ctx) -	__releases(ctx->completion_lock) -{ -	spin_unlock(&ctx->completion_lock); -} - -/* keep it inlined for io_submit_flush_completions() */  static inline void __io_cq_unlock_post(struct io_ring_ctx *ctx) -	__releases(ctx->completion_lock) -{ -	io_commit_cqring(ctx); -	__io_cq_unlock(ctx); -	io_commit_cqring_flush(ctx); -	io_cqring_wake(ctx); -} - -static void __io_cq_unlock_post_flush(struct io_ring_ctx *ctx) -	__releases(ctx->completion_lock)  {  	io_commit_cqring(ctx); @@ -641,13 +649,13 @@ static void __io_cq_unlock_post_flush(struct io_ring_ctx *ctx)  		 */  		io_commit_cqring_flush(ctx);  	} else { -		__io_cq_unlock(ctx); +		spin_unlock(&ctx->completion_lock);  		io_commit_cqring_flush(ctx);  		io_cqring_wake(ctx);  	}  } -void io_cq_unlock_post(struct io_ring_ctx *ctx) +static void io_cq_unlock_post(struct io_ring_ctx *ctx)  	__releases(ctx->completion_lock)  {  	io_commit_cqring(ctx); @@ -662,10 +670,10 @@ static void io_cqring_overflow_kill(struct io_ring_ctx *ctx)  	struct io_overflow_cqe *ocqe;  	LIST_HEAD(list); -	io_cq_lock(ctx); +	spin_lock(&ctx->completion_lock);  	list_splice_init(&ctx->cq_overflow_list, &list);  	clear_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq); -	io_cq_unlock(ctx); +	spin_unlock(&ctx->completion_lock);  	while (!list_empty(&list)) {  		ocqe = list_first_entry(&list, struct io_overflow_cqe, list); @@ -722,29 +730,29 @@ static void io_cqring_overflow_flush(struct io_ring_ctx *ctx)  }  /* can be called by any task */ -static void io_put_task_remote(struct task_struct *task, int nr) +static void io_put_task_remote(struct task_struct *task)  {  	struct io_uring_task *tctx = task->io_uring; -	percpu_counter_sub(&tctx->inflight, nr); +	percpu_counter_sub(&tctx->inflight, 1);  	if (unlikely(atomic_read(&tctx->in_cancel)))  		wake_up(&tctx->wait); -	put_task_struct_many(task, nr); +	put_task_struct(task);  }  /* used by a task to put its own references */ -static void io_put_task_local(struct task_struct *task, int nr) +static void io_put_task_local(struct task_struct *task)  { -	task->io_uring->cached_refs += nr; +	task->io_uring->cached_refs++;  }  /* must to be called somewhat shortly after putting a request */ -static inline void io_put_task(struct task_struct *task, int nr) +static inline void io_put_task(struct task_struct *task)  {  	if (likely(task == current)) -		io_put_task_local(task, nr); +		io_put_task_local(task);  	else -		io_put_task_remote(task, nr); +		io_put_task_remote(task);  }  void io_task_refs_refill(struct io_uring_task *tctx) @@ -934,20 +942,19 @@ bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags  	return __io_post_aux_cqe(ctx, user_data, res, cflags, true);  } -bool io_aux_cqe(struct io_ring_ctx *ctx, bool defer, u64 user_data, s32 res, u32 cflags, +bool io_aux_cqe(const struct io_kiocb *req, bool defer, s32 res, u32 cflags,  		bool allow_overflow)  { +	struct io_ring_ctx *ctx = req->ctx; +	u64 user_data = req->cqe.user_data;  	struct io_uring_cqe *cqe; -	unsigned int length;  	if (!defer)  		return __io_post_aux_cqe(ctx, user_data, res, cflags, allow_overflow); -	length = ARRAY_SIZE(ctx->submit_state.cqes); -  	lockdep_assert_held(&ctx->uring_lock); -	if (ctx->submit_state.cqes_count == length) { +	if (ctx->submit_state.cqes_count == ARRAY_SIZE(ctx->submit_state.cqes)) {  		__io_cq_lock(ctx);  		__io_flush_post_cqes(ctx);  		/* no need to flush - flush is deferred */ @@ -991,14 +998,18 @@ static void __io_req_complete_post(struct io_kiocb *req, unsigned issue_flags)  			}  		}  		io_put_kbuf_comp(req); -		io_dismantle_req(req); +		if (unlikely(req->flags & IO_REQ_CLEAN_FLAGS)) +			io_clean_op(req); +		if (!(req->flags & REQ_F_FIXED_FILE)) +			io_put_file(req->file); +  		rsrc_node = req->rsrc_node;  		/*  		 * Selected buffer deallocation in io_clean_op() assumes that  		 * we don't hold ->completion_lock. Clean them here to avoid  		 * deadlocks.  		 */ -		io_put_task_remote(req->task, 1); +		io_put_task_remote(req->task);  		wq_list_add_head(&req->comp_list, &ctx->locked_free_list);  		ctx->locked_free_nr++;  	} @@ -1111,36 +1122,13 @@ __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx)  	return true;  } -static inline void io_dismantle_req(struct io_kiocb *req) -{ -	unsigned int flags = req->flags; - -	if (unlikely(flags & IO_REQ_CLEAN_FLAGS)) -		io_clean_op(req); -	if (!(flags & REQ_F_FIXED_FILE)) -		io_put_file(req->file); -} - -static __cold void io_free_req_tw(struct io_kiocb *req, struct io_tw_state *ts) -{ -	struct io_ring_ctx *ctx = req->ctx; - -	if (req->rsrc_node) { -		io_tw_lock(ctx, ts); -		io_put_rsrc_node(ctx, req->rsrc_node); -	} -	io_dismantle_req(req); -	io_put_task_remote(req->task, 1); - -	spin_lock(&ctx->completion_lock); -	wq_list_add_head(&req->comp_list, &ctx->locked_free_list); -	ctx->locked_free_nr++; -	spin_unlock(&ctx->completion_lock); -} -  __cold void io_free_req(struct io_kiocb *req)  { -	req->io_task_work.func = io_free_req_tw; +	/* refs were already put, restore them for io_req_task_complete() */ +	req->flags &= ~REQ_F_REFCOUNT; +	/* we only want to free it, don't post CQEs */ +	req->flags |= REQ_F_CQE_SKIP; +	req->io_task_work.func = io_req_task_complete;  	io_req_task_work_add(req);  } @@ -1205,7 +1193,9 @@ static unsigned int handle_tw_list(struct llist_node *node,  			ts->locked = mutex_trylock(&(*ctx)->uring_lock);  			percpu_ref_get(&(*ctx)->refs);  		} -		req->io_task_work.func(req, ts); +		INDIRECT_CALL_2(req->io_task_work.func, +				io_poll_task_func, io_req_rw_complete, +				req, ts);  		node = next;  		count++;  		if (unlikely(need_resched())) { @@ -1303,7 +1293,7 @@ static __cold void io_fallback_tw(struct io_uring_task *tctx)  	}  } -static void io_req_local_work_add(struct io_kiocb *req, unsigned flags) +static inline void io_req_local_work_add(struct io_kiocb *req, unsigned flags)  {  	struct io_ring_ctx *ctx = req->ctx;  	unsigned nr_wait, nr_tw, nr_tw_prev; @@ -1354,19 +1344,11 @@ static void io_req_local_work_add(struct io_kiocb *req, unsigned flags)  	wake_up_state(ctx->submitter_task, TASK_INTERRUPTIBLE);  } -void __io_req_task_work_add(struct io_kiocb *req, unsigned flags) +static void io_req_normal_work_add(struct io_kiocb *req)  {  	struct io_uring_task *tctx = req->task->io_uring;  	struct io_ring_ctx *ctx = req->ctx; -	if (!(flags & IOU_F_TWQ_FORCE_NORMAL) && -	    (ctx->flags & IORING_SETUP_DEFER_TASKRUN)) { -		rcu_read_lock(); -		io_req_local_work_add(req, flags); -		rcu_read_unlock(); -		return; -	} -  	/* task_work already pending, we're done */  	if (!llist_add(&req->io_task_work.node, &tctx->task_list))  		return; @@ -1380,6 +1362,17 @@ void __io_req_task_work_add(struct io_kiocb *req, unsigned flags)  	io_fallback_tw(tctx);  } +void __io_req_task_work_add(struct io_kiocb *req, unsigned flags) +{ +	if (req->ctx->flags & IORING_SETUP_DEFER_TASKRUN) { +		rcu_read_lock(); +		io_req_local_work_add(req, flags); +		rcu_read_unlock(); +	} else { +		io_req_normal_work_add(req); +	} +} +  static void __cold io_move_task_work_from_local(struct io_ring_ctx *ctx)  {  	struct llist_node *node; @@ -1390,7 +1383,7 @@ static void __cold io_move_task_work_from_local(struct io_ring_ctx *ctx)  						    io_task_work.node);  		node = node->next; -		__io_req_task_work_add(req, IOU_F_TWQ_FORCE_NORMAL); +		io_req_normal_work_add(req);  	}  } @@ -1405,13 +1398,19 @@ static int __io_run_local_work(struct io_ring_ctx *ctx, struct io_tw_state *ts)  	if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)  		atomic_andnot(IORING_SQ_TASKRUN, &ctx->rings->sq_flags);  again: -	node = io_llist_xchg(&ctx->work_llist, NULL); +	/* +	 * llists are in reverse order, flip it back the right way before +	 * running the pending items. +	 */ +	node = llist_reverse_order(io_llist_xchg(&ctx->work_llist, NULL));  	while (node) {  		struct llist_node *next = node->next;  		struct io_kiocb *req = container_of(node, struct io_kiocb,  						    io_task_work.node);  		prefetch(container_of(next, struct io_kiocb, io_task_work.node)); -		req->io_task_work.func(req, ts); +		INDIRECT_CALL_2(req->io_task_work.func, +				io_poll_task_func, io_req_rw_complete, +				req, ts);  		ret++;  		node = next;  	} @@ -1498,9 +1497,6 @@ void io_queue_next(struct io_kiocb *req)  void io_free_batch_list(struct io_ring_ctx *ctx, struct io_wq_work_node *node)  	__must_hold(&ctx->uring_lock)  { -	struct task_struct *task = NULL; -	int task_refs = 0; -  	do {  		struct io_kiocb *req = container_of(node, struct io_kiocb,  						    comp_list); @@ -1530,19 +1526,10 @@ void io_free_batch_list(struct io_ring_ctx *ctx, struct io_wq_work_node *node)  		io_req_put_rsrc_locked(req, ctx); -		if (req->task != task) { -			if (task) -				io_put_task(task, task_refs); -			task = req->task; -			task_refs = 0; -		} -		task_refs++; +		io_put_task(req->task);  		node = req->comp_list.next;  		io_req_add_to_cache(req, ctx);  	} while (node); - -	if (task) -		io_put_task(task, task_refs);  }  static void __io_submit_flush_completions(struct io_ring_ctx *ctx) @@ -1570,7 +1557,7 @@ static void __io_submit_flush_completions(struct io_ring_ctx *ctx)  			}  		}  	} -	__io_cq_unlock_post_flush(ctx); +	__io_cq_unlock_post(ctx);  	if (!wq_list_empty(&ctx->submit_state.compl_reqs)) {  		io_free_batch_list(ctx, state->compl_reqs.first); @@ -1578,22 +1565,6 @@ static void __io_submit_flush_completions(struct io_ring_ctx *ctx)  	}  } -/* - * Drop reference to request, return next in chain (if there is one) if this - * was the last reference to this request. - */ -static inline struct io_kiocb *io_put_req_find_next(struct io_kiocb *req) -{ -	struct io_kiocb *nxt = NULL; - -	if (req_ref_put_and_test(req)) { -		if (unlikely(req->flags & IO_REQ_LINK_FLAGS)) -			nxt = io_req_find_next(req); -		io_free_req(req); -	} -	return nxt; -} -  static unsigned io_cqring_events(struct io_ring_ctx *ctx)  {  	/* See comment at the top of this file */ @@ -1758,54 +1729,14 @@ static void io_iopoll_req_issued(struct io_kiocb *req, unsigned int issue_flags)  	}  } -static bool io_bdev_nowait(struct block_device *bdev) -{ -	return !bdev || bdev_nowait(bdev); -} - -/* - * If we tracked the file through the SCM inflight mechanism, we could support - * any file. For now, just ensure that anything potentially problematic is done - * inline. - */ -static bool __io_file_supports_nowait(struct file *file, umode_t mode) -{ -	if (S_ISBLK(mode)) { -		if (IS_ENABLED(CONFIG_BLOCK) && -		    io_bdev_nowait(I_BDEV(file->f_mapping->host))) -			return true; -		return false; -	} -	if (S_ISSOCK(mode)) -		return true; -	if (S_ISREG(mode)) { -		if (IS_ENABLED(CONFIG_BLOCK) && -		    io_bdev_nowait(file->f_inode->i_sb->s_bdev) && -		    !io_is_uring_fops(file)) -			return true; -		return false; -	} - -	/* any ->read/write should understand O_NONBLOCK */ -	if (file->f_flags & O_NONBLOCK) -		return true; -	return file->f_mode & FMODE_NOWAIT; -} - -/* - * If we tracked the file through the SCM inflight mechanism, we could support - * any file. For now, just ensure that anything potentially problematic is done - * inline. - */  unsigned int io_file_get_flags(struct file *file)  { -	umode_t mode = file_inode(file)->i_mode;  	unsigned int res = 0; -	if (S_ISREG(mode)) -		res |= FFS_ISREG; -	if (__io_file_supports_nowait(file, mode)) -		res |= FFS_NOWAIT; +	if (S_ISREG(file_inode(file)->i_mode)) +		res |= REQ_F_ISREG; +	if ((file->f_flags & O_NONBLOCK) || (file->f_mode & FMODE_NOWAIT)) +		res |= REQ_F_SUPPORT_NOWAIT;  	return res;  } @@ -1891,39 +1822,6 @@ queue:  	spin_unlock(&ctx->completion_lock);  } -static void io_clean_op(struct io_kiocb *req) -{ -	if (req->flags & REQ_F_BUFFER_SELECTED) { -		spin_lock(&req->ctx->completion_lock); -		io_put_kbuf_comp(req); -		spin_unlock(&req->ctx->completion_lock); -	} - -	if (req->flags & REQ_F_NEED_CLEANUP) { -		const struct io_cold_def *def = &io_cold_defs[req->opcode]; - -		if (def->cleanup) -			def->cleanup(req); -	} -	if ((req->flags & REQ_F_POLLED) && req->apoll) { -		kfree(req->apoll->double_poll); -		kfree(req->apoll); -		req->apoll = NULL; -	} -	if (req->flags & REQ_F_INFLIGHT) { -		struct io_uring_task *tctx = req->task->io_uring; - -		atomic_dec(&tctx->inflight_tracked); -	} -	if (req->flags & REQ_F_CREDS) -		put_cred(req->creds); -	if (req->flags & REQ_F_ASYNC_DATA) { -		kfree(req->async_data); -		req->async_data = NULL; -	} -	req->flags &= ~IO_REQ_CLEAN_FLAGS; -} -  static bool io_assign_file(struct io_kiocb *req, const struct io_issue_def *def,  			   unsigned int issue_flags)  { @@ -1986,9 +1884,14 @@ int io_poll_issue(struct io_kiocb *req, struct io_tw_state *ts)  struct io_wq_work *io_wq_free_work(struct io_wq_work *work)  {  	struct io_kiocb *req = container_of(work, struct io_kiocb, work); +	struct io_kiocb *nxt = NULL; -	req = io_put_req_find_next(req); -	return req ? &req->work : NULL; +	if (req_ref_put_and_test(req)) { +		if (req->flags & IO_REQ_LINK_FLAGS) +			nxt = io_req_find_next(req); +		io_free_req(req); +	} +	return nxt ? &nxt->work : NULL;  }  void io_wq_submit_work(struct io_wq_work *work) @@ -2060,19 +1963,17 @@ inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd,  				      unsigned int issue_flags)  {  	struct io_ring_ctx *ctx = req->ctx; +	struct io_fixed_file *slot;  	struct file *file = NULL; -	unsigned long file_ptr;  	io_ring_submit_lock(ctx, issue_flags);  	if (unlikely((unsigned int)fd >= ctx->nr_user_files))  		goto out;  	fd = array_index_nospec(fd, ctx->nr_user_files); -	file_ptr = io_fixed_file_slot(&ctx->file_table, fd)->file_ptr; -	file = (struct file *) (file_ptr & FFS_MASK); -	file_ptr &= ~FFS_MASK; -	/* mask in overlapping REQ_F and FFS bits */ -	req->flags |= (file_ptr << REQ_F_SUPPORT_NOWAIT_BIT); +	slot = io_fixed_file_slot(&ctx->file_table, fd); +	file = io_slot_file(slot); +	req->flags |= io_slot_flags(slot);  	io_req_set_rsrc_node(req, ctx, 0);  out:  	io_ring_submit_unlock(ctx, issue_flags); @@ -2709,11 +2610,96 @@ static void io_mem_free(void *ptr)  		free_compound_page(page);  } +static void io_pages_free(struct page ***pages, int npages) +{ +	struct page **page_array; +	int i; + +	if (!pages) +		return; +	page_array = *pages; +	for (i = 0; i < npages; i++) +		unpin_user_page(page_array[i]); +	kvfree(page_array); +	*pages = NULL; +} + +static void *__io_uaddr_map(struct page ***pages, unsigned short *npages, +			    unsigned long uaddr, size_t size) +{ +	struct page **page_array; +	unsigned int nr_pages; +	int ret; + +	*npages = 0; + +	if (uaddr & (PAGE_SIZE - 1) || !size) +		return ERR_PTR(-EINVAL); + +	nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; +	if (nr_pages > USHRT_MAX) +		return ERR_PTR(-EINVAL); +	page_array = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL); +	if (!page_array) +		return ERR_PTR(-ENOMEM); + +	ret = pin_user_pages_fast(uaddr, nr_pages, FOLL_WRITE | FOLL_LONGTERM, +					page_array); +	if (ret != nr_pages) { +err: +		io_pages_free(&page_array, ret > 0 ? ret : 0); +		return ret < 0 ? ERR_PTR(ret) : ERR_PTR(-EFAULT); +	} +	/* +	 * Should be a single page. If the ring is small enough that we can +	 * use a normal page, that is fine. If we need multiple pages, then +	 * userspace should use a huge page. That's the only way to guarantee +	 * that we get contigious memory, outside of just being lucky or +	 * (currently) having low memory fragmentation. +	 */ +	if (page_array[0] != page_array[ret - 1]) +		goto err; +	*pages = page_array; +	*npages = nr_pages; +	return page_to_virt(page_array[0]); +} + +static void *io_rings_map(struct io_ring_ctx *ctx, unsigned long uaddr, +			  size_t size) +{ +	return __io_uaddr_map(&ctx->ring_pages, &ctx->n_ring_pages, uaddr, +				size); +} + +static void *io_sqes_map(struct io_ring_ctx *ctx, unsigned long uaddr, +			 size_t size) +{ +	return __io_uaddr_map(&ctx->sqe_pages, &ctx->n_sqe_pages, uaddr, +				size); +} + +static void io_rings_free(struct io_ring_ctx *ctx) +{ +	if (!(ctx->flags & IORING_SETUP_NO_MMAP)) { +		io_mem_free(ctx->rings); +		io_mem_free(ctx->sq_sqes); +		ctx->rings = NULL; +		ctx->sq_sqes = NULL; +	} else { +		io_pages_free(&ctx->ring_pages, ctx->n_ring_pages); +		io_pages_free(&ctx->sqe_pages, ctx->n_sqe_pages); +	} +} +  static void *io_mem_alloc(size_t size)  {  	gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP; +	void *ret; -	return (void *) __get_free_pages(gfp, get_order(size)); +	ret = (void *) __get_free_pages(gfp, get_order(size)); +	if (ret) +		return ret; +	return ERR_PTR(-ENOMEM);  }  static unsigned long rings_size(struct io_ring_ctx *ctx, unsigned int sq_entries, @@ -2869,8 +2855,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)  		mmdrop(ctx->mm_account);  		ctx->mm_account = NULL;  	} -	io_mem_free(ctx->rings); -	io_mem_free(ctx->sq_sqes); +	io_rings_free(ctx);  	percpu_ref_exit(&ctx->refs);  	free_uid(ctx->user); @@ -3050,7 +3035,18 @@ static __cold void io_ring_exit_work(struct work_struct *work)  			/* there is little hope left, don't run it too often */  			interval = HZ * 60;  		} -	} while (!wait_for_completion_timeout(&ctx->ref_comp, interval)); +		/* +		 * This is really an uninterruptible wait, as it has to be +		 * complete. But it's also run from a kworker, which doesn't +		 * take signals, so it's fine to make it interruptible. This +		 * avoids scenarios where we knowingly can wait much longer +		 * on completions, for example if someone does a SIGSTOP on +		 * a task that needs to finish task_work to make this loop +		 * complete. That's a synthetic situation that should not +		 * cause a stuck task backtrace, and hence a potential panic +		 * on stuck tasks if that is enabled. +		 */ +	} while (!wait_for_completion_interruptible_timeout(&ctx->ref_comp, interval));  	init_completion(&exit.completion);  	init_task_work(&exit.task_work, io_tctx_exit_cb); @@ -3074,7 +3070,12 @@ static __cold void io_ring_exit_work(struct work_struct *work)  			continue;  		mutex_unlock(&ctx->uring_lock); -		wait_for_completion(&exit.completion); +		/* +		 * See comment above for +		 * wait_for_completion_interruptible_timeout() on why this +		 * wait is marked as interruptible. +		 */ +		wait_for_completion_interruptible(&exit.completion);  		mutex_lock(&ctx->uring_lock);  	}  	mutex_unlock(&ctx->uring_lock); @@ -3348,6 +3349,10 @@ static void *io_uring_validate_mmap_request(struct file *file,  	struct page *page;  	void *ptr; +	/* Don't allow mmap if the ring was setup without it */ +	if (ctx->flags & IORING_SETUP_NO_MMAP) +		return ERR_PTR(-EINVAL); +  	switch (offset & IORING_OFF_MMAP_MASK) {  	case IORING_OFF_SQ_RING:  	case IORING_OFF_CQ_RING: @@ -3673,6 +3678,7 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx,  {  	struct io_rings *rings;  	size_t size, sq_array_offset; +	void *ptr;  	/* make sure these are sane, as we already accounted them */  	ctx->sq_entries = p->sq_entries; @@ -3682,9 +3688,13 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx,  	if (size == SIZE_MAX)  		return -EOVERFLOW; -	rings = io_mem_alloc(size); -	if (!rings) -		return -ENOMEM; +	if (!(ctx->flags & IORING_SETUP_NO_MMAP)) +		rings = io_mem_alloc(size); +	else +		rings = io_rings_map(ctx, p->cq_off.user_addr, size); + +	if (IS_ERR(rings)) +		return PTR_ERR(rings);  	ctx->rings = rings;  	ctx->sq_array = (u32 *)((char *)rings + sq_array_offset); @@ -3698,34 +3708,31 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx,  	else  		size = array_size(sizeof(struct io_uring_sqe), p->sq_entries);  	if (size == SIZE_MAX) { -		io_mem_free(ctx->rings); -		ctx->rings = NULL; +		io_rings_free(ctx);  		return -EOVERFLOW;  	} -	ctx->sq_sqes = io_mem_alloc(size); -	if (!ctx->sq_sqes) { -		io_mem_free(ctx->rings); -		ctx->rings = NULL; -		return -ENOMEM; +	if (!(ctx->flags & IORING_SETUP_NO_MMAP)) +		ptr = io_mem_alloc(size); +	else +		ptr = io_sqes_map(ctx, p->sq_off.user_addr, size); + +	if (IS_ERR(ptr)) { +		io_rings_free(ctx); +		return PTR_ERR(ptr);  	} +	ctx->sq_sqes = ptr;  	return 0;  } -static int io_uring_install_fd(struct io_ring_ctx *ctx, struct file *file) +static int io_uring_install_fd(struct file *file)  { -	int ret, fd; +	int fd;  	fd = get_unused_fd_flags(O_RDWR | O_CLOEXEC);  	if (fd < 0)  		return fd; - -	ret = __io_uring_add_tctx_node(ctx); -	if (ret) { -		put_unused_fd(fd); -		return ret; -	}  	fd_install(fd, file);  	return fd;  } @@ -3765,6 +3772,7 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,  				  struct io_uring_params __user *params)  {  	struct io_ring_ctx *ctx; +	struct io_uring_task *tctx;  	struct file *file;  	int ret; @@ -3776,6 +3784,10 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,  		entries = IORING_MAX_ENTRIES;  	} +	if ((p->flags & IORING_SETUP_REGISTERED_FD_ONLY) +	    && !(p->flags & IORING_SETUP_NO_MMAP)) +		return -EINVAL; +  	/*  	 * Use twice as many entries for the CQ ring. It's possible for the  	 * application to drive a higher depth than the size of the SQ ring, @@ -3887,7 +3899,6 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,  	if (ret)  		goto err; -	memset(&p->sq_off, 0, sizeof(p->sq_off));  	p->sq_off.head = offsetof(struct io_rings, sq.head);  	p->sq_off.tail = offsetof(struct io_rings, sq.tail);  	p->sq_off.ring_mask = offsetof(struct io_rings, sq_ring_mask); @@ -3895,8 +3906,10 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,  	p->sq_off.flags = offsetof(struct io_rings, sq_flags);  	p->sq_off.dropped = offsetof(struct io_rings, sq_dropped);  	p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings; +	p->sq_off.resv1 = 0; +	if (!(ctx->flags & IORING_SETUP_NO_MMAP)) +		p->sq_off.user_addr = 0; -	memset(&p->cq_off, 0, sizeof(p->cq_off));  	p->cq_off.head = offsetof(struct io_rings, cq.head);  	p->cq_off.tail = offsetof(struct io_rings, cq.tail);  	p->cq_off.ring_mask = offsetof(struct io_rings, cq_ring_mask); @@ -3904,6 +3917,9 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,  	p->cq_off.overflow = offsetof(struct io_rings, cq_overflow);  	p->cq_off.cqes = offsetof(struct io_rings, cqes);  	p->cq_off.flags = offsetof(struct io_rings, cq_flags); +	p->cq_off.resv1 = 0; +	if (!(ctx->flags & IORING_SETUP_NO_MMAP)) +		p->cq_off.user_addr = 0;  	p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP |  			IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS | @@ -3928,22 +3944,30 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,  		goto err;  	} +	ret = __io_uring_add_tctx_node(ctx); +	if (ret) +		goto err_fput; +	tctx = current->io_uring; +  	/*  	 * Install ring fd as the very last thing, so we don't risk someone  	 * having closed it before we finish setup  	 */ -	ret = io_uring_install_fd(ctx, file); -	if (ret < 0) { -		/* fput will clean it up */ -		fput(file); -		return ret; -	} +	if (p->flags & IORING_SETUP_REGISTERED_FD_ONLY) +		ret = io_ring_add_registered_file(tctx, file, 0, IO_RINGFD_REG_MAX); +	else +		ret = io_uring_install_fd(file); +	if (ret < 0) +		goto err_fput;  	trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags);  	return ret;  err:  	io_ring_ctx_wait_and_kill(ctx);  	return ret; +err_fput: +	fput(file); +	return ret;  }  /* @@ -3969,7 +3993,8 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params)  			IORING_SETUP_R_DISABLED | IORING_SETUP_SUBMIT_ALL |  			IORING_SETUP_COOP_TASKRUN | IORING_SETUP_TASKRUN_FLAG |  			IORING_SETUP_SQE128 | IORING_SETUP_CQE32 | -			IORING_SETUP_SINGLE_ISSUER | IORING_SETUP_DEFER_TASKRUN)) +			IORING_SETUP_SINGLE_ISSUER | IORING_SETUP_DEFER_TASKRUN | +			IORING_SETUP_NO_MMAP | IORING_SETUP_REGISTERED_FD_ONLY))  		return -EINVAL;  	return io_uring_create(entries, &p, params);  |