diff options
Diffstat (limited to 'fs/io_uring.c')
| -rw-r--r-- | fs/io_uring.c | 361 | 
1 files changed, 250 insertions, 111 deletions
| diff --git a/fs/io_uring.c b/fs/io_uring.c index 16fb7436043c..bc18af5e0a93 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -403,7 +403,6 @@ struct io_ring_ctx {  		struct wait_queue_head	cq_wait;  		unsigned		cq_extra;  		atomic_t		cq_timeouts; -		struct fasync_struct	*cq_fasync;  		unsigned		cq_last_tm_flush;  	} ____cacheline_aligned_in_smp; @@ -457,6 +456,8 @@ struct io_ring_ctx {  		struct work_struct		exit_work;  		struct list_head		tctx_list;  		struct completion		ref_comp; +		u32				iowq_limits[2]; +		bool				iowq_limits_set;  	};  }; @@ -502,6 +503,7 @@ struct io_poll_update {  struct io_close {  	struct file			*file;  	int				fd; +	u32				file_slot;  };  struct io_timeout_data { @@ -712,6 +714,7 @@ struct io_async_rw {  	struct iovec			fast_iov[UIO_FASTIOV];  	const struct iovec		*free_iovec;  	struct iov_iter			iter; +	struct iov_iter_state		iter_state;  	size_t				bytes_done;  	struct wait_page_queue		wpq;  }; @@ -735,7 +738,6 @@ enum {  	REQ_F_BUFFER_SELECTED_BIT,  	REQ_F_COMPLETE_INLINE_BIT,  	REQ_F_REISSUE_BIT, -	REQ_F_DONT_REISSUE_BIT,  	REQ_F_CREDS_BIT,  	REQ_F_REFCOUNT_BIT,  	REQ_F_ARM_LTIMEOUT_BIT, @@ -782,8 +784,6 @@ enum {  	REQ_F_COMPLETE_INLINE	= BIT(REQ_F_COMPLETE_INLINE_BIT),  	/* caller should reissue async */  	REQ_F_REISSUE		= BIT(REQ_F_REISSUE_BIT), -	/* don't attempt request reissue, see io_rw_reissue() */ -	REQ_F_DONT_REISSUE	= BIT(REQ_F_DONT_REISSUE_BIT),  	/* supports async reads */  	REQ_F_NOWAIT_READ	= BIT(REQ_F_NOWAIT_READ_BIT),  	/* supports async writes */ @@ -1100,6 +1100,8 @@ static int io_req_prep_async(struct io_kiocb *req);  static int io_install_fixed_file(struct io_kiocb *req, struct file *file,  				 unsigned int issue_flags, u32 slot_index); +static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags); +  static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer);  static struct kmem_cache *req_cachep; @@ -1368,11 +1370,6 @@ static void io_req_track_inflight(struct io_kiocb *req)  	}  } -static inline void io_unprep_linked_timeout(struct io_kiocb *req) -{ -	req->flags &= ~REQ_F_LINK_TIMEOUT; -} -  static struct io_kiocb *__io_prep_linked_timeout(struct io_kiocb *req)  {  	if (WARN_ON_ONCE(!req->link)) @@ -1613,10 +1610,8 @@ static void io_cqring_ev_posted(struct io_ring_ctx *ctx)  		wake_up(&ctx->sq_data->wait);  	if (io_should_trigger_evfd(ctx))  		eventfd_signal(ctx->cq_ev_fd, 1); -	if (waitqueue_active(&ctx->poll_wait)) { +	if (waitqueue_active(&ctx->poll_wait))  		wake_up_interruptible(&ctx->poll_wait); -		kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN); -	}  }  static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx) @@ -1630,10 +1625,8 @@ static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx)  	}  	if (io_should_trigger_evfd(ctx))  		eventfd_signal(ctx->cq_ev_fd, 1); -	if (waitqueue_active(&ctx->poll_wait)) { +	if (waitqueue_active(&ctx->poll_wait))  		wake_up_interruptible(&ctx->poll_wait); -		kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN); -	}  }  /* Returns true if there are no backlogged entries after the flush */ @@ -2444,13 +2437,6 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,  		req = list_first_entry(done, struct io_kiocb, inflight_entry);  		list_del(&req->inflight_entry); -		if (READ_ONCE(req->result) == -EAGAIN && -		    !(req->flags & REQ_F_DONT_REISSUE)) { -			req->iopoll_completed = 0; -			io_req_task_queue_reissue(req); -			continue; -		} -  		__io_cqring_fill_event(ctx, req->user_data, req->result,  					io_put_rw_kbuf(req));  		(*nr_events)++; @@ -2613,8 +2599,7 @@ static bool io_resubmit_prep(struct io_kiocb *req)  	if (!rw)  		return !io_req_prep_async(req); -	/* may have left rw->iter inconsistent on -EIOCBQUEUED */ -	iov_iter_revert(&rw->iter, req->result - iov_iter_count(&rw->iter)); +	iov_iter_restore(&rw->iter, &rw->iter_state);  	return true;  } @@ -2714,10 +2699,9 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)  	if (kiocb->ki_flags & IOCB_WRITE)  		kiocb_end_write(req);  	if (unlikely(res != req->result)) { -		if (!(res == -EAGAIN && io_rw_should_reissue(req) && -		    io_resubmit_prep(req))) { -			req_set_fail(req); -			req->flags |= REQ_F_DONT_REISSUE; +		if (res == -EAGAIN && io_rw_should_reissue(req)) { +			req->flags |= REQ_F_REISSUE; +			return;  		}  	} @@ -2843,7 +2827,8 @@ static bool io_file_supports_nowait(struct io_kiocb *req, int rw)  	return __io_file_supports_nowait(req->file, rw);  } -static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe) +static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, +		      int rw)  {  	struct io_ring_ctx *ctx = req->ctx;  	struct kiocb *kiocb = &req->rw.kiocb; @@ -2865,8 +2850,13 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)  	if (unlikely(ret))  		return ret; -	/* don't allow async punt for O_NONBLOCK or RWF_NOWAIT */ -	if ((kiocb->ki_flags & IOCB_NOWAIT) || (file->f_flags & O_NONBLOCK)) +	/* +	 * If the file is marked O_NONBLOCK, still allow retry for it if it +	 * supports async. Otherwise it's impossible to use O_NONBLOCK files +	 * reliably. If not, or it IOCB_NOWAIT is set, don't retry. +	 */ +	if ((kiocb->ki_flags & IOCB_NOWAIT) || +	    ((file->f_flags & O_NONBLOCK) && !io_file_supports_nowait(req, rw)))  		req->flags |= REQ_F_NOWAIT;  	ioprio = READ_ONCE(sqe->ioprio); @@ -2931,7 +2921,6 @@ static void kiocb_done(struct kiocb *kiocb, ssize_t ret,  {  	struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);  	struct io_async_rw *io = req->async_data; -	bool check_reissue = kiocb->ki_complete == io_complete_rw;  	/* add previously done IO, if any */  	if (io && io->bytes_done > 0) { @@ -2943,19 +2932,27 @@ static void kiocb_done(struct kiocb *kiocb, ssize_t ret,  	if (req->flags & REQ_F_CUR_POS)  		req->file->f_pos = kiocb->ki_pos; -	if (ret >= 0 && check_reissue) +	if (ret >= 0 && (kiocb->ki_complete == io_complete_rw))  		__io_complete_rw(req, ret, 0, issue_flags);  	else  		io_rw_done(kiocb, ret); -	if (check_reissue && (req->flags & REQ_F_REISSUE)) { +	if (req->flags & REQ_F_REISSUE) {  		req->flags &= ~REQ_F_REISSUE;  		if (io_resubmit_prep(req)) {  			io_req_task_queue_reissue(req);  		} else { +			unsigned int cflags = io_put_rw_kbuf(req); +			struct io_ring_ctx *ctx = req->ctx; +  			req_set_fail(req); -			__io_req_complete(req, issue_flags, ret, -					  io_put_rw_kbuf(req)); +			if (!(issue_flags & IO_URING_F_NONBLOCK)) { +				mutex_lock(&ctx->uring_lock); +				__io_req_complete(req, issue_flags, ret, cflags); +				mutex_unlock(&ctx->uring_lock); +			} else { +				__io_req_complete(req, issue_flags, ret, cflags); +			}  		}  	}  } @@ -3263,12 +3260,15 @@ static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter)  				ret = nr;  			break;  		} +		if (!iov_iter_is_bvec(iter)) { +			iov_iter_advance(iter, nr); +		} else { +			req->rw.len -= nr; +			req->rw.addr += nr; +		}  		ret += nr;  		if (nr != iovec.iov_len)  			break; -		req->rw.len -= nr; -		req->rw.addr += nr; -		iov_iter_advance(iter, nr);  	}  	return ret; @@ -3315,12 +3315,17 @@ static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,  	if (!force && !io_op_defs[req->opcode].needs_async_setup)  		return 0;  	if (!req->async_data) { +		struct io_async_rw *iorw; +  		if (io_alloc_async_data(req)) {  			kfree(iovec);  			return -ENOMEM;  		}  		io_req_map_rw(req, iovec, fast_iov, iter); +		iorw = req->async_data; +		/* we've copied and mapped the iter, ensure state is saved */ +		iov_iter_save_state(&iorw->iter, &iorw->iter_state);  	}  	return 0;  } @@ -3339,6 +3344,7 @@ static inline int io_rw_prep_async(struct io_kiocb *req, int rw)  	iorw->free_iovec = iov;  	if (iov)  		req->flags |= REQ_F_NEED_CLEANUP; +	iov_iter_save_state(&iorw->iter, &iorw->iter_state);  	return 0;  } @@ -3346,7 +3352,7 @@ static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)  {  	if (unlikely(!(req->file->f_mode & FMODE_READ)))  		return -EBADF; -	return io_prep_rw(req, sqe); +	return io_prep_rw(req, sqe, READ);  }  /* @@ -3442,19 +3448,28 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)  	struct kiocb *kiocb = &req->rw.kiocb;  	struct iov_iter __iter, *iter = &__iter;  	struct io_async_rw *rw = req->async_data; -	ssize_t io_size, ret, ret2;  	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; +	struct iov_iter_state __state, *state; +	ssize_t ret, ret2;  	if (rw) {  		iter = &rw->iter; +		state = &rw->iter_state; +		/* +		 * We come here from an earlier attempt, restore our state to +		 * match in case it doesn't. It's cheap enough that we don't +		 * need to make this conditional. +		 */ +		iov_iter_restore(iter, state);  		iovec = NULL;  	} else {  		ret = io_import_iovec(READ, req, &iovec, iter, !force_nonblock);  		if (ret < 0)  			return ret; +		state = &__state; +		iov_iter_save_state(iter, state);  	} -	io_size = iov_iter_count(iter); -	req->result = io_size; +	req->result = iov_iter_count(iter);  	/* Ensure we clear previously set non-block flag */  	if (!force_nonblock) @@ -3468,7 +3483,7 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)  		return ret ?: -EAGAIN;  	} -	ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), io_size); +	ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), req->result);  	if (unlikely(ret)) {  		kfree(iovec);  		return ret; @@ -3484,30 +3499,49 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)  		/* no retry on NONBLOCK nor RWF_NOWAIT */  		if (req->flags & REQ_F_NOWAIT)  			goto done; -		/* some cases will consume bytes even on error returns */ -		iov_iter_reexpand(iter, iter->count + iter->truncated); -		iov_iter_revert(iter, io_size - iov_iter_count(iter));  		ret = 0;  	} else if (ret == -EIOCBQUEUED) {  		goto out_free; -	} else if (ret <= 0 || ret == io_size || !force_nonblock || +	} else if (ret <= 0 || ret == req->result || !force_nonblock ||  		   (req->flags & REQ_F_NOWAIT) || !need_read_all(req)) {  		/* read all, failed, already did sync or don't want to retry */  		goto done;  	} +	/* +	 * Don't depend on the iter state matching what was consumed, or being +	 * untouched in case of error. Restore it and we'll advance it +	 * manually if we need to. +	 */ +	iov_iter_restore(iter, state); +  	ret2 = io_setup_async_rw(req, iovec, inline_vecs, iter, true);  	if (ret2)  		return ret2;  	iovec = NULL;  	rw = req->async_data; -	/* now use our persistent iterator, if we aren't already */ -	iter = &rw->iter; +	/* +	 * Now use our persistent iterator and state, if we aren't already. +	 * We've restored and mapped the iter to match. +	 */ +	if (iter != &rw->iter) { +		iter = &rw->iter; +		state = &rw->iter_state; +	}  	do { -		io_size -= ret; +		/* +		 * We end up here because of a partial read, either from +		 * above or inside this loop. Advance the iter by the bytes +		 * that were consumed. +		 */ +		iov_iter_advance(iter, ret); +		if (!iov_iter_count(iter)) +			break;  		rw->bytes_done += ret; +		iov_iter_save_state(iter, state); +  		/* if we can retry, do so with the callbacks armed */  		if (!io_rw_should_retry(req)) {  			kiocb->ki_flags &= ~IOCB_WAITQ; @@ -3525,7 +3559,8 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)  			return 0;  		/* we got some bytes, but not all. retry. */  		kiocb->ki_flags &= ~IOCB_WAITQ; -	} while (ret > 0 && ret < io_size); +		iov_iter_restore(iter, state); +	} while (ret > 0);  done:  	kiocb_done(kiocb, ret, issue_flags);  out_free: @@ -3539,7 +3574,7 @@ static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)  {  	if (unlikely(!(req->file->f_mode & FMODE_WRITE)))  		return -EBADF; -	return io_prep_rw(req, sqe); +	return io_prep_rw(req, sqe, WRITE);  }  static int io_write(struct io_kiocb *req, unsigned int issue_flags) @@ -3548,19 +3583,23 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags)  	struct kiocb *kiocb = &req->rw.kiocb;  	struct iov_iter __iter, *iter = &__iter;  	struct io_async_rw *rw = req->async_data; -	ssize_t ret, ret2, io_size;  	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; +	struct iov_iter_state __state, *state; +	ssize_t ret, ret2;  	if (rw) {  		iter = &rw->iter; +		state = &rw->iter_state; +		iov_iter_restore(iter, state);  		iovec = NULL;  	} else {  		ret = io_import_iovec(WRITE, req, &iovec, iter, !force_nonblock);  		if (ret < 0)  			return ret; +		state = &__state; +		iov_iter_save_state(iter, state);  	} -	io_size = iov_iter_count(iter); -	req->result = io_size; +	req->result = iov_iter_count(iter);  	/* Ensure we clear previously set non-block flag */  	if (!force_nonblock) @@ -3577,7 +3616,7 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags)  	    (req->flags & REQ_F_ISREG))  		goto copy_iov; -	ret = rw_verify_area(WRITE, req->file, io_kiocb_ppos(kiocb), io_size); +	ret = rw_verify_area(WRITE, req->file, io_kiocb_ppos(kiocb), req->result);  	if (unlikely(ret))  		goto out_free; @@ -3624,9 +3663,7 @@ done:  		kiocb_done(kiocb, ret2, issue_flags);  	} else {  copy_iov: -		/* some cases will consume bytes even on error returns */ -		iov_iter_reexpand(iter, iter->count + iter->truncated); -		iov_iter_revert(iter, io_size - iov_iter_count(iter)); +		iov_iter_restore(iter, state);  		ret = io_setup_async_rw(req, iovec, inline_vecs, iter, false);  		return ret ?: -EAGAIN;  	} @@ -4342,7 +4379,7 @@ static int io_add_buffers(struct io_provide_buf *pbuf, struct io_buffer **head)  	int i, bid = pbuf->bid;  	for (i = 0; i < pbuf->nbufs; i++) { -		buf = kmalloc(sizeof(*buf), GFP_KERNEL); +		buf = kmalloc(sizeof(*buf), GFP_KERNEL_ACCOUNT);  		if (!buf)  			break; @@ -4549,12 +4586,16 @@ static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)  	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))  		return -EINVAL;  	if (sqe->ioprio || sqe->off || sqe->addr || sqe->len || -	    sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in) +	    sqe->rw_flags || sqe->buf_index)  		return -EINVAL;  	if (req->flags & REQ_F_FIXED_FILE)  		return -EBADF;  	req->close.fd = READ_ONCE(sqe->fd); +	req->close.file_slot = READ_ONCE(sqe->file_index); +	if (req->close.file_slot && req->close.fd) +		return -EINVAL; +  	return 0;  } @@ -4566,6 +4607,11 @@ static int io_close(struct io_kiocb *req, unsigned int issue_flags)  	struct file *file = NULL;  	int ret = -EBADF; +	if (req->close.file_slot) { +		ret = io_close_fixed(req, issue_flags); +		goto err; +	} +  	spin_lock(&files->file_lock);  	fdt = files_fdtable(files);  	if (close->fd >= fdt->max_fds) { @@ -5293,7 +5339,7 @@ static bool __io_poll_complete(struct io_kiocb *req, __poll_t mask)  	if (req->poll.events & EPOLLONESHOT)  		flags = 0;  	if (!io_cqring_fill_event(ctx, req->user_data, error, flags)) { -		req->poll.done = true; +		req->poll.events |= EPOLLONESHOT;  		flags = 0;  	}  	if (flags & IORING_CQE_F_MORE) @@ -5322,10 +5368,15 @@ static void io_poll_task_func(struct io_kiocb *req, bool *locked)  	} else {  		bool done; +		if (req->poll.done) { +			spin_unlock(&ctx->completion_lock); +			return; +		}  		done = __io_poll_complete(req, req->result);  		if (done) {  			io_poll_remove_double(req);  			hash_del(&req->hash_node); +			req->poll.done = true;  		} else {  			req->result = 0;  			add_wait_queue(req->poll.head, &req->poll.wait); @@ -5463,6 +5514,7 @@ static void io_async_task_func(struct io_kiocb *req, bool *locked)  	hash_del(&req->hash_node);  	io_poll_remove_double(req); +	apoll->poll.done = true;  	spin_unlock(&ctx->completion_lock);  	if (!READ_ONCE(apoll->poll.canceled)) @@ -5783,6 +5835,7 @@ static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags)  	struct io_ring_ctx *ctx = req->ctx;  	struct io_poll_table ipt;  	__poll_t mask; +	bool done;  	ipt.pt._qproc = io_poll_queue_proc; @@ -5791,13 +5844,13 @@ static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags)  	if (mask) { /* no async, we'd stolen it */  		ipt.error = 0; -		io_poll_complete(req, mask); +		done = io_poll_complete(req, mask);  	}  	spin_unlock(&ctx->completion_lock);  	if (mask) {  		io_cqring_ev_posted(ctx); -		if (poll->events & EPOLLONESHOT) +		if (done)  			io_put_req(req);  	}  	return ipt.error; @@ -6288,19 +6341,16 @@ static int io_files_update(struct io_kiocb *req, unsigned int issue_flags)  	struct io_uring_rsrc_update2 up;  	int ret; -	if (issue_flags & IO_URING_F_NONBLOCK) -		return -EAGAIN; -  	up.offset = req->rsrc_update.offset;  	up.data = req->rsrc_update.arg;  	up.nr = 0;  	up.tags = 0;  	up.resv = 0; -	mutex_lock(&ctx->uring_lock); +	io_ring_submit_lock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));  	ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE,  					&up, req->rsrc_update.nr_args); -	mutex_unlock(&ctx->uring_lock); +	io_ring_submit_unlock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));  	if (ret < 0)  		req_set_fail(req); @@ -6930,7 +6980,7 @@ issue_sqe:  		switch (io_arm_poll_handler(req)) {  		case IO_APOLL_READY:  			if (linked_timeout) -				io_unprep_linked_timeout(req); +				io_queue_linked_timeout(linked_timeout);  			goto issue_sqe;  		case IO_APOLL_ABORTED:  			/* @@ -7515,6 +7565,14 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,  			break;  	} while (1); +	if (uts) { +		struct timespec64 ts; + +		if (get_timespec64(&ts, uts)) +			return -EFAULT; +		timeout = timespec64_to_jiffies(&ts); +	} +  	if (sig) {  #ifdef CONFIG_COMPAT  		if (in_compat_syscall()) @@ -7528,14 +7586,6 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,  			return ret;  	} -	if (uts) { -		struct timespec64 ts; - -		if (get_timespec64(&ts, uts)) -			return -EFAULT; -		timeout = timespec64_to_jiffies(&ts); -	} -  	init_waitqueue_func_entry(&iowq.wq, io_wake_function);  	iowq.wq.private = current;  	INIT_LIST_HEAD(&iowq.wq.entry); @@ -8284,11 +8334,27 @@ static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file,  #endif  } +static int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, +				 struct io_rsrc_node *node, void *rsrc) +{ +	struct io_rsrc_put *prsrc; + +	prsrc = kzalloc(sizeof(*prsrc), GFP_KERNEL); +	if (!prsrc) +		return -ENOMEM; + +	prsrc->tag = *io_get_tag_slot(data, idx); +	prsrc->rsrc = rsrc; +	list_add(&prsrc->list, &node->rsrc_list); +	return 0; +} +  static int io_install_fixed_file(struct io_kiocb *req, struct file *file,  				 unsigned int issue_flags, u32 slot_index)  {  	struct io_ring_ctx *ctx = req->ctx;  	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; +	bool needs_switch = false;  	struct io_fixed_file *file_slot;  	int ret = -EBADF; @@ -8304,9 +8370,22 @@ static int io_install_fixed_file(struct io_kiocb *req, struct file *file,  	slot_index = array_index_nospec(slot_index, ctx->nr_user_files);  	file_slot = io_fixed_file_slot(&ctx->file_table, slot_index); -	ret = -EBADF; -	if (file_slot->file_ptr) -		goto err; + +	if (file_slot->file_ptr) { +		struct file *old_file; + +		ret = io_rsrc_node_switch_start(ctx); +		if (ret) +			goto err; + +		old_file = (struct file *)(file_slot->file_ptr & FFS_MASK); +		ret = io_queue_rsrc_removal(ctx->file_data, slot_index, +					    ctx->rsrc_node, old_file); +		if (ret) +			goto err; +		file_slot->file_ptr = 0; +		needs_switch = true; +	}  	*io_get_tag_slot(ctx->file_data, slot_index) = 0;  	io_fixed_file_set(file_slot, file); @@ -8318,25 +8397,50 @@ static int io_install_fixed_file(struct io_kiocb *req, struct file *file,  	ret = 0;  err: +	if (needs_switch) +		io_rsrc_node_switch(ctx, ctx->file_data);  	io_ring_submit_unlock(ctx, !force_nonblock);  	if (ret)  		fput(file);  	return ret;  } -static int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, -				 struct io_rsrc_node *node, void *rsrc) +static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags)  { -	struct io_rsrc_put *prsrc; +	unsigned int offset = req->close.file_slot - 1; +	struct io_ring_ctx *ctx = req->ctx; +	struct io_fixed_file *file_slot; +	struct file *file; +	int ret, i; -	prsrc = kzalloc(sizeof(*prsrc), GFP_KERNEL); -	if (!prsrc) -		return -ENOMEM; +	io_ring_submit_lock(ctx, !(issue_flags & IO_URING_F_NONBLOCK)); +	ret = -ENXIO; +	if (unlikely(!ctx->file_data)) +		goto out; +	ret = -EINVAL; +	if (offset >= ctx->nr_user_files) +		goto out; +	ret = io_rsrc_node_switch_start(ctx); +	if (ret) +		goto out; -	prsrc->tag = *io_get_tag_slot(data, idx); -	prsrc->rsrc = rsrc; -	list_add(&prsrc->list, &node->rsrc_list); -	return 0; +	i = array_index_nospec(offset, ctx->nr_user_files); +	file_slot = io_fixed_file_slot(&ctx->file_table, i); +	ret = -EBADF; +	if (!file_slot->file_ptr) +		goto out; + +	file = (struct file *)(file_slot->file_ptr & FFS_MASK); +	ret = io_queue_rsrc_removal(ctx->file_data, offset, ctx->rsrc_node, file); +	if (ret) +		goto out; + +	file_slot->file_ptr = 0; +	io_rsrc_node_switch(ctx, ctx->file_data); +	ret = 0; +out: +	io_ring_submit_unlock(ctx, !(issue_flags & IO_URING_F_NONBLOCK)); +	return ret;  }  static int __io_sqe_files_update(struct io_ring_ctx *ctx, @@ -9105,8 +9209,10 @@ static void io_destroy_buffers(struct io_ring_ctx *ctx)  	struct io_buffer *buf;  	unsigned long index; -	xa_for_each(&ctx->io_buffers, index, buf) +	xa_for_each(&ctx->io_buffers, index, buf) {  		__io_remove_buffers(ctx, buf, index, -1U); +		cond_resched(); +	}  }  static void io_req_cache_free(struct list_head *list) @@ -9231,13 +9337,6 @@ static __poll_t io_uring_poll(struct file *file, poll_table *wait)  	return mask;  } -static int io_uring_fasync(int fd, struct file *file, int on) -{ -	struct io_ring_ctx *ctx = file->private_data; - -	return fasync_helper(fd, file, on, &ctx->cq_fasync); -} -  static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)  {  	const struct cred *creds; @@ -9536,7 +9635,16 @@ static int __io_uring_add_tctx_node(struct io_ring_ctx *ctx)  		ret = io_uring_alloc_task_context(current, ctx);  		if (unlikely(ret))  			return ret; +  		tctx = current->io_uring; +		if (ctx->iowq_limits_set) { +			unsigned int limits[2] = { ctx->iowq_limits[0], +						   ctx->iowq_limits[1], }; + +			ret = io_wq_max_workers(tctx->io_wq, limits); +			if (ret) +				return ret; +		}  	}  	if (!xa_load(&tctx->xa, (unsigned long)ctx)) {  		node = kmalloc(sizeof(*node), GFP_KERNEL); @@ -9604,8 +9712,10 @@ static void io_uring_clean_tctx(struct io_uring_task *tctx)  	struct io_tctx_node *node;  	unsigned long index; -	xa_for_each(&tctx->xa, index, node) +	xa_for_each(&tctx->xa, index, node) {  		io_uring_del_tctx_node(index); +		cond_resched(); +	}  	if (wq) {  		/*  		 * Must be after io_uring_del_task_file() (removes nodes under @@ -10029,7 +10139,6 @@ static const struct file_operations io_uring_fops = {  	.mmap_capabilities = io_uring_nommu_mmap_capabilities,  #endif  	.poll		= io_uring_poll, -	.fasync		= io_uring_fasync,  #ifdef CONFIG_PROC_FS  	.show_fdinfo	= io_uring_show_fdinfo,  #endif @@ -10540,7 +10649,9 @@ static int io_unregister_iowq_aff(struct io_ring_ctx *ctx)  static int io_register_iowq_max_workers(struct io_ring_ctx *ctx,  					void __user *arg) +	__must_hold(&ctx->uring_lock)  { +	struct io_tctx_node *node;  	struct io_uring_task *tctx = NULL;  	struct io_sq_data *sqd = NULL;  	__u32 new_count[2]; @@ -10560,33 +10671,61 @@ static int io_register_iowq_max_workers(struct io_ring_ctx *ctx,  			 * ordering. Fine to drop uring_lock here, we hold  			 * a ref to the ctx.  			 */ +			refcount_inc(&sqd->refs);  			mutex_unlock(&ctx->uring_lock);  			mutex_lock(&sqd->lock);  			mutex_lock(&ctx->uring_lock); -			tctx = sqd->thread->io_uring; +			if (sqd->thread) +				tctx = sqd->thread->io_uring;  		}  	} else {  		tctx = current->io_uring;  	} -	ret = -EINVAL; -	if (!tctx || !tctx->io_wq) -		goto err; +	BUILD_BUG_ON(sizeof(new_count) != sizeof(ctx->iowq_limits)); -	ret = io_wq_max_workers(tctx->io_wq, new_count); -	if (ret) -		goto err; +	memcpy(ctx->iowq_limits, new_count, sizeof(new_count)); +	ctx->iowq_limits_set = true; -	if (sqd) +	ret = -EINVAL; +	if (tctx && tctx->io_wq) { +		ret = io_wq_max_workers(tctx->io_wq, new_count); +		if (ret) +			goto err; +	} else { +		memset(new_count, 0, sizeof(new_count)); +	} + +	if (sqd) {  		mutex_unlock(&sqd->lock); +		io_put_sq_data(sqd); +	}  	if (copy_to_user(arg, new_count, sizeof(new_count)))  		return -EFAULT; +	/* that's it for SQPOLL, only the SQPOLL task creates requests */ +	if (sqd) +		return 0; + +	/* now propagate the restriction to all registered users */ +	list_for_each_entry(node, &ctx->tctx_list, ctx_node) { +		struct io_uring_task *tctx = node->task->io_uring; + +		if (WARN_ON_ONCE(!tctx->io_wq)) +			continue; + +		for (i = 0; i < ARRAY_SIZE(new_count); i++) +			new_count[i] = ctx->iowq_limits[i]; +		/* ignore errors, it always returns zero anyway */ +		(void)io_wq_max_workers(tctx->io_wq, new_count); +	}  	return 0;  err: -	if (sqd) +	if (sqd) {  		mutex_unlock(&sqd->lock); +		io_put_sq_data(sqd); +	}  	return ret;  } |