diff options
Diffstat (limited to 'fs/io_uring.c')
| -rw-r--r-- | fs/io_uring.c | 218 | 
1 files changed, 121 insertions, 97 deletions
diff --git a/fs/io_uring.c b/fs/io_uring.c index 5a826017ebb8..c06082bb039a 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -183,19 +183,15 @@ struct fixed_file_table {  	struct file		**files;  }; -enum { -	FFD_F_ATOMIC, -}; -  struct fixed_file_data {  	struct fixed_file_table		*table;  	struct io_ring_ctx		*ctx;  	struct percpu_ref		refs;  	struct llist_head		put_llist; -	unsigned long			state;  	struct work_struct		ref_work;  	struct completion		done; +	struct rcu_head			rcu;  };  struct io_ring_ctx { @@ -1004,6 +1000,7 @@ static void io_kill_timeout(struct io_kiocb *req)  	if (ret != -1) {  		atomic_inc(&req->ctx->cq_timeouts);  		list_del_init(&req->list); +		req->flags |= REQ_F_COMP_LOCKED;  		io_cqring_fill_event(req, 0);  		io_put_req(req);  	} @@ -1260,6 +1257,9 @@ static void __io_req_aux_free(struct io_kiocb *req)  {  	struct io_ring_ctx *ctx = req->ctx; +	if (req->flags & REQ_F_NEED_CLEANUP) +		io_cleanup_req(req); +  	kfree(req->io);  	if (req->file) {  		if (req->flags & REQ_F_FIXED_FILE) @@ -1275,9 +1275,6 @@ static void __io_free_req(struct io_kiocb *req)  {  	__io_req_aux_free(req); -	if (req->flags & REQ_F_NEED_CLEANUP) -		io_cleanup_req(req); -  	if (req->flags & REQ_F_INFLIGHT) {  		struct io_ring_ctx *ctx = req->ctx;  		unsigned long flags; @@ -1483,10 +1480,10 @@ static void io_free_req(struct io_kiocb *req)  __attribute__((nonnull))  static void io_put_req_find_next(struct io_kiocb *req, struct io_kiocb **nxtptr)  { -	io_req_find_next(req, nxtptr); - -	if (refcount_dec_and_test(&req->refs)) +	if (refcount_dec_and_test(&req->refs)) { +		io_req_find_next(req, nxtptr);  		__io_free_req(req); +	}  }  static void io_put_req(struct io_kiocb *req) @@ -1672,11 +1669,17 @@ static void io_iopoll_reap_events(struct io_ring_ctx *ctx)  	mutex_unlock(&ctx->uring_lock);  } -static int __io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events, -			    long min) +static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events, +			   long min)  {  	int iters = 0, ret = 0; +	/* +	 * We disallow the app entering submit/complete with polling, but we +	 * still need to lock the ring to prevent racing with polled issue +	 * that got punted to a workqueue. +	 */ +	mutex_lock(&ctx->uring_lock);  	do {  		int tmin = 0; @@ -1712,21 +1715,6 @@ static int __io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events,  		ret = 0;  	} while (min && !*nr_events && !need_resched()); -	return ret; -} - -static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events, -			   long min) -{ -	int ret; - -	/* -	 * We disallow the app entering submit/complete with polling, but we -	 * still need to lock the ring to prevent racing with polled issue -	 * that got punted to a workqueue. -	 */ -	mutex_lock(&ctx->uring_lock); -	ret = __io_iopoll_check(ctx, nr_events, min);  	mutex_unlock(&ctx->uring_lock);  	return ret;  } @@ -1830,6 +1818,10 @@ static void io_iopoll_req_issued(struct io_kiocb *req)  		list_add(&req->list, &ctx->poll_list);  	else  		list_add_tail(&req->list, &ctx->poll_list); + +	if ((ctx->flags & IORING_SETUP_SQPOLL) && +	    wq_has_sleeper(&ctx->sqo_wait)) +		wake_up(&ctx->sqo_wait);  }  static void io_file_put(struct io_submit_state *state) @@ -2080,7 +2072,7 @@ static ssize_t io_import_iovec(int rw, struct io_kiocb *req,  		ssize_t ret;  		ret = import_single_range(rw, buf, sqe_len, *iovec, iter);  		*iovec = NULL; -		return ret; +		return ret < 0 ? ret : sqe_len;  	}  	if (req->io) { @@ -2517,6 +2509,9 @@ static void io_fallocate_finish(struct io_wq_work **workptr)  	struct io_kiocb *nxt = NULL;  	int ret; +	if (io_req_cancelled(req)) +		return; +  	ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off,  				req->sync.len);  	if (ret < 0) @@ -2904,6 +2899,7 @@ static void io_close_finish(struct io_wq_work **workptr)  	struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);  	struct io_kiocb *nxt = NULL; +	/* not cancellable, don't do io_req_cancelled() */  	__io_close_finish(req, &nxt);  	if (nxt)  		io_wq_assign_next(workptr, nxt); @@ -3007,6 +3003,11 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)  	sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr));  	sr->len = READ_ONCE(sqe->len); +#ifdef CONFIG_COMPAT +	if (req->ctx->compat) +		sr->msg_flags |= MSG_CMSG_COMPAT; +#endif +  	if (!io || req->opcode == IORING_OP_SEND)  		return 0;  	/* iovec is already imported */ @@ -3071,7 +3072,7 @@ static int io_sendmsg(struct io_kiocb *req, struct io_kiocb **nxt,  			if (req->io)  				return -EAGAIN;  			if (io_alloc_async_ctx(req)) { -				if (kmsg && kmsg->iov != kmsg->fast_iov) +				if (kmsg->iov != kmsg->fast_iov)  					kfree(kmsg->iov);  				return -ENOMEM;  			} @@ -3159,6 +3160,11 @@ static int io_recvmsg_prep(struct io_kiocb *req,  	sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr));  	sr->len = READ_ONCE(sqe->len); +#ifdef CONFIG_COMPAT +	if (req->ctx->compat) +		sr->msg_flags |= MSG_CMSG_COMPAT; +#endif +  	if (!io || req->opcode == IORING_OP_RECV)  		return 0;  	/* iovec is already imported */ @@ -3225,7 +3231,7 @@ static int io_recvmsg(struct io_kiocb *req, struct io_kiocb **nxt,  			if (req->io)  				return -EAGAIN;  			if (io_alloc_async_ctx(req)) { -				if (kmsg && kmsg->iov != kmsg->fast_iov) +				if (kmsg->iov != kmsg->fast_iov)  					kfree(kmsg->iov);  				return -ENOMEM;  			} @@ -4710,11 +4716,21 @@ static void __io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe)  {  	struct io_kiocb *linked_timeout;  	struct io_kiocb *nxt = NULL; +	const struct cred *old_creds = NULL;  	int ret;  again:  	linked_timeout = io_prep_linked_timeout(req); +	if (req->work.creds && req->work.creds != current_cred()) { +		if (old_creds) +			revert_creds(old_creds); +		if (old_creds == req->work.creds) +			old_creds = NULL; /* restored original creds */ +		else +			old_creds = override_creds(req->work.creds); +	} +  	ret = io_issue_sqe(req, sqe, &nxt, true);  	/* @@ -4740,7 +4756,7 @@ punt:  err:  	/* drop submission reference */ -	io_put_req(req); +	io_put_req_find_next(req, &nxt);  	if (linked_timeout) {  		if (!ret) @@ -4764,6 +4780,8 @@ done_req:  			goto punt;  		goto again;  	} +	if (old_creds) +		revert_creds(old_creds);  }  static void io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe) @@ -4808,7 +4826,6 @@ static inline void io_queue_link_head(struct io_kiocb *req)  static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,  			  struct io_submit_state *state, struct io_kiocb **link)  { -	const struct cred *old_creds = NULL;  	struct io_ring_ctx *ctx = req->ctx;  	unsigned int sqe_flags;  	int ret, id; @@ -4823,14 +4840,12 @@ static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,  	id = READ_ONCE(sqe->personality);  	if (id) { -		const struct cred *personality_creds; - -		personality_creds = idr_find(&ctx->personality_idr, id); -		if (unlikely(!personality_creds)) { +		req->work.creds = idr_find(&ctx->personality_idr, id); +		if (unlikely(!req->work.creds)) {  			ret = -EINVAL;  			goto err_req;  		} -		old_creds = override_creds(personality_creds); +		get_cred(req->work.creds);  	}  	/* same numerical values with corresponding REQ_F_*, safe to copy */ @@ -4842,8 +4857,6 @@ static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,  err_req:  		io_cqring_add_event(req, ret);  		io_double_put_req(req); -		if (old_creds) -			revert_creds(old_creds);  		return false;  	} @@ -4904,8 +4917,6 @@ err_req:  		}  	} -	if (old_creds) -		revert_creds(old_creds);  	return true;  } @@ -5086,9 +5097,8 @@ static int io_sq_thread(void *data)  	const struct cred *old_cred;  	mm_segment_t old_fs;  	DEFINE_WAIT(wait); -	unsigned inflight;  	unsigned long timeout; -	int ret; +	int ret = 0;  	complete(&ctx->completions[1]); @@ -5096,39 +5106,19 @@ static int io_sq_thread(void *data)  	set_fs(USER_DS);  	old_cred = override_creds(ctx->creds); -	ret = timeout = inflight = 0; +	timeout = jiffies + ctx->sq_thread_idle;  	while (!kthread_should_park()) {  		unsigned int to_submit; -		if (inflight) { +		if (!list_empty(&ctx->poll_list)) {  			unsigned nr_events = 0; -			if (ctx->flags & IORING_SETUP_IOPOLL) { -				/* -				 * inflight is the count of the maximum possible -				 * entries we submitted, but it can be smaller -				 * if we dropped some of them. If we don't have -				 * poll entries available, then we know that we -				 * have nothing left to poll for. Reset the -				 * inflight count to zero in that case. -				 */ -				mutex_lock(&ctx->uring_lock); -				if (!list_empty(&ctx->poll_list)) -					__io_iopoll_check(ctx, &nr_events, 0); -				else -					inflight = 0; -				mutex_unlock(&ctx->uring_lock); -			} else { -				/* -				 * Normal IO, just pretend everything completed. -				 * We don't have to poll completions for that. -				 */ -				nr_events = inflight; -			} - -			inflight -= nr_events; -			if (!inflight) +			mutex_lock(&ctx->uring_lock); +			if (!list_empty(&ctx->poll_list)) +				io_iopoll_getevents(ctx, &nr_events, 0); +			else  				timeout = jiffies + ctx->sq_thread_idle; +			mutex_unlock(&ctx->uring_lock);  		}  		to_submit = io_sqring_entries(ctx); @@ -5139,34 +5129,47 @@ static int io_sq_thread(void *data)  		 */  		if (!to_submit || ret == -EBUSY) {  			/* +			 * Drop cur_mm before scheduling, we can't hold it for +			 * long periods (or over schedule()). Do this before +			 * adding ourselves to the waitqueue, as the unuse/drop +			 * may sleep. +			 */ +			if (cur_mm) { +				unuse_mm(cur_mm); +				mmput(cur_mm); +				cur_mm = NULL; +			} + +			/*  			 * We're polling. If we're within the defined idle  			 * period, then let us spin without work before going  			 * to sleep. The exception is if we got EBUSY doing  			 * more IO, we should wait for the application to  			 * reap events and wake us up.  			 */ -			if (inflight || +			if (!list_empty(&ctx->poll_list) ||  			    (!time_after(jiffies, timeout) && ret != -EBUSY &&  			    !percpu_ref_is_dying(&ctx->refs))) {  				cond_resched();  				continue;  			} +			prepare_to_wait(&ctx->sqo_wait, &wait, +						TASK_INTERRUPTIBLE); +  			/* -			 * Drop cur_mm before scheduling, we can't hold it for -			 * long periods (or over schedule()). Do this before -			 * adding ourselves to the waitqueue, as the unuse/drop -			 * may sleep. +			 * While doing polled IO, before going to sleep, we need +			 * to check if there are new reqs added to poll_list, it +			 * is because reqs may have been punted to io worker and +			 * will be added to poll_list later, hence check the +			 * poll_list again.  			 */ -			if (cur_mm) { -				unuse_mm(cur_mm); -				mmput(cur_mm); -				cur_mm = NULL; +			if ((ctx->flags & IORING_SETUP_IOPOLL) && +			    !list_empty_careful(&ctx->poll_list)) { +				finish_wait(&ctx->sqo_wait, &wait); +				continue;  			} -			prepare_to_wait(&ctx->sqo_wait, &wait, -						TASK_INTERRUPTIBLE); -  			/* Tell userspace we may need a wakeup call */  			ctx->rings->sq_flags |= IORING_SQ_NEED_WAKEUP;  			/* make sure to read SQ tail after writing flags */ @@ -5194,8 +5197,7 @@ static int io_sq_thread(void *data)  		mutex_lock(&ctx->uring_lock);  		ret = io_submit_sqes(ctx, to_submit, NULL, -1, &cur_mm, true);  		mutex_unlock(&ctx->uring_lock); -		if (ret > 0) -			inflight += ret; +		timeout = jiffies + ctx->sq_thread_idle;  	}  	set_fs(old_fs); @@ -5329,6 +5331,26 @@ static void io_file_ref_kill(struct percpu_ref *ref)  	complete(&data->done);  } +static void __io_file_ref_exit_and_free(struct rcu_head *rcu) +{ +	struct fixed_file_data *data = container_of(rcu, struct fixed_file_data, +							rcu); +	percpu_ref_exit(&data->refs); +	kfree(data); +} + +static void io_file_ref_exit_and_free(struct rcu_head *rcu) +{ +	/* +	 * We need to order our exit+free call against the potentially +	 * existing call_rcu() for switching to atomic. One way to do that +	 * is to have this rcu callback queue the final put and free, as we +	 * could otherwise have a pre-existing atomic switch complete _after_ +	 * the free callback we queued. +	 */ +	call_rcu(rcu, __io_file_ref_exit_and_free); +} +  static int io_sqe_files_unregister(struct io_ring_ctx *ctx)  {  	struct fixed_file_data *data = ctx->file_data; @@ -5341,14 +5363,13 @@ static int io_sqe_files_unregister(struct io_ring_ctx *ctx)  	flush_work(&data->ref_work);  	wait_for_completion(&data->done);  	io_ring_file_ref_flush(data); -	percpu_ref_exit(&data->refs);  	__io_sqe_files_unregister(ctx);  	nr_tables = DIV_ROUND_UP(ctx->nr_user_files, IORING_MAX_FILES_TABLE);  	for (i = 0; i < nr_tables; i++)  		kfree(data->table[i].files);  	kfree(data->table); -	kfree(data); +	call_rcu(&data->rcu, io_file_ref_exit_and_free);  	ctx->file_data = NULL;  	ctx->nr_user_files = 0;  	return 0; @@ -5600,7 +5621,6 @@ static void io_ring_file_ref_switch(struct work_struct *work)  	data = container_of(work, struct fixed_file_data, ref_work);  	io_ring_file_ref_flush(data); -	percpu_ref_get(&data->refs);  	percpu_ref_switch_to_percpu(&data->refs);  } @@ -5776,8 +5796,13 @@ static void io_atomic_switch(struct percpu_ref *ref)  {  	struct fixed_file_data *data; +	/* +	 * Juggle reference to ensure we hit zero, if needed, so we can +	 * switch back to percpu mode +	 */  	data = container_of(ref, struct fixed_file_data, refs); -	clear_bit(FFD_F_ATOMIC, &data->state); +	percpu_ref_put(&data->refs); +	percpu_ref_get(&data->refs);  }  static bool io_queue_file_removal(struct fixed_file_data *data, @@ -5800,11 +5825,7 @@ static bool io_queue_file_removal(struct fixed_file_data *data,  	llist_add(&pfile->llist, &data->put_llist);  	if (pfile == &pfile_stack) { -		if (!test_and_set_bit(FFD_F_ATOMIC, &data->state)) { -			percpu_ref_put(&data->refs); -			percpu_ref_switch_to_atomic(&data->refs, -							io_atomic_switch); -		} +		percpu_ref_switch_to_atomic(&data->refs, io_atomic_switch);  		wait_for_completion(&done);  		flush_work(&data->ref_work);  		return false; @@ -5878,10 +5899,8 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,  		up->offset++;  	} -	if (ref_switch && !test_and_set_bit(FFD_F_ATOMIC, &data->state)) { -		percpu_ref_put(&data->refs); +	if (ref_switch)  		percpu_ref_switch_to_atomic(&data->refs, io_atomic_switch); -	}  	return done ? done : err;  } @@ -6339,6 +6358,7 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx)  	io_sqe_buffer_unregister(ctx);  	io_sqe_files_unregister(ctx);  	io_eventfd_unregister(ctx); +	idr_destroy(&ctx->personality_idr);  #if defined(CONFIG_UNIX)  	if (ctx->ring_sock) { @@ -6652,6 +6672,7 @@ out_fput:  	return submitted ? submitted : ret;  } +#ifdef CONFIG_PROC_FS  static int io_uring_show_cred(int id, void *p, void *data)  {  	const struct cred *cred = p; @@ -6725,6 +6746,7 @@ static void io_uring_show_fdinfo(struct seq_file *m, struct file *f)  		percpu_ref_put(&ctx->refs);  	}  } +#endif  static const struct file_operations io_uring_fops = {  	.release	= io_uring_release, @@ -6736,7 +6758,9 @@ static const struct file_operations io_uring_fops = {  #endif  	.poll		= io_uring_poll,  	.fasync		= io_uring_fasync, +#ifdef CONFIG_PROC_FS  	.show_fdinfo	= io_uring_show_fdinfo, +#endif  };  static int io_allocate_scq_urings(struct io_ring_ctx *ctx,  |