diff options
Diffstat (limited to 'fs/io_uring.c')
| -rw-r--r-- | fs/io_uring.c | 157 | 
1 files changed, 93 insertions, 64 deletions
| diff --git a/fs/io_uring.c b/fs/io_uring.c index de650df9ac53..c06082bb039a 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -183,19 +183,15 @@ struct fixed_file_table {  	struct file		**files;  }; -enum { -	FFD_F_ATOMIC, -}; -  struct fixed_file_data {  	struct fixed_file_table		*table;  	struct io_ring_ctx		*ctx;  	struct percpu_ref		refs;  	struct llist_head		put_llist; -	unsigned long			state;  	struct work_struct		ref_work;  	struct completion		done; +	struct rcu_head			rcu;  };  struct io_ring_ctx { @@ -1004,6 +1000,7 @@ static void io_kill_timeout(struct io_kiocb *req)  	if (ret != -1) {  		atomic_inc(&req->ctx->cq_timeouts);  		list_del_init(&req->list); +		req->flags |= REQ_F_COMP_LOCKED;  		io_cqring_fill_event(req, 0);  		io_put_req(req);  	} @@ -1483,10 +1480,10 @@ static void io_free_req(struct io_kiocb *req)  __attribute__((nonnull))  static void io_put_req_find_next(struct io_kiocb *req, struct io_kiocb **nxtptr)  { -	io_req_find_next(req, nxtptr); - -	if (refcount_dec_and_test(&req->refs)) +	if (refcount_dec_and_test(&req->refs)) { +		io_req_find_next(req, nxtptr);  		__io_free_req(req); +	}  }  static void io_put_req(struct io_kiocb *req) @@ -1821,6 +1818,10 @@ static void io_iopoll_req_issued(struct io_kiocb *req)  		list_add(&req->list, &ctx->poll_list);  	else  		list_add_tail(&req->list, &ctx->poll_list); + +	if ((ctx->flags & IORING_SETUP_SQPOLL) && +	    wq_has_sleeper(&ctx->sqo_wait)) +		wake_up(&ctx->sqo_wait);  }  static void io_file_put(struct io_submit_state *state) @@ -2071,7 +2072,7 @@ static ssize_t io_import_iovec(int rw, struct io_kiocb *req,  		ssize_t ret;  		ret = import_single_range(rw, buf, sqe_len, *iovec, iter);  		*iovec = NULL; -		return ret; +		return ret < 0 ? ret : sqe_len;  	}  	if (req->io) { @@ -3002,6 +3003,11 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)  	sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr));  	sr->len = READ_ONCE(sqe->len); +#ifdef CONFIG_COMPAT +	if (req->ctx->compat) +		sr->msg_flags |= MSG_CMSG_COMPAT; +#endif +  	if (!io || req->opcode == IORING_OP_SEND)  		return 0;  	/* iovec is already imported */ @@ -3154,6 +3160,11 @@ static int io_recvmsg_prep(struct io_kiocb *req,  	sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr));  	sr->len = READ_ONCE(sqe->len); +#ifdef CONFIG_COMPAT +	if (req->ctx->compat) +		sr->msg_flags |= MSG_CMSG_COMPAT; +#endif +  	if (!io || req->opcode == IORING_OP_RECV)  		return 0;  	/* iovec is already imported */ @@ -4705,11 +4716,21 @@ static void __io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe)  {  	struct io_kiocb *linked_timeout;  	struct io_kiocb *nxt = NULL; +	const struct cred *old_creds = NULL;  	int ret;  again:  	linked_timeout = io_prep_linked_timeout(req); +	if (req->work.creds && req->work.creds != current_cred()) { +		if (old_creds) +			revert_creds(old_creds); +		if (old_creds == req->work.creds) +			old_creds = NULL; /* restored original creds */ +		else +			old_creds = override_creds(req->work.creds); +	} +  	ret = io_issue_sqe(req, sqe, &nxt, true);  	/* @@ -4735,7 +4756,7 @@ punt:  err:  	/* drop submission reference */ -	io_put_req(req); +	io_put_req_find_next(req, &nxt);  	if (linked_timeout) {  		if (!ret) @@ -4759,6 +4780,8 @@ done_req:  			goto punt;  		goto again;  	} +	if (old_creds) +		revert_creds(old_creds);  }  static void io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe) @@ -4803,7 +4826,6 @@ static inline void io_queue_link_head(struct io_kiocb *req)  static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,  			  struct io_submit_state *state, struct io_kiocb **link)  { -	const struct cred *old_creds = NULL;  	struct io_ring_ctx *ctx = req->ctx;  	unsigned int sqe_flags;  	int ret, id; @@ -4818,14 +4840,12 @@ static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,  	id = READ_ONCE(sqe->personality);  	if (id) { -		const struct cred *personality_creds; - -		personality_creds = idr_find(&ctx->personality_idr, id); -		if (unlikely(!personality_creds)) { +		req->work.creds = idr_find(&ctx->personality_idr, id); +		if (unlikely(!req->work.creds)) {  			ret = -EINVAL;  			goto err_req;  		} -		old_creds = override_creds(personality_creds); +		get_cred(req->work.creds);  	}  	/* same numerical values with corresponding REQ_F_*, safe to copy */ @@ -4837,8 +4857,6 @@ static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,  err_req:  		io_cqring_add_event(req, ret);  		io_double_put_req(req); -		if (old_creds) -			revert_creds(old_creds);  		return false;  	} @@ -4899,8 +4917,6 @@ err_req:  		}  	} -	if (old_creds) -		revert_creds(old_creds);  	return true;  } @@ -5081,9 +5097,8 @@ static int io_sq_thread(void *data)  	const struct cred *old_cred;  	mm_segment_t old_fs;  	DEFINE_WAIT(wait); -	unsigned inflight;  	unsigned long timeout; -	int ret; +	int ret = 0;  	complete(&ctx->completions[1]); @@ -5091,39 +5106,19 @@ static int io_sq_thread(void *data)  	set_fs(USER_DS);  	old_cred = override_creds(ctx->creds); -	ret = timeout = inflight = 0; +	timeout = jiffies + ctx->sq_thread_idle;  	while (!kthread_should_park()) {  		unsigned int to_submit; -		if (inflight) { +		if (!list_empty(&ctx->poll_list)) {  			unsigned nr_events = 0; -			if (ctx->flags & IORING_SETUP_IOPOLL) { -				/* -				 * inflight is the count of the maximum possible -				 * entries we submitted, but it can be smaller -				 * if we dropped some of them. If we don't have -				 * poll entries available, then we know that we -				 * have nothing left to poll for. Reset the -				 * inflight count to zero in that case. -				 */ -				mutex_lock(&ctx->uring_lock); -				if (!list_empty(&ctx->poll_list)) -					io_iopoll_getevents(ctx, &nr_events, 0); -				else -					inflight = 0; -				mutex_unlock(&ctx->uring_lock); -			} else { -				/* -				 * Normal IO, just pretend everything completed. -				 * We don't have to poll completions for that. -				 */ -				nr_events = inflight; -			} - -			inflight -= nr_events; -			if (!inflight) +			mutex_lock(&ctx->uring_lock); +			if (!list_empty(&ctx->poll_list)) +				io_iopoll_getevents(ctx, &nr_events, 0); +			else  				timeout = jiffies + ctx->sq_thread_idle; +			mutex_unlock(&ctx->uring_lock);  		}  		to_submit = io_sqring_entries(ctx); @@ -5152,7 +5147,7 @@ static int io_sq_thread(void *data)  			 * more IO, we should wait for the application to  			 * reap events and wake us up.  			 */ -			if (inflight || +			if (!list_empty(&ctx->poll_list) ||  			    (!time_after(jiffies, timeout) && ret != -EBUSY &&  			    !percpu_ref_is_dying(&ctx->refs))) {  				cond_resched(); @@ -5162,6 +5157,19 @@ static int io_sq_thread(void *data)  			prepare_to_wait(&ctx->sqo_wait, &wait,  						TASK_INTERRUPTIBLE); +			/* +			 * While doing polled IO, before going to sleep, we need +			 * to check if there are new reqs added to poll_list, it +			 * is because reqs may have been punted to io worker and +			 * will be added to poll_list later, hence check the +			 * poll_list again. +			 */ +			if ((ctx->flags & IORING_SETUP_IOPOLL) && +			    !list_empty_careful(&ctx->poll_list)) { +				finish_wait(&ctx->sqo_wait, &wait); +				continue; +			} +  			/* Tell userspace we may need a wakeup call */  			ctx->rings->sq_flags |= IORING_SQ_NEED_WAKEUP;  			/* make sure to read SQ tail after writing flags */ @@ -5189,8 +5197,7 @@ static int io_sq_thread(void *data)  		mutex_lock(&ctx->uring_lock);  		ret = io_submit_sqes(ctx, to_submit, NULL, -1, &cur_mm, true);  		mutex_unlock(&ctx->uring_lock); -		if (ret > 0) -			inflight += ret; +		timeout = jiffies + ctx->sq_thread_idle;  	}  	set_fs(old_fs); @@ -5324,6 +5331,26 @@ static void io_file_ref_kill(struct percpu_ref *ref)  	complete(&data->done);  } +static void __io_file_ref_exit_and_free(struct rcu_head *rcu) +{ +	struct fixed_file_data *data = container_of(rcu, struct fixed_file_data, +							rcu); +	percpu_ref_exit(&data->refs); +	kfree(data); +} + +static void io_file_ref_exit_and_free(struct rcu_head *rcu) +{ +	/* +	 * We need to order our exit+free call against the potentially +	 * existing call_rcu() for switching to atomic. One way to do that +	 * is to have this rcu callback queue the final put and free, as we +	 * could otherwise have a pre-existing atomic switch complete _after_ +	 * the free callback we queued. +	 */ +	call_rcu(rcu, __io_file_ref_exit_and_free); +} +  static int io_sqe_files_unregister(struct io_ring_ctx *ctx)  {  	struct fixed_file_data *data = ctx->file_data; @@ -5336,14 +5363,13 @@ static int io_sqe_files_unregister(struct io_ring_ctx *ctx)  	flush_work(&data->ref_work);  	wait_for_completion(&data->done);  	io_ring_file_ref_flush(data); -	percpu_ref_exit(&data->refs);  	__io_sqe_files_unregister(ctx);  	nr_tables = DIV_ROUND_UP(ctx->nr_user_files, IORING_MAX_FILES_TABLE);  	for (i = 0; i < nr_tables; i++)  		kfree(data->table[i].files);  	kfree(data->table); -	kfree(data); +	call_rcu(&data->rcu, io_file_ref_exit_and_free);  	ctx->file_data = NULL;  	ctx->nr_user_files = 0;  	return 0; @@ -5595,7 +5621,6 @@ static void io_ring_file_ref_switch(struct work_struct *work)  	data = container_of(work, struct fixed_file_data, ref_work);  	io_ring_file_ref_flush(data); -	percpu_ref_get(&data->refs);  	percpu_ref_switch_to_percpu(&data->refs);  } @@ -5771,8 +5796,13 @@ static void io_atomic_switch(struct percpu_ref *ref)  {  	struct fixed_file_data *data; +	/* +	 * Juggle reference to ensure we hit zero, if needed, so we can +	 * switch back to percpu mode +	 */  	data = container_of(ref, struct fixed_file_data, refs); -	clear_bit(FFD_F_ATOMIC, &data->state); +	percpu_ref_put(&data->refs); +	percpu_ref_get(&data->refs);  }  static bool io_queue_file_removal(struct fixed_file_data *data, @@ -5795,11 +5825,7 @@ static bool io_queue_file_removal(struct fixed_file_data *data,  	llist_add(&pfile->llist, &data->put_llist);  	if (pfile == &pfile_stack) { -		if (!test_and_set_bit(FFD_F_ATOMIC, &data->state)) { -			percpu_ref_put(&data->refs); -			percpu_ref_switch_to_atomic(&data->refs, -							io_atomic_switch); -		} +		percpu_ref_switch_to_atomic(&data->refs, io_atomic_switch);  		wait_for_completion(&done);  		flush_work(&data->ref_work);  		return false; @@ -5873,10 +5899,8 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,  		up->offset++;  	} -	if (ref_switch && !test_and_set_bit(FFD_F_ATOMIC, &data->state)) { -		percpu_ref_put(&data->refs); +	if (ref_switch)  		percpu_ref_switch_to_atomic(&data->refs, io_atomic_switch); -	}  	return done ? done : err;  } @@ -6334,6 +6358,7 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx)  	io_sqe_buffer_unregister(ctx);  	io_sqe_files_unregister(ctx);  	io_eventfd_unregister(ctx); +	idr_destroy(&ctx->personality_idr);  #if defined(CONFIG_UNIX)  	if (ctx->ring_sock) { @@ -6647,6 +6672,7 @@ out_fput:  	return submitted ? submitted : ret;  } +#ifdef CONFIG_PROC_FS  static int io_uring_show_cred(int id, void *p, void *data)  {  	const struct cred *cred = p; @@ -6720,6 +6746,7 @@ static void io_uring_show_fdinfo(struct seq_file *m, struct file *f)  		percpu_ref_put(&ctx->refs);  	}  } +#endif  static const struct file_operations io_uring_fops = {  	.release	= io_uring_release, @@ -6731,7 +6758,9 @@ static const struct file_operations io_uring_fops = {  #endif  	.poll		= io_uring_poll,  	.fasync		= io_uring_fasync, +#ifdef CONFIG_PROC_FS  	.show_fdinfo	= io_uring_show_fdinfo, +#endif  };  static int io_allocate_scq_urings(struct io_ring_ctx *ctx, |