From d3e9f732c415cf22faa33d6f195e291ad82dc92e Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 4 Aug 2021 08:37:25 -0600
Subject: io-wq: remove GFP_ATOMIC allocation off schedule out path

Daniel reports that the v5.14-rc4-rt4 kernel throws a BUG when running
stress-ng:

| [   90.202543] BUG: sleeping function called from invalid context at kernel/locking/spinlock_rt.c:35
| [   90.202549] in_atomic(): 1, irqs_disabled(): 1, non_block: 0, pid: 2047, name: iou-wrk-2041
| [   90.202555] CPU: 5 PID: 2047 Comm: iou-wrk-2041 Tainted: G        W         5.14.0-rc4-rt4+ #89
| [   90.202559] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.14.0-2 04/01/2014
| [   90.202561] Call Trace:
| [   90.202577]  dump_stack_lvl+0x34/0x44
| [   90.202584]  ___might_sleep.cold+0x87/0x94
| [   90.202588]  rt_spin_lock+0x19/0x70
| [   90.202593]  ___slab_alloc+0xcb/0x7d0
| [   90.202598]  ? newidle_balance.constprop.0+0xf5/0x3b0
| [   90.202603]  ? dequeue_entity+0xc3/0x290
| [   90.202605]  ? io_wqe_dec_running.isra.0+0x98/0xe0
| [   90.202610]  ? pick_next_task_fair+0xb9/0x330
| [   90.202612]  ? __schedule+0x670/0x1410
| [   90.202615]  ? io_wqe_dec_running.isra.0+0x98/0xe0
| [   90.202618]  kmem_cache_alloc_trace+0x79/0x1f0
| [   90.202621]  io_wqe_dec_running.isra.0+0x98/0xe0
| [   90.202625]  io_wq_worker_sleeping+0x37/0x50
| [   90.202628]  schedule+0x30/0xd0
| [   90.202630]  schedule_timeout+0x8f/0x1a0
| [   90.202634]  ? __bpf_trace_tick_stop+0x10/0x10
| [   90.202637]  io_wqe_worker+0xfd/0x320
| [   90.202641]  ? finish_task_switch.isra.0+0xd3/0x290
| [   90.202644]  ? io_worker_handle_work+0x670/0x670
| [   90.202646]  ? io_worker_handle_work+0x670/0x670
| [   90.202649]  ret_from_fork+0x22/0x30

which is due to the RT kernel not liking a GFP_ATOMIC allocation inside
a raw spinlock. Besides that not working on RT, doing any kind of
allocation from inside schedule() is kind of nasty and should be avoided
if at all possible.

This particular path happens when an io-wq worker goes to sleep, and we
need a new worker to handle pending work. We currently allocate a small
data item to hold the information we need to create a new worker, but we
can instead include this data in the io_worker struct itself and just
protect it with a single bit lock. We only really need one per worker
anyway, as we will have run pending work between to sleep cycles.

https://lore.kernel.org/lkml/20210804082418.fbibprcwtzyt5qax@beryllium.lan/
Reported-by: Daniel Wagner <dwagner@suse.de>
Tested-by: Daniel Wagner <dwagner@suse.de>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io-wq.c | 72 ++++++++++++++++++++++++++++++++++----------------------------
 1 file changed, 40 insertions(+), 32 deletions(-)

(limited to 'fs')

diff --git a/fs/io-wq.c b/fs/io-wq.c
index 7d2ed8c7dd31..4ce83bb48021 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -51,6 +51,10 @@ struct io_worker {
 
 	struct completion ref_done;
 
+	unsigned long create_state;
+	struct callback_head create_work;
+	int create_index;
+
 	struct rcu_head rcu;
 };
 
@@ -272,24 +276,18 @@ static void io_wqe_inc_running(struct io_worker *worker)
 	atomic_inc(&acct->nr_running);
 }
 
-struct create_worker_data {
-	struct callback_head work;
-	struct io_wqe *wqe;
-	int index;
-};
-
 static void create_worker_cb(struct callback_head *cb)
 {
-	struct create_worker_data *cwd;
+	struct io_worker *worker;
 	struct io_wq *wq;
 	struct io_wqe *wqe;
 	struct io_wqe_acct *acct;
 	bool do_create = false, first = false;
 
-	cwd = container_of(cb, struct create_worker_data, work);
-	wqe = cwd->wqe;
+	worker = container_of(cb, struct io_worker, create_work);
+	wqe = worker->wqe;
 	wq = wqe->wq;
-	acct = &wqe->acct[cwd->index];
+	acct = &wqe->acct[worker->create_index];
 	raw_spin_lock_irq(&wqe->lock);
 	if (acct->nr_workers < acct->max_workers) {
 		if (!acct->nr_workers)
@@ -299,33 +297,42 @@ static void create_worker_cb(struct callback_head *cb)
 	}
 	raw_spin_unlock_irq(&wqe->lock);
 	if (do_create) {
-		create_io_worker(wq, wqe, cwd->index, first);
+		create_io_worker(wq, wqe, worker->create_index, first);
 	} else {
 		atomic_dec(&acct->nr_running);
 		io_worker_ref_put(wq);
 	}
-	kfree(cwd);
+	clear_bit_unlock(0, &worker->create_state);
+	io_worker_release(worker);
 }
 
-static void io_queue_worker_create(struct io_wqe *wqe, struct io_wqe_acct *acct)
+static void io_queue_worker_create(struct io_wqe *wqe, struct io_worker *worker,
+				   struct io_wqe_acct *acct)
 {
-	struct create_worker_data *cwd;
 	struct io_wq *wq = wqe->wq;
 
 	/* raced with exit, just ignore create call */
 	if (test_bit(IO_WQ_BIT_EXIT, &wq->state))
 		goto fail;
+	if (!io_worker_get(worker))
+		goto fail;
+	/*
+	 * create_state manages ownership of create_work/index. We should
+	 * only need one entry per worker, as the worker going to sleep
+	 * will trigger the condition, and waking will clear it once it
+	 * runs the task_work.
+	 */
+	if (test_bit(0, &worker->create_state) ||
+	    test_and_set_bit_lock(0, &worker->create_state))
+		goto fail_release;
 
-	cwd = kmalloc(sizeof(*cwd), GFP_ATOMIC);
-	if (cwd) {
-		init_task_work(&cwd->work, create_worker_cb);
-		cwd->wqe = wqe;
-		cwd->index = acct->index;
-		if (!task_work_add(wq->task, &cwd->work, TWA_SIGNAL))
-			return;
-
-		kfree(cwd);
-	}
+	init_task_work(&worker->create_work, create_worker_cb);
+	worker->create_index = acct->index;
+	if (!task_work_add(wq->task, &worker->create_work, TWA_SIGNAL))
+		return;
+	clear_bit_unlock(0, &worker->create_state);
+fail_release:
+	io_worker_release(worker);
 fail:
 	atomic_dec(&acct->nr_running);
 	io_worker_ref_put(wq);
@@ -343,7 +350,7 @@ static void io_wqe_dec_running(struct io_worker *worker)
 	if (atomic_dec_and_test(&acct->nr_running) && io_wqe_run_queue(wqe)) {
 		atomic_inc(&acct->nr_running);
 		atomic_inc(&wqe->wq->worker_refs);
-		io_queue_worker_create(wqe, acct);
+		io_queue_worker_create(wqe, worker, acct);
 	}
 }
 
@@ -1004,12 +1011,12 @@ err_wq:
 
 static bool io_task_work_match(struct callback_head *cb, void *data)
 {
-	struct create_worker_data *cwd;
+	struct io_worker *worker;
 
 	if (cb->func != create_worker_cb)
 		return false;
-	cwd = container_of(cb, struct create_worker_data, work);
-	return cwd->wqe->wq == data;
+	worker = container_of(cb, struct io_worker, create_work);
+	return worker->wqe->wq == data;
 }
 
 void io_wq_exit_start(struct io_wq *wq)
@@ -1026,12 +1033,13 @@ static void io_wq_exit_workers(struct io_wq *wq)
 		return;
 
 	while ((cb = task_work_cancel_match(wq->task, io_task_work_match, wq)) != NULL) {
-		struct create_worker_data *cwd;
+		struct io_worker *worker;
 
-		cwd = container_of(cb, struct create_worker_data, work);
-		atomic_dec(&cwd->wqe->acct[cwd->index].nr_running);
+		worker = container_of(cb, struct io_worker, create_work);
+		atomic_dec(&worker->wqe->acct[worker->create_index].nr_running);
 		io_worker_ref_put(wq);
-		kfree(cwd);
+		clear_bit_unlock(0, &worker->create_state);
+		io_worker_release(worker);
 	}
 
 	rcu_read_lock();
-- 
cgit 


From 5fd4617840596884334332f36cabfe0deabe85c8 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 6 Aug 2021 14:04:31 -0600
Subject: io_uring: be smarter about waking multiple CQ ring waiters

Currently we only wake the first waiter, even if we have enough entries
posted to satisfy multiple waiters. Improve that situation so that
every waiter knows how much the CQ tail has to advance before they can
be safely woken up.

With this change, if we have N waiters each asking for 1 event and we get
4 completions, then we wake up 4 waiters. If we have N waiters asking
for 2 completions and we get 4 completions, then we wake up the first
two. Previously, only the first waiter would've been woken up.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 27 +++++++++++++--------------
 1 file changed, 13 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index a2e20a6fbfed..626c2784723d 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1436,11 +1436,13 @@ static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx)
 
 static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
 {
-	/* see waitqueue_active() comment */
-	smp_mb();
-
-	if (waitqueue_active(&ctx->cq_wait))
-		wake_up(&ctx->cq_wait);
+	/*
+	 * wake_up_all() may seem excessive, but io_wake_function() and
+	 * io_should_wake() handle the termination of the loop and only
+	 * wake as many waiters as we need to.
+	 */
+	if (wq_has_sleeper(&ctx->cq_wait))
+		wake_up_all(&ctx->cq_wait);
 	if (ctx->sq_data && waitqueue_active(&ctx->sq_data->wait))
 		wake_up(&ctx->sq_data->wait);
 	if (io_should_trigger_evfd(ctx))
@@ -1453,12 +1455,9 @@ static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
 
 static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx)
 {
-	/* see waitqueue_active() comment */
-	smp_mb();
-
 	if (ctx->flags & IORING_SETUP_SQPOLL) {
-		if (waitqueue_active(&ctx->cq_wait))
-			wake_up(&ctx->cq_wait);
+		if (wq_has_sleeper(&ctx->cq_wait))
+			wake_up_all(&ctx->cq_wait);
 	}
 	if (io_should_trigger_evfd(ctx))
 		eventfd_signal(ctx->cq_ev_fd, 1);
@@ -6976,21 +6975,21 @@ static int io_sq_thread(void *data)
 struct io_wait_queue {
 	struct wait_queue_entry wq;
 	struct io_ring_ctx *ctx;
-	unsigned to_wait;
+	unsigned cq_tail;
 	unsigned nr_timeouts;
 };
 
 static inline bool io_should_wake(struct io_wait_queue *iowq)
 {
 	struct io_ring_ctx *ctx = iowq->ctx;
+	int dist = ctx->cached_cq_tail - (int) iowq->cq_tail;
 
 	/*
 	 * Wake up if we have enough events, or if a timeout occurred since we
 	 * started waiting. For timeouts, we always want to return to userspace,
 	 * regardless of event count.
 	 */
-	return io_cqring_events(ctx) >= iowq->to_wait ||
-			atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts;
+	return dist >= 0 || atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts;
 }
 
 static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
@@ -7053,7 +7052,6 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 			.entry		= LIST_HEAD_INIT(iowq.wq.entry),
 		},
 		.ctx		= ctx,
-		.to_wait	= min_events,
 	};
 	struct io_rings *rings = ctx->rings;
 	signed long timeout = MAX_SCHEDULE_TIMEOUT;
@@ -7089,6 +7087,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 	}
 
 	iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
+	iowq.cq_tail = READ_ONCE(ctx->rings->cq.head) + min_events;
 	trace_io_uring_cqring_wait(ctx, min_events);
 	do {
 		/* if we can't even flush overflow, don't wait for more */
-- 
cgit 


From 042b0d85eabb79909ef29063fb45d363cbc0a85d Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 9 Aug 2021 13:04:01 +0100
Subject: io_uring: use kvmalloc for fixed files

Instead of hand-coded two-level tables for registered files, allocate
them with kvmalloc(). In many cases small enough tables are enough, and
so can be kmalloc()'ed removing an extra memory load and a bunch of bit
logic instructions from the hot path. If the table is larger, we trade
off all the pros with a TLB-assisted memory lookup.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/280421d3b48775dabab773006bb5588c7b2dabc0.1628471125.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 33 ++++++++++-----------------------
 1 file changed, 10 insertions(+), 23 deletions(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 626c2784723d..f8e6eb9de776 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -92,13 +92,8 @@
 #define IORING_MAX_CQ_ENTRIES	(2 * IORING_MAX_ENTRIES)
 #define IORING_SQPOLL_CAP_ENTRIES_VALUE 8
 
-/*
- * Shift of 9 is 512 entries, or exactly one page on 64-bit archs
- */
-#define IORING_FILE_TABLE_SHIFT	9
-#define IORING_MAX_FILES_TABLE	(1U << IORING_FILE_TABLE_SHIFT)
-#define IORING_FILE_TABLE_MASK	(IORING_MAX_FILES_TABLE - 1)
-#define IORING_MAX_FIXED_FILES	(64 * IORING_MAX_FILES_TABLE)
+/* 512 entries per page on 64-bit archs, 64 pages max */
+#define IORING_MAX_FIXED_FILES	(1U << 15)
 #define IORING_MAX_RESTRICTIONS	(IORING_RESTRICTION_LAST + \
 				 IORING_REGISTER_LAST + IORING_OP_LAST)
 
@@ -235,8 +230,7 @@ struct io_rsrc_put {
 };
 
 struct io_file_table {
-	/* two level table */
-	struct io_fixed_file **files;
+	struct io_fixed_file *files;
 };
 
 struct io_rsrc_node {
@@ -6333,12 +6327,9 @@ static void io_wq_submit_work(struct io_wq_work *work)
 #define FFS_MASK		~(FFS_ASYNC_READ|FFS_ASYNC_WRITE|FFS_ISREG)
 
 static inline struct io_fixed_file *io_fixed_file_slot(struct io_file_table *table,
-						      unsigned i)
+						       unsigned i)
 {
-	struct io_fixed_file *table_l2;
-
-	table_l2 = table->files[i >> IORING_FILE_TABLE_SHIFT];
-	return &table_l2[i & IORING_FILE_TABLE_MASK];
+	return &table->files[i];
 }
 
 static inline struct file *io_file_from_index(struct io_ring_ctx *ctx,
@@ -7276,17 +7267,13 @@ fail:
 
 static bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files)
 {
-	size_t size = nr_files * sizeof(struct io_fixed_file);
-
-	table->files = (struct io_fixed_file **)io_alloc_page_table(size);
+	table->files = kvcalloc(nr_files, sizeof(table->files[0]), GFP_KERNEL);
 	return !!table->files;
 }
 
-static void io_free_file_tables(struct io_file_table *table, unsigned nr_files)
+static void io_free_file_tables(struct io_file_table *table)
 {
-	size_t size = nr_files * sizeof(struct io_fixed_file);
-
-	io_free_page_table((void **)table->files, size);
+	kvfree(table->files);
 	table->files = NULL;
 }
 
@@ -7311,7 +7298,7 @@ static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
 			fput(file);
 	}
 #endif
-	io_free_file_tables(&ctx->file_table, ctx->nr_user_files);
+	io_free_file_tables(&ctx->file_table);
 	io_rsrc_data_free(ctx->file_data);
 	ctx->file_data = NULL;
 	ctx->nr_user_files = 0;
@@ -7779,7 +7766,7 @@ out_fput:
 		if (file)
 			fput(file);
 	}
-	io_free_file_tables(&ctx->file_table, nr_args);
+	io_free_file_tables(&ctx->file_table);
 	ctx->nr_user_files = 0;
 out_free:
 	io_rsrc_data_free(ctx->file_data);
-- 
cgit 


From ac177053bb2cb1f3c4c8bf89bce34c3f2c4823a7 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 9 Aug 2021 13:04:02 +0100
Subject: io_uring: inline fixed part of io_file_get()

Optimise io_file_get() with registered files, which is in a hot path,
by inlining parts of the function. Saves a function call, and
inefficiencies of passing arguments, e.g. evaluating
(sqe_flags & IOSQE_FIXED_FILE).

It couldn't have been done before as compilers were refusing to inline
it because of the function size.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/52115cd6ce28f33bd0923149c0e6cb611084a0b1.1628471125.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 65 +++++++++++++++++++++++++++++++++++------------------------
 1 file changed, 39 insertions(+), 26 deletions(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index f8e6eb9de776..69cf1665307f 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1059,7 +1059,8 @@ static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
 				     struct io_uring_rsrc_update2 *up,
 				     unsigned nr_args);
 static void io_clean_op(struct io_kiocb *req);
-static struct file *io_file_get(struct io_submit_state *state,
+static struct file *io_file_get(struct io_ring_ctx *ctx,
+				struct io_submit_state *state,
 				struct io_kiocb *req, int fd, bool fixed);
 static void __io_queue_sqe(struct io_kiocb *req);
 static void io_rsrc_put_work(struct work_struct *work);
@@ -3621,7 +3622,8 @@ static int __io_splice_prep(struct io_kiocb *req,
 	if (unlikely(sp->flags & ~valid_flags))
 		return -EINVAL;
 
-	sp->file_in = io_file_get(NULL, req, READ_ONCE(sqe->splice_fd_in),
+	sp->file_in = io_file_get(req->ctx, NULL, req,
+				  READ_ONCE(sqe->splice_fd_in),
 				  (sp->flags & SPLICE_F_FD_IN_FIXED));
 	if (!sp->file_in)
 		return -EBADF;
@@ -6353,36 +6355,48 @@ static void io_fixed_file_set(struct io_fixed_file *file_slot, struct file *file
 	file_slot->file_ptr = file_ptr;
 }
 
-static struct file *io_file_get(struct io_submit_state *state,
-				struct io_kiocb *req, int fd, bool fixed)
+static inline struct file *io_file_get_fixed(struct io_ring_ctx *ctx,
+					     struct io_kiocb *req, int fd)
 {
-	struct io_ring_ctx *ctx = req->ctx;
 	struct file *file;
+	unsigned long file_ptr;
 
-	if (fixed) {
-		unsigned long file_ptr;
+	if (unlikely((unsigned int)fd >= ctx->nr_user_files))
+		return NULL;
+	fd = array_index_nospec(fd, ctx->nr_user_files);
+	file_ptr = io_fixed_file_slot(&ctx->file_table, fd)->file_ptr;
+	file = (struct file *) (file_ptr & FFS_MASK);
+	file_ptr &= ~FFS_MASK;
+	/* mask in overlapping REQ_F and FFS bits */
+	req->flags |= (file_ptr << REQ_F_ASYNC_READ_BIT);
+	io_req_set_rsrc_node(req);
+	return file;
+}
 
-		if (unlikely((unsigned int)fd >= ctx->nr_user_files))
-			return NULL;
-		fd = array_index_nospec(fd, ctx->nr_user_files);
-		file_ptr = io_fixed_file_slot(&ctx->file_table, fd)->file_ptr;
-		file = (struct file *) (file_ptr & FFS_MASK);
-		file_ptr &= ~FFS_MASK;
-		/* mask in overlapping REQ_F and FFS bits */
-		req->flags |= (file_ptr << REQ_F_ASYNC_READ_BIT);
-		io_req_set_rsrc_node(req);
-	} else {
-		trace_io_uring_file_get(ctx, fd);
-		file = __io_file_get(state, fd);
+static struct file *io_file_get_normal(struct io_ring_ctx *ctx,
+				       struct io_submit_state *state,
+				       struct io_kiocb *req, int fd)
+{
+	struct file *file = __io_file_get(state, fd);
 
-		/* we don't allow fixed io_uring files */
-		if (file && unlikely(file->f_op == &io_uring_fops))
-			io_req_track_inflight(req);
-	}
+	trace_io_uring_file_get(ctx, fd);
 
+	/* we don't allow fixed io_uring files */
+	if (file && unlikely(file->f_op == &io_uring_fops))
+		io_req_track_inflight(req);
 	return file;
 }
 
+static inline struct file *io_file_get(struct io_ring_ctx *ctx,
+				       struct io_submit_state *state,
+				       struct io_kiocb *req, int fd, bool fixed)
+{
+	if (fixed)
+		return io_file_get_fixed(ctx, req, fd);
+	else
+		return io_file_get_normal(ctx, state, req, fd);
+}
+
 static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
 {
 	struct io_timeout_data *data = container_of(timer,
@@ -6589,9 +6603,8 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
 	}
 
 	if (io_op_defs[req->opcode].needs_file) {
-		bool fixed = req->flags & REQ_F_FIXED_FILE;
-
-		req->file = io_file_get(state, req, READ_ONCE(sqe->fd), fixed);
+		req->file = io_file_get(ctx, state, req, READ_ONCE(sqe->fd),
+					(sqe_flags & IOSQE_FIXED_FILE));
 		if (unlikely(!req->file))
 			ret = -EBADF;
 	}
-- 
cgit 


From b191e2dfe5955b392bc8c0ae546dfa5a13649c38 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 9 Aug 2021 13:04:03 +0100
Subject: io_uring: rename io_file_supports_async()

io_file_supports_async() checks whether a file supports nowait
operations, so "async" in the name is misleading. Rename it.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/33d55b5ce43aa1884c637c1957f1e30d30dc3bec.1628471125.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 69cf1665307f..7027c7e1f562 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -719,8 +719,8 @@ enum {
 	REQ_F_DONT_REISSUE_BIT,
 	REQ_F_CREDS_BIT,
 	/* keep async read/write and isreg together and in order */
-	REQ_F_ASYNC_READ_BIT,
-	REQ_F_ASYNC_WRITE_BIT,
+	REQ_F_NOWAIT_READ_BIT,
+	REQ_F_NOWAIT_WRITE_BIT,
 	REQ_F_ISREG_BIT,
 
 	/* not a real bit, just to check we're not overflowing the space */
@@ -766,9 +766,9 @@ enum {
 	/* don't attempt request reissue, see io_rw_reissue() */
 	REQ_F_DONT_REISSUE	= BIT(REQ_F_DONT_REISSUE_BIT),
 	/* supports async reads */
-	REQ_F_ASYNC_READ	= BIT(REQ_F_ASYNC_READ_BIT),
+	REQ_F_NOWAIT_READ	= BIT(REQ_F_NOWAIT_READ_BIT),
 	/* supports async writes */
-	REQ_F_ASYNC_WRITE	= BIT(REQ_F_ASYNC_WRITE_BIT),
+	REQ_F_NOWAIT_WRITE	= BIT(REQ_F_NOWAIT_WRITE_BIT),
 	/* regular file */
 	REQ_F_ISREG		= BIT(REQ_F_ISREG_BIT),
 	/* has creds assigned */
@@ -2631,7 +2631,7 @@ static bool io_bdev_nowait(struct block_device *bdev)
  * any file. For now, just ensure that anything potentially problematic is done
  * inline.
  */
-static bool __io_file_supports_async(struct file *file, int rw)
+static bool __io_file_supports_nowait(struct file *file, int rw)
 {
 	umode_t mode = file_inode(file)->i_mode;
 
@@ -2664,14 +2664,14 @@ static bool __io_file_supports_async(struct file *file, int rw)
 	return file->f_op->write_iter != NULL;
 }
 
-static bool io_file_supports_async(struct io_kiocb *req, int rw)
+static bool io_file_supports_nowait(struct io_kiocb *req, int rw)
 {
-	if (rw == READ && (req->flags & REQ_F_ASYNC_READ))
+	if (rw == READ && (req->flags & REQ_F_NOWAIT_READ))
 		return true;
-	else if (rw == WRITE && (req->flags & REQ_F_ASYNC_WRITE))
+	else if (rw == WRITE && (req->flags & REQ_F_NOWAIT_WRITE))
 		return true;
 
-	return __io_file_supports_async(req->file, rw);
+	return __io_file_supports_nowait(req->file, rw);
 }
 
 static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
@@ -3295,7 +3295,7 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)
 		kiocb->ki_flags |= IOCB_NOWAIT;
 
 	/* If the file doesn't support async, just async punt */
-	if (force_nonblock && !io_file_supports_async(req, READ)) {
+	if (force_nonblock && !io_file_supports_nowait(req, READ)) {
 		ret = io_setup_async_rw(req, iovec, inline_vecs, iter, true);
 		return ret ?: -EAGAIN;
 	}
@@ -3400,7 +3400,7 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags)
 		kiocb->ki_flags |= IOCB_NOWAIT;
 
 	/* If the file doesn't support async, just async punt */
-	if (force_nonblock && !io_file_supports_async(req, WRITE))
+	if (force_nonblock && !io_file_supports_nowait(req, WRITE))
 		goto copy_iov;
 
 	/* file path doesn't support NOWAIT for non-direct_IO */
@@ -5208,7 +5208,7 @@ static int io_arm_poll_handler(struct io_kiocb *req)
 	}
 
 	/* if we can't nonblock try, then no point in arming a poll handler */
-	if (!io_file_supports_async(req, rw))
+	if (!io_file_supports_nowait(req, rw))
 		return IO_APOLL_ABORTED;
 
 	apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC);
@@ -6346,9 +6346,9 @@ static void io_fixed_file_set(struct io_fixed_file *file_slot, struct file *file
 {
 	unsigned long file_ptr = (unsigned long) file;
 
-	if (__io_file_supports_async(file, READ))
+	if (__io_file_supports_nowait(file, READ))
 		file_ptr |= FFS_ASYNC_READ;
-	if (__io_file_supports_async(file, WRITE))
+	if (__io_file_supports_nowait(file, WRITE))
 		file_ptr |= FFS_ASYNC_WRITE;
 	if (S_ISREG(file_inode(file)->i_mode))
 		file_ptr |= FFS_ISREG;
@@ -6368,7 +6368,7 @@ static inline struct file *io_file_get_fixed(struct io_ring_ctx *ctx,
 	file = (struct file *) (file_ptr & FFS_MASK);
 	file_ptr &= ~FFS_MASK;
 	/* mask in overlapping REQ_F and FFS bits */
-	req->flags |= (file_ptr << REQ_F_ASYNC_READ_BIT);
+	req->flags |= (file_ptr << REQ_F_NOWAIT_READ_BIT);
 	io_req_set_rsrc_node(req);
 	return file;
 }
-- 
cgit 


From c97d8a0f68b30960e9c8089bc37cc3b96a96f84d Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 9 Aug 2021 13:04:04 +0100
Subject: io_uring: avoid touching inode in rw prep

If we use fixed files, we can be sure (almost) that REQ_F_ISREG is set.
However, for non-reg files io_prep_rw() still will look into inode to
double check, and that's expensive and can be avoided.

The only caveat is that it only currently works with 64+ bit
architectures, see FFS_ISREG, so we should consider that.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/0a62780c491ca2522cd52db4ae3f16e03aafed0f.1628471125.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 25 +++++++++++++++----------
 1 file changed, 15 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 7027c7e1f562..69e5a2102f94 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1231,6 +1231,20 @@ static bool req_need_defer(struct io_kiocb *req, u32 seq)
 	return false;
 }
 
+#define FFS_ASYNC_READ		0x1UL
+#define FFS_ASYNC_WRITE		0x2UL
+#ifdef CONFIG_64BIT
+#define FFS_ISREG		0x4UL
+#else
+#define FFS_ISREG		0x0UL
+#endif
+#define FFS_MASK		~(FFS_ASYNC_READ|FFS_ASYNC_WRITE|FFS_ISREG)
+
+static inline bool io_req_ffs_set(struct io_kiocb *req)
+{
+	return IS_ENABLED(CONFIG_64BIT) && (req->flags & REQ_F_FIXED_FILE);
+}
+
 static void io_req_track_inflight(struct io_kiocb *req)
 {
 	if (!(req->flags & REQ_F_INFLIGHT)) {
@@ -2682,7 +2696,7 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	unsigned ioprio;
 	int ret;
 
-	if (!(req->flags & REQ_F_ISREG) && S_ISREG(file_inode(file)->i_mode))
+	if (!io_req_ffs_set(req) && S_ISREG(file_inode(file)->i_mode))
 		req->flags |= REQ_F_ISREG;
 
 	kiocb->ki_pos = READ_ONCE(sqe->off);
@@ -6319,15 +6333,6 @@ static void io_wq_submit_work(struct io_wq_work *work)
 	}
 }
 
-#define FFS_ASYNC_READ		0x1UL
-#define FFS_ASYNC_WRITE		0x2UL
-#ifdef CONFIG_64BIT
-#define FFS_ISREG		0x4UL
-#else
-#define FFS_ISREG		0x0UL
-#endif
-#define FFS_MASK		~(FFS_ASYNC_READ|FFS_ASYNC_WRITE|FFS_ISREG)
-
 static inline struct io_fixed_file *io_fixed_file_slot(struct io_file_table *table,
 						       unsigned i)
 {
-- 
cgit 


From ebc11b6c6b87da5c83b4d934893a893f49160bc3 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 9 Aug 2021 13:04:05 +0100
Subject: io_uring: clean io-wq callbacks

Move io-wq callbacks closer to each other, so it's easier to work with
them, and rename io_free_work() into io_wq_free_work() for consistency.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/851bbc7f0f86f206d8c1333efee8bcb9c26e419f.1628471125.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 69e5a2102f94..1a7c91e9891e 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -6298,6 +6298,14 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
 	return 0;
 }
 
+static struct io_wq_work *io_wq_free_work(struct io_wq_work *work)
+{
+	struct io_kiocb *req = container_of(work, struct io_kiocb, work);
+
+	req = io_put_req_find_next(req);
+	return req ? &req->work : NULL;
+}
+
 static void io_wq_submit_work(struct io_wq_work *work)
 {
 	struct io_kiocb *req = container_of(work, struct io_kiocb, work);
@@ -7930,14 +7938,6 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 	return done ? done : err;
 }
 
-static struct io_wq_work *io_free_work(struct io_wq_work *work)
-{
-	struct io_kiocb *req = container_of(work, struct io_kiocb, work);
-
-	req = io_put_req_find_next(req);
-	return req ? &req->work : NULL;
-}
-
 static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx,
 					struct task_struct *task)
 {
@@ -7961,7 +7961,7 @@ static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx,
 
 	data.hash = hash;
 	data.task = task;
-	data.free_work = io_free_work;
+	data.free_work = io_wq_free_work;
 	data.do_work = io_wq_submit_work;
 
 	/* Do QD, or 4 * CPUS, whatever is smallest */
-- 
cgit 


From 2215bed9246dbb95df50fcef788b0765c7c2aac0 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 9 Aug 2021 13:04:06 +0100
Subject: io_uring: remove unnecessary PF_EXITING check

We prefer nornal task_works even if it would fail requests inside. Kill
a PF_EXITING check in io_req_task_work_add(), task_work_add() handles
well dying tasks, i.e. return error when can't enqueue due to late
stages of do_exit().

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/fc14297e8441cd8f5d1743a2488cf0df09bf48ac.1628471125.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 1a7c91e9891e..620ab4c90dff 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2006,8 +2006,6 @@ static void io_req_task_work_add(struct io_kiocb *req)
 	if (test_bit(0, &tctx->task_state) ||
 	    test_and_set_bit(0, &tctx->task_state))
 		return;
-	if (unlikely(tsk->flags & PF_EXITING))
-		goto fail;
 
 	/*
 	 * SQPOLL kernel thread doesn't need notification, just a wakeup. For
@@ -2020,7 +2018,7 @@ static void io_req_task_work_add(struct io_kiocb *req)
 		wake_up_process(tsk);
 		return;
 	}
-fail:
+
 	clear_bit(0, &tctx->task_state);
 	spin_lock_irqsave(&tctx->task_lock, flags);
 	node = tctx->task_list.first;
-- 
cgit 


From 8724dd8c833832c398c3578340374f45d6d9dd0d Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 9 Aug 2021 13:04:07 +0100
Subject: io-wq: improve wq_list_add_tail()

Prepare nodes that we're going to add before actually linking them, it's
always safer and costs us nothing.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/f7e53f0c84c02ed6748c488ed0789b98f8cc6185.1628471125.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io-wq.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/io-wq.h b/fs/io-wq.h
index 3999ee58ff26..308af3928424 100644
--- a/fs/io-wq.h
+++ b/fs/io-wq.h
@@ -44,6 +44,7 @@ static inline void wq_list_add_after(struct io_wq_work_node *node,
 static inline void wq_list_add_tail(struct io_wq_work_node *node,
 				    struct io_wq_work_list *list)
 {
+	node->next = NULL;
 	if (!list->first) {
 		list->last = node;
 		WRITE_ONCE(list->first, node);
@@ -51,7 +52,6 @@ static inline void wq_list_add_tail(struct io_wq_work_node *node,
 		list->last->next = node;
 		list->last = node;
 	}
-	node->next = NULL;
 }
 
 static inline void wq_list_cut(struct io_wq_work_list *list,
-- 
cgit 


From 864ea921b0300fe5a4db9136b7e307e94b369530 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 9 Aug 2021 13:04:08 +0100
Subject: io_uring: refactor io_alloc_req

Replace the main if of io_flush_cached_reqs() with inverted condition +
goto, so all the cases are handled in the same way. And also extract
io_preinit_req() to make it cleaner and easier to refer to.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/1abcba1f7b55dc53bf1dbe95036e345ffb1d5b01.1628471125.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 66 +++++++++++++++++++++++++++++------------------------------
 1 file changed, 33 insertions(+), 33 deletions(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 620ab4c90dff..8892747bd173 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1702,6 +1702,19 @@ static void io_req_complete_failed(struct io_kiocb *req, long res)
 	io_req_complete_post(req, res, 0);
 }
 
+/*
+ * Don't initialise the fields below on every allocation, but do that in
+ * advance and keep them valid across allocations.
+ */
+static void io_preinit_req(struct io_kiocb *req, struct io_ring_ctx *ctx)
+{
+	req->ctx = ctx;
+	req->link = NULL;
+	req->async_data = NULL;
+	/* not necessary, but safer to zero */
+	req->result = 0;
+}
+
 static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx,
 					struct io_comp_state *cs)
 {
@@ -1744,45 +1757,31 @@ static bool io_flush_cached_reqs(struct io_ring_ctx *ctx)
 static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx)
 {
 	struct io_submit_state *state = &ctx->submit_state;
+	gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
+	int ret, i;
 
 	BUILD_BUG_ON(ARRAY_SIZE(state->reqs) < IO_REQ_ALLOC_BATCH);
 
-	if (!state->free_reqs) {
-		gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
-		int ret, i;
-
-		if (io_flush_cached_reqs(ctx))
-			goto got_req;
-
-		ret = kmem_cache_alloc_bulk(req_cachep, gfp, IO_REQ_ALLOC_BATCH,
-					    state->reqs);
-
-		/*
-		 * Bulk alloc is all-or-nothing. If we fail to get a batch,
-		 * retry single alloc to be on the safe side.
-		 */
-		if (unlikely(ret <= 0)) {
-			state->reqs[0] = kmem_cache_alloc(req_cachep, gfp);
-			if (!state->reqs[0])
-				return NULL;
-			ret = 1;
-		}
+	if (likely(state->free_reqs || io_flush_cached_reqs(ctx)))
+		goto got_req;
 
-		/*
-		 * Don't initialise the fields below on every allocation, but
-		 * do that in advance and keep valid on free.
-		 */
-		for (i = 0; i < ret; i++) {
-			struct io_kiocb *req = state->reqs[i];
+	ret = kmem_cache_alloc_bulk(req_cachep, gfp, IO_REQ_ALLOC_BATCH,
+				    state->reqs);
 
-			req->ctx = ctx;
-			req->link = NULL;
-			req->async_data = NULL;
-			/* not necessary, but safer to zero */
-			req->result = 0;
-		}
-		state->free_reqs = ret;
+	/*
+	 * Bulk alloc is all-or-nothing. If we fail to get a batch,
+	 * retry single alloc to be on the safe side.
+	 */
+	if (unlikely(ret <= 0)) {
+		state->reqs[0] = kmem_cache_alloc(req_cachep, gfp);
+		if (!state->reqs[0])
+			return NULL;
+		ret = 1;
 	}
+
+	for (i = 0; i < ret; i++)
+		io_preinit_req(state->reqs[i], ctx);
+	state->free_reqs = ret;
 got_req:
 	state->free_reqs--;
 	return state->reqs[state->free_reqs];
@@ -6569,6 +6568,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
 	unsigned int sqe_flags;
 	int personality, ret = 0;
 
+	/* req is partially pre-initialised, see io_preinit_req() */
 	req->opcode = READ_ONCE(sqe->opcode);
 	/* same numerical values with corresponding REQ_F_*, safe to copy */
 	req->flags = sqe_flags = READ_ONCE(sqe->flags);
-- 
cgit 


From a2416e1ec23c6b79010d03d69c0e4e035339b4ad Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 9 Aug 2021 13:04:09 +0100
Subject: io_uring: don't halt iopoll too early

IOPOLL users should care more about getting completions for requests
they submitted, but not in "device did/completed something". Currently,
io_do_iopoll() may return a positive number, which will instruct
io_iopoll_check() to break the loop and end the syscall, even if there
is not enough CQEs or none at all.

Don't return positive numbers, so io_iopoll_check() exits only when it
gets an actual error, need reschedule or got enough CQEs.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/641a88f751623b6758303b3171f0a4141f06726e.1628471125.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 8892747bd173..6a3079a038fb 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2288,7 +2288,6 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
 	struct io_kiocb *req, *tmp;
 	LIST_HEAD(done);
 	bool spin;
-	int ret;
 
 	/*
 	 * Only spin for completions if we don't have multiple devices hanging
@@ -2296,9 +2295,9 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
 	 */
 	spin = !ctx->poll_multi_queue && *nr_events < min;
 
-	ret = 0;
 	list_for_each_entry_safe(req, tmp, &ctx->iopoll_list, inflight_entry) {
 		struct kiocb *kiocb = &req->rw.kiocb;
+		int ret;
 
 		/*
 		 * Move completed and retryable entries to our local lists.
@@ -2313,22 +2312,20 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
 			break;
 
 		ret = kiocb->ki_filp->f_op->iopoll(kiocb, spin);
-		if (ret < 0)
-			break;
+		if (unlikely(ret < 0))
+			return ret;
+		else if (ret)
+			spin = false;
 
 		/* iopoll may have completed current req */
 		if (READ_ONCE(req->iopoll_completed))
 			list_move_tail(&req->inflight_entry, &done);
-
-		if (ret && spin)
-			spin = false;
-		ret = 0;
 	}
 
 	if (!list_empty(&done))
 		io_iopoll_complete(ctx, nr_events, &done, resubmit);
 
-	return ret;
+	return 0;
 }
 
 /*
-- 
cgit 


From 282cdc86937bd31cf0ea49978ad7a42cfe12ea35 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 9 Aug 2021 13:04:10 +0100
Subject: io_uring: add more locking annotations for submit

Add more annotations for submission path functions holding ->uring_lock.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/128ec4185e26fbd661dd3a424aa66108ee8ff951.1628471125.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 6a3079a038fb..ff17c4e9aa42 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2133,6 +2133,7 @@ static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req,
 }
 
 static void io_submit_flush_completions(struct io_ring_ctx *ctx)
+	__must_hold(&req->ctx->uring_lock)
 {
 	struct io_comp_state *cs = &ctx->submit_state.comp;
 	int i, nr = cs->nr;
@@ -6473,6 +6474,7 @@ static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
 }
 
 static void __io_queue_sqe(struct io_kiocb *req)
+	__must_hold(&req->ctx->uring_lock)
 {
 	struct io_kiocb *linked_timeout = io_prep_linked_timeout(req);
 	int ret;
@@ -6516,6 +6518,7 @@ issue_sqe:
 }
 
 static inline void io_queue_sqe(struct io_kiocb *req)
+	__must_hold(&req->ctx->uring_lock)
 {
 	if (unlikely(req->ctx->drain_active) && io_drain_req(req))
 		return;
@@ -6560,6 +6563,7 @@ static inline bool io_check_restriction(struct io_ring_ctx *ctx,
 
 static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
 		       const struct io_uring_sqe *sqe)
+	__must_hold(&ctx->uring_lock)
 {
 	struct io_submit_state *state;
 	unsigned int sqe_flags;
@@ -6623,6 +6627,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
 
 static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
 			 const struct io_uring_sqe *sqe)
+	__must_hold(&ctx->uring_lock)
 {
 	struct io_submit_link *link = &ctx->submit_state.link;
 	int ret;
@@ -6755,6 +6760,7 @@ static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx)
 }
 
 static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
+	__must_hold(&ctx->uring_lock)
 {
 	struct io_uring_task *tctx;
 	int submitted = 0;
-- 
cgit 


From 90291099f24a82863e00de136d95ad7e73560107 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 9 Aug 2021 09:07:32 -0600
Subject: io_uring: optimise io_cqring_wait() hot path

Turns out we always init struct io_wait_queue in io_cqring_wait(), even
if it's not used after, i.e. there are already enough of CQEs. And often
it's exactly what happens, for instance, requests may have been
completed inline, or in case of io_uring_enter(submit=N, wait=1).

It shows up in my profiler, so optimise it by delaying the struct init.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/6f1b81c60b947d165583dc333947869c3d85d037.1628471125.git.asml.silence@gmail.com
[axboe: fixed up for new cqring wait]
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index ff17c4e9aa42..14aaeb87b149 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -7063,14 +7063,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 			  const sigset_t __user *sig, size_t sigsz,
 			  struct __kernel_timespec __user *uts)
 {
-	struct io_wait_queue iowq = {
-		.wq = {
-			.private	= current,
-			.func		= io_wake_function,
-			.entry		= LIST_HEAD_INIT(iowq.wq.entry),
-		},
-		.ctx		= ctx,
-	};
+	struct io_wait_queue iowq;
 	struct io_rings *rings = ctx->rings;
 	signed long timeout = MAX_SCHEDULE_TIMEOUT;
 	int ret;
@@ -7104,8 +7097,13 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 		timeout = timespec64_to_jiffies(&ts);
 	}
 
+	init_waitqueue_func_entry(&iowq.wq, io_wake_function);
+	iowq.wq.private = current;
+	INIT_LIST_HEAD(&iowq.wq.entry);
+	iowq.ctx = ctx;
 	iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
 	iowq.cq_tail = READ_ONCE(ctx->rings->cq.head) + min_events;
+
 	trace_io_uring_cqring_wait(ctx, min_events);
 	do {
 		/* if we can't even flush overflow, don't wait for more */
-- 
cgit 


From e73c5c7cd3e21bb95032a9ed3593c000f17f9ab8 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 9 Aug 2021 13:04:12 +0100
Subject: io_uring: extract a helper for ctx quiesce

Refactor __io_uring_register() by extracting a helper responsible for
ctx queisce. Looks better and will make it easier to add more
optimisations.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/0339e0027504176be09237eefa7945bf9a6f153d.1628471125.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 53 +++++++++++++++++++++++++++++------------------------
 1 file changed, 29 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 14aaeb87b149..03c32d392735 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -10074,6 +10074,33 @@ static bool io_register_op_must_quiesce(int op)
 	}
 }
 
+static int io_ctx_quiesce(struct io_ring_ctx *ctx)
+{
+	long ret;
+
+	percpu_ref_kill(&ctx->refs);
+
+	/*
+	 * Drop uring mutex before waiting for references to exit. If another
+	 * thread is currently inside io_uring_enter() it might need to grab the
+	 * uring_lock to make progress. If we hold it here across the drain
+	 * wait, then we can deadlock. It's safe to drop the mutex here, since
+	 * no new references will come in after we've killed the percpu ref.
+	 */
+	mutex_unlock(&ctx->uring_lock);
+	do {
+		ret = wait_for_completion_interruptible(&ctx->ref_comp);
+		if (!ret)
+			break;
+		ret = io_run_task_work_sig();
+	} while (ret >= 0);
+	mutex_lock(&ctx->uring_lock);
+
+	if (ret)
+		io_refs_resurrect(&ctx->refs, &ctx->ref_comp);
+	return ret;
+}
+
 static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 			       void __user *arg, unsigned nr_args)
 	__releases(ctx->uring_lock)
@@ -10098,31 +10125,9 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 	}
 
 	if (io_register_op_must_quiesce(opcode)) {
-		percpu_ref_kill(&ctx->refs);
-
-		/*
-		 * Drop uring mutex before waiting for references to exit. If
-		 * another thread is currently inside io_uring_enter() it might
-		 * need to grab the uring_lock to make progress. If we hold it
-		 * here across the drain wait, then we can deadlock. It's safe
-		 * to drop the mutex here, since no new references will come in
-		 * after we've killed the percpu ref.
-		 */
-		mutex_unlock(&ctx->uring_lock);
-		do {
-			ret = wait_for_completion_interruptible(&ctx->ref_comp);
-			if (!ret)
-				break;
-			ret = io_run_task_work_sig();
-			if (ret < 0)
-				break;
-		} while (1);
-		mutex_lock(&ctx->uring_lock);
-
-		if (ret) {
-			io_refs_resurrect(&ctx->refs, &ctx->ref_comp);
+		ret = io_ctx_quiesce(ctx);
+		if (ret)
 			return ret;
-		}
 	}
 
 	switch (opcode) {
-- 
cgit 


From 6a290a1442b45afb55d6a87619b716e5031d7c3e Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 9 Aug 2021 13:04:13 +0100
Subject: io_uring: move io_put_task() definition

Move the function in the source file as it is to get rid of forward
declarations.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/33d917d69e4206557c75a5b98fe22bcdf77ce47d.1628471125.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 23 +++++++++++------------
 1 file changed, 11 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 03c32d392735..a536b2509d6d 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1052,7 +1052,6 @@ static bool io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data,
 static void io_put_req(struct io_kiocb *req);
 static void io_put_req_deferred(struct io_kiocb *req, int nr);
 static void io_dismantle_req(struct io_kiocb *req);
-static void io_put_task(struct task_struct *task, int nr);
 static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req);
 static void io_queue_linked_timeout(struct io_kiocb *req);
 static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
@@ -1571,6 +1570,17 @@ static inline void req_ref_get(struct io_kiocb *req)
 	atomic_inc(&req->refs);
 }
 
+/* must to be called somewhat shortly after putting a request */
+static inline void io_put_task(struct task_struct *task, int nr)
+{
+	struct io_uring_task *tctx = task->io_uring;
+
+	percpu_counter_sub(&tctx->inflight, nr);
+	if (unlikely(atomic_read(&tctx->in_idle)))
+		wake_up(&tctx->wait);
+	put_task_struct_many(task, nr);
+}
+
 static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
 				     long res, unsigned int cflags)
 {
@@ -1809,17 +1819,6 @@ static void io_dismantle_req(struct io_kiocb *req)
 	}
 }
 
-/* must to be called somewhat shortly after putting a request */
-static inline void io_put_task(struct task_struct *task, int nr)
-{
-	struct io_uring_task *tctx = task->io_uring;
-
-	percpu_counter_sub(&tctx->inflight, nr);
-	if (unlikely(atomic_read(&tctx->in_idle)))
-		wake_up(&tctx->wait);
-	put_task_struct_many(task, nr);
-}
-
 static void __io_free_req(struct io_kiocb *req)
 {
 	struct io_ring_ctx *ctx = req->ctx;
-- 
cgit 


From b9bd2bea0f22f502019266dce368a9cd477ac721 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 9 Aug 2021 09:09:47 -0600
Subject: io_uring: move io_rsrc_node_alloc() definition

Move the function together with io_rsrc_node_ref_zero() in the source
file as it is to get rid of forward declarations.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/4d81f6f833e7d017860b24463a9a68b14a8a5ed2.1628471125.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 89 +++++++++++++++++++++++++++++------------------------------
 1 file changed, 44 insertions(+), 45 deletions(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index a536b2509d6d..ee98aebb6580 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1045,7 +1045,6 @@ static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
 					 struct task_struct *task,
 					 bool cancel_all);
 static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd);
-static struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx);
 
 static bool io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data,
 				 long res, unsigned int cflags);
@@ -7160,6 +7159,50 @@ static void io_rsrc_node_destroy(struct io_rsrc_node *ref_node)
 	kfree(ref_node);
 }
 
+static void io_rsrc_node_ref_zero(struct percpu_ref *ref)
+{
+	struct io_rsrc_node *node = container_of(ref, struct io_rsrc_node, refs);
+	struct io_ring_ctx *ctx = node->rsrc_data->ctx;
+	unsigned long flags;
+	bool first_add = false;
+
+	spin_lock_irqsave(&ctx->rsrc_ref_lock, flags);
+	node->done = true;
+
+	while (!list_empty(&ctx->rsrc_ref_list)) {
+		node = list_first_entry(&ctx->rsrc_ref_list,
+					    struct io_rsrc_node, node);
+		/* recycle ref nodes in order */
+		if (!node->done)
+			break;
+		list_del(&node->node);
+		first_add |= llist_add(&node->llist, &ctx->rsrc_put_llist);
+	}
+	spin_unlock_irqrestore(&ctx->rsrc_ref_lock, flags);
+
+	if (first_add)
+		mod_delayed_work(system_wq, &ctx->rsrc_put_work, HZ);
+}
+
+static struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx)
+{
+	struct io_rsrc_node *ref_node;
+
+	ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL);
+	if (!ref_node)
+		return NULL;
+
+	if (percpu_ref_init(&ref_node->refs, io_rsrc_node_ref_zero,
+			    0, GFP_KERNEL)) {
+		kfree(ref_node);
+		return NULL;
+	}
+	INIT_LIST_HEAD(&ref_node->node);
+	INIT_LIST_HEAD(&ref_node->rsrc_list);
+	ref_node->done = false;
+	return ref_node;
+}
+
 static void io_rsrc_node_switch(struct io_ring_ctx *ctx,
 				struct io_rsrc_data *data_to_kill)
 {
@@ -7674,50 +7717,6 @@ static void io_rsrc_put_work(struct work_struct *work)
 	}
 }
 
-static void io_rsrc_node_ref_zero(struct percpu_ref *ref)
-{
-	struct io_rsrc_node *node = container_of(ref, struct io_rsrc_node, refs);
-	struct io_ring_ctx *ctx = node->rsrc_data->ctx;
-	unsigned long flags;
-	bool first_add = false;
-
-	spin_lock_irqsave(&ctx->rsrc_ref_lock, flags);
-	node->done = true;
-
-	while (!list_empty(&ctx->rsrc_ref_list)) {
-		node = list_first_entry(&ctx->rsrc_ref_list,
-					    struct io_rsrc_node, node);
-		/* recycle ref nodes in order */
-		if (!node->done)
-			break;
-		list_del(&node->node);
-		first_add |= llist_add(&node->llist, &ctx->rsrc_put_llist);
-	}
-	spin_unlock_irqrestore(&ctx->rsrc_ref_lock, flags);
-
-	if (first_add)
-		mod_delayed_work(system_wq, &ctx->rsrc_put_work, HZ);
-}
-
-static struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx)
-{
-	struct io_rsrc_node *ref_node;
-
-	ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL);
-	if (!ref_node)
-		return NULL;
-
-	if (percpu_ref_init(&ref_node->refs, io_rsrc_node_ref_zero,
-			    0, GFP_KERNEL)) {
-		kfree(ref_node);
-		return NULL;
-	}
-	INIT_LIST_HEAD(&ref_node->node);
-	INIT_LIST_HEAD(&ref_node->rsrc_list);
-	ref_node->done = false;
-	return ref_node;
-}
-
 static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 				 unsigned nr_args, u64 __user *tags)
 {
-- 
cgit 


From 543af3a13da308f2cea954644b43c2c9f864c350 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 9 Aug 2021 13:04:15 +0100
Subject: io_uring: inline io_free_req_deferred

Inline io_free_req_deferred(), there is no reason to keep it separated.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/ce04b7180d4eac0d69dd00677b227eefe80c2cc5.1628471125.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index ee98aebb6580..576100b23147 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2182,16 +2182,12 @@ static inline void io_put_req(struct io_kiocb *req)
 		io_free_req(req);
 }
 
-static void io_free_req_deferred(struct io_kiocb *req)
-{
-	req->io_task_work.func = io_free_req;
-	io_req_task_work_add(req);
-}
-
 static inline void io_put_req_deferred(struct io_kiocb *req, int refs)
 {
-	if (req_ref_sub_and_test(req, refs))
-		io_free_req_deferred(req);
+	if (req_ref_sub_and_test(req, refs)) {
+		req->io_task_work.func = io_free_req;
+		io_req_task_work_add(req);
+	}
 }
 
 static unsigned io_cqring_events(struct io_ring_ctx *ctx)
-- 
cgit 


From d3fddf6dddd84432161eb070ed8e34d14c8bf56a Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 9 Aug 2021 13:04:16 +0100
Subject: io_uring: deduplicate open iopoll check

Move IORING_SETUP_IOPOLL check into __io_openat_prep(), so both openat
and openat2 reuse it.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/9a73ce83e4ee60d011180ef177eecef8e87ff2a2.1628471125.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 576100b23147..26a3ab9b6aed 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -3789,6 +3789,8 @@ static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
 	const char __user *fname;
 	int ret;
 
+	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+		return -EINVAL;
 	if (unlikely(sqe->ioprio || sqe->buf_index))
 		return -EINVAL;
 	if (unlikely(req->flags & REQ_F_FIXED_FILE))
@@ -3813,12 +3815,9 @@ static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
 
 static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
-	u64 flags, mode;
+	u64 mode = READ_ONCE(sqe->len);
+	u64 flags = READ_ONCE(sqe->open_flags);
 
-	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
-		return -EINVAL;
-	mode = READ_ONCE(sqe->len);
-	flags = READ_ONCE(sqe->open_flags);
 	req->open.how = build_open_how(flags, mode);
 	return __io_openat_prep(req, sqe);
 }
@@ -3829,8 +3828,6 @@ static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	size_t len;
 	int ret;
 
-	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
-		return -EINVAL;
 	how = u64_to_user_ptr(READ_ONCE(sqe->addr2));
 	len = READ_ONCE(sqe->len);
 	if (len < OPEN_HOW_SIZE_VER0)
-- 
cgit 


From 58d3be2c60d2cf4e6bb65bb6200fa39a7bc477f9 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 9 Aug 2021 13:04:17 +0100
Subject: io_uring: improve ctx hang handling

If io_ring_exit_work() can't get it done in 5 minutes, something is
going very wrong, don't keep spinning at HZ / 20 rate, it doesn't help
and it may take much of CPU time if there is a lot of workers stuck as
such.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/9e2d1ca81d569f6bc628af1a42ff6663bff7ce9c.1628471125.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 26a3ab9b6aed..af9472e3f17b 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -8795,6 +8795,7 @@ static void io_ring_exit_work(struct work_struct *work)
 {
 	struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, exit_work);
 	unsigned long timeout = jiffies + HZ * 60 * 5;
+	unsigned long interval = HZ / 20;
 	struct io_tctx_exit exit;
 	struct io_tctx_node *node;
 	int ret;
@@ -8819,8 +8820,11 @@ static void io_ring_exit_work(struct work_struct *work)
 			io_sq_thread_unpark(sqd);
 		}
 
-		WARN_ON_ONCE(time_after(jiffies, timeout));
-	} while (!wait_for_completion_timeout(&ctx->ref_comp, HZ/20));
+		if (WARN_ON_ONCE(time_after(jiffies, timeout))) {
+			/* there is little hope left, don't run it too often */
+			interval = HZ * 60;
+		}
+	} while (!wait_for_completion_timeout(&ctx->ref_comp, interval));
 
 	init_completion(&exit.completion);
 	init_task_work(&exit.task_work, io_tctx_exit_cb);
-- 
cgit 


From bbbca0948989aa1a8a75b99bcdece677ad06dfe6 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 9 Aug 2021 13:04:18 +0100
Subject: io_uring: kill unused IO_IOPOLL_BATCH

IO_IOPOLL_BATCH is not used, delete it.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/b2bdf19dbee2c9fc8865bbab9412135a14e24a64.1628471125.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index af9472e3f17b..8cdcd645f3d8 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -295,7 +295,6 @@ struct io_sq_data {
 	struct completion	exited;
 };
 
-#define IO_IOPOLL_BATCH			8
 #define IO_COMPL_BATCH			32
 #define IO_REQ_CACHE_SIZE		32
 #define IO_REQ_ALLOC_BATCH		8
-- 
cgit 


From af066f31eb3dac2a11516315d47a286a7b3b07df Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 9 Aug 2021 13:04:19 +0100
Subject: io_uring: drop exec checks from io_req_task_submit

In case of on-exec io_uring cancellations, tasks already wait for all
submitted requests to get completed/cancelled, so we don't need to check
for ->in_execve separately.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/be8707049f10df9d20ca03dc4ca3316239b5e8e0.1628471125.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 8cdcd645f3d8..cf75de0d74d2 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2046,7 +2046,7 @@ static void io_req_task_submit(struct io_kiocb *req)
 
 	/* ctx stays valid until unlock, even if we drop all ours ctx->refs */
 	mutex_lock(&ctx->uring_lock);
-	if (!(req->task->flags & PF_EXITING) && !req->task->in_execve)
+	if (likely(!(req->task->flags & PF_EXITING)))
 		__io_queue_sqe(req);
 	else
 		io_req_complete_failed(req, -EFAULT);
-- 
cgit 


From e9dbe221f5d1c974c853da94eee456803239cab5 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 9 Aug 2021 13:04:20 +0100
Subject: io_uring: optimise putting task struct

We cache all the reference to task + tctx, so if io_put_task() is
called by the corresponding task itself, we can save on atomics and
return the refs right back into the cache.

It's beneficial for all inline completions, and also iopolling, when
polling and submissions are done by the same task, including
SQPOLL|IOPOLL.

Note: io_uring_cancel_generic() can return refs to the cache as well,
so those should be flushed in the loop for tctx_inflight() to work
right.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/6fe9646b3cb70e46aca1f58426776e368c8926b3.1628471125.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index cf75de0d74d2..2eeaf4a15367 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2102,10 +2102,12 @@ static inline void io_init_req_batch(struct req_batch *rb)
 static void io_req_free_batch_finish(struct io_ring_ctx *ctx,
 				     struct req_batch *rb)
 {
-	if (rb->task)
-		io_put_task(rb->task, rb->task_refs);
 	if (rb->ctx_refs)
 		percpu_ref_put_many(&ctx->refs, rb->ctx_refs);
+	if (rb->task == current)
+		current->io_uring->cached_refs += rb->task_refs;
+	else if (rb->task)
+		io_put_task(rb->task, rb->task_refs);
 }
 
 static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req,
@@ -9138,9 +9140,11 @@ static void io_uring_drop_tctx_refs(struct task_struct *task)
 	struct io_uring_task *tctx = task->io_uring;
 	unsigned int refs = tctx->cached_refs;
 
-	tctx->cached_refs = 0;
-	percpu_counter_sub(&tctx->inflight, refs);
-	put_task_struct_many(task, refs);
+	if (refs) {
+		tctx->cached_refs = 0;
+		percpu_counter_sub(&tctx->inflight, refs);
+		put_task_struct_many(task, refs);
+	}
 }
 
 /*
@@ -9161,9 +9165,9 @@ static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd)
 	if (tctx->io_wq)
 		io_wq_exit_start(tctx->io_wq);
 
-	io_uring_drop_tctx_refs(current);
 	atomic_inc(&tctx->in_idle);
 	do {
+		io_uring_drop_tctx_refs(current);
 		/* read completions before cancelations */
 		inflight = tctx_inflight(tctx, !cancel_all);
 		if (!inflight)
@@ -9187,6 +9191,7 @@ static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd)
 		}
 
 		prepare_to_wait(&tctx->wait, &wait, TASK_UNINTERRUPTIBLE);
+		io_uring_drop_tctx_refs(current);
 		/*
 		 * If we've seen completions, retry without waiting. This
 		 * avoids a race where a completion comes in before we did
-- 
cgit 


From f56165e62fae78200292857628e4f1d8d12a0ed0 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 9 Aug 2021 20:18:07 +0100
Subject: io_uring: move io_fallback_req_func()

Move io_fallback_req_func() to kill yet another forward declaration.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/d0a8f9d9a0057ed761d6237167d51c9378798d2d.1628536684.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 28 +++++++++++++---------------
 1 file changed, 13 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 2eeaf4a15367..86162ebd6bae 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1067,8 +1067,6 @@ static void io_submit_flush_completions(struct io_ring_ctx *ctx);
 static bool io_poll_remove_waitqs(struct io_kiocb *req);
 static int io_req_prep_async(struct io_kiocb *req);
 
-static void io_fallback_req_func(struct work_struct *unused);
-
 static struct kmem_cache *req_cachep;
 
 static const struct file_operations io_uring_fops;
@@ -1145,6 +1143,19 @@ static inline bool io_is_timeout_noseq(struct io_kiocb *req)
 	return !req->timeout.off;
 }
 
+static void io_fallback_req_func(struct work_struct *work)
+{
+	struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx,
+						fallback_work.work);
+	struct llist_node *node = llist_del_all(&ctx->fallback_llist);
+	struct io_kiocb *req, *tmp;
+
+	percpu_ref_get(&ctx->refs);
+	llist_for_each_entry_safe(req, tmp, node, io_task_work.fallback_node)
+		req->io_task_work.func(req);
+	percpu_ref_put(&ctx->refs);
+}
+
 static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 {
 	struct io_ring_ctx *ctx;
@@ -2468,19 +2479,6 @@ static bool io_rw_should_reissue(struct io_kiocb *req)
 }
 #endif
 
-static void io_fallback_req_func(struct work_struct *work)
-{
-	struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx,
-						fallback_work.work);
-	struct llist_node *node = llist_del_all(&ctx->fallback_llist);
-	struct io_kiocb *req, *tmp;
-
-	percpu_ref_get(&ctx->refs);
-	llist_for_each_entry_safe(req, tmp, node, io_task_work.fallback_node)
-		req->io_task_work.func(req);
-	percpu_ref_put(&ctx->refs);
-}
-
 static void __io_complete_rw(struct io_kiocb *req, long res, long res2,
 			     unsigned int issue_flags)
 {
-- 
cgit 


From c34b025f2d2149d4351b994a923fa687a32478f8 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 9 Aug 2021 20:18:08 +0100
Subject: io_uring: cache __io_free_req()'d requests

Don't kfree requests in __io_free_req() but put them back into the
internal request cache. That makes allocations more sustainable and will
be used for refcounting optimisations.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/9f4950fbe7771c8d41799366d0a3a08ac3040236.1628536684.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 86162ebd6bae..cc2d3de16423 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1831,11 +1831,16 @@ static void io_dismantle_req(struct io_kiocb *req)
 static void __io_free_req(struct io_kiocb *req)
 {
 	struct io_ring_ctx *ctx = req->ctx;
+	unsigned long flags;
 
 	io_dismantle_req(req);
 	io_put_task(req->task, 1);
 
-	kmem_cache_free(req_cachep, req);
+	spin_lock_irqsave(&ctx->completion_lock, flags);
+	list_add(&req->compl.list, &ctx->locked_free_list);
+	ctx->locked_free_nr++;
+	spin_unlock_irqrestore(&ctx->completion_lock, flags);
+
 	percpu_ref_put(&ctx->refs);
 }
 
-- 
cgit 


From 7255834ed6ef9658b9e7fb192da6a323a64eac98 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 9 Aug 2021 20:18:09 +0100
Subject: io_uring: remove redundant args from cache_free

We don't use @tsk argument of io_req_cache_free(), remove it.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/6a28b4a58ee0aaf0db98e2179b9c9f06f9b0cca1.1628536684.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index cc2d3de16423..c6f07a3d8e0d 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -8622,13 +8622,11 @@ static void io_destroy_buffers(struct io_ring_ctx *ctx)
 		__io_remove_buffers(ctx, buf, index, -1U);
 }
 
-static void io_req_cache_free(struct list_head *list, struct task_struct *tsk)
+static void io_req_cache_free(struct list_head *list)
 {
 	struct io_kiocb *req, *nxt;
 
 	list_for_each_entry_safe(req, nxt, list, compl.list) {
-		if (tsk && req->task != tsk)
-			continue;
 		list_del(&req->compl.list);
 		kmem_cache_free(req_cachep, req);
 	}
@@ -8648,7 +8646,7 @@ static void io_req_caches_free(struct io_ring_ctx *ctx)
 	}
 
 	io_flush_cached_locked_reqs(ctx, cs);
-	io_req_cache_free(&cs->free_list, NULL);
+	io_req_cache_free(&cs->free_list);
 	mutex_unlock(&ctx->uring_lock);
 }
 
-- 
cgit 


From bb943b8265c84e9553903161bc39ff45f427d00d Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 9 Aug 2021 20:18:10 +0100
Subject: io_uring: use inflight_entry instead of compl.list

req->compl.list is used to cache freed requests, and so can't overlap in
time with req->inflight_entry. So, use inflight_entry to link requests
and remove compl.list.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/e430e79d22d70a190d718831bda7bfed1daf8976.1628536684.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index c6f07a3d8e0d..f05ec6e07eec 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -670,7 +670,6 @@ struct io_unlink {
 
 struct io_completion {
 	struct file			*file;
-	struct list_head		list;
 	u32				cflags;
 };
 
@@ -1670,7 +1669,7 @@ static void io_req_complete_post(struct io_kiocb *req, long res,
 		}
 		io_dismantle_req(req);
 		io_put_task(req->task, 1);
-		list_add(&req->compl.list, &ctx->locked_free_list);
+		list_add(&req->inflight_entry, &ctx->locked_free_list);
 		ctx->locked_free_nr++;
 	} else {
 		if (!percpu_ref_tryget(&ctx->refs))
@@ -1761,9 +1760,9 @@ static bool io_flush_cached_reqs(struct io_ring_ctx *ctx)
 	nr = state->free_reqs;
 	while (!list_empty(&cs->free_list)) {
 		struct io_kiocb *req = list_first_entry(&cs->free_list,
-						struct io_kiocb, compl.list);
+					struct io_kiocb, inflight_entry);
 
-		list_del(&req->compl.list);
+		list_del(&req->inflight_entry);
 		state->reqs[nr++] = req;
 		if (nr == ARRAY_SIZE(state->reqs))
 			break;
@@ -1837,7 +1836,7 @@ static void __io_free_req(struct io_kiocb *req)
 	io_put_task(req->task, 1);
 
 	spin_lock_irqsave(&ctx->completion_lock, flags);
-	list_add(&req->compl.list, &ctx->locked_free_list);
+	list_add(&req->inflight_entry, &ctx->locked_free_list);
 	ctx->locked_free_nr++;
 	spin_unlock_irqrestore(&ctx->completion_lock, flags);
 
@@ -2144,7 +2143,7 @@ static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req,
 	if (state->free_reqs != ARRAY_SIZE(state->reqs))
 		state->reqs[state->free_reqs++] = req;
 	else
-		list_add(&req->compl.list, &state->comp.free_list);
+		list_add(&req->inflight_entry, &state->comp.free_list);
 }
 
 static void io_submit_flush_completions(struct io_ring_ctx *ctx)
@@ -8626,8 +8625,8 @@ static void io_req_cache_free(struct list_head *list)
 {
 	struct io_kiocb *req, *nxt;
 
-	list_for_each_entry_safe(req, nxt, list, compl.list) {
-		list_del(&req->compl.list);
+	list_for_each_entry_safe(req, nxt, list, inflight_entry) {
+		list_del(&req->inflight_entry);
 		kmem_cache_free(req_cachep, req);
 	}
 }
-- 
cgit 


From cd0ca2e048dc0ddea4f59354b0b8ce4548a76a91 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 9 Aug 2021 20:18:11 +0100
Subject: io_uring: inline struct io_comp_state

Inline struct io_comp_state into struct io_submit_state. They are
already coupled tightly, together with mixed responsibilities it
only brings confusion having them separately.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/e55bba77426b399e3a2e54e3c6c267c6a0fc4b57.1628536684.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 61 ++++++++++++++++++++++++++---------------------------------
 1 file changed, 27 insertions(+), 34 deletions(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index f05ec6e07eec..a927e2173d98 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -299,13 +299,6 @@ struct io_sq_data {
 #define IO_REQ_CACHE_SIZE		32
 #define IO_REQ_ALLOC_BATCH		8
 
-struct io_comp_state {
-	struct io_kiocb		*reqs[IO_COMPL_BATCH];
-	unsigned int		nr;
-	/* inline/task_work completion list, under ->uring_lock */
-	struct list_head	free_list;
-};
-
 struct io_submit_link {
 	struct io_kiocb		*head;
 	struct io_kiocb		*last;
@@ -326,7 +319,10 @@ struct io_submit_state {
 	/*
 	 * Batch completion logic
 	 */
-	struct io_comp_state	comp;
+	struct io_kiocb		*compl_reqs[IO_COMPL_BATCH];
+	unsigned int		compl_nr;
+	/* inline/task_work completion list, under ->uring_lock */
+	struct list_head	free_list;
 
 	/*
 	 * File reference cache
@@ -1208,7 +1204,7 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 	INIT_DELAYED_WORK(&ctx->rsrc_put_work, io_rsrc_put_work);
 	init_llist_head(&ctx->rsrc_put_llist);
 	INIT_LIST_HEAD(&ctx->tctx_list);
-	INIT_LIST_HEAD(&ctx->submit_state.comp.free_list);
+	INIT_LIST_HEAD(&ctx->submit_state.free_list);
 	INIT_LIST_HEAD(&ctx->locked_free_list);
 	INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func);
 	return ctx;
@@ -1734,10 +1730,10 @@ static void io_preinit_req(struct io_kiocb *req, struct io_ring_ctx *ctx)
 }
 
 static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx,
-					struct io_comp_state *cs)
+					struct io_submit_state *state)
 {
 	spin_lock_irq(&ctx->completion_lock);
-	list_splice_init(&ctx->locked_free_list, &cs->free_list);
+	list_splice_init(&ctx->locked_free_list, &state->free_list);
 	ctx->locked_free_nr = 0;
 	spin_unlock_irq(&ctx->completion_lock);
 }
@@ -1746,7 +1742,6 @@ static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx,
 static bool io_flush_cached_reqs(struct io_ring_ctx *ctx)
 {
 	struct io_submit_state *state = &ctx->submit_state;
-	struct io_comp_state *cs = &state->comp;
 	int nr;
 
 	/*
@@ -1755,11 +1750,11 @@ static bool io_flush_cached_reqs(struct io_ring_ctx *ctx)
 	 * side cache.
 	 */
 	if (READ_ONCE(ctx->locked_free_nr) > IO_COMPL_BATCH)
-		io_flush_cached_locked_reqs(ctx, cs);
+		io_flush_cached_locked_reqs(ctx, state);
 
 	nr = state->free_reqs;
-	while (!list_empty(&cs->free_list)) {
-		struct io_kiocb *req = list_first_entry(&cs->free_list,
+	while (!list_empty(&state->free_list)) {
+		struct io_kiocb *req = list_first_entry(&state->free_list,
 					struct io_kiocb, inflight_entry);
 
 		list_del(&req->inflight_entry);
@@ -1946,7 +1941,7 @@ static void ctx_flush_and_put(struct io_ring_ctx *ctx)
 {
 	if (!ctx)
 		return;
-	if (ctx->submit_state.comp.nr) {
+	if (ctx->submit_state.compl_nr) {
 		mutex_lock(&ctx->uring_lock);
 		io_submit_flush_completions(ctx);
 		mutex_unlock(&ctx->uring_lock);
@@ -2143,19 +2138,19 @@ static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req,
 	if (state->free_reqs != ARRAY_SIZE(state->reqs))
 		state->reqs[state->free_reqs++] = req;
 	else
-		list_add(&req->inflight_entry, &state->comp.free_list);
+		list_add(&req->inflight_entry, &state->free_list);
 }
 
 static void io_submit_flush_completions(struct io_ring_ctx *ctx)
 	__must_hold(&req->ctx->uring_lock)
 {
-	struct io_comp_state *cs = &ctx->submit_state.comp;
-	int i, nr = cs->nr;
+	struct io_submit_state *state = &ctx->submit_state;
+	int i, nr = state->compl_nr;
 	struct req_batch rb;
 
 	spin_lock_irq(&ctx->completion_lock);
 	for (i = 0; i < nr; i++) {
-		struct io_kiocb *req = cs->reqs[i];
+		struct io_kiocb *req = state->compl_reqs[i];
 
 		__io_cqring_fill_event(ctx, req->user_data, req->result,
 					req->compl.cflags);
@@ -2166,7 +2161,7 @@ static void io_submit_flush_completions(struct io_ring_ctx *ctx)
 
 	io_init_req_batch(&rb);
 	for (i = 0; i < nr; i++) {
-		struct io_kiocb *req = cs->reqs[i];
+		struct io_kiocb *req = state->compl_reqs[i];
 
 		/* submission and completion refs */
 		if (req_ref_sub_and_test(req, 2))
@@ -2174,7 +2169,7 @@ static void io_submit_flush_completions(struct io_ring_ctx *ctx)
 	}
 
 	io_req_free_batch_finish(ctx, &rb);
-	cs->nr = 0;
+	state->compl_nr = 0;
 }
 
 /*
@@ -6484,10 +6479,10 @@ issue_sqe:
 		/* drop submission reference */
 		if (req->flags & REQ_F_COMPLETE_INLINE) {
 			struct io_ring_ctx *ctx = req->ctx;
-			struct io_comp_state *cs = &ctx->submit_state.comp;
+			struct io_submit_state *state = &ctx->submit_state;
 
-			cs->reqs[cs->nr++] = req;
-			if (cs->nr == ARRAY_SIZE(cs->reqs))
+			state->compl_reqs[state->compl_nr++] = req;
+			if (state->compl_nr == ARRAY_SIZE(state->compl_reqs))
 				io_submit_flush_completions(ctx);
 		} else {
 			io_put_req(req);
@@ -6690,7 +6685,7 @@ static void io_submit_state_end(struct io_submit_state *state,
 {
 	if (state->link.head)
 		io_queue_sqe(state->link.head);
-	if (state->comp.nr)
+	if (state->compl_nr)
 		io_submit_flush_completions(ctx);
 	if (state->plug_started)
 		blk_finish_plug(&state->plug);
@@ -8633,19 +8628,17 @@ static void io_req_cache_free(struct list_head *list)
 
 static void io_req_caches_free(struct io_ring_ctx *ctx)
 {
-	struct io_submit_state *submit_state = &ctx->submit_state;
-	struct io_comp_state *cs = &ctx->submit_state.comp;
+	struct io_submit_state *state = &ctx->submit_state;
 
 	mutex_lock(&ctx->uring_lock);
 
-	if (submit_state->free_reqs) {
-		kmem_cache_free_bulk(req_cachep, submit_state->free_reqs,
-				     submit_state->reqs);
-		submit_state->free_reqs = 0;
+	if (state->free_reqs) {
+		kmem_cache_free_bulk(req_cachep, state->free_reqs, state->reqs);
+		state->free_reqs = 0;
 	}
 
-	io_flush_cached_locked_reqs(ctx, cs);
-	io_req_cache_free(&cs->free_list);
+	io_flush_cached_locked_reqs(ctx, state);
+	io_req_cache_free(&state->free_list);
 	mutex_unlock(&ctx->uring_lock);
 }
 
-- 
cgit 


From 90f67366cb8871951399fb5bcf182e902b896615 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 9 Aug 2021 20:18:12 +0100
Subject: io_uring: remove extra argument for overflow flush

Unlike __io_cqring_overflow_flush(), nobody does forced flushing with
io_cqring_overflow_flush(), so removed the argument from it.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/7594f869ca41b7cfb5a35a3c7c2d402242834e9e.1628536684.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index a927e2173d98..e2718ce6dfbc 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1523,7 +1523,7 @@ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
 	return all_flushed;
 }
 
-static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
+static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx)
 {
 	bool ret = true;
 
@@ -1531,7 +1531,7 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
 		/* iopoll syncs against uring_lock, not completion_lock */
 		if (ctx->flags & IORING_SETUP_IOPOLL)
 			mutex_lock(&ctx->uring_lock);
-		ret = __io_cqring_overflow_flush(ctx, force);
+		ret = __io_cqring_overflow_flush(ctx, false);
 		if (ctx->flags & IORING_SETUP_IOPOLL)
 			mutex_unlock(&ctx->uring_lock);
 	}
@@ -7058,7 +7058,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 	int ret;
 
 	do {
-		io_cqring_overflow_flush(ctx, false);
+		io_cqring_overflow_flush(ctx);
 		if (io_cqring_events(ctx) >= min_events)
 			return 0;
 		if (!io_run_task_work())
@@ -7096,7 +7096,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 	trace_io_uring_cqring_wait(ctx, min_events);
 	do {
 		/* if we can't even flush overflow, don't wait for more */
-		if (!io_cqring_overflow_flush(ctx, false)) {
+		if (!io_cqring_overflow_flush(ctx)) {
 			ret = -EBUSY;
 			break;
 		}
@@ -9365,7 +9365,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
 	 */
 	ret = 0;
 	if (ctx->flags & IORING_SETUP_SQPOLL) {
-		io_cqring_overflow_flush(ctx, false);
+		io_cqring_overflow_flush(ctx);
 
 		if (unlikely(ctx->sq_data->thread == NULL)) {
 			ret = -EOWNERDEAD;
-- 
cgit 


From 5d70904367b45b74dab9da5c023b6629f511e48f Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 9 Aug 2021 20:18:13 +0100
Subject: io_uring: inline io_poll_remove_waitqs

Inline io_poll_remove_waitqs() into its only user and clean it up.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/2f1a91a19ffcd591531dc4c61e2f11c64a2d6a6d.1628536684.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 23 ++++++-----------------
 1 file changed, 6 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index e2718ce6dfbc..a8cef973a601 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1059,7 +1059,6 @@ static void io_rsrc_put_work(struct work_struct *work);
 
 static void io_req_task_queue(struct io_kiocb *req);
 static void io_submit_flush_completions(struct io_ring_ctx *ctx);
-static bool io_poll_remove_waitqs(struct io_kiocb *req);
 static int io_req_prep_async(struct io_kiocb *req);
 
 static struct kmem_cache *req_cachep;
@@ -5251,34 +5250,24 @@ static bool __io_poll_remove_one(struct io_kiocb *req,
 	return do_complete;
 }
 
-static bool io_poll_remove_waitqs(struct io_kiocb *req)
+static bool io_poll_remove_one(struct io_kiocb *req)
 	__must_hold(&req->ctx->completion_lock)
 {
+	int refs;
 	bool do_complete;
 
 	io_poll_remove_double(req);
 	do_complete = __io_poll_remove_one(req, io_poll_get_single(req), true);
 
-	if (req->opcode != IORING_OP_POLL_ADD && do_complete) {
-		/* non-poll requests have submit ref still */
-		req_ref_put(req);
-	}
-	return do_complete;
-}
-
-static bool io_poll_remove_one(struct io_kiocb *req)
-	__must_hold(&req->ctx->completion_lock)
-{
-	bool do_complete;
-
-	do_complete = io_poll_remove_waitqs(req);
 	if (do_complete) {
 		io_cqring_fill_event(req->ctx, req->user_data, -ECANCELED, 0);
 		io_commit_cqring(req->ctx);
 		req_set_fail(req);
-		io_put_req_deferred(req, 1);
-	}
 
+		/* non-poll requests have submit ref still */
+		refs = 1 + (req->opcode != IORING_OP_POLL_ADD);
+		io_put_req_deferred(req, refs);
+	}
 	return do_complete;
 }
 
-- 
cgit 


From 6294f3686b4d77771ab8b161304ada546e71d36a Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 10 Aug 2021 17:53:55 +0100
Subject: io_uring: clean up tctx_task_work()

After recent fixes, tctx_task_work() always does proper spinlocking
before looking into ->task_list, so now we don't need atomics for
->task_state, replace it with non-atomic task_running using the critical
section.

Tide it up, combine two separate block with spinlocking, and always try
to splice in there, so we do less locking when new requests are arriving
during the function execution.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
[axboe: fix missing ->task_running reset on task_work_add() failure]
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 32 ++++++++++++++------------------
 1 file changed, 14 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index a8cef973a601..60313502a234 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -476,8 +476,8 @@ struct io_uring_task {
 
 	spinlock_t		task_lock;
 	struct io_wq_work_list	task_list;
-	unsigned long		task_state;
 	struct callback_head	task_work;
+	bool			task_running;
 };
 
 /*
@@ -1960,9 +1960,13 @@ static void tctx_task_work(struct callback_head *cb)
 		spin_lock_irq(&tctx->task_lock);
 		node = tctx->task_list.first;
 		INIT_WQ_LIST(&tctx->task_list);
+		if (!node)
+			tctx->task_running = false;
 		spin_unlock_irq(&tctx->task_lock);
+		if (!node)
+			break;
 
-		while (node) {
+		do {
 			struct io_wq_work_node *next = node->next;
 			struct io_kiocb *req = container_of(node, struct io_kiocb,
 							    io_task_work.node);
@@ -1974,19 +1978,8 @@ static void tctx_task_work(struct callback_head *cb)
 			}
 			req->io_task_work.func(req);
 			node = next;
-		}
-		if (wq_list_empty(&tctx->task_list)) {
-			spin_lock_irq(&tctx->task_lock);
-			clear_bit(0, &tctx->task_state);
-			if (wq_list_empty(&tctx->task_list)) {
-				spin_unlock_irq(&tctx->task_lock);
-				break;
-			}
-			spin_unlock_irq(&tctx->task_lock);
-			/* another tctx_task_work() is enqueued, yield */
-			if (test_and_set_bit(0, &tctx->task_state))
-				break;
-		}
+		} while (node);
+
 		cond_resched();
 	}
 
@@ -2000,16 +1993,19 @@ static void io_req_task_work_add(struct io_kiocb *req)
 	enum task_work_notify_mode notify;
 	struct io_wq_work_node *node;
 	unsigned long flags;
+	bool running;
 
 	WARN_ON_ONCE(!tctx);
 
 	spin_lock_irqsave(&tctx->task_lock, flags);
 	wq_list_add_tail(&req->io_task_work.node, &tctx->task_list);
+	running = tctx->task_running;
+	if (!running)
+		tctx->task_running = true;
 	spin_unlock_irqrestore(&tctx->task_lock, flags);
 
 	/* task_work already pending, we're done */
-	if (test_bit(0, &tctx->task_state) ||
-	    test_and_set_bit(0, &tctx->task_state))
+	if (running)
 		return;
 
 	/*
@@ -2024,8 +2020,8 @@ static void io_req_task_work_add(struct io_kiocb *req)
 		return;
 	}
 
-	clear_bit(0, &tctx->task_state);
 	spin_lock_irqsave(&tctx->task_lock, flags);
+	tctx->task_running = false;
 	node = tctx->task_list.first;
 	INIT_WQ_LIST(&tctx->task_list);
 	spin_unlock_irqrestore(&tctx->task_lock, flags);
-- 
cgit 


From 62906e89e63ba497105c0e3558089a10365f4f33 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 10 Aug 2021 14:52:47 +0100
Subject: io_uring: remove file batch-get optimisation

For requests with non-fixed files, instead of grabbing just one
reference, we get by the number of left requests, so the following
requests using the same file can take it without atomics.

However, it's not all win. If there is one request in the middle
not using files or having a fixed file, we'll need to put back the left
references. Even worse if an application submits requests dealing with
different files, it will do a put for each new request, so doubling the
number of atomics needed. Also, even if not used, it's still takes some
cycles in the submission path.

If a file used many times, it rather makes sense to pre-register it, if
not, we may fall in the described pitfall. So, this optimisation is a
matter of use case. Go with the simpliest code-wise way, remove it.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 53 ++++-------------------------------------------------
 1 file changed, 4 insertions(+), 49 deletions(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 60313502a234..64241e2d3dd9 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -324,12 +324,6 @@ struct io_submit_state {
 	/* inline/task_work completion list, under ->uring_lock */
 	struct list_head	free_list;
 
-	/*
-	 * File reference cache
-	 */
-	struct file		*file;
-	unsigned int		fd;
-	unsigned int		file_refs;
 	unsigned int		ios_left;
 };
 
@@ -1052,7 +1046,6 @@ static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
 				     unsigned nr_args);
 static void io_clean_op(struct io_kiocb *req);
 static struct file *io_file_get(struct io_ring_ctx *ctx,
-				struct io_submit_state *state,
 				struct io_kiocb *req, int fd, bool fixed);
 static void __io_queue_sqe(struct io_kiocb *req);
 static void io_rsrc_put_work(struct work_struct *work);
@@ -2583,40 +2576,6 @@ static void io_iopoll_req_issued(struct io_kiocb *req)
 	}
 }
 
-static inline void io_state_file_put(struct io_submit_state *state)
-{
-	if (state->file_refs) {
-		fput_many(state->file, state->file_refs);
-		state->file_refs = 0;
-	}
-}
-
-/*
- * Get as many references to a file as we have IOs left in this submission,
- * assuming most submissions are for one file, or at least that each file
- * has more than one submission.
- */
-static struct file *__io_file_get(struct io_submit_state *state, int fd)
-{
-	if (!state)
-		return fget(fd);
-
-	if (state->file_refs) {
-		if (state->fd == fd) {
-			state->file_refs--;
-			return state->file;
-		}
-		io_state_file_put(state);
-	}
-	state->file = fget_many(fd, state->ios_left);
-	if (unlikely(!state->file))
-		return NULL;
-
-	state->fd = fd;
-	state->file_refs = state->ios_left - 1;
-	return state->file;
-}
-
 static bool io_bdev_nowait(struct block_device *bdev)
 {
 	return !bdev || blk_queue_nowait(bdev_get_queue(bdev));
@@ -3618,8 +3577,7 @@ static int __io_splice_prep(struct io_kiocb *req,
 	if (unlikely(sp->flags & ~valid_flags))
 		return -EINVAL;
 
-	sp->file_in = io_file_get(req->ctx, NULL, req,
-				  READ_ONCE(sqe->splice_fd_in),
+	sp->file_in = io_file_get(req->ctx, req, READ_ONCE(sqe->splice_fd_in),
 				  (sp->flags & SPLICE_F_FD_IN_FIXED));
 	if (!sp->file_in)
 		return -EBADF;
@@ -6356,10 +6314,9 @@ static inline struct file *io_file_get_fixed(struct io_ring_ctx *ctx,
 }
 
 static struct file *io_file_get_normal(struct io_ring_ctx *ctx,
-				       struct io_submit_state *state,
 				       struct io_kiocb *req, int fd)
 {
-	struct file *file = __io_file_get(state, fd);
+	struct file *file = fget(fd);
 
 	trace_io_uring_file_get(ctx, fd);
 
@@ -6370,13 +6327,12 @@ static struct file *io_file_get_normal(struct io_ring_ctx *ctx,
 }
 
 static inline struct file *io_file_get(struct io_ring_ctx *ctx,
-				       struct io_submit_state *state,
 				       struct io_kiocb *req, int fd, bool fixed)
 {
 	if (fixed)
 		return io_file_get_fixed(ctx, req, fd);
 	else
-		return io_file_get_normal(ctx, state, req, fd);
+		return io_file_get_normal(ctx, req, fd);
 }
 
 static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
@@ -6589,7 +6545,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
 	}
 
 	if (io_op_defs[req->opcode].needs_file) {
-		req->file = io_file_get(ctx, state, req, READ_ONCE(sqe->fd),
+		req->file = io_file_get(ctx, req, READ_ONCE(sqe->fd),
 					(sqe_flags & IOSQE_FIXED_FILE));
 		if (unlikely(!req->file))
 			ret = -EBADF;
@@ -6674,7 +6630,6 @@ static void io_submit_state_end(struct io_submit_state *state,
 		io_submit_flush_completions(ctx);
 	if (state->plug_started)
 		blk_finish_plug(&state->plug);
-	io_state_file_put(state);
 }
 
 /*
-- 
cgit 


From 89850fce16a1a75caacca77cfa0c829aeea4f886 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 10 Aug 2021 15:11:51 -0600
Subject: io_uring: run timeouts from task_work

This is in preparation to making the completion lock work outside of
hard/soft IRQ context.

Add a timeout_lock to handle the ordering of timeout completions or
cancelations with the timeouts actually triggering.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 54 ++++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 40 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 64241e2d3dd9..b8d09565d3ba 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -409,6 +409,8 @@ struct io_ring_ctx {
 	struct {
 		spinlock_t		completion_lock;
 
+		spinlock_t		timeout_lock;
+
 		/*
 		 * ->iopoll_list is protected by the ctx->uring_lock for
 		 * io_uring instances that don't use IORING_SETUP_SQPOLL.
@@ -1188,6 +1190,7 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 	mutex_init(&ctx->uring_lock);
 	init_waitqueue_head(&ctx->cq_wait);
 	spin_lock_init(&ctx->completion_lock);
+	spin_lock_init(&ctx->timeout_lock);
 	INIT_LIST_HEAD(&ctx->iopoll_list);
 	INIT_LIST_HEAD(&ctx->defer_list);
 	INIT_LIST_HEAD(&ctx->timeout_list);
@@ -1328,6 +1331,7 @@ static void io_queue_async_work(struct io_kiocb *req)
 
 static void io_kill_timeout(struct io_kiocb *req, int status)
 	__must_hold(&req->ctx->completion_lock)
+	__must_hold(&req->ctx->timeout_lock)
 {
 	struct io_timeout_data *io = req->async_data;
 
@@ -1355,9 +1359,12 @@ static void io_queue_deferred(struct io_ring_ctx *ctx)
 }
 
 static void io_flush_timeouts(struct io_ring_ctx *ctx)
+	__must_hold(&ctx->completion_lock)
 {
 	u32 seq = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
+	unsigned long flags;
 
+	spin_lock_irqsave(&ctx->timeout_lock, flags);
 	while (!list_empty(&ctx->timeout_list)) {
 		u32 events_needed, events_got;
 		struct io_kiocb *req = list_first_entry(&ctx->timeout_list,
@@ -1382,6 +1389,7 @@ static void io_flush_timeouts(struct io_ring_ctx *ctx)
 		io_kill_timeout(req, 0);
 	}
 	ctx->cq_last_tm_flush = seq;
+	spin_unlock_irqrestore(&ctx->timeout_lock, flags);
 }
 
 static void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
@@ -5455,6 +5463,20 @@ err:
 	return 0;
 }
 
+static void io_req_task_timeout(struct io_kiocb *req)
+{
+	struct io_ring_ctx *ctx = req->ctx;
+
+	spin_lock_irq(&ctx->completion_lock);
+	io_cqring_fill_event(ctx, req->user_data, -ETIME, 0);
+	io_commit_cqring(ctx);
+	spin_unlock_irq(&ctx->completion_lock);
+
+	io_cqring_ev_posted(ctx);
+	req_set_fail(req);
+	io_put_req(req);
+}
+
 static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
 {
 	struct io_timeout_data *data = container_of(timer,
@@ -5463,24 +5485,20 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
 	struct io_ring_ctx *ctx = req->ctx;
 	unsigned long flags;
 
-	spin_lock_irqsave(&ctx->completion_lock, flags);
+	spin_lock_irqsave(&ctx->timeout_lock, flags);
 	list_del_init(&req->timeout.list);
 	atomic_set(&req->ctx->cq_timeouts,
 		atomic_read(&req->ctx->cq_timeouts) + 1);
+	spin_unlock_irqrestore(&ctx->timeout_lock, flags);
 
-	io_cqring_fill_event(ctx, req->user_data, -ETIME, 0);
-	io_commit_cqring(ctx);
-	spin_unlock_irqrestore(&ctx->completion_lock, flags);
-
-	io_cqring_ev_posted(ctx);
-	req_set_fail(req);
-	io_put_req(req);
+	req->io_task_work.func = io_req_task_timeout;
+	io_req_task_work_add(req);
 	return HRTIMER_NORESTART;
 }
 
 static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx,
 					   __u64 user_data)
-	__must_hold(&ctx->completion_lock)
+	__must_hold(&ctx->timeout_lock)
 {
 	struct io_timeout_data *io;
 	struct io_kiocb *req;
@@ -5502,7 +5520,7 @@ static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx,
 }
 
 static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
-	__must_hold(&ctx->completion_lock)
+	__must_hold(&ctx->timeout_lock)
 {
 	struct io_kiocb *req = io_timeout_extract(ctx, user_data);
 
@@ -5517,7 +5535,7 @@ static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
 
 static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
 			     struct timespec64 *ts, enum hrtimer_mode mode)
-	__must_hold(&ctx->completion_lock)
+	__must_hold(&ctx->timeout_lock)
 {
 	struct io_kiocb *req = io_timeout_extract(ctx, user_data);
 	struct io_timeout_data *data;
@@ -5576,13 +5594,15 @@ static int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags)
 	struct io_ring_ctx *ctx = req->ctx;
 	int ret;
 
-	spin_lock_irq(&ctx->completion_lock);
+	spin_lock_irq(&ctx->timeout_lock);
 	if (!(req->timeout_rem.flags & IORING_TIMEOUT_UPDATE))
 		ret = io_timeout_cancel(ctx, tr->addr);
 	else
 		ret = io_timeout_update(ctx, tr->addr, &tr->ts,
 					io_translate_timeout_mode(tr->flags));
+	spin_unlock_irq(&ctx->timeout_lock);
 
+	spin_lock_irq(&ctx->completion_lock);
 	io_cqring_fill_event(ctx, req->user_data, ret, 0);
 	io_commit_cqring(ctx);
 	spin_unlock_irq(&ctx->completion_lock);
@@ -5637,7 +5657,7 @@ static int io_timeout(struct io_kiocb *req, unsigned int issue_flags)
 	struct list_head *entry;
 	u32 tail, off = req->timeout.off;
 
-	spin_lock_irq(&ctx->completion_lock);
+	spin_lock_irq(&ctx->timeout_lock);
 
 	/*
 	 * sqe->off holds how many events that need to occur for this
@@ -5676,7 +5696,7 @@ add:
 	list_add(&req->timeout.list, entry);
 	data->timer.function = io_timeout_fn;
 	hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode);
-	spin_unlock_irq(&ctx->completion_lock);
+	spin_unlock_irq(&ctx->timeout_lock);
 	return 0;
 }
 
@@ -5730,7 +5750,9 @@ static void io_async_find_and_cancel(struct io_ring_ctx *ctx,
 	spin_lock_irqsave(&ctx->completion_lock, flags);
 	if (ret != -ENOENT)
 		goto done;
+	spin_lock(&ctx->timeout_lock);
 	ret = io_timeout_cancel(ctx, sqe_addr);
+	spin_unlock(&ctx->timeout_lock);
 	if (ret != -ENOENT)
 		goto done;
 	ret = io_poll_cancel(ctx, sqe_addr, false);
@@ -5772,7 +5794,9 @@ static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
 	spin_lock_irq(&ctx->completion_lock);
 	if (ret != -ENOENT)
 		goto done;
+	spin_lock(&ctx->timeout_lock);
 	ret = io_timeout_cancel(ctx, sqe_addr);
+	spin_unlock(&ctx->timeout_lock);
 	if (ret != -ENOENT)
 		goto done;
 	ret = io_poll_cancel(ctx, sqe_addr, false);
@@ -8801,12 +8825,14 @@ static bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk,
 	int canceled = 0;
 
 	spin_lock_irq(&ctx->completion_lock);
+	spin_lock(&ctx->timeout_lock);
 	list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) {
 		if (io_match_task(req, tsk, cancel_all)) {
 			io_kill_timeout(req, -ECANCELED);
 			canceled++;
 		}
 	}
+	spin_unlock(&ctx->timeout_lock);
 	if (canceled != 0)
 		io_commit_cqring(ctx);
 	spin_unlock_irq(&ctx->completion_lock);
-- 
cgit 


From 89b263f6d56e683ddcf7643140271ef6e36c72b9 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 10 Aug 2021 15:14:18 -0600
Subject: io_uring: run linked timeouts from task_work

This is in preparation to making the completion lock work outside of
hard/soft IRQ context.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 42 ++++++++++++++++++++++++++++++------------
 1 file changed, 30 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index b8d09565d3ba..8a1c461559ac 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -538,6 +538,8 @@ struct io_timeout {
 	struct list_head		list;
 	/* head of the link, used by linked timeouts only */
 	struct io_kiocb			*head;
+	/* for linked completions */
+	struct io_kiocb			*prev;
 };
 
 struct io_timeout_rem {
@@ -1848,6 +1850,7 @@ static inline void io_remove_next_linked(struct io_kiocb *req)
 
 static bool io_kill_linked_timeout(struct io_kiocb *req)
 	__must_hold(&req->ctx->completion_lock)
+	__must_hold(&req->ctx->timeout_lock)
 {
 	struct io_kiocb *link = req->link;
 
@@ -1892,8 +1895,13 @@ static bool io_disarm_next(struct io_kiocb *req)
 {
 	bool posted = false;
 
-	if (likely(req->flags & REQ_F_LINK_TIMEOUT))
+	if (likely(req->flags & REQ_F_LINK_TIMEOUT)) {
+		struct io_ring_ctx *ctx = req->ctx;
+
+		spin_lock_irq(&ctx->timeout_lock);
 		posted = io_kill_linked_timeout(req);
+		spin_unlock_irq(&ctx->timeout_lock);
+	}
 	if (unlikely((req->flags & REQ_F_FAIL) &&
 		     !(req->flags & REQ_F_HARDLINK))) {
 		posted |= (req->link != NULL);
@@ -6359,6 +6367,20 @@ static inline struct file *io_file_get(struct io_ring_ctx *ctx,
 		return io_file_get_normal(ctx, req, fd);
 }
 
+static void io_req_task_link_timeout(struct io_kiocb *req)
+{
+	struct io_kiocb *prev = req->timeout.prev;
+	struct io_ring_ctx *ctx = req->ctx;
+
+	if (prev) {
+		io_async_find_and_cancel(ctx, req, prev->user_data, -ETIME);
+		io_put_req(prev);
+		io_put_req(req);
+	} else {
+		io_req_complete_post(req, -ETIME, 0);
+	}
+}
+
 static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
 {
 	struct io_timeout_data *data = container_of(timer,
@@ -6367,7 +6389,7 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
 	struct io_ring_ctx *ctx = req->ctx;
 	unsigned long flags;
 
-	spin_lock_irqsave(&ctx->completion_lock, flags);
+	spin_lock_irqsave(&ctx->timeout_lock, flags);
 	prev = req->timeout.head;
 	req->timeout.head = NULL;
 
@@ -6380,15 +6402,11 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
 		if (!req_ref_inc_not_zero(prev))
 			prev = NULL;
 	}
-	spin_unlock_irqrestore(&ctx->completion_lock, flags);
+	req->timeout.prev = prev;
+	spin_unlock_irqrestore(&ctx->timeout_lock, flags);
 
-	if (prev) {
-		io_async_find_and_cancel(ctx, req, prev->user_data, -ETIME);
-		io_put_req_deferred(prev, 1);
-		io_put_req_deferred(req, 1);
-	} else {
-		io_req_complete_post(req, -ETIME, 0);
-	}
+	req->io_task_work.func = io_req_task_link_timeout;
+	io_req_task_work_add(req);
 	return HRTIMER_NORESTART;
 }
 
@@ -6396,7 +6414,7 @@ static void io_queue_linked_timeout(struct io_kiocb *req)
 {
 	struct io_ring_ctx *ctx = req->ctx;
 
-	spin_lock_irq(&ctx->completion_lock);
+	spin_lock_irq(&ctx->timeout_lock);
 	/*
 	 * If the back reference is NULL, then our linked request finished
 	 * before we got a chance to setup the timer
@@ -6408,7 +6426,7 @@ static void io_queue_linked_timeout(struct io_kiocb *req)
 		hrtimer_start(&data->timer, timespec64_to_ktime(data->ts),
 				data->mode);
 	}
-	spin_unlock_irq(&ctx->completion_lock);
+	spin_unlock_irq(&ctx->timeout_lock);
 	/* drop submission reference */
 	io_put_req(req);
 }
-- 
cgit 


From 8ef12efe26c8e44323011e57753b8c0e87af1582 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 10 Aug 2021 15:15:25 -0600
Subject: io_uring: run regular file completions from task_work

This is in preparation to making the completion lock work outside of
hard/soft IRQ context.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 31 ++++++++++++++++++++++++-------
 1 file changed, 24 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 8a1c461559ac..6477b0d52c19 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2482,31 +2482,48 @@ static bool io_rw_should_reissue(struct io_kiocb *req)
 }
 #endif
 
-static void __io_complete_rw(struct io_kiocb *req, long res, long res2,
-			     unsigned int issue_flags)
+static bool __io_complete_rw_common(struct io_kiocb *req, long res)
 {
-	int cflags = 0;
-
 	if (req->rw.kiocb.ki_flags & IOCB_WRITE)
 		kiocb_end_write(req);
 	if (res != req->result) {
 		if ((res == -EAGAIN || res == -EOPNOTSUPP) &&
 		    io_rw_should_reissue(req)) {
 			req->flags |= REQ_F_REISSUE;
-			return;
+			return true;
 		}
 		req_set_fail(req);
+		req->result = res;
 	}
+	return false;
+}
+
+static void io_req_task_complete(struct io_kiocb *req)
+{
+	int cflags = 0;
+
 	if (req->flags & REQ_F_BUFFER_SELECTED)
 		cflags = io_put_rw_kbuf(req);
-	__io_req_complete(req, issue_flags, res, cflags);
+	__io_req_complete(req, 0, req->result, cflags);
+}
+
+static void __io_complete_rw(struct io_kiocb *req, long res, long res2,
+			     unsigned int issue_flags)
+{
+	if (__io_complete_rw_common(req, res))
+		return;
+	io_req_task_complete(req);
 }
 
 static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
 {
 	struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
 
-	__io_complete_rw(req, res, res2, 0);
+	if (__io_complete_rw_common(req, res))
+		return;
+	req->result = res;
+	req->io_task_work.func = io_req_task_complete;
+	io_req_task_work_add(req);
 }
 
 static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
-- 
cgit 


From 79ebeaee8a21a00417d89f1a02019f79840d9bad Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 10 Aug 2021 15:18:27 -0600
Subject: io_uring: remove IRQ aspect of io_ring_ctx completion lock

We have no hard/soft IRQ users of this lock left, remove any IRQ
disabling/saving and restoring when grabbing this lock.

This is straight forward with no users entering with IRQs disabled
anymore, the only thing to look out for is the waitqueue poll head
lock which nests inside the completion lock. That needs IRQs disabled,
and hence we have to do that now instead of relying on the outer lock
doing so.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 154 ++++++++++++++++++++++++++++------------------------------
 1 file changed, 74 insertions(+), 80 deletions(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 6477b0d52c19..52cad59c0739 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1292,10 +1292,10 @@ static void io_prep_async_link(struct io_kiocb *req)
 	if (req->flags & REQ_F_LINK_TIMEOUT) {
 		struct io_ring_ctx *ctx = req->ctx;
 
-		spin_lock_irq(&ctx->completion_lock);
+		spin_lock(&ctx->completion_lock);
 		io_for_each_link(cur, req)
 			io_prep_async_work(cur);
-		spin_unlock_irq(&ctx->completion_lock);
+		spin_unlock(&ctx->completion_lock);
 	} else {
 		io_for_each_link(cur, req)
 			io_prep_async_work(cur);
@@ -1364,9 +1364,8 @@ static void io_flush_timeouts(struct io_ring_ctx *ctx)
 	__must_hold(&ctx->completion_lock)
 {
 	u32 seq = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
-	unsigned long flags;
 
-	spin_lock_irqsave(&ctx->timeout_lock, flags);
+	spin_lock_irq(&ctx->timeout_lock);
 	while (!list_empty(&ctx->timeout_list)) {
 		u32 events_needed, events_got;
 		struct io_kiocb *req = list_first_entry(&ctx->timeout_list,
@@ -1391,7 +1390,7 @@ static void io_flush_timeouts(struct io_ring_ctx *ctx)
 		io_kill_timeout(req, 0);
 	}
 	ctx->cq_last_tm_flush = seq;
-	spin_unlock_irqrestore(&ctx->timeout_lock, flags);
+	spin_unlock_irq(&ctx->timeout_lock);
 }
 
 static void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
@@ -1484,14 +1483,13 @@ static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx)
 /* Returns true if there are no backlogged entries after the flush */
 static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
 {
-	unsigned long flags;
 	bool all_flushed, posted;
 
 	if (!force && __io_cqring_events(ctx) == ctx->cq_entries)
 		return false;
 
 	posted = false;
-	spin_lock_irqsave(&ctx->completion_lock, flags);
+	spin_lock(&ctx->completion_lock);
 	while (!list_empty(&ctx->cq_overflow_list)) {
 		struct io_uring_cqe *cqe = io_get_cqe(ctx);
 		struct io_overflow_cqe *ocqe;
@@ -1519,7 +1517,7 @@ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
 
 	if (posted)
 		io_commit_cqring(ctx);
-	spin_unlock_irqrestore(&ctx->completion_lock, flags);
+	spin_unlock(&ctx->completion_lock);
 	if (posted)
 		io_cqring_ev_posted(ctx);
 	return all_flushed;
@@ -1648,9 +1646,8 @@ static void io_req_complete_post(struct io_kiocb *req, long res,
 				 unsigned int cflags)
 {
 	struct io_ring_ctx *ctx = req->ctx;
-	unsigned long flags;
 
-	spin_lock_irqsave(&ctx->completion_lock, flags);
+	spin_lock(&ctx->completion_lock);
 	__io_cqring_fill_event(ctx, req->user_data, res, cflags);
 	/*
 	 * If we're the last reference to this request, add to our locked
@@ -1674,7 +1671,7 @@ static void io_req_complete_post(struct io_kiocb *req, long res,
 			req = NULL;
 	}
 	io_commit_cqring(ctx);
-	spin_unlock_irqrestore(&ctx->completion_lock, flags);
+	spin_unlock(&ctx->completion_lock);
 
 	if (req) {
 		io_cqring_ev_posted(ctx);
@@ -1734,10 +1731,10 @@ static void io_preinit_req(struct io_kiocb *req, struct io_ring_ctx *ctx)
 static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx,
 					struct io_submit_state *state)
 {
-	spin_lock_irq(&ctx->completion_lock);
+	spin_lock(&ctx->completion_lock);
 	list_splice_init(&ctx->locked_free_list, &state->free_list);
 	ctx->locked_free_nr = 0;
-	spin_unlock_irq(&ctx->completion_lock);
+	spin_unlock(&ctx->completion_lock);
 }
 
 /* Returns true IFF there are requests in the cache */
@@ -1827,15 +1824,14 @@ static void io_dismantle_req(struct io_kiocb *req)
 static void __io_free_req(struct io_kiocb *req)
 {
 	struct io_ring_ctx *ctx = req->ctx;
-	unsigned long flags;
 
 	io_dismantle_req(req);
 	io_put_task(req->task, 1);
 
-	spin_lock_irqsave(&ctx->completion_lock, flags);
+	spin_lock(&ctx->completion_lock);
 	list_add(&req->inflight_entry, &ctx->locked_free_list);
 	ctx->locked_free_nr++;
-	spin_unlock_irqrestore(&ctx->completion_lock, flags);
+	spin_unlock(&ctx->completion_lock);
 
 	percpu_ref_put(&ctx->refs);
 }
@@ -1922,14 +1918,13 @@ static struct io_kiocb *__io_req_find_next(struct io_kiocb *req)
 	 */
 	if (req->flags & (REQ_F_LINK_TIMEOUT | REQ_F_FAIL)) {
 		struct io_ring_ctx *ctx = req->ctx;
-		unsigned long flags;
 		bool posted;
 
-		spin_lock_irqsave(&ctx->completion_lock, flags);
+		spin_lock(&ctx->completion_lock);
 		posted = io_disarm_next(req);
 		if (posted)
 			io_commit_cqring(req->ctx);
-		spin_unlock_irqrestore(&ctx->completion_lock, flags);
+		spin_unlock(&ctx->completion_lock);
 		if (posted)
 			io_cqring_ev_posted(ctx);
 	}
@@ -2152,7 +2147,7 @@ static void io_submit_flush_completions(struct io_ring_ctx *ctx)
 	int i, nr = state->compl_nr;
 	struct req_batch rb;
 
-	spin_lock_irq(&ctx->completion_lock);
+	spin_lock(&ctx->completion_lock);
 	for (i = 0; i < nr; i++) {
 		struct io_kiocb *req = state->compl_reqs[i];
 
@@ -2160,7 +2155,7 @@ static void io_submit_flush_completions(struct io_ring_ctx *ctx)
 					req->compl.cflags);
 	}
 	io_commit_cqring(ctx);
-	spin_unlock_irq(&ctx->completion_lock);
+	spin_unlock(&ctx->completion_lock);
 	io_cqring_ev_posted(ctx);
 
 	io_init_req_batch(&rb);
@@ -4865,7 +4860,7 @@ static bool io_poll_rewait(struct io_kiocb *req, struct io_poll_iocb *poll)
 		req->result = vfs_poll(req->file, &pt) & poll->events;
 	}
 
-	spin_lock_irq(&ctx->completion_lock);
+	spin_lock(&ctx->completion_lock);
 	if (!req->result && !READ_ONCE(poll->canceled)) {
 		add_wait_queue(poll->head, &poll->wait);
 		return true;
@@ -4899,12 +4894,12 @@ static void io_poll_remove_double(struct io_kiocb *req)
 	if (poll && poll->head) {
 		struct wait_queue_head *head = poll->head;
 
-		spin_lock(&head->lock);
+		spin_lock_irq(&head->lock);
 		list_del_init(&poll->wait.entry);
 		if (poll->wait.private)
 			req_ref_put(req);
 		poll->head = NULL;
-		spin_unlock(&head->lock);
+		spin_unlock_irq(&head->lock);
 	}
 }
 
@@ -4940,7 +4935,7 @@ static void io_poll_task_func(struct io_kiocb *req)
 	struct io_kiocb *nxt;
 
 	if (io_poll_rewait(req, &req->poll)) {
-		spin_unlock_irq(&ctx->completion_lock);
+		spin_unlock(&ctx->completion_lock);
 	} else {
 		bool done;
 
@@ -4952,7 +4947,7 @@ static void io_poll_task_func(struct io_kiocb *req)
 			req->result = 0;
 			add_wait_queue(req->poll.head, &req->poll.wait);
 		}
-		spin_unlock_irq(&ctx->completion_lock);
+		spin_unlock(&ctx->completion_lock);
 		io_cqring_ev_posted(ctx);
 
 		if (done) {
@@ -4969,6 +4964,7 @@ static int io_poll_double_wake(struct wait_queue_entry *wait, unsigned mode,
 	struct io_kiocb *req = wait->private;
 	struct io_poll_iocb *poll = io_poll_get_single(req);
 	__poll_t mask = key_to_poll(key);
+	unsigned long flags;
 
 	/* for instances that support it check for an event match first: */
 	if (mask && !(mask & poll->events))
@@ -4981,13 +4977,13 @@ static int io_poll_double_wake(struct wait_queue_entry *wait, unsigned mode,
 	if (poll->head) {
 		bool done;
 
-		spin_lock(&poll->head->lock);
+		spin_lock_irqsave(&poll->head->lock, flags);
 		done = list_empty(&poll->wait.entry);
 		if (!done)
 			list_del_init(&poll->wait.entry);
 		/* make sure double remove sees this as being gone */
 		wait->private = NULL;
-		spin_unlock(&poll->head->lock);
+		spin_unlock_irqrestore(&poll->head->lock, flags);
 		if (!done) {
 			/* use wait func handler, so it matches the rq type */
 			poll->wait.func(&poll->wait, mode, sync, key);
@@ -5075,13 +5071,13 @@ static void io_async_task_func(struct io_kiocb *req)
 	trace_io_uring_task_run(req->ctx, req, req->opcode, req->user_data);
 
 	if (io_poll_rewait(req, &apoll->poll)) {
-		spin_unlock_irq(&ctx->completion_lock);
+		spin_unlock(&ctx->completion_lock);
 		return;
 	}
 
 	hash_del(&req->hash_node);
 	io_poll_remove_double(req);
-	spin_unlock_irq(&ctx->completion_lock);
+	spin_unlock(&ctx->completion_lock);
 
 	if (!READ_ONCE(apoll->poll.canceled))
 		io_req_task_submit(req);
@@ -5133,11 +5129,11 @@ static __poll_t __io_arm_poll_handler(struct io_kiocb *req,
 	if (unlikely(!ipt->nr_entries) && !ipt->error)
 		ipt->error = -EINVAL;
 
-	spin_lock_irq(&ctx->completion_lock);
+	spin_lock(&ctx->completion_lock);
 	if (ipt->error || (mask && (poll->events & EPOLLONESHOT)))
 		io_poll_remove_double(req);
 	if (likely(poll->head)) {
-		spin_lock(&poll->head->lock);
+		spin_lock_irq(&poll->head->lock);
 		if (unlikely(list_empty(&poll->wait.entry))) {
 			if (ipt->error)
 				cancel = true;
@@ -5150,7 +5146,7 @@ static __poll_t __io_arm_poll_handler(struct io_kiocb *req,
 			WRITE_ONCE(poll->canceled, true);
 		else if (!poll->done) /* actually waiting for an event */
 			io_poll_req_insert(req);
-		spin_unlock(&poll->head->lock);
+		spin_unlock_irq(&poll->head->lock);
 	}
 
 	return mask;
@@ -5206,12 +5202,12 @@ static int io_arm_poll_handler(struct io_kiocb *req)
 	ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask,
 					io_async_wake);
 	if (ret || ipt.error) {
-		spin_unlock_irq(&ctx->completion_lock);
+		spin_unlock(&ctx->completion_lock);
 		if (ret)
 			return IO_APOLL_READY;
 		return IO_APOLL_ABORTED;
 	}
-	spin_unlock_irq(&ctx->completion_lock);
+	spin_unlock(&ctx->completion_lock);
 	trace_io_uring_poll_arm(ctx, req, req->opcode, req->user_data,
 				mask, apoll->poll.events);
 	return IO_APOLL_OK;
@@ -5225,14 +5221,14 @@ static bool __io_poll_remove_one(struct io_kiocb *req,
 
 	if (!poll->head)
 		return false;
-	spin_lock(&poll->head->lock);
+	spin_lock_irq(&poll->head->lock);
 	if (do_cancel)
 		WRITE_ONCE(poll->canceled, true);
 	if (!list_empty(&poll->wait.entry)) {
 		list_del_init(&poll->wait.entry);
 		do_complete = true;
 	}
-	spin_unlock(&poll->head->lock);
+	spin_unlock_irq(&poll->head->lock);
 	hash_del(&req->hash_node);
 	return do_complete;
 }
@@ -5268,7 +5264,7 @@ static bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk,
 	struct io_kiocb *req;
 	int posted = 0, i;
 
-	spin_lock_irq(&ctx->completion_lock);
+	spin_lock(&ctx->completion_lock);
 	for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
 		struct hlist_head *list;
 
@@ -5278,7 +5274,7 @@ static bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk,
 				posted += io_poll_remove_one(req);
 		}
 	}
-	spin_unlock_irq(&ctx->completion_lock);
+	spin_unlock(&ctx->completion_lock);
 
 	if (posted)
 		io_cqring_ev_posted(ctx);
@@ -5416,7 +5412,7 @@ static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags)
 		ipt.error = 0;
 		io_poll_complete(req, mask);
 	}
-	spin_unlock_irq(&ctx->completion_lock);
+	spin_unlock(&ctx->completion_lock);
 
 	if (mask) {
 		io_cqring_ev_posted(ctx);
@@ -5433,7 +5429,7 @@ static int io_poll_update(struct io_kiocb *req, unsigned int issue_flags)
 	bool completing;
 	int ret;
 
-	spin_lock_irq(&ctx->completion_lock);
+	spin_lock(&ctx->completion_lock);
 	preq = io_poll_find(ctx, req->poll_update.old_user_data, true);
 	if (!preq) {
 		ret = -ENOENT;
@@ -5460,7 +5456,7 @@ static int io_poll_update(struct io_kiocb *req, unsigned int issue_flags)
 	ret = 0;
 err:
 	if (ret < 0) {
-		spin_unlock_irq(&ctx->completion_lock);
+		spin_unlock(&ctx->completion_lock);
 		req_set_fail(req);
 		io_req_complete(req, ret);
 		return 0;
@@ -5473,7 +5469,7 @@ err:
 	}
 	if (req->poll_update.update_user_data)
 		preq->user_data = req->poll_update.new_user_data;
-	spin_unlock_irq(&ctx->completion_lock);
+	spin_unlock(&ctx->completion_lock);
 
 	/* complete update request, we're done with it */
 	io_req_complete(req, ret);
@@ -5492,10 +5488,10 @@ static void io_req_task_timeout(struct io_kiocb *req)
 {
 	struct io_ring_ctx *ctx = req->ctx;
 
-	spin_lock_irq(&ctx->completion_lock);
+	spin_lock(&ctx->completion_lock);
 	io_cqring_fill_event(ctx, req->user_data, -ETIME, 0);
 	io_commit_cqring(ctx);
-	spin_unlock_irq(&ctx->completion_lock);
+	spin_unlock(&ctx->completion_lock);
 
 	io_cqring_ev_posted(ctx);
 	req_set_fail(req);
@@ -5627,10 +5623,10 @@ static int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags)
 					io_translate_timeout_mode(tr->flags));
 	spin_unlock_irq(&ctx->timeout_lock);
 
-	spin_lock_irq(&ctx->completion_lock);
+	spin_lock(&ctx->completion_lock);
 	io_cqring_fill_event(ctx, req->user_data, ret, 0);
 	io_commit_cqring(ctx);
-	spin_unlock_irq(&ctx->completion_lock);
+	spin_unlock(&ctx->completion_lock);
 	io_cqring_ev_posted(ctx);
 	if (ret < 0)
 		req_set_fail(req);
@@ -5768,16 +5764,15 @@ static void io_async_find_and_cancel(struct io_ring_ctx *ctx,
 				     struct io_kiocb *req, __u64 sqe_addr,
 				     int success_ret)
 {
-	unsigned long flags;
 	int ret;
 
 	ret = io_async_cancel_one(req->task->io_uring, sqe_addr, ctx);
-	spin_lock_irqsave(&ctx->completion_lock, flags);
+	spin_lock(&ctx->completion_lock);
 	if (ret != -ENOENT)
 		goto done;
-	spin_lock(&ctx->timeout_lock);
+	spin_lock_irq(&ctx->timeout_lock);
 	ret = io_timeout_cancel(ctx, sqe_addr);
-	spin_unlock(&ctx->timeout_lock);
+	spin_unlock_irq(&ctx->timeout_lock);
 	if (ret != -ENOENT)
 		goto done;
 	ret = io_poll_cancel(ctx, sqe_addr, false);
@@ -5786,7 +5781,7 @@ done:
 		ret = success_ret;
 	io_cqring_fill_event(ctx, req->user_data, ret, 0);
 	io_commit_cqring(ctx);
-	spin_unlock_irqrestore(&ctx->completion_lock, flags);
+	spin_unlock(&ctx->completion_lock);
 	io_cqring_ev_posted(ctx);
 
 	if (ret < 0)
@@ -5816,18 +5811,18 @@ static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
 
 	/* tasks should wait for their io-wq threads, so safe w/o sync */
 	ret = io_async_cancel_one(req->task->io_uring, sqe_addr, ctx);
-	spin_lock_irq(&ctx->completion_lock);
+	spin_lock(&ctx->completion_lock);
 	if (ret != -ENOENT)
 		goto done;
-	spin_lock(&ctx->timeout_lock);
+	spin_lock_irq(&ctx->timeout_lock);
 	ret = io_timeout_cancel(ctx, sqe_addr);
-	spin_unlock(&ctx->timeout_lock);
+	spin_unlock_irq(&ctx->timeout_lock);
 	if (ret != -ENOENT)
 		goto done;
 	ret = io_poll_cancel(ctx, sqe_addr, false);
 	if (ret != -ENOENT)
 		goto done;
-	spin_unlock_irq(&ctx->completion_lock);
+	spin_unlock(&ctx->completion_lock);
 
 	/* slow path, try all io-wq's */
 	io_ring_submit_lock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
@@ -5841,11 +5836,11 @@ static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
 	}
 	io_ring_submit_unlock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
 
-	spin_lock_irq(&ctx->completion_lock);
+	spin_lock(&ctx->completion_lock);
 done:
 	io_cqring_fill_event(ctx, req->user_data, ret, 0);
 	io_commit_cqring(ctx);
-	spin_unlock_irq(&ctx->completion_lock);
+	spin_unlock(&ctx->completion_lock);
 	io_cqring_ev_posted(ctx);
 
 	if (ret < 0)
@@ -6061,9 +6056,9 @@ fail:
 		return true;
 	}
 
-	spin_lock_irq(&ctx->completion_lock);
+	spin_lock(&ctx->completion_lock);
 	if (!req_need_defer(req, seq) && list_empty(&ctx->defer_list)) {
-		spin_unlock_irq(&ctx->completion_lock);
+		spin_unlock(&ctx->completion_lock);
 		kfree(de);
 		io_queue_async_work(req);
 		return true;
@@ -6073,7 +6068,7 @@ fail:
 	de->req = req;
 	de->seq = seq;
 	list_add_tail(&de->list, &ctx->defer_list);
-	spin_unlock_irq(&ctx->completion_lock);
+	spin_unlock(&ctx->completion_lock);
 	return true;
 }
 
@@ -6813,18 +6808,18 @@ static inline bool io_sqd_events_pending(struct io_sq_data *sqd)
 static inline void io_ring_set_wakeup_flag(struct io_ring_ctx *ctx)
 {
 	/* Tell userspace we may need a wakeup call */
-	spin_lock_irq(&ctx->completion_lock);
+	spin_lock(&ctx->completion_lock);
 	WRITE_ONCE(ctx->rings->sq_flags,
 		   ctx->rings->sq_flags | IORING_SQ_NEED_WAKEUP);
-	spin_unlock_irq(&ctx->completion_lock);
+	spin_unlock(&ctx->completion_lock);
 }
 
 static inline void io_ring_clear_wakeup_flag(struct io_ring_ctx *ctx)
 {
-	spin_lock_irq(&ctx->completion_lock);
+	spin_lock(&ctx->completion_lock);
 	WRITE_ONCE(ctx->rings->sq_flags,
 		   ctx->rings->sq_flags & ~IORING_SQ_NEED_WAKEUP);
-	spin_unlock_irq(&ctx->completion_lock);
+	spin_unlock(&ctx->completion_lock);
 }
 
 static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)
@@ -7671,11 +7666,11 @@ static void __io_rsrc_put_work(struct io_rsrc_node *ref_node)
 			bool lock_ring = ctx->flags & IORING_SETUP_IOPOLL;
 
 			io_ring_submit_lock(ctx, lock_ring);
-			spin_lock_irq(&ctx->completion_lock);
+			spin_lock(&ctx->completion_lock);
 			io_cqring_fill_event(ctx, prsrc->tag, 0, 0);
 			ctx->cq_extra++;
 			io_commit_cqring(ctx);
-			spin_unlock_irq(&ctx->completion_lock);
+			spin_unlock(&ctx->completion_lock);
 			io_cqring_ev_posted(ctx);
 			io_ring_submit_unlock(ctx, lock_ring);
 		}
@@ -8846,8 +8841,8 @@ static void io_ring_exit_work(struct work_struct *work)
 		mutex_lock(&ctx->uring_lock);
 	}
 	mutex_unlock(&ctx->uring_lock);
-	spin_lock_irq(&ctx->completion_lock);
-	spin_unlock_irq(&ctx->completion_lock);
+	spin_lock(&ctx->completion_lock);
+	spin_unlock(&ctx->completion_lock);
 
 	io_ring_ctx_free(ctx);
 }
@@ -8859,18 +8854,18 @@ static bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk,
 	struct io_kiocb *req, *tmp;
 	int canceled = 0;
 
-	spin_lock_irq(&ctx->completion_lock);
-	spin_lock(&ctx->timeout_lock);
+	spin_lock(&ctx->completion_lock);
+	spin_lock_irq(&ctx->timeout_lock);
 	list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) {
 		if (io_match_task(req, tsk, cancel_all)) {
 			io_kill_timeout(req, -ECANCELED);
 			canceled++;
 		}
 	}
-	spin_unlock(&ctx->timeout_lock);
+	spin_unlock_irq(&ctx->timeout_lock);
 	if (canceled != 0)
 		io_commit_cqring(ctx);
-	spin_unlock_irq(&ctx->completion_lock);
+	spin_unlock(&ctx->completion_lock);
 	if (canceled != 0)
 		io_cqring_ev_posted(ctx);
 	return canceled != 0;
@@ -8926,13 +8921,12 @@ static bool io_cancel_task_cb(struct io_wq_work *work, void *data)
 	bool ret;
 
 	if (!cancel->all && (req->flags & REQ_F_LINK_TIMEOUT)) {
-		unsigned long flags;
 		struct io_ring_ctx *ctx = req->ctx;
 
 		/* protect against races with linked timeouts */
-		spin_lock_irqsave(&ctx->completion_lock, flags);
+		spin_lock(&ctx->completion_lock);
 		ret = io_match_task(req, cancel->task, cancel->all);
-		spin_unlock_irqrestore(&ctx->completion_lock, flags);
+		spin_unlock(&ctx->completion_lock);
 	} else {
 		ret = io_match_task(req, cancel->task, cancel->all);
 	}
@@ -8945,14 +8939,14 @@ static bool io_cancel_defer_files(struct io_ring_ctx *ctx,
 	struct io_defer_entry *de;
 	LIST_HEAD(list);
 
-	spin_lock_irq(&ctx->completion_lock);
+	spin_lock(&ctx->completion_lock);
 	list_for_each_entry_reverse(de, &ctx->defer_list, list) {
 		if (io_match_task(de->req, task, cancel_all)) {
 			list_cut_position(&list, &ctx->defer_list, &de->list);
 			break;
 		}
 	}
-	spin_unlock_irq(&ctx->completion_lock);
+	spin_unlock(&ctx->completion_lock);
 	if (list_empty(&list))
 		return false;
 
@@ -9502,7 +9496,7 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
 			io_uring_show_cred(m, index, cred);
 	}
 	seq_printf(m, "PollList:\n");
-	spin_lock_irq(&ctx->completion_lock);
+	spin_lock(&ctx->completion_lock);
 	for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
 		struct hlist_head *list = &ctx->cancel_hash[i];
 		struct io_kiocb *req;
@@ -9511,7 +9505,7 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
 			seq_printf(m, "  op=%d, task_works=%d\n", req->opcode,
 					req->task->task_works != NULL);
 	}
-	spin_unlock_irq(&ctx->completion_lock);
+	spin_unlock(&ctx->completion_lock);
 	if (has_lock)
 		mutex_unlock(&ctx->uring_lock);
 }
-- 
cgit 


From 21c843d5825b949332fe58495007ca531ef6ae91 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Wed, 11 Aug 2021 19:28:27 +0100
Subject: io_uring: move req_ref_get() and friends

Move all request refcount helpers to avoid forward declarations in the
future.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/89fd36f6f3fe5b733dfe4546c24725eee40df605.1628705069.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 70 +++++++++++++++++++++++++++++------------------------------
 1 file changed, 35 insertions(+), 35 deletions(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 52cad59c0739..1b79f6e2da2e 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1078,6 +1078,41 @@ EXPORT_SYMBOL(io_uring_get_socket);
 #define io_for_each_link(pos, head) \
 	for (pos = (head); pos; pos = pos->link)
 
+/*
+ * Shamelessly stolen from the mm implementation of page reference checking,
+ * see commit f958d7b528b1 for details.
+ */
+#define req_ref_zero_or_close_to_overflow(req)	\
+	((unsigned int) atomic_read(&(req->refs)) + 127u <= 127u)
+
+static inline bool req_ref_inc_not_zero(struct io_kiocb *req)
+{
+	return atomic_inc_not_zero(&req->refs);
+}
+
+static inline bool req_ref_sub_and_test(struct io_kiocb *req, int refs)
+{
+	WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req));
+	return atomic_sub_and_test(refs, &req->refs);
+}
+
+static inline bool req_ref_put_and_test(struct io_kiocb *req)
+{
+	WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req));
+	return atomic_dec_and_test(&req->refs);
+}
+
+static inline void req_ref_put(struct io_kiocb *req)
+{
+	WARN_ON_ONCE(req_ref_put_and_test(req));
+}
+
+static inline void req_ref_get(struct io_kiocb *req)
+{
+	WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req));
+	atomic_inc(&req->refs);
+}
+
 static inline void io_req_set_rsrc_node(struct io_kiocb *req)
 {
 	struct io_ring_ctx *ctx = req->ctx;
@@ -1539,41 +1574,6 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx)
 	return ret;
 }
 
-/*
- * Shamelessly stolen from the mm implementation of page reference checking,
- * see commit f958d7b528b1 for details.
- */
-#define req_ref_zero_or_close_to_overflow(req)	\
-	((unsigned int) atomic_read(&(req->refs)) + 127u <= 127u)
-
-static inline bool req_ref_inc_not_zero(struct io_kiocb *req)
-{
-	return atomic_inc_not_zero(&req->refs);
-}
-
-static inline bool req_ref_sub_and_test(struct io_kiocb *req, int refs)
-{
-	WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req));
-	return atomic_sub_and_test(refs, &req->refs);
-}
-
-static inline bool req_ref_put_and_test(struct io_kiocb *req)
-{
-	WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req));
-	return atomic_dec_and_test(&req->refs);
-}
-
-static inline void req_ref_put(struct io_kiocb *req)
-{
-	WARN_ON_ONCE(req_ref_put_and_test(req));
-}
-
-static inline void req_ref_get(struct io_kiocb *req)
-{
-	WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req));
-	atomic_inc(&req->refs);
-}
-
 /* must to be called somewhat shortly after putting a request */
 static inline void io_put_task(struct task_struct *task, int nr)
 {
-- 
cgit 


From 91c2f6978311afe1f49094fdd90fd6ab29b66223 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Wed, 11 Aug 2021 19:28:28 +0100
Subject: io_uring: remove req_ref_sub_and_test()

Soon, we won't need to put several references at once, remove
req_ref_sub_and_test() and @nr argument from io_put_req_deferred(),
and put the rest of the references by hand.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/1868c7554108bff9194fb5757e77be23fadf7fc0.1628705069.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 31 ++++++++++++++-----------------
 1 file changed, 14 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 1b79f6e2da2e..9d1c18865ef4 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1041,7 +1041,7 @@ static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd);
 static bool io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data,
 				 long res, unsigned int cflags);
 static void io_put_req(struct io_kiocb *req);
-static void io_put_req_deferred(struct io_kiocb *req, int nr);
+static void io_put_req_deferred(struct io_kiocb *req);
 static void io_dismantle_req(struct io_kiocb *req);
 static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req);
 static void io_queue_linked_timeout(struct io_kiocb *req);
@@ -1090,12 +1090,6 @@ static inline bool req_ref_inc_not_zero(struct io_kiocb *req)
 	return atomic_inc_not_zero(&req->refs);
 }
 
-static inline bool req_ref_sub_and_test(struct io_kiocb *req, int refs)
-{
-	WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req));
-	return atomic_sub_and_test(refs, &req->refs);
-}
-
 static inline bool req_ref_put_and_test(struct io_kiocb *req)
 {
 	WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req));
@@ -1377,7 +1371,7 @@ static void io_kill_timeout(struct io_kiocb *req, int status)
 			atomic_read(&req->ctx->cq_timeouts) + 1);
 		list_del_init(&req->timeout.list);
 		io_cqring_fill_event(req->ctx, req->user_data, status, 0);
-		io_put_req_deferred(req, 1);
+		io_put_req_deferred(req);
 	}
 }
 
@@ -1862,7 +1856,7 @@ static bool io_kill_linked_timeout(struct io_kiocb *req)
 		if (hrtimer_try_to_cancel(&io->timer) != -1) {
 			io_cqring_fill_event(link->ctx, link->user_data,
 					     -ECANCELED, 0);
-			io_put_req_deferred(link, 1);
+			io_put_req_deferred(link);
 			return true;
 		}
 	}
@@ -1881,7 +1875,9 @@ static void io_fail_links(struct io_kiocb *req)
 
 		trace_io_uring_fail_link(req, link);
 		io_cqring_fill_event(link->ctx, link->user_data, -ECANCELED, 0);
-		io_put_req_deferred(link, 2);
+
+		io_put_req(link);
+		io_put_req_deferred(link);
 		link = nxt;
 	}
 }
@@ -2163,7 +2159,8 @@ static void io_submit_flush_completions(struct io_ring_ctx *ctx)
 		struct io_kiocb *req = state->compl_reqs[i];
 
 		/* submission and completion refs */
-		if (req_ref_sub_and_test(req, 2))
+		io_put_req(req);
+		if (req_ref_put_and_test(req))
 			io_req_free_batch(&rb, req, &ctx->submit_state);
 	}
 
@@ -2192,9 +2189,9 @@ static inline void io_put_req(struct io_kiocb *req)
 		io_free_req(req);
 }
 
-static inline void io_put_req_deferred(struct io_kiocb *req, int refs)
+static inline void io_put_req_deferred(struct io_kiocb *req)
 {
-	if (req_ref_sub_and_test(req, refs)) {
+	if (req_ref_put_and_test(req)) {
 		req->io_task_work.func = io_free_req;
 		io_req_task_work_add(req);
 	}
@@ -5236,7 +5233,6 @@ static bool __io_poll_remove_one(struct io_kiocb *req,
 static bool io_poll_remove_one(struct io_kiocb *req)
 	__must_hold(&req->ctx->completion_lock)
 {
-	int refs;
 	bool do_complete;
 
 	io_poll_remove_double(req);
@@ -5248,8 +5244,9 @@ static bool io_poll_remove_one(struct io_kiocb *req)
 		req_set_fail(req);
 
 		/* non-poll requests have submit ref still */
-		refs = 1 + (req->opcode != IORING_OP_POLL_ADD);
-		io_put_req_deferred(req, refs);
+		if (req->opcode != IORING_OP_POLL_ADD)
+			io_put_req(req);
+		io_put_req_deferred(req);
 	}
 	return do_complete;
 }
@@ -5550,7 +5547,7 @@ static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
 
 	req_set_fail(req);
 	io_cqring_fill_event(ctx, req->user_data, -ECANCELED, 0);
-	io_put_req_deferred(req, 1);
+	io_put_req_deferred(req);
 	return 0;
 }
 
-- 
cgit 


From 5d5901a3434064e98c1dbb3047b9f9793825ea42 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Wed, 11 Aug 2021 19:28:29 +0100
Subject: io_uring: remove submission references

Requests are by default given with two references, submission and
completion. Completion references are straightforward, they represent
request ownership and are put when a request is completed or so.
Submission references are a bit more trickier. They're needed when
io_issue_sqe() followed deep into the submission stack (e.g. in fs,
block, drivers, etc.), request may have given away for concurrent
execution or already completed, and the code unwinding back to
io_issue_sqe() may be accessing some pieces of our requests, e.g.
file or iov.

Now, we prevent such async/in-depth completions by pushing requests
through task_work. Punting to io-wq is also done through task_works,
apart from a couple of cases with a pretty well known context. So,
there're two cases:
1) io_issue_sqe() from the task context and protected by ->uring_lock.
Either requests return back to io_uring or handed to task_work, which
won't be executed because we're currently controlling that task. So,
we can be sure that requests are staying alive all the time and we don't
need submission references to pin them.

2) io_issue_sqe() from io-wq, which doesn't hold the mutex. The role of
submission reference is played by io-wq reference, which is put by
io_wq_submit_work(). Hence, it should be fine.

Considering that, we can carefully kill the submission reference.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/6b68f1c763229a590f2a27148aee77767a8d7750.1628705069.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 37 ++++++++++++++-----------------------
 1 file changed, 14 insertions(+), 23 deletions(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 9d1c18865ef4..6ad6c56e7f4d 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1705,7 +1705,6 @@ static inline void io_req_complete(struct io_kiocb *req, long res)
 static void io_req_complete_failed(struct io_kiocb *req, long res)
 {
 	req_set_fail(req);
-	io_put_req(req);
 	io_req_complete_post(req, res, 0);
 }
 
@@ -1760,7 +1759,14 @@ static bool io_flush_cached_reqs(struct io_ring_ctx *ctx)
 	return nr != 0;
 }
 
+/*
+ * A request might get retired back into the request caches even before opcode
+ * handlers and io_issue_sqe() are done with it, e.g. inline completion path.
+ * Because of that, io_alloc_req() should be called only under ->uring_lock
+ * and with extra caution to not get a request that is still worked on.
+ */
 static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx)
+	__must_hold(&ctx->uring_lock)
 {
 	struct io_submit_state *state = &ctx->submit_state;
 	gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
@@ -1875,8 +1881,6 @@ static void io_fail_links(struct io_kiocb *req)
 
 		trace_io_uring_fail_link(req, link);
 		io_cqring_fill_event(link->ctx, link->user_data, -ECANCELED, 0);
-
-		io_put_req(link);
 		io_put_req_deferred(link);
 		link = nxt;
 	}
@@ -2158,8 +2162,6 @@ static void io_submit_flush_completions(struct io_ring_ctx *ctx)
 	for (i = 0; i < nr; i++) {
 		struct io_kiocb *req = state->compl_reqs[i];
 
-		/* submission and completion refs */
-		io_put_req(req);
 		if (req_ref_put_and_test(req))
 			io_req_free_batch(&rb, req, &ctx->submit_state);
 	}
@@ -2264,7 +2266,6 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
 		if (READ_ONCE(req->result) == -EAGAIN && resubmit &&
 		    !(req->flags & REQ_F_DONT_REISSUE)) {
 			req->iopoll_completed = 0;
-			req_ref_get(req);
 			io_req_task_queue_reissue(req);
 			continue;
 		}
@@ -2762,7 +2763,6 @@ static void kiocb_done(struct kiocb *kiocb, ssize_t ret,
 	if (check_reissue && (req->flags & REQ_F_REISSUE)) {
 		req->flags &= ~REQ_F_REISSUE;
 		if (io_resubmit_prep(req)) {
-			req_ref_get(req);
 			io_req_task_queue_reissue(req);
 		} else {
 			int cflags = 0;
@@ -3188,9 +3188,6 @@ static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode,
 
 	req->rw.kiocb.ki_flags &= ~IOCB_WAITQ;
 	list_del_init(&wait->entry);
-
-	/* submit ref gets dropped, acquire a new one */
-	req_ref_get(req);
 	io_req_task_queue(req);
 	return 1;
 }
@@ -5242,10 +5239,6 @@ static bool io_poll_remove_one(struct io_kiocb *req)
 		io_cqring_fill_event(req->ctx, req->user_data, -ECANCELED, 0);
 		io_commit_cqring(req->ctx);
 		req_set_fail(req);
-
-		/* non-poll requests have submit ref still */
-		if (req->opcode != IORING_OP_POLL_ADD)
-			io_put_req(req);
 		io_put_req_deferred(req);
 	}
 	return do_complete;
@@ -6280,6 +6273,9 @@ static void io_wq_submit_work(struct io_wq_work *work)
 	struct io_kiocb *timeout;
 	int ret = 0;
 
+	/* will be dropped by ->io_free_work() after returning to io-wq */
+	req_ref_get(req);
+
 	timeout = io_prep_linked_timeout(req);
 	if (timeout)
 		io_queue_linked_timeout(timeout);
@@ -6302,11 +6298,8 @@ static void io_wq_submit_work(struct io_wq_work *work)
 	}
 
 	/* avoid locking problems by failing it from a clean context */
-	if (ret) {
-		/* io-wq is going to take one down */
-		req_ref_get(req);
+	if (ret)
 		io_req_task_queue_fail(req, ret);
-	}
 }
 
 static inline struct io_fixed_file *io_fixed_file_slot(struct io_file_table *table,
@@ -6448,6 +6441,8 @@ static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
 	    nxt->opcode != IORING_OP_LINK_TIMEOUT)
 		return NULL;
 
+	/* linked timeouts should have two refs once prep'ed */
+	req_ref_get(nxt);
 	nxt->timeout.head = req;
 	nxt->flags |= REQ_F_LTIMEOUT_ACTIVE;
 	req->flags |= REQ_F_LINK_TIMEOUT;
@@ -6468,7 +6463,6 @@ issue_sqe:
 	 * doesn't support non-blocking read/write attempts
 	 */
 	if (likely(!ret)) {
-		/* drop submission reference */
 		if (req->flags & REQ_F_COMPLETE_INLINE) {
 			struct io_ring_ctx *ctx = req->ctx;
 			struct io_submit_state *state = &ctx->submit_state;
@@ -6476,8 +6470,6 @@ issue_sqe:
 			state->compl_reqs[state->compl_nr++] = req;
 			if (state->compl_nr == ARRAY_SIZE(state->compl_reqs))
 				io_submit_flush_completions(ctx);
-		} else {
-			io_put_req(req);
 		}
 	} else if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) {
 		switch (io_arm_poll_handler(req)) {
@@ -6557,8 +6549,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
 	req->user_data = READ_ONCE(sqe->user_data);
 	req->file = NULL;
 	req->fixed_rsrc_refs = NULL;
-	/* one is dropped after submission, the other at completion */
-	atomic_set(&req->refs, 2);
+	atomic_set(&req->refs, 1);
 	req->task = current;
 
 	/* enforce forwards compatibility on users */
-- 
cgit 


From 20e60a3832089741d6b25c13d291050c5d00b4e7 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Wed, 11 Aug 2021 19:28:30 +0100
Subject: io_uring: skip request refcounting

As submission references are gone, there is only one initial reference
left. Instead of actually doing atomic refcounting, add a flag
indicating whether we're going to take more refs or doing any other sync
magic. The flag should be set before the request may get used in
parallel.

Together with the previous patch it saves 2 refcount atomics per request
for IOPOLL and IRQ completions, and 1 atomic per req for inline
completions, with some exceptions. In particular, currently, there are
three cases, when the refcounting have to be enabled:
- Polling, including apoll. Because double poll entries takes a ref.
  Might get relaxed in the near future.
- Link timeouts, enabled for both, the timeout and the request it's
  bound to, because they work in-parallel and we need to synchronise
  to cancel one of them on completion.
- When a request gets in io-wq, because it doesn't hold uring_lock and
  we need guarantees of submission references.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/8b204b6c5f6643062270a1913d6d3a7f8f795fd9.1628705069.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 24 +++++++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 6ad6c56e7f4d..dae87c576943 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -710,6 +710,7 @@ enum {
 	REQ_F_REISSUE_BIT,
 	REQ_F_DONT_REISSUE_BIT,
 	REQ_F_CREDS_BIT,
+	REQ_F_REFCOUNT_BIT,
 	/* keep async read/write and isreg together and in order */
 	REQ_F_NOWAIT_READ_BIT,
 	REQ_F_NOWAIT_WRITE_BIT,
@@ -765,6 +766,8 @@ enum {
 	REQ_F_ISREG		= BIT(REQ_F_ISREG_BIT),
 	/* has creds assigned */
 	REQ_F_CREDS		= BIT(REQ_F_CREDS_BIT),
+	/* skip refcounting if not set */
+	REQ_F_REFCOUNT		= BIT(REQ_F_REFCOUNT_BIT),
 };
 
 struct async_poll {
@@ -1087,26 +1090,40 @@ EXPORT_SYMBOL(io_uring_get_socket);
 
 static inline bool req_ref_inc_not_zero(struct io_kiocb *req)
 {
+	WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT));
 	return atomic_inc_not_zero(&req->refs);
 }
 
 static inline bool req_ref_put_and_test(struct io_kiocb *req)
 {
+	if (likely(!(req->flags & REQ_F_REFCOUNT)))
+		return true;
+
 	WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req));
 	return atomic_dec_and_test(&req->refs);
 }
 
 static inline void req_ref_put(struct io_kiocb *req)
 {
+	WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT));
 	WARN_ON_ONCE(req_ref_put_and_test(req));
 }
 
 static inline void req_ref_get(struct io_kiocb *req)
 {
+	WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT));
 	WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req));
 	atomic_inc(&req->refs);
 }
 
+static inline void io_req_refcount(struct io_kiocb *req)
+{
+	if (!(req->flags & REQ_F_REFCOUNT)) {
+		req->flags |= REQ_F_REFCOUNT;
+		atomic_set(&req->refs, 1);
+	}
+}
+
 static inline void io_req_set_rsrc_node(struct io_kiocb *req)
 {
 	struct io_ring_ctx *ctx = req->ctx;
@@ -5192,6 +5209,7 @@ static int io_arm_poll_handler(struct io_kiocb *req)
 	req->apoll = apoll;
 	req->flags |= REQ_F_POLLED;
 	ipt.pt._qproc = io_async_queue_proc;
+	io_req_refcount(req);
 
 	ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask,
 					io_async_wake);
@@ -5382,6 +5400,7 @@ static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
 	if (flags & ~IORING_POLL_ADD_MULTI)
 		return -EINVAL;
 
+	io_req_refcount(req);
 	poll->events = io_poll_parse_events(sqe, flags);
 	return 0;
 }
@@ -6273,6 +6292,7 @@ static void io_wq_submit_work(struct io_wq_work *work)
 	struct io_kiocb *timeout;
 	int ret = 0;
 
+	io_req_refcount(req);
 	/* will be dropped by ->io_free_work() after returning to io-wq */
 	req_ref_get(req);
 
@@ -6442,7 +6462,10 @@ static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
 		return NULL;
 
 	/* linked timeouts should have two refs once prep'ed */
+	io_req_refcount(req);
+	io_req_refcount(nxt);
 	req_ref_get(nxt);
+
 	nxt->timeout.head = req;
 	nxt->flags |= REQ_F_LTIMEOUT_ACTIVE;
 	req->flags |= REQ_F_LINK_TIMEOUT;
@@ -6549,7 +6572,6 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
 	req->user_data = READ_ONCE(sqe->user_data);
 	req->file = NULL;
 	req->fixed_rsrc_refs = NULL;
-	atomic_set(&req->refs, 1);
 	req->task = current;
 
 	/* enforce forwards compatibility on users */
-- 
cgit 


From fd08e5309bba8672c1190362dff6c92bfd59218d Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Wed, 11 Aug 2021 19:28:31 +0100
Subject: io_uring: optimise hot path of ltimeout prep

io_prep_linked_timeout() grew too heavy and compiler now refuse to
inline the function. Help it by splitting in two and annotating with
inline.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/560636717a32e9513724f09b9ecaace942dde4d4.1628705069.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 45 +++++++++++++++++++++++++--------------------
 1 file changed, 25 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index dae87c576943..4f5a00707644 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1046,7 +1046,6 @@ static bool io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data,
 static void io_put_req(struct io_kiocb *req);
 static void io_put_req_deferred(struct io_kiocb *req);
 static void io_dismantle_req(struct io_kiocb *req);
-static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req);
 static void io_queue_linked_timeout(struct io_kiocb *req);
 static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
 				     struct io_uring_rsrc_update2 *up,
@@ -1299,6 +1298,31 @@ static void io_req_track_inflight(struct io_kiocb *req)
 	}
 }
 
+static struct io_kiocb *__io_prep_linked_timeout(struct io_kiocb *req)
+{
+	struct io_kiocb *nxt = req->link;
+
+	if (req->flags & REQ_F_LINK_TIMEOUT)
+		return NULL;
+
+	/* linked timeouts should have two refs once prep'ed */
+	io_req_refcount(req);
+	io_req_refcount(nxt);
+	req_ref_get(nxt);
+
+	nxt->timeout.head = req;
+	nxt->flags |= REQ_F_LTIMEOUT_ACTIVE;
+	req->flags |= REQ_F_LINK_TIMEOUT;
+	return nxt;
+}
+
+static inline struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
+{
+	if (likely(!req->link || req->link->opcode != IORING_OP_LINK_TIMEOUT))
+		return NULL;
+	return __io_prep_linked_timeout(req);
+}
+
 static void io_prep_async_work(struct io_kiocb *req)
 {
 	const struct io_op_def *def = &io_op_defs[req->opcode];
@@ -6453,25 +6477,6 @@ static void io_queue_linked_timeout(struct io_kiocb *req)
 	io_put_req(req);
 }
 
-static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
-{
-	struct io_kiocb *nxt = req->link;
-
-	if (!nxt || (req->flags & REQ_F_LINK_TIMEOUT) ||
-	    nxt->opcode != IORING_OP_LINK_TIMEOUT)
-		return NULL;
-
-	/* linked timeouts should have two refs once prep'ed */
-	io_req_refcount(req);
-	io_req_refcount(nxt);
-	req_ref_get(nxt);
-
-	nxt->timeout.head = req;
-	nxt->flags |= REQ_F_LTIMEOUT_ACTIVE;
-	req->flags |= REQ_F_LINK_TIMEOUT;
-	return nxt;
-}
-
 static void __io_queue_sqe(struct io_kiocb *req)
 	__must_hold(&req->ctx->uring_lock)
 {
-- 
cgit 


From f552a27afe67f05c47bb0c33b92af2a23b684c31 Mon Sep 17 00:00:00 2001
From: Hao Xu <haoxu@linux.alibaba.com>
Date: Thu, 12 Aug 2021 12:14:35 +0800
Subject: io_uring: remove files pointer in cancellation functions

When doing cancellation, we use a parameter to indicate where it's from
do_exit or exec. So a boolean value is good enough for this, remove the
struct files* as it is not necessary.

Signed-off-by: Hao Xu <haoxu@linux.alibaba.com>
[axboe: fixup io_uring_files_cancel for !CONFIG_IO_URING]
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c            |  4 ++--
 include/linux/io_uring.h | 10 +++++-----
 kernel/exit.c            |  2 +-
 3 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 4f5a00707644..7626cad93f60 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -9213,9 +9213,9 @@ static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd)
 	}
 }
 
-void __io_uring_cancel(struct files_struct *files)
+void __io_uring_cancel(bool cancel_all)
 {
-	io_uring_cancel_generic(!files, NULL);
+	io_uring_cancel_generic(cancel_all, NULL);
 }
 
 static void *io_uring_validate_mmap_request(struct file *file,
diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h
index ed13304e764c..649a4d7c241b 100644
--- a/include/linux/io_uring.h
+++ b/include/linux/io_uring.h
@@ -7,18 +7,18 @@
 
 #if defined(CONFIG_IO_URING)
 struct sock *io_uring_get_socket(struct file *file);
-void __io_uring_cancel(struct files_struct *files);
+void __io_uring_cancel(bool cancel_all);
 void __io_uring_free(struct task_struct *tsk);
 
-static inline void io_uring_files_cancel(struct files_struct *files)
+static inline void io_uring_files_cancel(void)
 {
 	if (current->io_uring)
-		__io_uring_cancel(files);
+		__io_uring_cancel(false);
 }
 static inline void io_uring_task_cancel(void)
 {
 	if (current->io_uring)
-		__io_uring_cancel(NULL);
+		__io_uring_cancel(true);
 }
 static inline void io_uring_free(struct task_struct *tsk)
 {
@@ -33,7 +33,7 @@ static inline struct sock *io_uring_get_socket(struct file *file)
 static inline void io_uring_task_cancel(void)
 {
 }
-static inline void io_uring_files_cancel(struct files_struct *files)
+static inline void io_uring_files_cancel(void)
 {
 }
 static inline void io_uring_free(struct task_struct *tsk)
diff --git a/kernel/exit.c b/kernel/exit.c
index 9a89e7f36acb..91a43e57a32e 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -777,7 +777,7 @@ void __noreturn do_exit(long code)
 		schedule();
 	}
 
-	io_uring_files_cancel(tsk->files);
+	io_uring_files_cancel();
 	exit_signals(tsk);  /* sets PF_EXITING */
 
 	/* sync mm's RSS info before statistics gathering */
-- 
cgit 


From 41a5169c23ebe85fdd0b64a0b6381f486a34ef3c Mon Sep 17 00:00:00 2001
From: Hao Xu <haoxu@linux.alibaba.com>
Date: Thu, 12 Aug 2021 15:47:02 +0800
Subject: io_uring: code clean for completion_lock in io_arm_poll_handler()

We can merge two spin_unlock() operations to one since we removed some
code not long ago.

Signed-off-by: Hao Xu <haoxu@linux.alibaba.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 7626cad93f60..c581c7cbb8b2 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -5237,13 +5237,10 @@ static int io_arm_poll_handler(struct io_kiocb *req)
 
 	ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask,
 					io_async_wake);
-	if (ret || ipt.error) {
-		spin_unlock(&ctx->completion_lock);
-		if (ret)
-			return IO_APOLL_READY;
-		return IO_APOLL_ABORTED;
-	}
 	spin_unlock(&ctx->completion_lock);
+	if (ret || ipt.error)
+		return ret ? IO_APOLL_READY : IO_APOLL_ABORTED;
+
 	trace_io_uring_poll_arm(ctx, req, req->opcode, req->user_data,
 				mask, apoll->poll.events);
 	return IO_APOLL_OK;
-- 
cgit 


From a141dd896f544df9627502cfb3fc1a73fb6587e4 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 12 Aug 2021 12:48:34 -0600
Subject: io_uring: correct __must_hold annotation

io_req_free_batch() has a __must_hold annotation referencing a
request being passed in, but we're passing in the context.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index c581c7cbb8b2..e1ca427e183a 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2182,7 +2182,7 @@ static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req,
 }
 
 static void io_submit_flush_completions(struct io_ring_ctx *ctx)
-	__must_hold(&req->ctx->uring_lock)
+	__must_hold(&ctx->uring_lock)
 {
 	struct io_submit_state *state = &ctx->submit_state;
 	int i, nr = state->compl_nr;
-- 
cgit 


From 48dcd38d73c22b22bf9dc1c01b0ca0b8414b31da Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sun, 15 Aug 2021 10:40:18 +0100
Subject: io_uring: optimise iowq refcounting

If a requests is forwarded into io-wq, there is a good chance it hasn't
been refcounted yet and we can save one req_ref_get() by setting the
refcount number to the right value directly.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/2d53f4449faaf73b4a4c5de667fc3c176d974860.1628981736.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 25 ++++++++++++++++---------
 1 file changed, 16 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index e1ca427e183a..86466e12c74d 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1115,14 +1115,19 @@ static inline void req_ref_get(struct io_kiocb *req)
 	atomic_inc(&req->refs);
 }
 
-static inline void io_req_refcount(struct io_kiocb *req)
+static inline void __io_req_set_refcount(struct io_kiocb *req, int nr)
 {
 	if (!(req->flags & REQ_F_REFCOUNT)) {
 		req->flags |= REQ_F_REFCOUNT;
-		atomic_set(&req->refs, 1);
+		atomic_set(&req->refs, nr);
 	}
 }
 
+static inline void io_req_set_refcount(struct io_kiocb *req)
+{
+	__io_req_set_refcount(req, 1);
+}
+
 static inline void io_req_set_rsrc_node(struct io_kiocb *req)
 {
 	struct io_ring_ctx *ctx = req->ctx;
@@ -1306,8 +1311,8 @@ static struct io_kiocb *__io_prep_linked_timeout(struct io_kiocb *req)
 		return NULL;
 
 	/* linked timeouts should have two refs once prep'ed */
-	io_req_refcount(req);
-	io_req_refcount(nxt);
+	io_req_set_refcount(req);
+	io_req_set_refcount(nxt);
 	req_ref_get(nxt);
 
 	nxt->timeout.head = req;
@@ -5233,7 +5238,7 @@ static int io_arm_poll_handler(struct io_kiocb *req)
 	req->apoll = apoll;
 	req->flags |= REQ_F_POLLED;
 	ipt.pt._qproc = io_async_queue_proc;
-	io_req_refcount(req);
+	io_req_set_refcount(req);
 
 	ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask,
 					io_async_wake);
@@ -5421,7 +5426,7 @@ static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
 	if (flags & ~IORING_POLL_ADD_MULTI)
 		return -EINVAL;
 
-	io_req_refcount(req);
+	io_req_set_refcount(req);
 	poll->events = io_poll_parse_events(sqe, flags);
 	return 0;
 }
@@ -6313,9 +6318,11 @@ static void io_wq_submit_work(struct io_wq_work *work)
 	struct io_kiocb *timeout;
 	int ret = 0;
 
-	io_req_refcount(req);
-	/* will be dropped by ->io_free_work() after returning to io-wq */
-	req_ref_get(req);
+	/* one will be dropped by ->io_free_work() after returning to io-wq */
+	if (!(req->flags & REQ_F_REFCOUNT))
+		__io_req_set_refcount(req, 2);
+	else
+		req_ref_get(req);
 
 	timeout = io_prep_linked_timeout(req);
 	if (timeout)
-- 
cgit 


From 761bcac1573efc99042d59add94d468bf17127f0 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sun, 15 Aug 2021 10:40:19 +0100
Subject: io_uring: don't inflight-track linked timeouts

Tracking linked timeouts as infligh was needed to make sure that io-wq
is not destroyed by io_uring_cancel_generic() racing with
io_async_cancel_one() accessing it. Now, cancellations issued by linked
timeouts are done in the task context, so it's already synchronised.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/e1b05cf47cb69df2305efdbee8cf7ba36f46c1a3.1628981736.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 86466e12c74d..5b95ab661f31 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -5701,8 +5701,6 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 
 	data->mode = io_translate_timeout_mode(flags);
 	hrtimer_init(&data->timer, CLOCK_MONOTONIC, data->mode);
-	if (is_timeout_link)
-		io_req_track_inflight(req);
 	return 0;
 }
 
-- 
cgit 


From fb6820998f57a3e63a382a322530fa28522a2bba Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sun, 15 Aug 2021 10:40:20 +0100
Subject: io_uring: optimise initial ltimeout refcounting

Linked timeouts are never refcounted when it comes to the first call to
__io_prep_linked_timeout(), so save an io_ref_get() and set the desired
value directly.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/177b24cc62ffbb42d915d6eb9e8876266e4c0d5a.1628981736.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 5b95ab661f31..005fc06f89b9 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1312,8 +1312,7 @@ static struct io_kiocb *__io_prep_linked_timeout(struct io_kiocb *req)
 
 	/* linked timeouts should have two refs once prep'ed */
 	io_req_set_refcount(req);
-	io_req_set_refcount(nxt);
-	req_ref_get(nxt);
+	__io_req_set_refcount(nxt, 2);
 
 	nxt->timeout.head = req;
 	nxt->flags |= REQ_F_LTIMEOUT_ACTIVE;
-- 
cgit 


From a8576af9d1b03a1b8aba7228e938ab0817fdbda6 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sun, 15 Aug 2021 10:40:21 +0100
Subject: io_uring: kill not necessary resubmit switch

773af69121ecc ("io_uring: always reissue from task_work context") makes
all resubmission to be made from task_work, so we don't need that hack
with resubmit/not-resubmit switch anymore.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/47fa177cca04e5ffd308a35227966c8e15d8525b.1628981736.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 005fc06f89b9..d6e0e7e317da 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2293,7 +2293,7 @@ static inline bool io_run_task_work(void)
  * Find and free completed poll iocbs
  */
 static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
-			       struct list_head *done, bool resubmit)
+			       struct list_head *done)
 {
 	struct req_batch rb;
 	struct io_kiocb *req;
@@ -2308,7 +2308,7 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
 		req = list_first_entry(done, struct io_kiocb, inflight_entry);
 		list_del(&req->inflight_entry);
 
-		if (READ_ONCE(req->result) == -EAGAIN && resubmit &&
+		if (READ_ONCE(req->result) == -EAGAIN &&
 		    !(req->flags & REQ_F_DONT_REISSUE)) {
 			req->iopoll_completed = 0;
 			io_req_task_queue_reissue(req);
@@ -2331,7 +2331,7 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
 }
 
 static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
-			long min, bool resubmit)
+			long min)
 {
 	struct io_kiocb *req, *tmp;
 	LIST_HEAD(done);
@@ -2371,7 +2371,7 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
 	}
 
 	if (!list_empty(&done))
-		io_iopoll_complete(ctx, nr_events, &done, resubmit);
+		io_iopoll_complete(ctx, nr_events, &done);
 
 	return 0;
 }
@@ -2389,7 +2389,7 @@ static void io_iopoll_try_reap_events(struct io_ring_ctx *ctx)
 	while (!list_empty(&ctx->iopoll_list)) {
 		unsigned int nr_events = 0;
 
-		io_do_iopoll(ctx, &nr_events, 0, false);
+		io_do_iopoll(ctx, &nr_events, 0);
 
 		/* let it sleep and repeat later if can't complete a request */
 		if (nr_events == 0)
@@ -2451,7 +2451,7 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
 			    list_empty(&ctx->iopoll_list))
 				break;
 		}
-		ret = io_do_iopoll(ctx, &nr_events, min, true);
+		ret = io_do_iopoll(ctx, &nr_events, min);
 	} while (!ret && nr_events < min && !need_resched());
 out:
 	mutex_unlock(&ctx->uring_lock);
@@ -6857,7 +6857,7 @@ static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)
 
 		mutex_lock(&ctx->uring_lock);
 		if (!list_empty(&ctx->iopoll_list))
-			io_do_iopoll(ctx, &nr_events, 0, true);
+			io_do_iopoll(ctx, &nr_events, 0);
 
 		/*
 		 * Don't submit if refs are dying, good for io_uring_register(),
-- 
cgit 


From 8cb01fac982a3f8622a46821af1eb68136f936ca Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sun, 15 Aug 2021 10:40:22 +0100
Subject: io_uring: deduplicate cancellation code

IORING_OP_ASYNC_CANCEL and IORING_OP_LINK_TIMEOUT have enough of
overlap, so extract a helper for request cancellation and use in both.
Also, removes some amount of ugliness because of success_ret.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/900122b588e65b637e71bfec80a260726c6a54d6.1628981736.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 46 ++++++++++++++++++----------------------------
 1 file changed, 18 insertions(+), 28 deletions(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index d6e0e7e317da..a23b6089cbf6 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -5792,32 +5792,24 @@ static int io_async_cancel_one(struct io_uring_task *tctx, u64 user_data,
 	return ret;
 }
 
-static void io_async_find_and_cancel(struct io_ring_ctx *ctx,
-				     struct io_kiocb *req, __u64 sqe_addr,
-				     int success_ret)
+static int io_try_cancel_userdata(struct io_kiocb *req, u64 sqe_addr)
+	__acquires(&req->ctx->completion_lock)
 {
+	struct io_ring_ctx *ctx = req->ctx;
 	int ret;
 
+	WARN_ON_ONCE(req->task != current);
+
 	ret = io_async_cancel_one(req->task->io_uring, sqe_addr, ctx);
 	spin_lock(&ctx->completion_lock);
 	if (ret != -ENOENT)
-		goto done;
+		return ret;
 	spin_lock_irq(&ctx->timeout_lock);
 	ret = io_timeout_cancel(ctx, sqe_addr);
 	spin_unlock_irq(&ctx->timeout_lock);
 	if (ret != -ENOENT)
-		goto done;
-	ret = io_poll_cancel(ctx, sqe_addr, false);
-done:
-	if (!ret)
-		ret = success_ret;
-	io_cqring_fill_event(ctx, req->user_data, ret, 0);
-	io_commit_cqring(ctx);
-	spin_unlock(&ctx->completion_lock);
-	io_cqring_ev_posted(ctx);
-
-	if (ret < 0)
-		req_set_fail(req);
+		return ret;
+	return io_poll_cancel(ctx, sqe_addr, false);
 }
 
 static int io_async_cancel_prep(struct io_kiocb *req,
@@ -5841,17 +5833,7 @@ static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
 	struct io_tctx_node *node;
 	int ret;
 
-	/* tasks should wait for their io-wq threads, so safe w/o sync */
-	ret = io_async_cancel_one(req->task->io_uring, sqe_addr, ctx);
-	spin_lock(&ctx->completion_lock);
-	if (ret != -ENOENT)
-		goto done;
-	spin_lock_irq(&ctx->timeout_lock);
-	ret = io_timeout_cancel(ctx, sqe_addr);
-	spin_unlock_irq(&ctx->timeout_lock);
-	if (ret != -ENOENT)
-		goto done;
-	ret = io_poll_cancel(ctx, sqe_addr, false);
+	ret = io_try_cancel_userdata(req, sqe_addr);
 	if (ret != -ENOENT)
 		goto done;
 	spin_unlock(&ctx->completion_lock);
@@ -6418,9 +6400,17 @@ static void io_req_task_link_timeout(struct io_kiocb *req)
 {
 	struct io_kiocb *prev = req->timeout.prev;
 	struct io_ring_ctx *ctx = req->ctx;
+	int ret;
 
 	if (prev) {
-		io_async_find_and_cancel(ctx, req, prev->user_data, -ETIME);
+		ret = io_try_cancel_userdata(req, prev->user_data);
+		if (!ret)
+			ret = -ETIME;
+		io_cqring_fill_event(ctx, req->user_data, ret, 0);
+		io_commit_cqring(ctx);
+		spin_unlock(&ctx->completion_lock);
+		io_cqring_ev_posted(ctx);
+
 		io_put_req(prev);
 		io_put_req(req);
 	} else {
-- 
cgit 


From b97e736a4b553ff18963019c7ca91cd684f83709 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sun, 15 Aug 2021 10:40:23 +0100
Subject: io_uring: kill REQ_F_LTIMEOUT_ACTIVE

Instead of handling double consecutive linked timeouts through tricky
flag combinations, just check the submit_state.link during timeout_prep
and fail that case in advance.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/04150760b0dc739522264b8abd309409f7421a06.1628981736.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index a23b6089cbf6..abf8bb781d40 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -705,7 +705,6 @@ enum {
 	REQ_F_NEED_CLEANUP_BIT,
 	REQ_F_POLLED_BIT,
 	REQ_F_BUFFER_SELECTED_BIT,
-	REQ_F_LTIMEOUT_ACTIVE_BIT,
 	REQ_F_COMPLETE_INLINE_BIT,
 	REQ_F_REISSUE_BIT,
 	REQ_F_DONT_REISSUE_BIT,
@@ -750,8 +749,6 @@ enum {
 	REQ_F_POLLED		= BIT(REQ_F_POLLED_BIT),
 	/* buffer already selected */
 	REQ_F_BUFFER_SELECTED	= BIT(REQ_F_BUFFER_SELECTED_BIT),
-	/* linked timeout is active, i.e. prepared by link's head */
-	REQ_F_LTIMEOUT_ACTIVE	= BIT(REQ_F_LTIMEOUT_ACTIVE_BIT),
 	/* completion is deferred through io_comp_state */
 	REQ_F_COMPLETE_INLINE	= BIT(REQ_F_COMPLETE_INLINE_BIT),
 	/* caller should reissue async */
@@ -1315,7 +1312,6 @@ static struct io_kiocb *__io_prep_linked_timeout(struct io_kiocb *req)
 	__io_req_set_refcount(nxt, 2);
 
 	nxt->timeout.head = req;
-	nxt->flags |= REQ_F_LTIMEOUT_ACTIVE;
 	req->flags |= REQ_F_LINK_TIMEOUT;
 	return nxt;
 }
@@ -1895,11 +1891,7 @@ static bool io_kill_linked_timeout(struct io_kiocb *req)
 {
 	struct io_kiocb *link = req->link;
 
-	/*
-	 * Can happen if a linked timeout fired and link had been like
-	 * req -> link t-out -> link t-out [-> ...]
-	 */
-	if (link && (link->flags & REQ_F_LTIMEOUT_ACTIVE)) {
+	if (link && link->opcode == IORING_OP_LINK_TIMEOUT) {
 		struct io_timeout_data *io = link->async_data;
 
 		io_remove_next_linked(req);
@@ -5700,6 +5692,15 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 
 	data->mode = io_translate_timeout_mode(flags);
 	hrtimer_init(&data->timer, CLOCK_MONOTONIC, data->mode);
+
+	if (is_timeout_link) {
+		struct io_submit_link *link = &req->ctx->submit_state.link;
+
+		if (!link->head)
+			return -EINVAL;
+		if (link->last->opcode == IORING_OP_LINK_TIMEOUT)
+			return -EINVAL;
+	}
 	return 0;
 }
 
-- 
cgit 


From 4d13d1a4d1e1807e04b846b48934e87016027f90 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sun, 15 Aug 2021 10:40:24 +0100
Subject: io_uring: simplify io_prep_linked_timeout

The link test in io_prep_linked_timeout() is pretty bulky, replace it
with a flag. It's better for normal path and linked requests, and also
will be used further for request failing.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/3703770bfae8bc1ff370e43ef5767940202cab42.1628981736.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index abf8bb781d40..097ba596c819 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -710,6 +710,7 @@ enum {
 	REQ_F_DONT_REISSUE_BIT,
 	REQ_F_CREDS_BIT,
 	REQ_F_REFCOUNT_BIT,
+	REQ_F_ARM_LTIMEOUT_BIT,
 	/* keep async read/write and isreg together and in order */
 	REQ_F_NOWAIT_READ_BIT,
 	REQ_F_NOWAIT_WRITE_BIT,
@@ -765,6 +766,8 @@ enum {
 	REQ_F_CREDS		= BIT(REQ_F_CREDS_BIT),
 	/* skip refcounting if not set */
 	REQ_F_REFCOUNT		= BIT(REQ_F_REFCOUNT_BIT),
+	/* there is a linked timeout that has to be armed */
+	REQ_F_ARM_LTIMEOUT	= BIT(REQ_F_ARM_LTIMEOUT_BIT),
 };
 
 struct async_poll {
@@ -1302,23 +1305,18 @@ static void io_req_track_inflight(struct io_kiocb *req)
 
 static struct io_kiocb *__io_prep_linked_timeout(struct io_kiocb *req)
 {
-	struct io_kiocb *nxt = req->link;
-
-	if (req->flags & REQ_F_LINK_TIMEOUT)
-		return NULL;
+	req->flags &= ~REQ_F_ARM_LTIMEOUT;
+	req->flags |= REQ_F_LINK_TIMEOUT;
 
 	/* linked timeouts should have two refs once prep'ed */
 	io_req_set_refcount(req);
-	__io_req_set_refcount(nxt, 2);
-
-	nxt->timeout.head = req;
-	req->flags |= REQ_F_LINK_TIMEOUT;
-	return nxt;
+	__io_req_set_refcount(req->link, 2);
+	return req->link;
 }
 
 static inline struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
 {
-	if (likely(!req->link || req->link->opcode != IORING_OP_LINK_TIMEOUT))
+	if (likely(!(req->flags & REQ_F_ARM_LTIMEOUT)))
 		return NULL;
 	return __io_prep_linked_timeout(req);
 }
@@ -5700,6 +5698,8 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 			return -EINVAL;
 		if (link->last->opcode == IORING_OP_LINK_TIMEOUT)
 			return -EINVAL;
+		req->timeout.head = link->last;
+		link->last->flags |= REQ_F_ARM_LTIMEOUT;
 	}
 	return 0;
 }
-- 
cgit 


From 0756a8691017518ceeca4c083e7a359107186498 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sun, 15 Aug 2021 10:40:25 +0100
Subject: io_uring: cancel not-armed linked touts separately

Adjust io_disarm_next(), so it can detect if there is a linked but
not-yet-armed timeout and complete/cancel it separately. Will be used in
the following patch.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/ae228cde2c0df3d92d29d5e4852ed9fa8a2a97db.1628981736.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 097ba596c819..8eea3a1e8c21 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1034,6 +1034,9 @@ static const struct io_op_def io_op_defs[] = {
 	[IORING_OP_UNLINKAT] = {},
 };
 
+/* requests with any of those set should undergo io_disarm_next() */
+#define IO_DISARM_MASK (REQ_F_ARM_LTIMEOUT | REQ_F_LINK_TIMEOUT | REQ_F_FAIL)
+
 static bool io_disarm_next(struct io_kiocb *req);
 static void io_uring_del_tctx_node(unsigned long index);
 static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
@@ -1688,7 +1691,7 @@ static void io_req_complete_post(struct io_kiocb *req, long res,
 	 */
 	if (req_ref_put_and_test(req)) {
 		if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) {
-			if (req->flags & (REQ_F_LINK_TIMEOUT | REQ_F_FAIL))
+			if (req->flags & IO_DISARM_MASK)
 				io_disarm_next(req);
 			if (req->link) {
 				io_req_task_queue(req->link);
@@ -1926,7 +1929,17 @@ static bool io_disarm_next(struct io_kiocb *req)
 {
 	bool posted = false;
 
-	if (likely(req->flags & REQ_F_LINK_TIMEOUT)) {
+	if (req->flags & REQ_F_ARM_LTIMEOUT) {
+		struct io_kiocb *link = req->link;
+
+		if (link && link->opcode == IORING_OP_LINK_TIMEOUT) {
+			io_remove_next_linked(req);
+			io_cqring_fill_event(link->ctx, link->user_data,
+					     -ECANCELED, 0);
+			io_put_req_deferred(link);
+			posted = true;
+		}
+	} else if (req->flags & REQ_F_LINK_TIMEOUT) {
 		struct io_ring_ctx *ctx = req->ctx;
 
 		spin_lock_irq(&ctx->timeout_lock);
@@ -1951,7 +1964,7 @@ static struct io_kiocb *__io_req_find_next(struct io_kiocb *req)
 	 * dependencies to the next request. In case of failure, fail the rest
 	 * of the chain.
 	 */
-	if (req->flags & (REQ_F_LINK_TIMEOUT | REQ_F_FAIL)) {
+	if (req->flags & IO_DISARM_MASK) {
 		struct io_ring_ctx *ctx = req->ctx;
 		bool posted;
 
-- 
cgit 


From 906c6caaf586180261ea581915e1cf8bc466bd69 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sun, 15 Aug 2021 10:40:26 +0100
Subject: io_uring: optimise io_prep_linked_timeout()

Linked timeout handling during issuing is heavy, it adds extra
instructions and forces to save the next linked timeout before
io_issue_sqe().

Follwing the same reasoning as in refcounting patches, a request can't
be freed by the time it returns from io_issue_sqe(), so now we don't
need to do io_prep_linked_timeout() in advance, and it can be delayed to
colder paths optimising the generic path.

Also, it should also save quite a lot for requests with linked timeouts
and completed inline on timeout spinlocking + hrtimer_start() +
hrtimer_try_to_cancel() and so on.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/19bfc9a0d26c5c5f1e359f7650afe807ca8ef879.1628981736.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 25 ++++++++++++++++++++++---
 1 file changed, 22 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 8eea3a1e8c21..d78a05ecbf68 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1306,8 +1306,16 @@ static void io_req_track_inflight(struct io_kiocb *req)
 	}
 }
 
+static inline void io_unprep_linked_timeout(struct io_kiocb *req)
+{
+	req->flags &= ~REQ_F_LINK_TIMEOUT;
+}
+
 static struct io_kiocb *__io_prep_linked_timeout(struct io_kiocb *req)
 {
+	if (WARN_ON_ONCE(!req->link))
+		return NULL;
+
 	req->flags &= ~REQ_F_ARM_LTIMEOUT;
 	req->flags |= REQ_F_LINK_TIMEOUT;
 
@@ -1932,6 +1940,7 @@ static bool io_disarm_next(struct io_kiocb *req)
 	if (req->flags & REQ_F_ARM_LTIMEOUT) {
 		struct io_kiocb *link = req->link;
 
+		req->flags &= ~REQ_F_ARM_LTIMEOUT;
 		if (link && link->opcode == IORING_OP_LINK_TIMEOUT) {
 			io_remove_next_linked(req);
 			io_cqring_fill_event(link->ctx, link->user_data,
@@ -6485,7 +6494,7 @@ static void io_queue_linked_timeout(struct io_kiocb *req)
 static void __io_queue_sqe(struct io_kiocb *req)
 	__must_hold(&req->ctx->uring_lock)
 {
-	struct io_kiocb *linked_timeout = io_prep_linked_timeout(req);
+	struct io_kiocb *linked_timeout;
 	int ret;
 
 issue_sqe:
@@ -6503,10 +6512,19 @@ issue_sqe:
 			state->compl_reqs[state->compl_nr++] = req;
 			if (state->compl_nr == ARRAY_SIZE(state->compl_reqs))
 				io_submit_flush_completions(ctx);
+			return;
 		}
+
+		linked_timeout = io_prep_linked_timeout(req);
+		if (linked_timeout)
+			io_queue_linked_timeout(linked_timeout);
 	} else if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) {
+		linked_timeout = io_prep_linked_timeout(req);
+
 		switch (io_arm_poll_handler(req)) {
 		case IO_APOLL_READY:
+			if (linked_timeout)
+				io_unprep_linked_timeout(req);
 			goto issue_sqe;
 		case IO_APOLL_ABORTED:
 			/*
@@ -6516,11 +6534,12 @@ issue_sqe:
 			io_queue_async_work(req);
 			break;
 		}
+
+		if (linked_timeout)
+			io_queue_linked_timeout(linked_timeout);
 	} else {
 		io_req_complete_failed(req, ret);
 	}
-	if (linked_timeout)
-		io_queue_linked_timeout(linked_timeout);
 }
 
 static inline void io_queue_sqe(struct io_kiocb *req)
-- 
cgit 


From ae421d9350b51cba1daa28ee6eb14fbce7517eca Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 17 Aug 2021 20:28:08 +0100
Subject: io_uring: better encapsulate buffer select for rw

Make io_put_rw_kbuf() to do the REQ_F_BUFFER_SELECTED check, so all the
callers don't need to hand code it. The number of places where we call
io_put_rw_kbuf() is growing, so saves some pain.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/3df3919e5e7efe03420c44ab4d9317a81a9cf398.1629228203.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 23 +++++++----------------
 1 file changed, 7 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index d78a05ecbf68..4b6003bc731a 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2286,6 +2286,8 @@ static inline unsigned int io_put_rw_kbuf(struct io_kiocb *req)
 {
 	struct io_buffer *kbuf;
 
+	if (likely(!(req->flags & REQ_F_BUFFER_SELECTED)))
+		return 0;
 	kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
 	return io_put_kbuf(req, kbuf);
 }
@@ -2315,8 +2317,6 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
 
 	io_init_req_batch(&rb);
 	while (!list_empty(done)) {
-		int cflags = 0;
-
 		req = list_first_entry(done, struct io_kiocb, inflight_entry);
 		list_del(&req->inflight_entry);
 
@@ -2327,10 +2327,8 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
 			continue;
 		}
 
-		if (req->flags & REQ_F_BUFFER_SELECTED)
-			cflags = io_put_rw_kbuf(req);
-
-		__io_cqring_fill_event(ctx, req->user_data, req->result, cflags);
+		__io_cqring_fill_event(ctx, req->user_data, req->result,
+					io_put_rw_kbuf(req));
 		(*nr_events)++;
 
 		if (req_ref_put_and_test(req))
@@ -2550,11 +2548,7 @@ static bool __io_complete_rw_common(struct io_kiocb *req, long res)
 
 static void io_req_task_complete(struct io_kiocb *req)
 {
-	int cflags = 0;
-
-	if (req->flags & REQ_F_BUFFER_SELECTED)
-		cflags = io_put_rw_kbuf(req);
-	__io_req_complete(req, 0, req->result, cflags);
+	__io_req_complete(req, 0, req->result, io_put_rw_kbuf(req));
 }
 
 static void __io_complete_rw(struct io_kiocb *req, long res, long res2,
@@ -2822,12 +2816,9 @@ static void kiocb_done(struct kiocb *kiocb, ssize_t ret,
 		if (io_resubmit_prep(req)) {
 			io_req_task_queue_reissue(req);
 		} else {
-			int cflags = 0;
-
 			req_set_fail(req);
-			if (req->flags & REQ_F_BUFFER_SELECTED)
-				cflags = io_put_rw_kbuf(req);
-			__io_req_complete(req, issue_flags, ret, cflags);
+			__io_req_complete(req, issue_flags, ret,
+					  io_put_rw_kbuf(req));
 		}
 	}
 }
-- 
cgit 


From 505657bc6c52b01304d8a7c79b2f98878e3d83db Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 17 Aug 2021 20:28:09 +0100
Subject: io_uring: reuse io_req_complete_post()

We have io_req_complete_post() to post a CQE and put the request. It
takes care of all synchronisation and is more concise and efficent, so
replace all hancoded occurrences of
"lock; post CQE; unlock; + put_req()" with io_req_complete_post().

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/2c83463458a613f9d870e5147eb134da2aa70779.1629228203.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 48 +++++++++++-------------------------------------
 1 file changed, 11 insertions(+), 37 deletions(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 4b6003bc731a..6f2668b6b867 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -5523,16 +5523,8 @@ err:
 
 static void io_req_task_timeout(struct io_kiocb *req)
 {
-	struct io_ring_ctx *ctx = req->ctx;
-
-	spin_lock(&ctx->completion_lock);
-	io_cqring_fill_event(ctx, req->user_data, -ETIME, 0);
-	io_commit_cqring(ctx);
-	spin_unlock(&ctx->completion_lock);
-
-	io_cqring_ev_posted(ctx);
 	req_set_fail(req);
-	io_put_req(req);
+	io_req_complete_post(req, -ETIME, 0);
 }
 
 static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
@@ -5660,14 +5652,9 @@ static int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags)
 					io_translate_timeout_mode(tr->flags));
 	spin_unlock_irq(&ctx->timeout_lock);
 
-	spin_lock(&ctx->completion_lock);
-	io_cqring_fill_event(ctx, req->user_data, ret, 0);
-	io_commit_cqring(ctx);
-	spin_unlock(&ctx->completion_lock);
-	io_cqring_ev_posted(ctx);
 	if (ret < 0)
 		req_set_fail(req);
-	io_put_req(req);
+	io_req_complete_post(req, ret, 0);
 	return 0;
 }
 
@@ -5807,7 +5794,6 @@ static int io_async_cancel_one(struct io_uring_task *tctx, u64 user_data,
 }
 
 static int io_try_cancel_userdata(struct io_kiocb *req, u64 sqe_addr)
-	__acquires(&req->ctx->completion_lock)
 {
 	struct io_ring_ctx *ctx = req->ctx;
 	int ret;
@@ -5815,15 +5801,19 @@ static int io_try_cancel_userdata(struct io_kiocb *req, u64 sqe_addr)
 	WARN_ON_ONCE(req->task != current);
 
 	ret = io_async_cancel_one(req->task->io_uring, sqe_addr, ctx);
-	spin_lock(&ctx->completion_lock);
 	if (ret != -ENOENT)
 		return ret;
+
+	spin_lock(&ctx->completion_lock);
 	spin_lock_irq(&ctx->timeout_lock);
 	ret = io_timeout_cancel(ctx, sqe_addr);
 	spin_unlock_irq(&ctx->timeout_lock);
 	if (ret != -ENOENT)
-		return ret;
-	return io_poll_cancel(ctx, sqe_addr, false);
+		goto out;
+	ret = io_poll_cancel(ctx, sqe_addr, false);
+out:
+	spin_unlock(&ctx->completion_lock);
+	return ret;
 }
 
 static int io_async_cancel_prep(struct io_kiocb *req,
@@ -5850,7 +5840,6 @@ static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
 	ret = io_try_cancel_userdata(req, sqe_addr);
 	if (ret != -ENOENT)
 		goto done;
-	spin_unlock(&ctx->completion_lock);
 
 	/* slow path, try all io-wq's */
 	io_ring_submit_lock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
@@ -5863,17 +5852,10 @@ static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
 			break;
 	}
 	io_ring_submit_unlock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
-
-	spin_lock(&ctx->completion_lock);
 done:
-	io_cqring_fill_event(ctx, req->user_data, ret, 0);
-	io_commit_cqring(ctx);
-	spin_unlock(&ctx->completion_lock);
-	io_cqring_ev_posted(ctx);
-
 	if (ret < 0)
 		req_set_fail(req);
-	io_put_req(req);
+	io_req_complete_post(req, ret, 0);
 	return 0;
 }
 
@@ -6413,20 +6395,12 @@ static inline struct file *io_file_get(struct io_ring_ctx *ctx,
 static void io_req_task_link_timeout(struct io_kiocb *req)
 {
 	struct io_kiocb *prev = req->timeout.prev;
-	struct io_ring_ctx *ctx = req->ctx;
 	int ret;
 
 	if (prev) {
 		ret = io_try_cancel_userdata(req, prev->user_data);
-		if (!ret)
-			ret = -ETIME;
-		io_cqring_fill_event(ctx, req->user_data, ret, 0);
-		io_commit_cqring(ctx);
-		spin_unlock(&ctx->completion_lock);
-		io_cqring_ev_posted(ctx);
-
+		io_req_complete_post(req, ret ?: -ETIME, 0);
 		io_put_req(prev);
-		io_put_req(req);
 	} else {
 		io_req_complete_post(req, -ETIME, 0);
 	}
-- 
cgit 


From 23a65db83b3f4549e5eee1fb5517c3365f627699 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 17 Aug 2021 20:28:11 +0100
Subject: io_uring: improve same wq polling

Move earlier the check for whether __io_queue_proc() tries to poll
already polled waitqueue, and do the same for the second poll entry, if
any. Shouldn't really matter, but at least it would have a more
predictable behaviour.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/8cb428cfe8ade0fd055859fabb878db8777d4c2f.1629228203.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 6f2668b6b867..71d54841ecbe 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -5062,8 +5062,13 @@ static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt,
 	if (unlikely(pt->nr_entries)) {
 		struct io_poll_iocb *poll_one = poll;
 
+		/* double add on the same waitqueue head, ignore */
+		if (poll_one->head == head)
+			return;
 		/* already have a 2nd entry, fail a third attempt */
 		if (*poll_ptr) {
+			if ((*poll_ptr)->head == head)
+				return;
 			pt->error = -EINVAL;
 			return;
 		}
@@ -5073,9 +5078,6 @@ static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt,
 		 */
 		if (!(poll_one->events & EPOLLONESHOT))
 			poll_one->events |= EPOLLONESHOT;
-		/* double add on the same waitqueue head, ignore */
-		if (poll_one->head == head)
-			return;
 		poll = kmalloc(sizeof(*poll), GFP_ATOMIC);
 		if (!poll) {
 			pt->error = -ENOMEM;
-- 
cgit 


From ec3c3d0f3a271b5c7422449262970e7eb98f2126 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Wed, 18 Aug 2021 10:50:52 +0100
Subject: io_uring: fix io_timeout_remove locking

io_timeout_cancel() posts CQEs so needs ->completion_lock to be held,
so grab it in io_timeout_remove().

Fixes: 48ecb6369f1f2 ("io_uring: run timeouts from task_work")
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/d6f03d653a4d7bf693ef6f39b6a426b6d97fd96f.1629280204.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 71d54841ecbe..08546332f1b9 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -5572,6 +5572,7 @@ static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx,
 }
 
 static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
+	__must_hold(&ctx->completion_lock)
 	__must_hold(&ctx->timeout_lock)
 {
 	struct io_kiocb *req = io_timeout_extract(ctx, user_data);
@@ -5646,13 +5647,18 @@ static int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags)
 	struct io_ring_ctx *ctx = req->ctx;
 	int ret;
 
-	spin_lock_irq(&ctx->timeout_lock);
-	if (!(req->timeout_rem.flags & IORING_TIMEOUT_UPDATE))
+	if (!(req->timeout_rem.flags & IORING_TIMEOUT_UPDATE)) {
+		spin_lock(&ctx->completion_lock);
+		spin_lock_irq(&ctx->timeout_lock);
 		ret = io_timeout_cancel(ctx, tr->addr);
-	else
+		spin_unlock_irq(&ctx->timeout_lock);
+		spin_unlock(&ctx->completion_lock);
+	} else {
+		spin_lock_irq(&ctx->timeout_lock);
 		ret = io_timeout_update(ctx, tr->addr, &tr->ts,
 					io_translate_timeout_mode(tr->flags));
-	spin_unlock_irq(&ctx->timeout_lock);
+		spin_unlock_irq(&ctx->timeout_lock);
+	}
 
 	if (ret < 0)
 		req_set_fail(req);
-- 
cgit 


From 79dca1846fe979304ad0b998e56b20326e2e5a72 Mon Sep 17 00:00:00 2001
From: Hao Xu <haoxu@linux.alibaba.com>
Date: Tue, 10 Aug 2021 20:55:54 +0800
Subject: io-wq: move nr_running and worker_refs out of wqe->lock protection

We don't need to protect nr_running and worker_refs by wqe->lock, so
narrow the range of raw_spin_lock_irq - raw_spin_unlock_irq

Signed-off-by: Hao Xu <haoxu@linux.alibaba.com>
Link: https://lore.kernel.org/r/20210810125554.99229-1-haoxu@linux.alibaba.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io-wq.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/io-wq.c b/fs/io-wq.c
index 4ce83bb48021..8da9bb103916 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -256,16 +256,17 @@ static void io_wqe_wake_worker(struct io_wqe *wqe, struct io_wqe_acct *acct)
 
 		raw_spin_lock_irq(&wqe->lock);
 		if (acct->nr_workers < acct->max_workers) {
-			atomic_inc(&acct->nr_running);
-			atomic_inc(&wqe->wq->worker_refs);
 			if (!acct->nr_workers)
 				first = true;
 			acct->nr_workers++;
 			do_create = true;
 		}
 		raw_spin_unlock_irq(&wqe->lock);
-		if (do_create)
+		if (do_create) {
+			atomic_inc(&acct->nr_running);
+			atomic_inc(&wqe->wq->worker_refs);
 			create_io_worker(wqe->wq, wqe, acct->index, first);
+		}
 	}
 }
 
-- 
cgit 


From 316319e82f7342ef327223a23199648bfabeadcd Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 19 Aug 2021 09:41:42 -0600
Subject: io_uring: add comments on why PF_EXITING checking is safe

We have two checks of task->flags & PF_EXITING left:

1) In io_req_task_submit(), which is called in task_work and hence always
   in the context of the original task. That means that
   req->task == current, and hence checking ->flags is totally fine.

2) In io_poll_rewait(), where we need to stop re-arming poll to prevent
   it interfering with cancelation. This is only run from task_work as
   well, and hence for this case too req->task == current.

Add a comment to both spots detailing that.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 08546332f1b9..78aa4daf7969 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2112,6 +2112,7 @@ static void io_req_task_submit(struct io_kiocb *req)
 
 	/* ctx stays valid until unlock, even if we drop all ours ctx->refs */
 	mutex_lock(&ctx->uring_lock);
+	/* req->task == current here, checking PF_EXITING is safe */
 	if (likely(!(req->task->flags & PF_EXITING)))
 		__io_queue_sqe(req);
 	else
@@ -4893,6 +4894,7 @@ static bool io_poll_rewait(struct io_kiocb *req, struct io_poll_iocb *poll)
 {
 	struct io_ring_ctx *ctx = req->ctx;
 
+	/* req->task == current here, checking PF_EXITING is safe */
 	if (unlikely(req->task->flags & PF_EXITING))
 		WRITE_ONCE(poll->canceled, true);
 
-- 
cgit 


From e98e49b2bbf777f91732dc916d7ad33876c663c9 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Wed, 18 Aug 2021 17:01:43 +0100
Subject: io_uring: extend task put optimisations

Now with IRQ completions done via IRQ, almost all requests freeing
are done from the context of submitter task, so it makes sense to
extend task_put optimisation from io_req_free_batch_finish() to cover
all the cases including task_work by moving it into io_put_task().

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/824a7cbd745ddeee4a0f3ff85c558a24fd005872.1629302453.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 78aa4daf7969..9107cd78863d 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1623,10 +1623,14 @@ static inline void io_put_task(struct task_struct *task, int nr)
 {
 	struct io_uring_task *tctx = task->io_uring;
 
-	percpu_counter_sub(&tctx->inflight, nr);
-	if (unlikely(atomic_read(&tctx->in_idle)))
-		wake_up(&tctx->wait);
-	put_task_struct_many(task, nr);
+	if (likely(task == current)) {
+		tctx->cached_refs += nr;
+	} else {
+		percpu_counter_sub(&tctx->inflight, nr);
+		if (unlikely(atomic_read(&tctx->in_idle)))
+			wake_up(&tctx->wait);
+		put_task_struct_many(task, nr);
+	}
 }
 
 static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
@@ -2171,9 +2175,7 @@ static void io_req_free_batch_finish(struct io_ring_ctx *ctx,
 {
 	if (rb->ctx_refs)
 		percpu_ref_put_many(&ctx->refs, rb->ctx_refs);
-	if (rb->task == current)
-		current->io_uring->cached_refs += rb->task_refs;
-	else if (rb->task)
+	if (rb->task)
 		io_put_task(rb->task, rb->task_refs);
 }
 
-- 
cgit 


From 187f08c12cd1d81f000cdc9c0119ef6e0a6f47e3 Mon Sep 17 00:00:00 2001
From: wangyangbo <wangyangbo@uniontech.com>
Date: Thu, 19 Aug 2021 13:56:57 +0800
Subject: io_uring: Add register support for non-4k PAGE_SIZE

Now allocated rsrc table uses PAGE_SIZE as the size of 2nd-level, and
accessing this table relies on each level index from fixed TABLE_SHIFT
(12 - 3) in 4k page case. In order to correctly work in non-4k page,
define TABLE_SHIFT as non-fixed (PAGE_SHIFT - shift of data) for
2nd-level table entry number.

Signed-off-by: wangyangbo <wangyangbo@uniontech.com>
Link: https://lore.kernel.org/r/20210819055657.27327-1-wangyangbo@uniontech.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 9107cd78863d..5d3df4f913a8 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -92,12 +92,12 @@
 #define IORING_MAX_CQ_ENTRIES	(2 * IORING_MAX_ENTRIES)
 #define IORING_SQPOLL_CAP_ENTRIES_VALUE 8
 
-/* 512 entries per page on 64-bit archs, 64 pages max */
+/* only define max */
 #define IORING_MAX_FIXED_FILES	(1U << 15)
 #define IORING_MAX_RESTRICTIONS	(IORING_RESTRICTION_LAST + \
 				 IORING_REGISTER_LAST + IORING_OP_LAST)
 
-#define IO_RSRC_TAG_TABLE_SHIFT	9
+#define IO_RSRC_TAG_TABLE_SHIFT	(PAGE_SHIFT - 3)
 #define IO_RSRC_TAG_TABLE_MAX	(1U << IO_RSRC_TAG_TABLE_SHIFT)
 #define IO_RSRC_TAG_TABLE_MASK	(IO_RSRC_TAG_TABLE_MAX - 1)
 
-- 
cgit 


From 99c8bc52d1321ab3a711eba2941eadbe7425230f Mon Sep 17 00:00:00 2001
From: Hao Xu <haoxu@linux.alibaba.com>
Date: Sat, 21 Aug 2021 06:19:54 +0800
Subject: io_uring: fix lack of protection for compl_nr

coml_nr in ctx_flush_and_put() is not protected by uring_lock, this
may cause problems when accessing in parallel:

say coml_nr > 0

  ctx_flush_and put                  other context
   if (compl_nr)                      get mutex
                                      coml_nr > 0
                                      do flush
                                          coml_nr = 0
                                      release mutex
        get mutex
           do flush (*)
        release mutex

in (*) place, we call io_cqring_ev_posted() and users likely get
no events there. To avoid spurious events, re-check the value when
under the lock.

Fixes: 2c32395d8111 ("io_uring: fix __tctx_task_work() ctx race")
Signed-off-by: Hao Xu <haoxu@linux.alibaba.com>
Link: https://lore.kernel.org/r/20210820221954.61815-1-haoxu@linux.alibaba.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 5d3df4f913a8..706ac8c03b95 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2007,7 +2007,8 @@ static void ctx_flush_and_put(struct io_ring_ctx *ctx)
 		return;
 	if (ctx->submit_state.compl_nr) {
 		mutex_lock(&ctx->uring_lock);
-		io_submit_flush_completions(ctx);
+		if (ctx->submit_state.compl_nr)
+			io_submit_flush_completions(ctx);
 		mutex_unlock(&ctx->uring_lock);
 	}
 	percpu_ref_put(&ctx->refs);
-- 
cgit 


From 3a1b8a4e843f96b636431450d8d79061605cf74b Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 20 Aug 2021 10:36:35 +0100
Subject: io_uring: limit fixed table size by RLIMIT_NOFILE

Limit the number of files in io_uring fixed tables by RLIMIT_NOFILE,
that's the first and the simpliest restriction that we should impose.

Cc: stable@vger.kernel.org
Suggested-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/b2756c340aed7d6c0b302c26dab50c6c5907f4ce.1629451684.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 706ac8c03b95..9a021fe6c461 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -7733,6 +7733,8 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 		return -EINVAL;
 	if (nr_args > IORING_MAX_FIXED_FILES)
 		return -EMFILE;
+	if (nr_args > rlimit(RLIMIT_NOFILE))
+		return -EMFILE;
 	ret = io_rsrc_node_switch_start(ctx);
 	if (ret)
 		return ret;
-- 
cgit 


From 0bea96f59ba40e63c0ae93ad6a02417b95f22f4d Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 20 Aug 2021 10:36:36 +0100
Subject: io_uring: place fixed tables under memcg limits

Fixed tables may be large enough, place all of them together with
allocated tags under memcg limits.

Cc: stable@vger.kernel.org
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/b3ac9f5da9821bb59837b5fe25e8ef4be982218c.1629451684.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 9a021fe6c461..c6f1afa5ec04 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -7138,14 +7138,14 @@ static void **io_alloc_page_table(size_t size)
 	size_t init_size = size;
 	void **table;
 
-	table = kcalloc(nr_tables, sizeof(*table), GFP_KERNEL);
+	table = kcalloc(nr_tables, sizeof(*table), GFP_KERNEL_ACCOUNT);
 	if (!table)
 		return NULL;
 
 	for (i = 0; i < nr_tables; i++) {
 		unsigned int this_size = min_t(size_t, size, PAGE_SIZE);
 
-		table[i] = kzalloc(this_size, GFP_KERNEL);
+		table[i] = kzalloc(this_size, GFP_KERNEL_ACCOUNT);
 		if (!table[i]) {
 			io_free_page_table(table, init_size);
 			return NULL;
@@ -7336,7 +7336,8 @@ fail:
 
 static bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files)
 {
-	table->files = kvcalloc(nr_files, sizeof(table->files[0]), GFP_KERNEL);
+	table->files = kvcalloc(nr_files, sizeof(table->files[0]),
+				GFP_KERNEL_ACCOUNT);
 	return !!table->files;
 }
 
-- 
cgit 


From 2c5d763c1939fbd130452ee0d4d1a44b5dd97bb7 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sat, 21 Aug 2021 07:21:19 -0600
Subject: io_uring: add clarifying comment for io_cqring_ev_posted()

We've previously had an issue where overflow flush unconditionally calls
io_cqring_ev_posted() even if it didn't flush any events to the ring,
causing wake and eventfd increment where no new events are available.
Some applications don't like that, see commit b18032bb0a88 for details.

This came up in discussion for another patch recently, hence add a
comment detailing what the relationship between calling the events
posted helper and CQ ring entries is.

Link: https://lore.kernel.org/io-uring/77a44fce-c831-16a6-8e80-9aee77f496a2@kernel.dk/
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index c6f1afa5ec04..8d263209b508 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1526,6 +1526,13 @@ static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx)
 	return !ctx->eventfd_async || io_wq_current_is_worker();
 }
 
+/*
+ * This should only get called when at least one event has been posted.
+ * Some applications rely on the eventfd notification count only changing
+ * IFF a new CQE has been added to the CQ ring. There's no depedency on
+ * 1:1 relationship between how many times this function is called (and
+ * hence the eventfd count) and number of CQEs posted to the CQ ring.
+ */
 static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
 {
 	/*
-- 
cgit 


From 26578cda3db983b17cabe4e577af26306beb9987 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 20 Aug 2021 10:36:37 +0100
Subject: io_uring: add ->splice_fd_in checks

->splice_fd_in is used only by splice/tee, but no other request checks
it for validity. Add the check for most of request types excluding
reads/writes/sends/recvs, we don't want overhead for them and can leave
them be as is until the field is actually used.

Cc: stable@vger.kernel.org
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/f44bc2acd6777d932de3d71a5692235b5b2b7397.1629451684.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 52 ++++++++++++++++++++++++++++++----------------------
 1 file changed, 30 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 8d263209b508..3898f7ab14f6 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -3511,7 +3511,7 @@ static int io_renameat_prep(struct io_kiocb *req,
 
 	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
 		return -EINVAL;
-	if (sqe->ioprio || sqe->buf_index)
+	if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in)
 		return -EINVAL;
 	if (unlikely(req->flags & REQ_F_FIXED_FILE))
 		return -EBADF;
@@ -3562,7 +3562,8 @@ static int io_unlinkat_prep(struct io_kiocb *req,
 
 	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
 		return -EINVAL;
-	if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index)
+	if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index ||
+	    sqe->splice_fd_in)
 		return -EINVAL;
 	if (unlikely(req->flags & REQ_F_FIXED_FILE))
 		return -EBADF;
@@ -3608,8 +3609,8 @@ static int io_shutdown_prep(struct io_kiocb *req,
 #if defined(CONFIG_NET)
 	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
 		return -EINVAL;
-	if (sqe->ioprio || sqe->off || sqe->addr || sqe->rw_flags ||
-	    sqe->buf_index)
+	if (unlikely(sqe->ioprio || sqe->off || sqe->addr || sqe->rw_flags ||
+		     sqe->buf_index || sqe->splice_fd_in))
 		return -EINVAL;
 
 	req->shutdown.how = READ_ONCE(sqe->len);
@@ -3757,7 +3758,8 @@ static int io_fsync_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 
 	if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
 		return -EINVAL;
-	if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
+	if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index ||
+		     sqe->splice_fd_in))
 		return -EINVAL;
 
 	req->sync.flags = READ_ONCE(sqe->fsync_flags);
@@ -3790,7 +3792,8 @@ static int io_fsync(struct io_kiocb *req, unsigned int issue_flags)
 static int io_fallocate_prep(struct io_kiocb *req,
 			     const struct io_uring_sqe *sqe)
 {
-	if (sqe->ioprio || sqe->buf_index || sqe->rw_flags)
+	if (sqe->ioprio || sqe->buf_index || sqe->rw_flags ||
+	    sqe->splice_fd_in)
 		return -EINVAL;
 	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
 		return -EINVAL;
@@ -3823,7 +3826,7 @@ static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
 
 	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
 		return -EINVAL;
-	if (unlikely(sqe->ioprio || sqe->buf_index))
+	if (unlikely(sqe->ioprio || sqe->buf_index || sqe->splice_fd_in))
 		return -EINVAL;
 	if (unlikely(req->flags & REQ_F_FIXED_FILE))
 		return -EBADF;
@@ -3942,7 +3945,8 @@ static int io_remove_buffers_prep(struct io_kiocb *req,
 	struct io_provide_buf *p = &req->pbuf;
 	u64 tmp;
 
-	if (sqe->ioprio || sqe->rw_flags || sqe->addr || sqe->len || sqe->off)
+	if (sqe->ioprio || sqe->rw_flags || sqe->addr || sqe->len || sqe->off ||
+	    sqe->splice_fd_in)
 		return -EINVAL;
 
 	tmp = READ_ONCE(sqe->fd);
@@ -4013,7 +4017,7 @@ static int io_provide_buffers_prep(struct io_kiocb *req,
 	struct io_provide_buf *p = &req->pbuf;
 	u64 tmp;
 
-	if (sqe->ioprio || sqe->rw_flags)
+	if (sqe->ioprio || sqe->rw_flags || sqe->splice_fd_in)
 		return -EINVAL;
 
 	tmp = READ_ONCE(sqe->fd);
@@ -4100,7 +4104,7 @@ static int io_epoll_ctl_prep(struct io_kiocb *req,
 			     const struct io_uring_sqe *sqe)
 {
 #if defined(CONFIG_EPOLL)
-	if (sqe->ioprio || sqe->buf_index)
+	if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in)
 		return -EINVAL;
 	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
 		return -EINVAL;
@@ -4146,7 +4150,7 @@ static int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags)
 static int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
 #if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
-	if (sqe->ioprio || sqe->buf_index || sqe->off)
+	if (sqe->ioprio || sqe->buf_index || sqe->off || sqe->splice_fd_in)
 		return -EINVAL;
 	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
 		return -EINVAL;
@@ -4181,7 +4185,7 @@ static int io_madvise(struct io_kiocb *req, unsigned int issue_flags)
 
 static int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
-	if (sqe->ioprio || sqe->buf_index || sqe->addr)
+	if (sqe->ioprio || sqe->buf_index || sqe->addr || sqe->splice_fd_in)
 		return -EINVAL;
 	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
 		return -EINVAL;
@@ -4219,7 +4223,7 @@ static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
 	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
 		return -EINVAL;
-	if (sqe->ioprio || sqe->buf_index)
+	if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in)
 		return -EINVAL;
 	if (req->flags & REQ_F_FIXED_FILE)
 		return -EBADF;
@@ -4255,7 +4259,7 @@ static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
 		return -EINVAL;
 	if (sqe->ioprio || sqe->off || sqe->addr || sqe->len ||
-	    sqe->rw_flags || sqe->buf_index)
+	    sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in)
 		return -EINVAL;
 	if (req->flags & REQ_F_FIXED_FILE)
 		return -EBADF;
@@ -4316,7 +4320,8 @@ static int io_sfr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 
 	if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
 		return -EINVAL;
-	if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
+	if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index ||
+		     sqe->splice_fd_in))
 		return -EINVAL;
 
 	req->sync.off = READ_ONCE(sqe->off);
@@ -4743,7 +4748,7 @@ static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 
 	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
 		return -EINVAL;
-	if (sqe->ioprio || sqe->len || sqe->buf_index)
+	if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->splice_fd_in)
 		return -EINVAL;
 
 	accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
@@ -4791,7 +4796,8 @@ static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 
 	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
 		return -EINVAL;
-	if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags)
+	if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags ||
+	    sqe->splice_fd_in)
 		return -EINVAL;
 
 	conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
@@ -5387,7 +5393,7 @@ static int io_poll_update_prep(struct io_kiocb *req,
 
 	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
 		return -EINVAL;
-	if (sqe->ioprio || sqe->buf_index)
+	if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in)
 		return -EINVAL;
 	flags = READ_ONCE(sqe->len);
 	if (flags & ~(IORING_POLL_UPDATE_EVENTS | IORING_POLL_UPDATE_USER_DATA |
@@ -5626,7 +5632,7 @@ static int io_timeout_remove_prep(struct io_kiocb *req,
 		return -EINVAL;
 	if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
 		return -EINVAL;
-	if (sqe->ioprio || sqe->buf_index || sqe->len)
+	if (sqe->ioprio || sqe->buf_index || sqe->len || sqe->splice_fd_in)
 		return -EINVAL;
 
 	tr->addr = READ_ONCE(sqe->addr);
@@ -5687,7 +5693,8 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 
 	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
 		return -EINVAL;
-	if (sqe->ioprio || sqe->buf_index || sqe->len != 1)
+	if (sqe->ioprio || sqe->buf_index || sqe->len != 1 ||
+	    sqe->splice_fd_in)
 		return -EINVAL;
 	if (off && is_timeout_link)
 		return -EINVAL;
@@ -5843,7 +5850,8 @@ static int io_async_cancel_prep(struct io_kiocb *req,
 		return -EINVAL;
 	if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
 		return -EINVAL;
-	if (sqe->ioprio || sqe->off || sqe->len || sqe->cancel_flags)
+	if (sqe->ioprio || sqe->off || sqe->len || sqe->cancel_flags ||
+	    sqe->splice_fd_in)
 		return -EINVAL;
 
 	req->cancel.addr = READ_ONCE(sqe->addr);
@@ -5884,7 +5892,7 @@ static int io_rsrc_update_prep(struct io_kiocb *req,
 {
 	if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
 		return -EINVAL;
-	if (sqe->ioprio || sqe->rw_flags)
+	if (sqe->ioprio || sqe->rw_flags || sqe->splice_fd_in)
 		return -EINVAL;
 
 	req->rsrc_update.offset = READ_ONCE(sqe->off);
-- 
cgit 


From 5636c00d3e8ef1f6d1291e71edb48f727ba5a999 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Wed, 18 Aug 2021 12:42:45 +0100
Subject: io_uring: flush completions for fallbacks

io_fallback_req_func() doesn't expect anyone creating inline
completions, and no one currently does that. Teach the function to flush
completions preparing for further changes.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/8b941516921f72e1a64d58932d671736892d7fff.1629286357.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 3898f7ab14f6..926df5b00b31 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1197,6 +1197,11 @@ static void io_fallback_req_func(struct work_struct *work)
 	percpu_ref_get(&ctx->refs);
 	llist_for_each_entry_safe(req, tmp, node, io_task_work.fallback_node)
 		req->io_task_work.func(req);
+
+	mutex_lock(&ctx->uring_lock);
+	if (ctx->submit_state.compl_nr)
+		io_submit_flush_completions(ctx);
+	mutex_unlock(&ctx->uring_lock);
 	percpu_ref_put(&ctx->refs);
 }
 
-- 
cgit 


From f237c30a5610d35a584f3296d397b93d80ce374e Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Wed, 18 Aug 2021 12:42:46 +0100
Subject: io_uring: batch task work locking

Many task_work handlers either grab ->uring_lock, or may benefit from
having it. Move locking logic out of individual handlers to a lazy
approach controlled by tctx_task_work(), so we don't keep doing
tons of mutex lock/unlock.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/d6a34e147f2507a2f3e2fa1e38a9c541dcad3929.1629286357.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 80 ++++++++++++++++++++++++++++++++++++-----------------------
 1 file changed, 49 insertions(+), 31 deletions(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 926df5b00b31..cb98a831586a 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -775,7 +775,7 @@ struct async_poll {
 	struct io_poll_iocb	*double_poll;
 };
 
-typedef void (*io_req_tw_func_t)(struct io_kiocb *req);
+typedef void (*io_req_tw_func_t)(struct io_kiocb *req, bool *locked);
 
 struct io_task_work {
 	union {
@@ -1080,6 +1080,14 @@ struct sock *io_uring_get_socket(struct file *file)
 }
 EXPORT_SYMBOL(io_uring_get_socket);
 
+static inline void io_tw_lock(struct io_ring_ctx *ctx, bool *locked)
+{
+	if (!*locked) {
+		mutex_lock(&ctx->uring_lock);
+		*locked = true;
+	}
+}
+
 #define io_for_each_link(pos, head) \
 	for (pos = (head); pos; pos = pos->link)
 
@@ -1193,16 +1201,19 @@ static void io_fallback_req_func(struct work_struct *work)
 						fallback_work.work);
 	struct llist_node *node = llist_del_all(&ctx->fallback_llist);
 	struct io_kiocb *req, *tmp;
+	bool locked = false;
 
 	percpu_ref_get(&ctx->refs);
 	llist_for_each_entry_safe(req, tmp, node, io_task_work.fallback_node)
-		req->io_task_work.func(req);
+		req->io_task_work.func(req, &locked);
 
-	mutex_lock(&ctx->uring_lock);
-	if (ctx->submit_state.compl_nr)
-		io_submit_flush_completions(ctx);
-	mutex_unlock(&ctx->uring_lock);
+	if (locked) {
+		if (ctx->submit_state.compl_nr)
+			io_submit_flush_completions(ctx);
+		mutex_unlock(&ctx->uring_lock);
+	}
 	percpu_ref_put(&ctx->refs);
+
 }
 
 static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
@@ -1386,12 +1397,15 @@ static void io_prep_async_link(struct io_kiocb *req)
 	}
 }
 
-static void io_queue_async_work(struct io_kiocb *req)
+static void io_queue_async_work(struct io_kiocb *req, bool *locked)
 {
 	struct io_ring_ctx *ctx = req->ctx;
 	struct io_kiocb *link = io_prep_linked_timeout(req);
 	struct io_uring_task *tctx = req->task->io_uring;
 
+	/* must not take the lock, NULL it as a precaution */
+	locked = NULL;
+
 	BUG_ON(!tctx);
 	BUG_ON(!tctx->io_wq);
 
@@ -2013,21 +2027,22 @@ static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req)
 	return __io_req_find_next(req);
 }
 
-static void ctx_flush_and_put(struct io_ring_ctx *ctx)
+static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked)
 {
 	if (!ctx)
 		return;
-	if (ctx->submit_state.compl_nr) {
-		mutex_lock(&ctx->uring_lock);
+	if (*locked) {
 		if (ctx->submit_state.compl_nr)
 			io_submit_flush_completions(ctx);
 		mutex_unlock(&ctx->uring_lock);
+		*locked = false;
 	}
 	percpu_ref_put(&ctx->refs);
 }
 
 static void tctx_task_work(struct callback_head *cb)
 {
+	bool locked = false;
 	struct io_ring_ctx *ctx = NULL;
 	struct io_uring_task *tctx = container_of(cb, struct io_uring_task,
 						  task_work);
@@ -2050,18 +2065,18 @@ static void tctx_task_work(struct callback_head *cb)
 							    io_task_work.node);
 
 			if (req->ctx != ctx) {
-				ctx_flush_and_put(ctx);
+				ctx_flush_and_put(ctx, &locked);
 				ctx = req->ctx;
 				percpu_ref_get(&ctx->refs);
 			}
-			req->io_task_work.func(req);
+			req->io_task_work.func(req, &locked);
 			node = next;
 		} while (node);
 
 		cond_resched();
 	}
 
-	ctx_flush_and_put(ctx);
+	ctx_flush_and_put(ctx, &locked);
 }
 
 static void io_req_task_work_add(struct io_kiocb *req)
@@ -2113,28 +2128,26 @@ static void io_req_task_work_add(struct io_kiocb *req)
 	}
 }
 
-static void io_req_task_cancel(struct io_kiocb *req)
+static void io_req_task_cancel(struct io_kiocb *req, bool *locked)
 {
 	struct io_ring_ctx *ctx = req->ctx;
 
 	/* ctx is guaranteed to stay alive while we hold uring_lock */
-	mutex_lock(&ctx->uring_lock);
+	io_tw_lock(ctx, locked);
 	io_req_complete_failed(req, req->result);
-	mutex_unlock(&ctx->uring_lock);
 }
 
-static void io_req_task_submit(struct io_kiocb *req)
+static void io_req_task_submit(struct io_kiocb *req, bool *locked)
 {
 	struct io_ring_ctx *ctx = req->ctx;
 
 	/* ctx stays valid until unlock, even if we drop all ours ctx->refs */
-	mutex_lock(&ctx->uring_lock);
+	io_tw_lock(ctx, locked);
 	/* req->task == current here, checking PF_EXITING is safe */
 	if (likely(!(req->task->flags & PF_EXITING)))
 		__io_queue_sqe(req);
 	else
 		io_req_complete_failed(req, -EFAULT);
-	mutex_unlock(&ctx->uring_lock);
 }
 
 static void io_req_task_queue_fail(struct io_kiocb *req, int ret)
@@ -2170,6 +2183,11 @@ static void io_free_req(struct io_kiocb *req)
 	__io_free_req(req);
 }
 
+static void io_free_req_work(struct io_kiocb *req, bool *locked)
+{
+	io_free_req(req);
+}
+
 struct req_batch {
 	struct task_struct	*task;
 	int			task_refs;
@@ -2267,7 +2285,7 @@ static inline void io_put_req(struct io_kiocb *req)
 static inline void io_put_req_deferred(struct io_kiocb *req)
 {
 	if (req_ref_put_and_test(req)) {
-		req->io_task_work.func = io_free_req;
+		req->io_task_work.func = io_free_req_work;
 		io_req_task_work_add(req);
 	}
 }
@@ -2562,7 +2580,7 @@ static bool __io_complete_rw_common(struct io_kiocb *req, long res)
 	return false;
 }
 
-static void io_req_task_complete(struct io_kiocb *req)
+static void io_req_task_complete(struct io_kiocb *req, bool *locked)
 {
 	__io_req_complete(req, 0, req->result, io_put_rw_kbuf(req));
 }
@@ -2572,7 +2590,7 @@ static void __io_complete_rw(struct io_kiocb *req, long res, long res2,
 {
 	if (__io_complete_rw_common(req, res))
 		return;
-	io_req_task_complete(req);
+	__io_req_complete(req, 0, req->result, io_put_rw_kbuf(req));
 }
 
 static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
@@ -4994,7 +5012,7 @@ static bool io_poll_complete(struct io_kiocb *req, __poll_t mask)
 	return !(flags & IORING_CQE_F_MORE);
 }
 
-static void io_poll_task_func(struct io_kiocb *req)
+static void io_poll_task_func(struct io_kiocb *req, bool *locked)
 {
 	struct io_ring_ctx *ctx = req->ctx;
 	struct io_kiocb *nxt;
@@ -5018,7 +5036,7 @@ static void io_poll_task_func(struct io_kiocb *req)
 		if (done) {
 			nxt = io_put_req_find_next(req);
 			if (nxt)
-				io_req_task_submit(nxt);
+				io_req_task_submit(nxt, locked);
 		}
 	}
 }
@@ -5130,7 +5148,7 @@ static void io_async_queue_proc(struct file *file, struct wait_queue_head *head,
 	__io_queue_proc(&apoll->poll, pt, head, &apoll->double_poll);
 }
 
-static void io_async_task_func(struct io_kiocb *req)
+static void io_async_task_func(struct io_kiocb *req, bool *locked)
 {
 	struct async_poll *apoll = req->apoll;
 	struct io_ring_ctx *ctx = req->ctx;
@@ -5147,7 +5165,7 @@ static void io_async_task_func(struct io_kiocb *req)
 	spin_unlock(&ctx->completion_lock);
 
 	if (!READ_ONCE(apoll->poll.canceled))
-		io_req_task_submit(req);
+		io_req_task_submit(req, locked);
 	else
 		io_req_complete_failed(req, -ECANCELED);
 }
@@ -5546,7 +5564,7 @@ err:
 	return 0;
 }
 
-static void io_req_task_timeout(struct io_kiocb *req)
+static void io_req_task_timeout(struct io_kiocb *req, bool *locked)
 {
 	req_set_fail(req);
 	io_req_complete_post(req, -ETIME, 0);
@@ -6103,7 +6121,7 @@ fail:
 	if (!req_need_defer(req, seq) && list_empty(&ctx->defer_list)) {
 		spin_unlock(&ctx->completion_lock);
 		kfree(de);
-		io_queue_async_work(req);
+		io_queue_async_work(req, NULL);
 		return true;
 	}
 
@@ -6425,7 +6443,7 @@ static inline struct file *io_file_get(struct io_ring_ctx *ctx,
 		return io_file_get_normal(ctx, req, fd);
 }
 
-static void io_req_task_link_timeout(struct io_kiocb *req)
+static void io_req_task_link_timeout(struct io_kiocb *req, bool *locked)
 {
 	struct io_kiocb *prev = req->timeout.prev;
 	int ret;
@@ -6529,7 +6547,7 @@ issue_sqe:
 			 * Queued up for async execution, worker will release
 			 * submit reference when the iocb is actually submitted.
 			 */
-			io_queue_async_work(req);
+			io_queue_async_work(req, NULL);
 			break;
 		}
 
@@ -6554,7 +6572,7 @@ static inline void io_queue_sqe(struct io_kiocb *req)
 		if (unlikely(ret))
 			io_req_complete_failed(req, ret);
 		else
-			io_queue_async_work(req);
+			io_queue_async_work(req, NULL);
 	}
 }
 
-- 
cgit 


From 126180b95f27ef6cc536da57115e06665254b0d7 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Wed, 18 Aug 2021 12:42:47 +0100
Subject: io_uring: IRQ rw completion batching

Employ inline completion logic for read/write completions done via
io_req_task_complete(). If ->uring_lock is contended, just do normal
request completion, but if not, make tctx_task_work() to grab the lock
and do batched inline completions in io_req_task_complete().

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/94589c3ce69eaed86a21bb1ec696407a54fab1aa.1629286357.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index cb98a831586a..827e60ae4909 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2067,6 +2067,8 @@ static void tctx_task_work(struct callback_head *cb)
 			if (req->ctx != ctx) {
 				ctx_flush_and_put(ctx, &locked);
 				ctx = req->ctx;
+				/* if not contended, grab and improve batching */
+				locked = mutex_trylock(&ctx->uring_lock);
 				percpu_ref_get(&ctx->refs);
 			}
 			req->io_task_work.func(req, &locked);
@@ -2582,7 +2584,20 @@ static bool __io_complete_rw_common(struct io_kiocb *req, long res)
 
 static void io_req_task_complete(struct io_kiocb *req, bool *locked)
 {
-	__io_req_complete(req, 0, req->result, io_put_rw_kbuf(req));
+	unsigned int cflags = io_put_rw_kbuf(req);
+	long res = req->result;
+
+	if (*locked) {
+		struct io_ring_ctx *ctx = req->ctx;
+		struct io_submit_state *state = &ctx->submit_state;
+
+		io_req_complete_state(req, res, cflags);
+		state->compl_reqs[state->compl_nr++] = req;
+		if (state->compl_nr == ARRAY_SIZE(state->compl_reqs))
+			io_submit_flush_completions(ctx);
+	} else {
+		io_req_complete_post(req, res, cflags);
+	}
 }
 
 static void __io_complete_rw(struct io_kiocb *req, long res, long res2,
-- 
cgit 


From dadebc350da2bef62593b1df007a6e0b90baf42a Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 23 Aug 2021 13:30:44 +0100
Subject: io_uring: fix io_try_cancel_userdata race for iowq

WARNING: CPU: 1 PID: 5870 at fs/io_uring.c:5975 io_try_cancel_userdata+0x30f/0x540 fs/io_uring.c:5975
CPU: 0 PID: 5870 Comm: iou-wrk-5860 Not tainted 5.14.0-rc6-next-20210820-syzkaller #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
RIP: 0010:io_try_cancel_userdata+0x30f/0x540 fs/io_uring.c:5975
Call Trace:
 io_async_cancel fs/io_uring.c:6014 [inline]
 io_issue_sqe+0x22d5/0x65a0 fs/io_uring.c:6407
 io_wq_submit_work+0x1dc/0x300 fs/io_uring.c:6511
 io_worker_handle_work+0xa45/0x1840 fs/io-wq.c:533
 io_wqe_worker+0x2cc/0xbb0 fs/io-wq.c:582
 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:295

io_try_cancel_userdata() can be called from io_async_cancel() executing
in the io-wq context, so the warning fires, which is there to alert
anyone accessing task->io_uring->io_wq in a racy way. However,
io_wq_put_and_exit() always first waits for all threads to complete,
so the only detail left is to zero tctx->io_wq after the context is
removed.

note: one little assumption is that when IO_WQ_WORK_CANCEL, the executor
won't touch ->io_wq, because io_wq_destroy() might cancel left pending
requests in such a way.

Cc: stable@vger.kernel.org
Reported-by: syzbot+b0c9d1588ae92866515f@syzkaller.appspotmail.com
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/dfdd37a80cfa9ffd3e59538929c99cdd55d8699e.1629721757.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 827e60ae4909..6859438c4e09 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -5863,7 +5863,7 @@ static int io_try_cancel_userdata(struct io_kiocb *req, u64 sqe_addr)
 	struct io_ring_ctx *ctx = req->ctx;
 	int ret;
 
-	WARN_ON_ONCE(req->task != current);
+	WARN_ON_ONCE(!io_wq_current_is_worker() && req->task != current);
 
 	ret = io_async_cancel_one(req->task->io_uring, sqe_addr, ctx);
 	if (ret != -ENOENT)
@@ -6369,6 +6369,7 @@ static void io_wq_submit_work(struct io_wq_work *work)
 	if (timeout)
 		io_queue_linked_timeout(timeout);
 
+	/* either cancelled or io-wq is dying, so don't touch tctx->iowq */
 	if (work->flags & IO_WQ_WORK_CANCEL)
 		ret = -ECANCELED;
 
@@ -9184,8 +9185,8 @@ static void io_uring_clean_tctx(struct io_uring_task *tctx)
 		 * Must be after io_uring_del_task_file() (removes nodes under
 		 * uring_lock) to avoid race with io_uring_try_cancel_iowq().
 		 */
-		tctx->io_wq = NULL;
 		io_wq_put_and_exit(wq);
+		tctx->io_wq = NULL;
 	}
 }
 
-- 
cgit 


From b9445598d8c60a1379887b957024b71343965f74 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Wed, 25 Aug 2021 12:25:45 +0100
Subject: io_uring: openat directly into fixed fd table

Instead of opening a file into a process's file table as usual and then
registering the fd within io_uring, some users may want to skip the
first step and place it directly into io_uring's fixed file table.
This patch adds such a capability for IORING_OP_OPENAT and
IORING_OP_OPENAT2.

The behaviour is controlled by setting sqe->file_index, where 0 implies
the old behaviour using normal file tables. If non-zero value is
specified, then it will behave as described and place the file into a
fixed file slot sqe->file_index - 1. A file table should be already
created, the slot should be valid and empty, otherwise the operation
will fail.

Keep the error codes consistent with IORING_OP_FILES_UPDATE, ENXIO and
EINVAL on inappropriate fixed tables, and return EBADF on collision with
already registered file.

Note: IOSQE_FIXED_FILE can't be used to switch between modes, because
accept takes a file, and it already uses the flag with a different
meaning.

Suggested-by: Josh Triplett <josh@joshtriplett.org>
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Link: https://lore.kernel.org/r/e9b33d1163286f51ea707f87d95bd596dada1e65.1629888991.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c                 | 74 ++++++++++++++++++++++++++++++++++++++-----
 include/uapi/linux/io_uring.h |  5 ++-
 2 files changed, 70 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 6859438c4e09..62da427d614d 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -580,6 +580,7 @@ struct io_sr_msg {
 struct io_open {
 	struct file			*file;
 	int				dfd;
+	u32				file_slot;
 	struct filename			*filename;
 	struct open_how			how;
 	unsigned long			nofile;
@@ -1063,6 +1064,9 @@ static void io_req_task_queue(struct io_kiocb *req);
 static void io_submit_flush_completions(struct io_ring_ctx *ctx);
 static int io_req_prep_async(struct io_kiocb *req);
 
+static int io_install_fixed_file(struct io_kiocb *req, struct file *file,
+				 unsigned int issue_flags, u32 slot_index);
+
 static struct kmem_cache *req_cachep;
 
 static const struct file_operations io_uring_fops;
@@ -3864,7 +3868,7 @@ static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
 
 	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
 		return -EINVAL;
-	if (unlikely(sqe->ioprio || sqe->buf_index || sqe->splice_fd_in))
+	if (unlikely(sqe->ioprio || sqe->buf_index))
 		return -EINVAL;
 	if (unlikely(req->flags & REQ_F_FIXED_FILE))
 		return -EBADF;
@@ -3881,6 +3885,11 @@ static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
 		req->open.filename = NULL;
 		return ret;
 	}
+
+	req->open.file_slot = READ_ONCE(sqe->file_index);
+	if (req->open.file_slot && (req->open.how.flags & O_CLOEXEC))
+		return -EINVAL;
+
 	req->open.nofile = rlimit(RLIMIT_NOFILE);
 	req->flags |= REQ_F_NEED_CLEANUP;
 	return 0;
@@ -3918,8 +3927,8 @@ static int io_openat2(struct io_kiocb *req, unsigned int issue_flags)
 {
 	struct open_flags op;
 	struct file *file;
-	bool nonblock_set;
-	bool resolve_nonblock;
+	bool resolve_nonblock, nonblock_set;
+	bool fixed = !!req->open.file_slot;
 	int ret;
 
 	ret = build_open_flags(&req->open.how, &op);
@@ -3938,9 +3947,11 @@ static int io_openat2(struct io_kiocb *req, unsigned int issue_flags)
 		op.open_flag |= O_NONBLOCK;
 	}
 
-	ret = __get_unused_fd_flags(req->open.how.flags, req->open.nofile);
-	if (ret < 0)
-		goto err;
+	if (!fixed) {
+		ret = __get_unused_fd_flags(req->open.how.flags, req->open.nofile);
+		if (ret < 0)
+			goto err;
+	}
 
 	file = do_filp_open(req->open.dfd, req->open.filename, &op);
 	if (IS_ERR(file)) {
@@ -3949,7 +3960,8 @@ static int io_openat2(struct io_kiocb *req, unsigned int issue_flags)
 		 * marginal gain for something that is now known to be a slower
 		 * path. So just put it, and we'll get a new one when we retry.
 		 */
-		put_unused_fd(ret);
+		if (!fixed)
+			put_unused_fd(ret);
 
 		ret = PTR_ERR(file);
 		/* only retry if RESOLVE_CACHED wasn't already set by application */
@@ -3962,7 +3974,12 @@ static int io_openat2(struct io_kiocb *req, unsigned int issue_flags)
 	if ((issue_flags & IO_URING_F_NONBLOCK) && !nonblock_set)
 		file->f_flags &= ~O_NONBLOCK;
 	fsnotify_open(file);
-	fd_install(ret, file);
+
+	if (!fixed)
+		fd_install(ret, file);
+	else
+		ret = io_install_fixed_file(req, file, issue_flags,
+					    req->open.file_slot - 1);
 err:
 	putname(req->open.filename);
 	req->flags &= ~REQ_F_NEED_CLEANUP;
@@ -7899,6 +7916,46 @@ static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file,
 #endif
 }
 
+static int io_install_fixed_file(struct io_kiocb *req, struct file *file,
+				 unsigned int issue_flags, u32 slot_index)
+{
+	struct io_ring_ctx *ctx = req->ctx;
+	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
+	struct io_fixed_file *file_slot;
+	int ret = -EBADF;
+
+	io_ring_submit_lock(ctx, !force_nonblock);
+	if (file->f_op == &io_uring_fops)
+		goto err;
+	ret = -ENXIO;
+	if (!ctx->file_data)
+		goto err;
+	ret = -EINVAL;
+	if (slot_index >= ctx->nr_user_files)
+		goto err;
+
+	slot_index = array_index_nospec(slot_index, ctx->nr_user_files);
+	file_slot = io_fixed_file_slot(&ctx->file_table, slot_index);
+	ret = -EBADF;
+	if (file_slot->file_ptr)
+		goto err;
+
+	*io_get_tag_slot(ctx->file_data, slot_index) = 0;
+	io_fixed_file_set(file_slot, file);
+	ret = io_sqe_file_register(ctx, file, slot_index);
+	if (ret) {
+		file_slot->file_ptr = 0;
+		goto err;
+	}
+
+	ret = 0;
+err:
+	io_ring_submit_unlock(ctx, !force_nonblock);
+	if (ret)
+		fput(file);
+	return ret;
+}
+
 static int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx,
 				 struct io_rsrc_node *node, void *rsrc)
 {
@@ -10366,6 +10423,7 @@ static int __init io_uring_init(void)
 	BUILD_BUG_SQE_ELEM(40, __u16,  buf_group);
 	BUILD_BUG_SQE_ELEM(42, __u16,  personality);
 	BUILD_BUG_SQE_ELEM(44, __s32,  splice_fd_in);
+	BUILD_BUG_SQE_ELEM(44, __u32,  file_index);
 
 	BUILD_BUG_ON(sizeof(struct io_uring_files_update) !=
 		     sizeof(struct io_uring_rsrc_update));
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 79126d5cd289..45a4f2373694 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -55,7 +55,10 @@ struct io_uring_sqe {
 	} __attribute__((packed));
 	/* personality to use, if used */
 	__u16	personality;
-	__s32	splice_fd_in;
+	union {
+		__s32	splice_fd_in;
+		__u32	file_index;
+	};
 	__u64	__pad2[2];
 };
 
-- 
cgit 


From a7083ad5e30767ede4ff49d7471ea9c078702db2 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Wed, 25 Aug 2021 12:25:46 +0100
Subject: io_uring: hand code io_accept() fd installing

Make io_accept() to handle file descriptor allocations and installation.
A preparation patch for bypassing file tables.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Link: https://lore.kernel.org/r/5b73d204caa0ce979ccb98136695b60f52a3d98c.1629888991.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 27 ++++++++++++++++++++-------
 1 file changed, 20 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 62da427d614d..ad0a61591ac7 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -4810,6 +4810,11 @@ static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2));
 	accept->flags = READ_ONCE(sqe->accept_flags);
 	accept->nofile = rlimit(RLIMIT_NOFILE);
+
+	if (accept->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
+		return -EINVAL;
+	if (SOCK_NONBLOCK != O_NONBLOCK && (accept->flags & SOCK_NONBLOCK))
+		accept->flags = (accept->flags & ~SOCK_NONBLOCK) | O_NONBLOCK;
 	return 0;
 }
 
@@ -4818,20 +4823,28 @@ static int io_accept(struct io_kiocb *req, unsigned int issue_flags)
 	struct io_accept *accept = &req->accept;
 	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
 	unsigned int file_flags = force_nonblock ? O_NONBLOCK : 0;
-	int ret;
+	struct file *file;
+	int ret, fd;
 
 	if (req->file->f_flags & O_NONBLOCK)
 		req->flags |= REQ_F_NOWAIT;
 
-	ret = __sys_accept4_file(req->file, file_flags, accept->addr,
-					accept->addr_len, accept->flags,
-					accept->nofile);
-	if (ret == -EAGAIN && force_nonblock)
-		return -EAGAIN;
-	if (ret < 0) {
+	fd = __get_unused_fd_flags(accept->flags, accept->nofile);
+	if (unlikely(fd < 0))
+		return fd;
+
+	file = do_accept(req->file, file_flags, accept->addr, accept->addr_len,
+			 accept->flags);
+	if (IS_ERR(file)) {
+		ret = PTR_ERR(file);
+		if (ret == -EAGAIN && force_nonblock)
+			return -EAGAIN;
 		if (ret == -ERESTARTSYS)
 			ret = -EINTR;
 		req_set_fail(req);
+	} else {
+		fd_install(fd, file);
+		ret = fd;
 	}
 	__io_req_complete(req, issue_flags, ret, 0);
 	return 0;
-- 
cgit 


From aaa4db12ef7bdc3e343580d1d3c0b2a8874fc1fb Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Wed, 25 Aug 2021 12:25:47 +0100
Subject: io_uring: accept directly into fixed file table

As done with open opcodes, allow accept to skip installing fd into
processes' file tables and put it directly into io_uring's fixed file
table. Same restrictions and design as for open.

Suggested-by: Josh Triplett <josh@joshtriplett.org>
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Link: https://lore.kernel.org/r/6d16163f376fac7ac26a656de6b42199143e9721.1629888991.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 24 ++++++++++++++++++------
 1 file changed, 18 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index ad0a61591ac7..e321cb0d00c4 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -515,6 +515,7 @@ struct io_accept {
 	struct sockaddr __user		*addr;
 	int __user			*addr_len;
 	int				flags;
+	u32				file_slot;
 	unsigned long			nofile;
 };
 
@@ -4803,7 +4804,7 @@ static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 
 	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
 		return -EINVAL;
-	if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->splice_fd_in)
+	if (sqe->ioprio || sqe->len || sqe->buf_index)
 		return -EINVAL;
 
 	accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
@@ -4811,6 +4812,10 @@ static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	accept->flags = READ_ONCE(sqe->accept_flags);
 	accept->nofile = rlimit(RLIMIT_NOFILE);
 
+	accept->file_slot = READ_ONCE(sqe->file_index);
+	if (accept->file_slot && ((req->open.how.flags & O_CLOEXEC) ||
+				  (accept->flags & SOCK_CLOEXEC)))
+		return -EINVAL;
 	if (accept->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
 		return -EINVAL;
 	if (SOCK_NONBLOCK != O_NONBLOCK && (accept->flags & SOCK_NONBLOCK))
@@ -4823,28 +4828,35 @@ static int io_accept(struct io_kiocb *req, unsigned int issue_flags)
 	struct io_accept *accept = &req->accept;
 	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
 	unsigned int file_flags = force_nonblock ? O_NONBLOCK : 0;
+	bool fixed = !!accept->file_slot;
 	struct file *file;
 	int ret, fd;
 
 	if (req->file->f_flags & O_NONBLOCK)
 		req->flags |= REQ_F_NOWAIT;
 
-	fd = __get_unused_fd_flags(accept->flags, accept->nofile);
-	if (unlikely(fd < 0))
-		return fd;
-
+	if (!fixed) {
+		fd = __get_unused_fd_flags(accept->flags, accept->nofile);
+		if (unlikely(fd < 0))
+			return fd;
+	}
 	file = do_accept(req->file, file_flags, accept->addr, accept->addr_len,
 			 accept->flags);
 	if (IS_ERR(file)) {
+		if (!fixed)
+			put_unused_fd(fd);
 		ret = PTR_ERR(file);
 		if (ret == -EAGAIN && force_nonblock)
 			return -EAGAIN;
 		if (ret == -ERESTARTSYS)
 			ret = -EINTR;
 		req_set_fail(req);
-	} else {
+	} else if (!fixed) {
 		fd_install(fd, file);
 		ret = fd;
+	} else {
+		ret = io_install_fixed_file(req, file, issue_flags,
+					    accept->file_slot - 1);
 	}
 	__io_req_complete(req, issue_flags, ret, 0);
 	return 0;
-- 
cgit 


From 0c6e1d7fd5e7560fdc4bb3418c2c0f0d7a95bf76 Mon Sep 17 00:00:00 2001
From: Hao Xu <haoxu@linux.alibaba.com>
Date: Thu, 26 Aug 2021 01:58:56 +0800
Subject: io_uring: don't free request to slab

It's not necessary to free the request back to slab when we fail to
get sqe, just move it to state->free_list.

Signed-off-by: Hao Xu <haoxu@linux.alibaba.com>
Reviewed-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/20210825175856.194299-1-haoxu@linux.alibaba.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index e321cb0d00c4..be3c3aea6398 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -6889,7 +6889,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
 		}
 		sqe = io_get_sqe(ctx);
 		if (unlikely(!sqe)) {
-			kmem_cache_free(req_cachep, req);
+			list_add(&req->inflight_entry, &ctx->submit_state.free_list);
 			break;
 		}
 		/* will complete beyond this point, count as submitted */
-- 
cgit 


From 14afdd6ee3a0db7bcae887d1951ed21c4d1539cd Mon Sep 17 00:00:00 2001
From: Hao Xu <haoxu@linux.alibaba.com>
Date: Fri, 27 Aug 2021 17:46:08 +0800
Subject: io_uring: remove redundant req_set_fail()

req_set_fail() in io_submit_sqe() is redundant, remove it.

Signed-off-by: Hao Xu <haoxu@linux.alibaba.com>
Reviewed-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/20210827094609.36052-2-haoxu@linux.alibaba.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index be3c3aea6398..48cc6328eb06 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -6734,7 +6734,6 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
 fail_req:
 		if (link->head) {
 			/* fail even hard links since we don't submit */
-			req_set_fail(link->head);
 			io_req_complete_failed(link->head, -ECANCELED);
 			link->head = NULL;
 		}
-- 
cgit 


From a8295b982c46d4a7c259a4cdd58a2681929068a9 Mon Sep 17 00:00:00 2001
From: Hao Xu <haoxu@linux.alibaba.com>
Date: Fri, 27 Aug 2021 17:46:09 +0800
Subject: io_uring: fix failed linkchain code logic

Given a linkchain like this:
req0(link_flag)-->req1(link_flag)-->...-->reqn(no link_flag)

There is a problem:
 - if some intermediate linked req like req1 's submittion fails, reqs
   after it won't be cancelled.

   - sqpoll disabled: maybe it's ok since users can get the error info
     of req1 and stop submitting the following sqes.

   - sqpoll enabled: definitely a problem, the following sqes will be
     submitted in the next round.

The solution is to refactor the code logic to:
 - if a linked req's submittion fails, just mark it and the head(if it
   exists) as REQ_F_FAIL. Leverage req->result to indicate whether it
   is failed or cancelled.
 - submit or fail the whole chain when we come to the end of it.

Signed-off-by: Hao Xu <haoxu@linux.alibaba.com>
Reviewed-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/20210827094609.36052-3-haoxu@linux.alibaba.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 61 +++++++++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 47 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 48cc6328eb06..1341b714ed12 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1188,6 +1188,12 @@ static inline void req_set_fail(struct io_kiocb *req)
 	req->flags |= REQ_F_FAIL;
 }
 
+static inline void req_fail_link_node(struct io_kiocb *req, int res)
+{
+	req_set_fail(req);
+	req->result = res;
+}
+
 static void io_ring_ctx_ref_free(struct percpu_ref *ref)
 {
 	struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);
@@ -1957,11 +1963,16 @@ static void io_fail_links(struct io_kiocb *req)
 
 	req->link = NULL;
 	while (link) {
+		long res = -ECANCELED;
+
+		if (link->flags & REQ_F_FAIL)
+			res = link->result;
+
 		nxt = link->link;
 		link->link = NULL;
 
 		trace_io_uring_fail_link(req, link);
-		io_cqring_fill_event(link->ctx, link->user_data, -ECANCELED, 0);
+		io_cqring_fill_event(link->ctx, link->user_data, res, 0);
 		io_put_req_deferred(link);
 		link = nxt;
 	}
@@ -6622,8 +6633,10 @@ static inline void io_queue_sqe(struct io_kiocb *req)
 	if (unlikely(req->ctx->drain_active) && io_drain_req(req))
 		return;
 
-	if (likely(!(req->flags & REQ_F_FORCE_ASYNC))) {
+	if (likely(!(req->flags & (REQ_F_FORCE_ASYNC | REQ_F_FAIL)))) {
 		__io_queue_sqe(req);
+	} else if (req->flags & REQ_F_FAIL) {
+		io_req_complete_failed(req, req->result);
 	} else {
 		int ret = io_req_prep_async(req);
 
@@ -6732,19 +6745,34 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
 	ret = io_init_req(ctx, req, sqe);
 	if (unlikely(ret)) {
 fail_req:
+		/* fail even hard links since we don't submit */
 		if (link->head) {
-			/* fail even hard links since we don't submit */
-			io_req_complete_failed(link->head, -ECANCELED);
-			link->head = NULL;
+			/*
+			 * we can judge a link req is failed or cancelled by if
+			 * REQ_F_FAIL is set, but the head is an exception since
+			 * it may be set REQ_F_FAIL because of other req's failure
+			 * so let's leverage req->result to distinguish if a head
+			 * is set REQ_F_FAIL because of its failure or other req's
+			 * failure so that we can set the correct ret code for it.
+			 * init result here to avoid affecting the normal path.
+			 */
+			if (!(link->head->flags & REQ_F_FAIL))
+				req_fail_link_node(link->head, -ECANCELED);
+		} else if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) {
+			/*
+			 * the current req is a normal req, we should return
+			 * error and thus break the submittion loop.
+			 */
+			io_req_complete_failed(req, ret);
+			return ret;
 		}
-		io_req_complete_failed(req, ret);
-		return ret;
+		req_fail_link_node(req, ret);
+	} else {
+		ret = io_req_prep(req, sqe);
+		if (unlikely(ret))
+			goto fail_req;
 	}
 
-	ret = io_req_prep(req, sqe);
-	if (unlikely(ret))
-		goto fail_req;
-
 	/* don't need @sqe from now on */
 	trace_io_uring_submit_sqe(ctx, req, req->opcode, req->user_data,
 				  req->flags, true,
@@ -6760,9 +6788,14 @@ fail_req:
 	if (link->head) {
 		struct io_kiocb *head = link->head;
 
-		ret = io_req_prep_async(req);
-		if (unlikely(ret))
-			goto fail_req;
+		if (!(req->flags & REQ_F_FAIL)) {
+			ret = io_req_prep_async(req);
+			if (unlikely(ret)) {
+				req_fail_link_node(req, ret);
+				if (!(head->flags & REQ_F_FAIL))
+					req_fail_link_node(head, -ECANCELED);
+			}
+		}
 		trace_io_uring_link(ctx, req, head);
 		link->last->link = req;
 		link->last = req;
-- 
cgit 


From 9a10867ae54e02a0f204d2eebea5a446fb7a86f9 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 27 Aug 2021 11:55:01 +0100
Subject: io_uring: add task-refs-get helper

As we have a more complicated task referencing, which apart from normal
task references includes taking tctx->inflight and caching all that, it
would be a good idea to have all that isolated in helpers.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/d9114d037f1c195897aa13f38a496078eca2afdb.1630023531.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 30 +++++++++++++++++++-----------
 1 file changed, 19 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 1341b714ed12..5059049da242 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1670,6 +1670,24 @@ static inline void io_put_task(struct task_struct *task, int nr)
 	}
 }
 
+static void io_task_refs_refill(struct io_uring_task *tctx)
+{
+	unsigned int refill = -tctx->cached_refs + IO_TCTX_REFS_CACHE_NR;
+
+	percpu_counter_add(&tctx->inflight, refill);
+	refcount_add(refill, &current->usage);
+	tctx->cached_refs += refill;
+}
+
+static inline void io_get_task_refs(int nr)
+{
+	struct io_uring_task *tctx = current->io_uring;
+
+	tctx->cached_refs -= nr;
+	if (unlikely(tctx->cached_refs < 0))
+		io_task_refs_refill(tctx);
+}
+
 static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
 				     long res, unsigned int cflags)
 {
@@ -6890,25 +6908,15 @@ static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx)
 static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
 	__must_hold(&ctx->uring_lock)
 {
-	struct io_uring_task *tctx;
 	int submitted = 0;
 
 	/* make sure SQ entry isn't read before tail */
 	nr = min3(nr, ctx->sq_entries, io_sqring_entries(ctx));
 	if (!percpu_ref_tryget_many(&ctx->refs, nr))
 		return -EAGAIN;
+	io_get_task_refs(nr);
 
-	tctx = current->io_uring;
-	tctx->cached_refs -= nr;
-	if (unlikely(tctx->cached_refs < 0)) {
-		unsigned int refill = -tctx->cached_refs + IO_TCTX_REFS_CACHE_NR;
-
-		percpu_counter_add(&tctx->inflight, refill);
-		refcount_add(refill, &current->usage);
-		tctx->cached_refs += refill;
-	}
 	io_submit_state_start(&ctx->submit_state, nr);
-
 	while (submitted < nr) {
 		const struct io_uring_sqe *sqe;
 		struct io_kiocb *req;
-- 
cgit 


From b18a1a4574d2d15f1b0c84658d4549ccbf241fee Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Wed, 25 Aug 2021 20:51:39 +0100
Subject: io_uring: clarify io_req_task_cancel() locking

It's too easy to forget and misjudge about synchronisation in
io_req_task_cancel(), add a comment clarifying it.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/71099083835f983a1fd73d5a3da6391924da8300.1629920396.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 5059049da242..09194f7276ba 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2168,7 +2168,7 @@ static void io_req_task_cancel(struct io_kiocb *req, bool *locked)
 {
 	struct io_ring_ctx *ctx = req->ctx;
 
-	/* ctx is guaranteed to stay alive while we hold uring_lock */
+	/* not needed for normal modes, but SQPOLL depends on it */
 	io_tw_lock(ctx, locked);
 	io_req_complete_failed(req, req->result);
 }
@@ -2177,7 +2177,6 @@ static void io_req_task_submit(struct io_kiocb *req, bool *locked)
 {
 	struct io_ring_ctx *ctx = req->ctx;
 
-	/* ctx stays valid until unlock, even if we drop all ours ctx->refs */
 	io_tw_lock(ctx, locked);
 	/* req->task == current here, checking PF_EXITING is safe */
 	if (likely(!(req->task->flags & PF_EXITING)))
-- 
cgit 


From 90499ad00ca59320b5bb43392b7931e1bd84cad2 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Wed, 25 Aug 2021 20:51:40 +0100
Subject: io_uring: add build check for buf_index overflows

req->buf_index is u16 and so we rely on registered buffers indexes
fitting into it. Add a build check, so when the upper limit for the
number of buffers is lifted we get a compliation fail but not lurking
problems.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/787e8e1a17cea51ca6301426b1c4c4887b8bd676.1629920396.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 09194f7276ba..53326449d685 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -10493,6 +10493,10 @@ static int __init io_uring_init(void)
 		     sizeof(struct io_uring_rsrc_update));
 	BUILD_BUG_ON(sizeof(struct io_uring_rsrc_update) >
 		     sizeof(struct io_uring_rsrc_update2));
+
+	/* ->buf_index is u16 */
+	BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16));
+
 	/* should fit into one byte */
 	BUILD_BUG_ON(SQE_VALID_FLAGS >= (1 << 8));
 
-- 
cgit 


From 2e480058ddc21ec53a10e8b41623e245e908bdbc Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 27 Aug 2021 11:33:19 -0600
Subject: io-wq: provide a way to limit max number of workers

io-wq divides work into two categories:

1) Work that completes in a bounded time, like reading from a regular file
   or a block device. This type of work is limited based on the size of
   the SQ ring.

2) Work that may never complete, we call this unbounded work. The amount
   of workers here is just limited by RLIMIT_NPROC.

For various uses cases, it's handy to have the kernel limit the maximum
amount of pending workers for both categories. Provide a way to do with
with a new IORING_REGISTER_IOWQ_MAX_WORKERS operation.

IORING_REGISTER_IOWQ_MAX_WORKERS takes an array of two integers and sets
the max worker count to what is being passed in for each category. The
old values are returned into that same array. If 0 is being passed in for
either category, it simply returns the current value.

The value is capped at RLIMIT_NPROC. This actually isn't that important
as it's more of a hint, if we're exceeding the value then our attempt
to fork a new worker will fail. This happens naturally already if more
than one node is in the system, as these values are per-node internally
for io-wq.

Reported-by: Johannes Lundberg <johalun0@gmail.com>
Link: https://github.com/axboe/liburing/issues/420
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io-wq.c                    | 29 +++++++++++++++++++++++++++++
 fs/io-wq.h                    |  1 +
 fs/io_uring.c                 | 32 ++++++++++++++++++++++++++++++++
 include/uapi/linux/io_uring.h |  3 +++
 4 files changed, 65 insertions(+)

(limited to 'fs')

diff --git a/fs/io-wq.c b/fs/io-wq.c
index 8da9bb103916..4b5fc621ab39 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -1152,6 +1152,35 @@ int io_wq_cpu_affinity(struct io_wq *wq, cpumask_var_t mask)
 	return 0;
 }
 
+/*
+ * Set max number of unbounded workers, returns old value. If new_count is 0,
+ * then just return the old value.
+ */
+int io_wq_max_workers(struct io_wq *wq, int *new_count)
+{
+	int i, node, prev = 0;
+
+	for (i = 0; i < 2; i++) {
+		if (new_count[i] > task_rlimit(current, RLIMIT_NPROC))
+			new_count[i] = task_rlimit(current, RLIMIT_NPROC);
+	}
+
+	rcu_read_lock();
+	for_each_node(node) {
+		struct io_wqe_acct *acct;
+
+		for (i = 0; i < 2; i++) {
+			acct = &wq->wqes[node]->acct[i];
+			prev = max_t(int, acct->max_workers, prev);
+			if (new_count[i])
+				acct->max_workers = new_count[i];
+			new_count[i] = prev;
+		}
+	}
+	rcu_read_unlock();
+	return 0;
+}
+
 static __init int io_wq_init(void)
 {
 	int ret;
diff --git a/fs/io-wq.h b/fs/io-wq.h
index 308af3928424..bf5c4c533760 100644
--- a/fs/io-wq.h
+++ b/fs/io-wq.h
@@ -128,6 +128,7 @@ void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work);
 void io_wq_hash_work(struct io_wq_work *work, void *val);
 
 int io_wq_cpu_affinity(struct io_wq *wq, cpumask_var_t mask);
+int io_wq_max_workers(struct io_wq *wq, int *new_count);
 
 static inline bool io_wq_is_hashed(struct io_wq_work *work)
 {
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 53326449d685..edbda88142f9 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -10233,6 +10233,31 @@ static int io_unregister_iowq_aff(struct io_ring_ctx *ctx)
 	return io_wq_cpu_affinity(tctx->io_wq, NULL);
 }
 
+static int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
+					void __user *arg)
+{
+	struct io_uring_task *tctx = current->io_uring;
+	__u32 new_count[2];
+	int i, ret;
+
+	if (!tctx || !tctx->io_wq)
+		return -EINVAL;
+	if (copy_from_user(new_count, arg, sizeof(new_count)))
+		return -EFAULT;
+	for (i = 0; i < ARRAY_SIZE(new_count); i++)
+		if (new_count[i] > INT_MAX)
+			return -EINVAL;
+
+	ret = io_wq_max_workers(tctx->io_wq, new_count);
+	if (ret)
+		return ret;
+
+	if (copy_to_user(arg, new_count, sizeof(new_count)))
+		return -EFAULT;
+
+	return 0;
+}
+
 static bool io_register_op_must_quiesce(int op)
 {
 	switch (op) {
@@ -10250,6 +10275,7 @@ static bool io_register_op_must_quiesce(int op)
 	case IORING_REGISTER_BUFFERS_UPDATE:
 	case IORING_REGISTER_IOWQ_AFF:
 	case IORING_UNREGISTER_IOWQ_AFF:
+	case IORING_REGISTER_IOWQ_MAX_WORKERS:
 		return false;
 	default:
 		return true;
@@ -10406,6 +10432,12 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 			break;
 		ret = io_unregister_iowq_aff(ctx);
 		break;
+	case IORING_REGISTER_IOWQ_MAX_WORKERS:
+		ret = -EINVAL;
+		if (!arg || nr_args != 2)
+			break;
+		ret = io_register_iowq_max_workers(ctx, arg);
+		break;
 	default:
 		ret = -EINVAL;
 		break;
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 45a4f2373694..64fe809c4e36 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -309,6 +309,9 @@ enum {
 	IORING_REGISTER_IOWQ_AFF		= 17,
 	IORING_UNREGISTER_IOWQ_AFF		= 18,
 
+	/* set/get max number of workers */
+	IORING_REGISTER_IOWQ_MAX_WORKERS	= 19,
+
 	/* this goes last */
 	IORING_REGISTER_LAST
 };
-- 
cgit 


From 50c1df2b56e0f581b1dbf334dbf807d6fb8f77b2 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 27 Aug 2021 17:11:06 -0600
Subject: io_uring: support CLOCK_BOOTTIME/REALTIME for timeouts

Certain use cases want to use CLOCK_BOOTTIME or CLOCK_REALTIME rather than
CLOCK_MONOTONIC, instead of the default CLOCK_MONOTONIC.

Add an IORING_TIMEOUT_BOOTTIME and IORING_TIMEOUT_REALTIME flag that
allows timeouts and linked timeouts to use the selected clock source.

Only one clock source may be selected, and we -EINVAL the request if more
than one is given. If neither BOOTIME nor REALTIME are selected, the
previous default of MONOTONIC is used.

Link: https://github.com/axboe/liburing/issues/369
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c                 | 27 ++++++++++++++++++++++++---
 include/uapi/linux/io_uring.h |  3 +++
 2 files changed, 27 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index edbda88142f9..1c99f0143b57 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -508,6 +508,7 @@ struct io_timeout_data {
 	struct hrtimer			timer;
 	struct timespec64		ts;
 	enum hrtimer_mode		mode;
+	u32				flags;
 };
 
 struct io_accept {
@@ -5712,6 +5713,22 @@ static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
 	return 0;
 }
 
+static clockid_t io_timeout_get_clock(struct io_timeout_data *data)
+{
+	switch (data->flags & IORING_TIMEOUT_CLOCK_MASK) {
+	case IORING_TIMEOUT_BOOTTIME:
+		return CLOCK_BOOTTIME;
+	case IORING_TIMEOUT_REALTIME:
+		return CLOCK_REALTIME;
+	default:
+		/* can't happen, vetted at prep time */
+		WARN_ON_ONCE(1);
+		fallthrough;
+	case 0:
+		return CLOCK_MONOTONIC;
+	}
+}
+
 static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
 			     struct timespec64 *ts, enum hrtimer_mode mode)
 	__must_hold(&ctx->timeout_lock)
@@ -5725,7 +5742,7 @@ static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
 	req->timeout.off = 0; /* noseq */
 	data = req->async_data;
 	list_add_tail(&req->timeout.list, &ctx->timeout_list);
-	hrtimer_init(&data->timer, CLOCK_MONOTONIC, mode);
+	hrtimer_init(&data->timer, io_timeout_get_clock(data), mode);
 	data->timer.function = io_timeout_fn;
 	hrtimer_start(&data->timer, timespec64_to_ktime(*ts), mode);
 	return 0;
@@ -5807,7 +5824,10 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 	if (off && is_timeout_link)
 		return -EINVAL;
 	flags = READ_ONCE(sqe->timeout_flags);
-	if (flags & ~IORING_TIMEOUT_ABS)
+	if (flags & ~(IORING_TIMEOUT_ABS | IORING_TIMEOUT_CLOCK_MASK))
+		return -EINVAL;
+	/* more than one clock specified is invalid, obviously */
+	if (hweight32(flags & IORING_TIMEOUT_CLOCK_MASK) > 1)
 		return -EINVAL;
 
 	req->timeout.off = off;
@@ -5819,12 +5839,13 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 
 	data = req->async_data;
 	data->req = req;
+	data->flags = flags;
 
 	if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr)))
 		return -EFAULT;
 
 	data->mode = io_translate_timeout_mode(flags);
-	hrtimer_init(&data->timer, CLOCK_MONOTONIC, data->mode);
+	hrtimer_init(&data->timer, io_timeout_get_clock(data), data->mode);
 
 	if (is_timeout_link) {
 		struct io_submit_link *link = &req->ctx->submit_state.link;
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 64fe809c4e36..b6d28d927a3f 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -151,6 +151,9 @@ enum {
  */
 #define IORING_TIMEOUT_ABS	(1U << 0)
 #define IORING_TIMEOUT_UPDATE	(1U << 1)
+#define IORING_TIMEOUT_BOOTTIME	(1U << 2)
+#define IORING_TIMEOUT_REALTIME	(1U << 3)
+#define IORING_TIMEOUT_CLOCK_MASK	(IORING_TIMEOUT_BOOTTIME | IORING_TIMEOUT_REALTIME)
 
 /*
  * sqe->splice_flags
-- 
cgit 


From ef9dd637084d437463f5e9efa153dfc94e7e5f08 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sat, 28 Aug 2021 19:54:38 -0600
Subject: io_uring: keep ltimeouts in a list

A preparation patch. Keep all queued linked timeout in a list, so they
may be found and updated.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 1c99f0143b57..aa978292f34b 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -375,6 +375,7 @@ struct io_ring_ctx {
 
 		struct io_submit_state	submit_state;
 		struct list_head	timeout_list;
+		struct list_head	ltimeout_list;
 		struct list_head	cq_overflow_list;
 		struct xarray		io_buffers;
 		struct xarray		personalities;
@@ -1277,6 +1278,7 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 	INIT_LIST_HEAD(&ctx->iopoll_list);
 	INIT_LIST_HEAD(&ctx->defer_list);
 	INIT_LIST_HEAD(&ctx->timeout_list);
+	INIT_LIST_HEAD(&ctx->ltimeout_list);
 	spin_lock_init(&ctx->rsrc_ref_lock);
 	INIT_LIST_HEAD(&ctx->rsrc_ref_list);
 	INIT_DELAYED_WORK(&ctx->rsrc_put_work, io_rsrc_put_work);
@@ -1966,6 +1968,7 @@ static bool io_kill_linked_timeout(struct io_kiocb *req)
 		io_remove_next_linked(req);
 		link->timeout.head = NULL;
 		if (hrtimer_try_to_cancel(&io->timer) != -1) {
+			list_del(&link->timeout.list);
 			io_cqring_fill_event(link->ctx, link->user_data,
 					     -ECANCELED, 0);
 			io_put_req_deferred(link);
@@ -5830,6 +5833,7 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 	if (hweight32(flags & IORING_TIMEOUT_CLOCK_MASK) > 1)
 		return -EINVAL;
 
+	INIT_LIST_HEAD(&req->timeout.list);
 	req->timeout.off = off;
 	if (unlikely(off && !req->ctx->off_timeout_used))
 		req->ctx->off_timeout_used = true;
@@ -6585,6 +6589,7 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
 		if (!req_ref_inc_not_zero(prev))
 			prev = NULL;
 	}
+	list_del(&req->timeout.list);
 	req->timeout.prev = prev;
 	spin_unlock_irqrestore(&ctx->timeout_lock, flags);
 
@@ -6608,6 +6613,7 @@ static void io_queue_linked_timeout(struct io_kiocb *req)
 		data->timer.function = io_link_timeout_fn;
 		hrtimer_start(&data->timer, timespec64_to_ktime(data->ts),
 				data->mode);
+		list_add_tail(&req->timeout.list, &ctx->ltimeout_list);
 	}
 	spin_unlock_irq(&ctx->timeout_lock);
 	/* drop submission reference */
@@ -8900,6 +8906,7 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx)
 		sock_release(ctx->ring_sock);
 	}
 #endif
+	WARN_ON_ONCE(!list_empty(&ctx->ltimeout_list));
 
 	io_mem_free(ctx->rings);
 	io_mem_free(ctx->sq_sqes);
-- 
cgit 


From f1042b6ccb887f07301f6b096b3d0cfcf9189323 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sat, 28 Aug 2021 19:54:39 -0600
Subject: io_uring: allow updating linked timeouts

We allow updating normal timeouts, add support for adjusting timings of
linked timeouts as well.

Reported-by: Victor Stewart <v@nametag.social>
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c                 | 44 +++++++++++++++++++++++++++++++++++++++----
 include/uapi/linux/io_uring.h | 11 ++++++-----
 2 files changed, 46 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index aa978292f34b..7cc458e0b636 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -552,6 +552,7 @@ struct io_timeout_rem {
 	/* timeout update */
 	struct timespec64		ts;
 	u32				flags;
+	bool				ltimeout;
 };
 
 struct io_rw {
@@ -1069,6 +1070,7 @@ static int io_req_prep_async(struct io_kiocb *req);
 
 static int io_install_fixed_file(struct io_kiocb *req, struct file *file,
 				 unsigned int issue_flags, u32 slot_index);
+static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer);
 
 static struct kmem_cache *req_cachep;
 
@@ -5732,6 +5734,31 @@ static clockid_t io_timeout_get_clock(struct io_timeout_data *data)
 	}
 }
 
+static int io_linked_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
+				    struct timespec64 *ts, enum hrtimer_mode mode)
+	__must_hold(&ctx->timeout_lock)
+{
+	struct io_timeout_data *io;
+	struct io_kiocb *req;
+	bool found = false;
+
+	list_for_each_entry(req, &ctx->ltimeout_list, timeout.list) {
+		found = user_data == req->user_data;
+		if (found)
+			break;
+	}
+	if (!found)
+		return -ENOENT;
+
+	io = req->async_data;
+	if (hrtimer_try_to_cancel(&io->timer) == -1)
+		return -EALREADY;
+	hrtimer_init(&io->timer, io_timeout_get_clock(io), mode);
+	io->timer.function = io_link_timeout_fn;
+	hrtimer_start(&io->timer, timespec64_to_ktime(*ts), mode);
+	return 0;
+}
+
 static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
 			     struct timespec64 *ts, enum hrtimer_mode mode)
 	__must_hold(&ctx->timeout_lock)
@@ -5763,10 +5790,15 @@ static int io_timeout_remove_prep(struct io_kiocb *req,
 	if (sqe->ioprio || sqe->buf_index || sqe->len || sqe->splice_fd_in)
 		return -EINVAL;
 
+	tr->ltimeout = false;
 	tr->addr = READ_ONCE(sqe->addr);
 	tr->flags = READ_ONCE(sqe->timeout_flags);
-	if (tr->flags & IORING_TIMEOUT_UPDATE) {
-		if (tr->flags & ~(IORING_TIMEOUT_UPDATE|IORING_TIMEOUT_ABS))
+	if (tr->flags & IORING_TIMEOUT_UPDATE_MASK) {
+		if (hweight32(tr->flags & IORING_TIMEOUT_CLOCK_MASK) > 1)
+			return -EINVAL;
+		if (tr->flags & IORING_LINK_TIMEOUT_UPDATE)
+			tr->ltimeout = true;
+		if (tr->flags & ~(IORING_TIMEOUT_UPDATE_MASK|IORING_TIMEOUT_ABS))
 			return -EINVAL;
 		if (get_timespec64(&tr->ts, u64_to_user_ptr(sqe->addr2)))
 			return -EFAULT;
@@ -5800,9 +5832,13 @@ static int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags)
 		spin_unlock_irq(&ctx->timeout_lock);
 		spin_unlock(&ctx->completion_lock);
 	} else {
+		enum hrtimer_mode mode = io_translate_timeout_mode(tr->flags);
+
 		spin_lock_irq(&ctx->timeout_lock);
-		ret = io_timeout_update(ctx, tr->addr, &tr->ts,
-					io_translate_timeout_mode(tr->flags));
+		if (tr->ltimeout)
+			ret = io_linked_timeout_update(ctx, tr->addr, &tr->ts, mode);
+		else
+			ret = io_timeout_update(ctx, tr->addr, &tr->ts, mode);
 		spin_unlock_irq(&ctx->timeout_lock);
 	}
 
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index b6d28d927a3f..3caec9199658 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -149,12 +149,13 @@ enum {
 /*
  * sqe->timeout_flags
  */
-#define IORING_TIMEOUT_ABS	(1U << 0)
-#define IORING_TIMEOUT_UPDATE	(1U << 1)
-#define IORING_TIMEOUT_BOOTTIME	(1U << 2)
-#define IORING_TIMEOUT_REALTIME	(1U << 3)
+#define IORING_TIMEOUT_ABS		(1U << 0)
+#define IORING_TIMEOUT_UPDATE		(1U << 1)
+#define IORING_TIMEOUT_BOOTTIME		(1U << 2)
+#define IORING_TIMEOUT_REALTIME		(1U << 3)
+#define IORING_LINK_TIMEOUT_UPDATE	(1U << 4)
 #define IORING_TIMEOUT_CLOCK_MASK	(IORING_TIMEOUT_BOOTTIME | IORING_TIMEOUT_REALTIME)
-
+#define IORING_TIMEOUT_UPDATE_MASK	(IORING_TIMEOUT_UPDATE | IORING_LINK_TIMEOUT_UPDATE)
 /*
  * sqe->splice_flags
  * extends splice(2) flags
-- 
cgit 


From ecc53c48c13d995e6fe5559e30ffee48d92784fd Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sun, 29 Aug 2021 16:13:03 -0600
Subject: io-wq: check max_worker limits if a worker transitions bound state

For the two places where new workers are created, we diligently check if
we are allowed to create a new worker. If we're currently at the limit
of how many workers of a given type we can have, then we don't create
any new ones.

If you have a mixed workload with various types of bound and unbounded
work, then it can happen that a worker finishes one type of work and
is then transitioned to the other type. For this case, we don't check
if we are actually allowed to do so. This can cause io-wq to temporarily
exceed the allowed number of workers for a given type.

When retrieving work, check that the types match. If they don't, check
if we are allowed to transition to the other type. If not, then don't
handle the new work.

Cc: stable@vger.kernel.org
Reported-by: Johannes Lundberg <johalun0@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io-wq.c | 33 ++++++++++++++++++++++++++++++---
 1 file changed, 30 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/io-wq.c b/fs/io-wq.c
index 4b5fc621ab39..da3ad45028f9 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -424,7 +424,28 @@ static void io_wait_on_hash(struct io_wqe *wqe, unsigned int hash)
 	spin_unlock(&wq->hash->wait.lock);
 }
 
-static struct io_wq_work *io_get_next_work(struct io_wqe *wqe)
+/*
+ * We can always run the work if the worker is currently the same type as
+ * the work (eg both are bound, or both are unbound). If they are not the
+ * same, only allow it if incrementing the worker count would be allowed.
+ */
+static bool io_worker_can_run_work(struct io_worker *worker,
+				   struct io_wq_work *work)
+{
+	struct io_wqe_acct *acct;
+
+	if (!(worker->flags & IO_WORKER_F_BOUND) !=
+	    !(work->flags & IO_WQ_WORK_UNBOUND))
+		return true;
+
+	/* not the same type, check if we'd go over the limit */
+	acct = io_work_get_acct(worker->wqe, work);
+	return acct->nr_workers < acct->max_workers;
+}
+
+static struct io_wq_work *io_get_next_work(struct io_wqe *wqe,
+					   struct io_worker *worker,
+					   bool *stalled)
 	__must_hold(wqe->lock)
 {
 	struct io_wq_work_node *node, *prev;
@@ -436,6 +457,9 @@ static struct io_wq_work *io_get_next_work(struct io_wqe *wqe)
 
 		work = container_of(node, struct io_wq_work, list);
 
+		if (!io_worker_can_run_work(worker, work))
+			break;
+
 		/* not hashed, can run anytime */
 		if (!io_wq_is_hashed(work)) {
 			wq_list_del(&wqe->work_list, node, prev);
@@ -462,6 +486,7 @@ static struct io_wq_work *io_get_next_work(struct io_wqe *wqe)
 		raw_spin_unlock(&wqe->lock);
 		io_wait_on_hash(wqe, stall_hash);
 		raw_spin_lock(&wqe->lock);
+		*stalled = true;
 	}
 
 	return NULL;
@@ -501,6 +526,7 @@ static void io_worker_handle_work(struct io_worker *worker)
 
 	do {
 		struct io_wq_work *work;
+		bool stalled;
 get_next:
 		/*
 		 * If we got some work, mark us as busy. If we didn't, but
@@ -509,10 +535,11 @@ get_next:
 		 * can't make progress, any work completion or insertion will
 		 * clear the stalled flag.
 		 */
-		work = io_get_next_work(wqe);
+		stalled = false;
+		work = io_get_next_work(wqe, worker, &stalled);
 		if (work)
 			__io_worker_busy(wqe, worker, work);
-		else if (!wq_list_empty(&wqe->work_list))
+		else if (stalled)
 			wqe->flags |= IO_WQE_FLAG_STALLED;
 
 		raw_spin_unlock_irq(&wqe->lock);
-- 
cgit 


From a9a4aa9fbfc5b87f315c63d9a317648774a46879 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 30 Aug 2021 06:33:08 -0600
Subject: io-wq: wqe and worker locks no longer need to be IRQ safe

io_uring no longer queues async work off completion handlers that run in
hard or soft interrupt context, and that use case was the only reason that
io-wq had to use IRQ safe locks for wqe and worker locks.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io-wq.c | 59 ++++++++++++++++++++++++++++-------------------------------
 1 file changed, 28 insertions(+), 31 deletions(-)

(limited to 'fs')

diff --git a/fs/io-wq.c b/fs/io-wq.c
index da3ad45028f9..13aeb48a0964 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -178,7 +178,7 @@ static void io_worker_exit(struct io_worker *worker)
 		complete(&worker->ref_done);
 	wait_for_completion(&worker->ref_done);
 
-	raw_spin_lock_irq(&wqe->lock);
+	raw_spin_lock(&wqe->lock);
 	if (worker->flags & IO_WORKER_F_FREE)
 		hlist_nulls_del_rcu(&worker->nulls_node);
 	list_del_rcu(&worker->all_list);
@@ -188,7 +188,7 @@ static void io_worker_exit(struct io_worker *worker)
 	worker->flags = 0;
 	current->flags &= ~PF_IO_WORKER;
 	preempt_enable();
-	raw_spin_unlock_irq(&wqe->lock);
+	raw_spin_unlock(&wqe->lock);
 
 	kfree_rcu(worker, rcu);
 	io_worker_ref_put(wqe->wq);
@@ -254,14 +254,14 @@ static void io_wqe_wake_worker(struct io_wqe *wqe, struct io_wqe_acct *acct)
 	if (!ret) {
 		bool do_create = false, first = false;
 
-		raw_spin_lock_irq(&wqe->lock);
+		raw_spin_lock(&wqe->lock);
 		if (acct->nr_workers < acct->max_workers) {
 			if (!acct->nr_workers)
 				first = true;
 			acct->nr_workers++;
 			do_create = true;
 		}
-		raw_spin_unlock_irq(&wqe->lock);
+		raw_spin_unlock(&wqe->lock);
 		if (do_create) {
 			atomic_inc(&acct->nr_running);
 			atomic_inc(&wqe->wq->worker_refs);
@@ -289,14 +289,14 @@ static void create_worker_cb(struct callback_head *cb)
 	wqe = worker->wqe;
 	wq = wqe->wq;
 	acct = &wqe->acct[worker->create_index];
-	raw_spin_lock_irq(&wqe->lock);
+	raw_spin_lock(&wqe->lock);
 	if (acct->nr_workers < acct->max_workers) {
 		if (!acct->nr_workers)
 			first = true;
 		acct->nr_workers++;
 		do_create = true;
 	}
-	raw_spin_unlock_irq(&wqe->lock);
+	raw_spin_unlock(&wqe->lock);
 	if (do_create) {
 		create_io_worker(wq, wqe, worker->create_index, first);
 	} else {
@@ -510,9 +510,9 @@ static void io_assign_current_work(struct io_worker *worker,
 		cond_resched();
 	}
 
-	spin_lock_irq(&worker->lock);
+	spin_lock(&worker->lock);
 	worker->cur_work = work;
-	spin_unlock_irq(&worker->lock);
+	spin_unlock(&worker->lock);
 }
 
 static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work);
@@ -542,7 +542,7 @@ get_next:
 		else if (stalled)
 			wqe->flags |= IO_WQE_FLAG_STALLED;
 
-		raw_spin_unlock_irq(&wqe->lock);
+		raw_spin_unlock(&wqe->lock);
 		if (!work)
 			break;
 		io_assign_current_work(worker, work);
@@ -574,16 +574,16 @@ get_next:
 				clear_bit(hash, &wq->hash->map);
 				if (wq_has_sleeper(&wq->hash->wait))
 					wake_up(&wq->hash->wait);
-				raw_spin_lock_irq(&wqe->lock);
+				raw_spin_lock(&wqe->lock);
 				wqe->flags &= ~IO_WQE_FLAG_STALLED;
 				/* skip unnecessary unlock-lock wqe->lock */
 				if (!work)
 					goto get_next;
-				raw_spin_unlock_irq(&wqe->lock);
+				raw_spin_unlock(&wqe->lock);
 			}
 		} while (work);
 
-		raw_spin_lock_irq(&wqe->lock);
+		raw_spin_lock(&wqe->lock);
 	} while (1);
 }
 
@@ -604,13 +604,13 @@ static int io_wqe_worker(void *data)
 
 		set_current_state(TASK_INTERRUPTIBLE);
 loop:
-		raw_spin_lock_irq(&wqe->lock);
+		raw_spin_lock(&wqe->lock);
 		if (io_wqe_run_queue(wqe)) {
 			io_worker_handle_work(worker);
 			goto loop;
 		}
 		__io_worker_idle(wqe, worker);
-		raw_spin_unlock_irq(&wqe->lock);
+		raw_spin_unlock(&wqe->lock);
 		if (io_flush_signals())
 			continue;
 		ret = schedule_timeout(WORKER_IDLE_TIMEOUT);
@@ -629,7 +629,7 @@ loop:
 	}
 
 	if (test_bit(IO_WQ_BIT_EXIT, &wq->state)) {
-		raw_spin_lock_irq(&wqe->lock);
+		raw_spin_lock(&wqe->lock);
 		io_worker_handle_work(worker);
 	}
 
@@ -671,9 +671,9 @@ void io_wq_worker_sleeping(struct task_struct *tsk)
 
 	worker->flags &= ~IO_WORKER_F_RUNNING;
 
-	raw_spin_lock_irq(&worker->wqe->lock);
+	raw_spin_lock(&worker->wqe->lock);
 	io_wqe_dec_running(worker);
-	raw_spin_unlock_irq(&worker->wqe->lock);
+	raw_spin_unlock(&worker->wqe->lock);
 }
 
 static void create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index, bool first)
@@ -699,9 +699,9 @@ static void create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index, bo
 		kfree(worker);
 fail:
 		atomic_dec(&acct->nr_running);
-		raw_spin_lock_irq(&wqe->lock);
+		raw_spin_lock(&wqe->lock);
 		acct->nr_workers--;
-		raw_spin_unlock_irq(&wqe->lock);
+		raw_spin_unlock(&wqe->lock);
 		io_worker_ref_put(wq);
 		return;
 	}
@@ -711,7 +711,7 @@ fail:
 	set_cpus_allowed_ptr(tsk, wqe->cpu_mask);
 	tsk->flags |= PF_NO_SETAFFINITY;
 
-	raw_spin_lock_irq(&wqe->lock);
+	raw_spin_lock(&wqe->lock);
 	hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list);
 	list_add_tail_rcu(&worker->all_list, &wqe->all_list);
 	worker->flags |= IO_WORKER_F_FREE;
@@ -719,7 +719,7 @@ fail:
 		worker->flags |= IO_WORKER_F_BOUND;
 	if (first && (worker->flags & IO_WORKER_F_BOUND))
 		worker->flags |= IO_WORKER_F_FIXED;
-	raw_spin_unlock_irq(&wqe->lock);
+	raw_spin_unlock(&wqe->lock);
 	wake_up_new_task(tsk);
 }
 
@@ -795,7 +795,6 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
 {
 	struct io_wqe_acct *acct = io_work_get_acct(wqe, work);
 	int work_flags;
-	unsigned long flags;
 
 	/*
 	 * If io-wq is exiting for this task, or if the request has explicitly
@@ -808,10 +807,10 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
 	}
 
 	work_flags = work->flags;
-	raw_spin_lock_irqsave(&wqe->lock, flags);
+	raw_spin_lock(&wqe->lock);
 	io_wqe_insert_work(wqe, work);
 	wqe->flags &= ~IO_WQE_FLAG_STALLED;
-	raw_spin_unlock_irqrestore(&wqe->lock, flags);
+	raw_spin_unlock(&wqe->lock);
 
 	if ((work_flags & IO_WQ_WORK_CONCURRENT) ||
 	    !atomic_read(&acct->nr_running))
@@ -840,19 +839,18 @@ void io_wq_hash_work(struct io_wq_work *work, void *val)
 static bool io_wq_worker_cancel(struct io_worker *worker, void *data)
 {
 	struct io_cb_cancel_data *match = data;
-	unsigned long flags;
 
 	/*
 	 * Hold the lock to avoid ->cur_work going out of scope, caller
 	 * may dereference the passed in work.
 	 */
-	spin_lock_irqsave(&worker->lock, flags);
+	spin_lock(&worker->lock);
 	if (worker->cur_work &&
 	    match->fn(worker->cur_work, match->data)) {
 		set_notify_signal(worker->task);
 		match->nr_running++;
 	}
-	spin_unlock_irqrestore(&worker->lock, flags);
+	spin_unlock(&worker->lock);
 
 	return match->nr_running && !match->cancel_all;
 }
@@ -880,16 +878,15 @@ static void io_wqe_cancel_pending_work(struct io_wqe *wqe,
 {
 	struct io_wq_work_node *node, *prev;
 	struct io_wq_work *work;
-	unsigned long flags;
 
 retry:
-	raw_spin_lock_irqsave(&wqe->lock, flags);
+	raw_spin_lock(&wqe->lock);
 	wq_list_for_each(node, prev, &wqe->work_list) {
 		work = container_of(node, struct io_wq_work, list);
 		if (!match->fn(work, match->data))
 			continue;
 		io_wqe_remove_pending(wqe, work, prev);
-		raw_spin_unlock_irqrestore(&wqe->lock, flags);
+		raw_spin_unlock(&wqe->lock);
 		io_run_cancel(work, wqe);
 		match->nr_pending++;
 		if (!match->cancel_all)
@@ -898,7 +895,7 @@ retry:
 		/* not safe to continue after unlock */
 		goto retry;
 	}
-	raw_spin_unlock_irqrestore(&wqe->lock, flags);
+	raw_spin_unlock(&wqe->lock);
 }
 
 static void io_wqe_cancel_running_work(struct io_wqe *wqe,
-- 
cgit 


From 87df7fb922d18e96992aa5e824aa34b2065fef59 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 30 Aug 2021 07:45:47 -0600
Subject: io-wq: fix wakeup race when adding new work

When new work is added, io_wqe_enqueue() checks if we need to wake or
create a new worker. But that check is done outside the lock that
otherwise synchronizes us with a worker going to sleep, so we can end
up in the following situation:

CPU0				CPU1
lock
insert work
unlock
atomic_read(nr_running) != 0
				lock
				atomic_dec(nr_running)
no wakeup needed

Hold the wqe lock around the "need to wakeup" check. Then we can also get
rid of the temporary work_flags variable, as we know the work will remain
valid as long as we hold the lock.

Cc: stable@vger.kernel.org
Reported-by: Andres Freund <andres@anarazel.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io-wq.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/io-wq.c b/fs/io-wq.c
index 13aeb48a0964..cd9bd095fb1b 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -794,7 +794,7 @@ append:
 static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
 {
 	struct io_wqe_acct *acct = io_work_get_acct(wqe, work);
-	int work_flags;
+	bool do_wake;
 
 	/*
 	 * If io-wq is exiting for this task, or if the request has explicitly
@@ -806,14 +806,14 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
 		return;
 	}
 
-	work_flags = work->flags;
 	raw_spin_lock(&wqe->lock);
 	io_wqe_insert_work(wqe, work);
 	wqe->flags &= ~IO_WQE_FLAG_STALLED;
+	do_wake = (work->flags & IO_WQ_WORK_CONCURRENT) ||
+			!atomic_read(&acct->nr_running);
 	raw_spin_unlock(&wqe->lock);
 
-	if ((work_flags & IO_WQ_WORK_CONCURRENT) ||
-	    !atomic_read(&acct->nr_running))
+	if (do_wake)
 		io_wqe_wake_worker(wqe, acct);
 }
 
-- 
cgit