aboutsummaryrefslogtreecommitdiff
path: root/fs/io_uring.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/io_uring.c')
-rw-r--r--fs/io_uring.c1865
1 files changed, 1088 insertions, 777 deletions
diff --git a/fs/io_uring.c b/fs/io_uring.c
index b42dfa0243bf..7e35283fc0b1 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -205,6 +205,7 @@ struct fixed_file_ref_node {
struct list_head file_list;
struct fixed_file_data *file_data;
struct llist_node llist;
+ bool done;
};
struct fixed_file_data {
@@ -244,6 +245,8 @@ struct io_sq_data {
struct task_struct *thread;
struct wait_queue_head wait;
+
+ unsigned sq_thread_idle;
};
struct io_ring_ctx {
@@ -284,7 +287,6 @@ struct io_ring_ctx {
struct list_head timeout_list;
struct list_head cq_overflow_list;
- wait_queue_head_t inflight_wait;
struct io_uring_sqe *sq_sqes;
} ____cacheline_aligned_in_smp;
@@ -309,7 +311,6 @@ struct io_ring_ctx {
struct io_sq_data *sq_data; /* if using sq thread polling */
struct wait_queue_head sqo_sq_wait;
- struct wait_queue_entry sqo_wait_entry;
struct list_head sqd_list;
/*
@@ -394,16 +395,18 @@ struct io_ring_ctx {
*/
struct io_poll_iocb {
struct file *file;
- union {
- struct wait_queue_head *head;
- u64 addr;
- };
+ struct wait_queue_head *head;
__poll_t events;
bool done;
bool canceled;
struct wait_queue_entry wait;
};
+struct io_poll_remove {
+ struct file *file;
+ u64 addr;
+};
+
struct io_close {
struct file *file;
struct file *put_file;
@@ -443,11 +446,17 @@ struct io_timeout {
u32 off;
u32 target_seq;
struct list_head list;
+ /* head of the link, used by linked timeouts only */
+ struct io_kiocb *head;
};
struct io_timeout_rem {
struct file *file;
u64 addr;
+
+ /* timeout update */
+ struct timespec64 ts;
+ u32 flags;
};
struct io_rw {
@@ -478,6 +487,7 @@ struct io_sr_msg {
struct io_open {
struct file *file;
int dfd;
+ bool ignore_nonblock;
struct filename *filename;
struct open_how how;
unsigned long nofile;
@@ -539,6 +549,27 @@ struct io_statx {
struct statx __user *buffer;
};
+struct io_shutdown {
+ struct file *file;
+ int how;
+};
+
+struct io_rename {
+ struct file *file;
+ int old_dfd;
+ int new_dfd;
+ struct filename *oldpath;
+ struct filename *newpath;
+ int flags;
+};
+
+struct io_unlink {
+ struct file *file;
+ int dfd;
+ int flags;
+ struct filename *filename;
+};
+
struct io_completion {
struct file *file;
struct list_head list;
@@ -573,7 +604,6 @@ enum {
REQ_F_FORCE_ASYNC_BIT = IOSQE_ASYNC_BIT,
REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT,
- REQ_F_LINK_HEAD_BIT,
REQ_F_FAIL_LINK_BIT,
REQ_F_INFLIGHT_BIT,
REQ_F_CUR_POS_BIT,
@@ -605,8 +635,6 @@ enum {
/* IOSQE_BUFFER_SELECT */
REQ_F_BUFFER_SELECT = BIT(REQ_F_BUFFER_SELECT_BIT),
- /* head of a link */
- REQ_F_LINK_HEAD = BIT(REQ_F_LINK_HEAD_BIT),
/* fail rest of links */
REQ_F_FAIL_LINK = BIT(REQ_F_FAIL_LINK_BIT),
/* on inflight list */
@@ -649,6 +677,7 @@ struct io_kiocb {
struct file *file;
struct io_rw rw;
struct io_poll_iocb poll;
+ struct io_poll_remove poll_remove;
struct io_accept accept;
struct io_sync sync;
struct io_cancel cancel;
@@ -665,6 +694,9 @@ struct io_kiocb {
struct io_splice splice;
struct io_provide_buf pbuf;
struct io_statx statx;
+ struct io_shutdown shutdown;
+ struct io_rename rename;
+ struct io_unlink unlink;
/* use only after cleaning per-op data, see io_clean_op() */
struct io_completion compl;
};
@@ -684,15 +716,14 @@ struct io_kiocb {
struct task_struct *task;
u64 user_data;
- struct list_head link_list;
+ struct io_kiocb *link;
+ struct percpu_ref *fixed_file_refs;
/*
* 1. used with ctx->iopoll_list with reads/writes
* 2. to track reqs with ->files (see io_op_def::file_table)
*/
struct list_head inflight_entry;
-
- struct percpu_ref *fixed_file_refs;
struct callback_head task_work;
/* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */
struct hlist_node hash_node;
@@ -723,6 +754,8 @@ struct io_submit_state {
void *reqs[IO_IOPOLL_BATCH];
unsigned int free_reqs;
+ bool plug_started;
+
/*
* Batch completion logic
*/
@@ -733,7 +766,7 @@ struct io_submit_state {
*/
struct file *file;
unsigned int fd;
- unsigned int has_refs;
+ unsigned int file_refs;
unsigned int ios_left;
};
@@ -755,6 +788,8 @@ struct io_op_def {
unsigned buffer_select : 1;
/* must always have async data allocated */
unsigned needs_async_data : 1;
+ /* should block plug */
+ unsigned plug : 1;
/* size of async data needed, if any */
unsigned short async_size;
unsigned work_flags;
@@ -768,6 +803,7 @@ static const struct io_op_def io_op_defs[] = {
.pollin = 1,
.buffer_select = 1,
.needs_async_data = 1,
+ .plug = 1,
.async_size = sizeof(struct io_async_rw),
.work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG,
},
@@ -777,6 +813,7 @@ static const struct io_op_def io_op_defs[] = {
.unbound_nonreg_file = 1,
.pollout = 1,
.needs_async_data = 1,
+ .plug = 1,
.async_size = sizeof(struct io_async_rw),
.work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG |
IO_WQ_WORK_FSIZE,
@@ -789,6 +826,7 @@ static const struct io_op_def io_op_defs[] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
.pollin = 1,
+ .plug = 1,
.async_size = sizeof(struct io_async_rw),
.work_flags = IO_WQ_WORK_BLKCG | IO_WQ_WORK_MM,
},
@@ -797,6 +835,7 @@ static const struct io_op_def io_op_defs[] = {
.hash_reg_file = 1,
.unbound_nonreg_file = 1,
.pollout = 1,
+ .plug = 1,
.async_size = sizeof(struct io_async_rw),
.work_flags = IO_WQ_WORK_BLKCG | IO_WQ_WORK_FSIZE |
IO_WQ_WORK_MM,
@@ -816,8 +855,7 @@ static const struct io_op_def io_op_defs[] = {
.pollout = 1,
.needs_async_data = 1,
.async_size = sizeof(struct io_async_msghdr),
- .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG |
- IO_WQ_WORK_FS,
+ .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG,
},
[IORING_OP_RECVMSG] = {
.needs_file = 1,
@@ -826,15 +864,17 @@ static const struct io_op_def io_op_defs[] = {
.buffer_select = 1,
.needs_async_data = 1,
.async_size = sizeof(struct io_async_msghdr),
- .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG |
- IO_WQ_WORK_FS,
+ .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG,
},
[IORING_OP_TIMEOUT] = {
.needs_async_data = 1,
.async_size = sizeof(struct io_timeout_data),
.work_flags = IO_WQ_WORK_MM,
},
- [IORING_OP_TIMEOUT_REMOVE] = {},
+ [IORING_OP_TIMEOUT_REMOVE] = {
+ /* used by timeout updates' prep() */
+ .work_flags = IO_WQ_WORK_MM,
+ },
[IORING_OP_ACCEPT] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
@@ -861,7 +901,7 @@ static const struct io_op_def io_op_defs[] = {
},
[IORING_OP_OPENAT] = {
.work_flags = IO_WQ_WORK_FILES | IO_WQ_WORK_BLKCG |
- IO_WQ_WORK_FS,
+ IO_WQ_WORK_FS | IO_WQ_WORK_MM,
},
[IORING_OP_CLOSE] = {
.needs_file = 1,
@@ -880,6 +920,7 @@ static const struct io_op_def io_op_defs[] = {
.unbound_nonreg_file = 1,
.pollin = 1,
.buffer_select = 1,
+ .plug = 1,
.async_size = sizeof(struct io_async_rw),
.work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG,
},
@@ -887,6 +928,7 @@ static const struct io_op_def io_op_defs[] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
.pollout = 1,
+ .plug = 1,
.async_size = sizeof(struct io_async_rw),
.work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG |
IO_WQ_WORK_FSIZE,
@@ -913,7 +955,7 @@ static const struct io_op_def io_op_defs[] = {
},
[IORING_OP_OPENAT2] = {
.work_flags = IO_WQ_WORK_FILES | IO_WQ_WORK_FS |
- IO_WQ_WORK_BLKCG,
+ IO_WQ_WORK_BLKCG | IO_WQ_WORK_MM,
},
[IORING_OP_EPOLL_CTL] = {
.unbound_nonreg_file = 1,
@@ -932,6 +974,17 @@ static const struct io_op_def io_op_defs[] = {
.hash_reg_file = 1,
.unbound_nonreg_file = 1,
},
+ [IORING_OP_SHUTDOWN] = {
+ .needs_file = 1,
+ },
+ [IORING_OP_RENAMEAT] = {
+ .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_FILES |
+ IO_WQ_WORK_FS | IO_WQ_WORK_BLKCG,
+ },
+ [IORING_OP_UNLINKAT] = {
+ .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_FILES |
+ IO_WQ_WORK_FS | IO_WQ_WORK_BLKCG,
+ },
};
enum io_mem_account {
@@ -981,6 +1034,9 @@ struct sock *io_uring_get_socket(struct file *file)
}
EXPORT_SYMBOL(io_uring_get_socket);
+#define io_for_each_link(pos, head) \
+ for (pos = (head); pos; pos = pos->link)
+
static inline void io_clean_op(struct io_kiocb *req)
{
if (req->flags & (REQ_F_NEED_CLEANUP | REQ_F_BUFFER_SELECTED |
@@ -988,35 +1044,127 @@ static inline void io_clean_op(struct io_kiocb *req)
__io_clean_op(req);
}
-static void io_sq_thread_drop_mm(void)
+static inline void io_set_resource_node(struct io_kiocb *req)
+{
+ struct io_ring_ctx *ctx = req->ctx;
+
+ if (!req->fixed_file_refs) {
+ req->fixed_file_refs = &ctx->file_data->node->refs;
+ percpu_ref_get(req->fixed_file_refs);
+ }
+}
+
+static bool io_match_task(struct io_kiocb *head,
+ struct task_struct *task,
+ struct files_struct *files)
{
+ struct io_kiocb *req;
+
+ if (task && head->task != task)
+ return false;
+ if (!files)
+ return true;
+
+ io_for_each_link(req, head) {
+ if ((req->flags & REQ_F_WORK_INITIALIZED) &&
+ (req->work.flags & IO_WQ_WORK_FILES) &&
+ req->work.identity->files == files)
+ return true;
+ }
+ return false;
+}
+
+static void io_sq_thread_drop_mm_files(void)
+{
+ struct files_struct *files = current->files;
struct mm_struct *mm = current->mm;
if (mm) {
kthread_unuse_mm(mm);
mmput(mm);
+ current->mm = NULL;
+ }
+ if (files) {
+ struct nsproxy *nsproxy = current->nsproxy;
+
+ task_lock(current);
+ current->files = NULL;
+ current->nsproxy = NULL;
+ task_unlock(current);
+ put_files_struct(files);
+ put_nsproxy(nsproxy);
}
}
-static int __io_sq_thread_acquire_mm(struct io_ring_ctx *ctx)
+static int __io_sq_thread_acquire_files(struct io_ring_ctx *ctx)
{
- if (!current->mm) {
- if (unlikely(!(ctx->flags & IORING_SETUP_SQPOLL) ||
- !ctx->sqo_task->mm ||
- !mmget_not_zero(ctx->sqo_task->mm)))
- return -EFAULT;
- kthread_use_mm(ctx->sqo_task->mm);
- }
+ if (!current->files) {
+ struct files_struct *files;
+ struct nsproxy *nsproxy;
+ task_lock(ctx->sqo_task);
+ files = ctx->sqo_task->files;
+ if (!files) {
+ task_unlock(ctx->sqo_task);
+ return -EOWNERDEAD;
+ }
+ atomic_inc(&files->count);
+ get_nsproxy(ctx->sqo_task->nsproxy);
+ nsproxy = ctx->sqo_task->nsproxy;
+ task_unlock(ctx->sqo_task);
+
+ task_lock(current);
+ current->files = files;
+ current->nsproxy = nsproxy;
+ task_unlock(current);
+ }
return 0;
}
-static int io_sq_thread_acquire_mm(struct io_ring_ctx *ctx,
- struct io_kiocb *req)
+static int __io_sq_thread_acquire_mm(struct io_ring_ctx *ctx)
{
- if (!(io_op_defs[req->opcode].work_flags & IO_WQ_WORK_MM))
+ struct mm_struct *mm;
+
+ if (current->mm)
+ return 0;
+
+ /* Should never happen */
+ if (unlikely(!(ctx->flags & IORING_SETUP_SQPOLL)))
+ return -EFAULT;
+
+ task_lock(ctx->sqo_task);
+ mm = ctx->sqo_task->mm;
+ if (unlikely(!mm || !mmget_not_zero(mm)))
+ mm = NULL;
+ task_unlock(ctx->sqo_task);
+
+ if (mm) {
+ kthread_use_mm(mm);
return 0;
- return __io_sq_thread_acquire_mm(ctx);
+ }
+
+ return -EFAULT;
+}
+
+static int io_sq_thread_acquire_mm_files(struct io_ring_ctx *ctx,
+ struct io_kiocb *req)
+{
+ const struct io_op_def *def = &io_op_defs[req->opcode];
+ int ret;
+
+ if (def->work_flags & IO_WQ_WORK_MM) {
+ ret = __io_sq_thread_acquire_mm(ctx);
+ if (unlikely(ret))
+ return ret;
+ }
+
+ if (def->needs_file || (def->work_flags & IO_WQ_WORK_FILES)) {
+ ret = __io_sq_thread_acquire_files(ctx);
+ if (unlikely(ret))
+ return ret;
+ }
+
+ return 0;
}
static void io_sq_thread_associate_blkcg(struct io_ring_ctx *ctx,
@@ -1159,7 +1307,6 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
INIT_LIST_HEAD(&ctx->iopoll_list);
INIT_LIST_HEAD(&ctx->defer_list);
INIT_LIST_HEAD(&ctx->timeout_list);
- init_waitqueue_head(&ctx->inflight_wait);
spin_lock_init(&ctx->inflight_lock);
INIT_LIST_HEAD(&ctx->inflight_list);
INIT_DELAYED_WORK(&ctx->file_put_work, io_file_put_work);
@@ -1269,14 +1416,17 @@ static bool io_identity_cow(struct io_kiocb *req)
*/
io_init_identity(id);
if (creds)
- req->work.identity->creds = creds;
+ id->creds = creds;
/* add one for this request */
refcount_inc(&id->count);
- /* drop old identity, assign new one. one ref for req, one for tctx */
- if (req->work.identity != tctx->identity &&
- refcount_sub_and_test(2, &req->work.identity->count))
+ /* drop tctx and req identity references, if needed */
+ if (tctx->identity != &tctx->__identity &&
+ refcount_dec_and_test(&tctx->identity->count))
+ kfree(tctx->identity);
+ if (req->work.identity != &tctx->__identity &&
+ refcount_dec_and_test(&req->work.identity->count))
kfree(req->work.identity);
req->work.identity = id;
@@ -1295,22 +1445,6 @@ static bool io_grab_identity(struct io_kiocb *req)
return false;
req->work.flags |= IO_WQ_WORK_FSIZE;
}
-
- if (!(req->work.flags & IO_WQ_WORK_FILES) &&
- (def->work_flags & IO_WQ_WORK_FILES) &&
- !(req->flags & REQ_F_NO_FILE_TABLE)) {
- if (id->files != current->files ||
- id->nsproxy != current->nsproxy)
- return false;
- atomic_inc(&id->files->count);
- get_nsproxy(id->nsproxy);
- req->flags |= REQ_F_INFLIGHT;
-
- spin_lock_irq(&ctx->inflight_lock);
- list_add(&req->inflight_entry, &ctx->inflight_list);
- spin_unlock_irq(&ctx->inflight_lock);
- req->work.flags |= IO_WQ_WORK_FILES;
- }
#ifdef CONFIG_BLK_CGROUP
if (!(req->work.flags & IO_WQ_WORK_BLKCG) &&
(def->work_flags & IO_WQ_WORK_BLKCG)) {
@@ -1352,6 +1486,21 @@ static bool io_grab_identity(struct io_kiocb *req)
}
spin_unlock(&current->fs->lock);
}
+ if (!(req->work.flags & IO_WQ_WORK_FILES) &&
+ (def->work_flags & IO_WQ_WORK_FILES) &&
+ !(req->flags & REQ_F_NO_FILE_TABLE)) {
+ if (id->files != current->files ||
+ id->nsproxy != current->nsproxy)
+ return false;
+ atomic_inc(&id->files->count);
+ get_nsproxy(id->nsproxy);
+ req->flags |= REQ_F_INFLIGHT;
+
+ spin_lock_irq(&ctx->inflight_lock);
+ list_add(&req->inflight_entry, &ctx->inflight_list);
+ spin_unlock_irq(&ctx->inflight_lock);
+ req->work.flags |= IO_WQ_WORK_FILES;
+ }
return true;
}
@@ -1365,6 +1514,9 @@ static void io_prep_async_work(struct io_kiocb *req)
io_req_init_async(req);
id = req->work.identity;
+ if (req->flags & REQ_F_FORCE_ASYNC)
+ req->work.flags |= IO_WQ_WORK_CONCURRENT;
+
if (req->flags & REQ_F_ISREG) {
if (def->hash_reg_file || (ctx->flags & IORING_SETUP_IOPOLL))
io_wq_hash_work(&req->work, file_inode(req->file));
@@ -1396,10 +1548,8 @@ static void io_prep_async_link(struct io_kiocb *req)
{
struct io_kiocb *cur;
- io_prep_async_work(req);
- if (req->flags & REQ_F_LINK_HEAD)
- list_for_each_entry(cur, &req->link_list, link_list)
- io_prep_async_work(cur);
+ io_for_each_link(cur, req)
+ io_prep_async_work(cur);
}
static struct io_kiocb *__io_queue_async_work(struct io_kiocb *req)
@@ -1440,30 +1590,18 @@ static void io_kill_timeout(struct io_kiocb *req)
}
}
-static bool io_task_match(struct io_kiocb *req, struct task_struct *tsk)
-{
- struct io_ring_ctx *ctx = req->ctx;
-
- if (!tsk || req->task == tsk)
- return true;
- if (ctx->flags & IORING_SETUP_SQPOLL) {
- if (ctx->sq_data && req->task == ctx->sq_data->thread)
- return true;
- }
- return false;
-}
-
/*
* Returns true if we found and killed one or more timeouts
*/
-static bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk)
+static bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk,
+ struct files_struct *files)
{
struct io_kiocb *req, *tmp;
int canceled = 0;
spin_lock_irq(&ctx->completion_lock);
list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) {
- if (io_task_match(req, tsk)) {
+ if (io_match_task(req, tsk, files)) {
io_kill_timeout(req);
canceled++;
}
@@ -1555,6 +1693,11 @@ static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx)
return io_wq_current_is_worker();
}
+static inline unsigned __io_cqring_events(struct io_ring_ctx *ctx)
+{
+ return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head);
+}
+
static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
{
if (waitqueue_active(&ctx->wait))
@@ -1565,26 +1708,6 @@ static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
eventfd_signal(ctx->cq_ev_fd, 1);
}
-static void io_cqring_mark_overflow(struct io_ring_ctx *ctx)
-{
- if (list_empty(&ctx->cq_overflow_list)) {
- clear_bit(0, &ctx->sq_check_overflow);
- clear_bit(0, &ctx->cq_check_overflow);
- ctx->rings->sq_flags &= ~IORING_SQ_CQ_OVERFLOW;
- }
-}
-
-static inline bool io_match_files(struct io_kiocb *req,
- struct files_struct *files)
-{
- if (!files)
- return true;
- if ((req->flags & REQ_F_WORK_INITIALIZED) &&
- (req->work.flags & IO_WQ_WORK_FILES))
- return req->work.identity->files == files;
- return false;
-}
-
/* Returns true if there are no backlogged entries after the flush */
static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force,
struct task_struct *tsk,
@@ -1594,27 +1717,15 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force,
struct io_kiocb *req, *tmp;
struct io_uring_cqe *cqe;
unsigned long flags;
+ bool all_flushed;
LIST_HEAD(list);
- if (!force) {
- if (list_empty_careful(&ctx->cq_overflow_list))
- return true;
- if ((ctx->cached_cq_tail - READ_ONCE(rings->cq.head) ==
- rings->cq_ring_entries))
- return false;
- }
+ if (!force && __io_cqring_events(ctx) == rings->cq_ring_entries)
+ return false;
spin_lock_irqsave(&ctx->completion_lock, flags);
-
- /* if force is set, the ring is going away. always drop after that */
- if (force)
- ctx->cq_overflow_flushed = 1;
-
- cqe = NULL;
list_for_each_entry_safe(req, tmp, &ctx->cq_overflow_list, compl.list) {
- if (tsk && req->task != tsk)
- continue;
- if (!io_match_files(req, files))
+ if (!io_match_task(req, tsk, files))
continue;
cqe = io_get_cqring(ctx);
@@ -1633,9 +1744,14 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force,
}
}
- io_commit_cqring(ctx);
- io_cqring_mark_overflow(ctx);
+ all_flushed = list_empty(&ctx->cq_overflow_list);
+ if (all_flushed) {
+ clear_bit(0, &ctx->sq_check_overflow);
+ clear_bit(0, &ctx->cq_check_overflow);
+ ctx->rings->sq_flags &= ~IORING_SQ_CQ_OVERFLOW;
+ }
+ io_commit_cqring(ctx);
spin_unlock_irqrestore(&ctx->completion_lock, flags);
io_cqring_ev_posted(ctx);
@@ -1645,7 +1761,7 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force,
io_put_req(req);
}
- return cqe != NULL;
+ return all_flushed;
}
static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags)
@@ -1665,7 +1781,8 @@ static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags)
WRITE_ONCE(cqe->user_data, req->user_data);
WRITE_ONCE(cqe->res, res);
WRITE_ONCE(cqe->flags, cflags);
- } else if (ctx->cq_overflow_flushed || req->task->io_uring->in_idle) {
+ } else if (ctx->cq_overflow_flushed ||
+ atomic_read(&req->task->io_uring->in_idle)) {
/*
* If we're in ring overflow flush mode, or in task cancel mode,
* then we cannot store the request for later flushing, we need
@@ -1809,9 +1926,7 @@ fallback:
static inline void io_put_file(struct io_kiocb *req, struct file *file,
bool fixed)
{
- if (fixed)
- percpu_ref_put(req->fixed_file_refs);
- else
+ if (!fixed)
fput(file);
}
@@ -1823,7 +1938,8 @@ static void io_dismantle_req(struct io_kiocb *req)
kfree(req->async_data);
if (req->file)
io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE));
-
+ if (req->fixed_file_refs)
+ percpu_ref_put(req->fixed_file_refs);
io_req_clean_work(req);
}
@@ -1835,7 +1951,7 @@ static void __io_free_req(struct io_kiocb *req)
io_dismantle_req(req);
percpu_counter_dec(&tctx->inflight);
- if (tctx->in_idle)
+ if (atomic_read(&tctx->in_idle))
wake_up(&tctx->wait);
put_task_struct(req->task);
@@ -1846,96 +1962,66 @@ static void __io_free_req(struct io_kiocb *req)
percpu_ref_put(&ctx->refs);
}
-static bool io_link_cancel_timeout(struct io_kiocb *req)
+static inline void io_remove_next_linked(struct io_kiocb *req)
{
- struct io_timeout_data *io = req->async_data;
- struct io_ring_ctx *ctx = req->ctx;
- int ret;
+ struct io_kiocb *nxt = req->link;
- ret = hrtimer_try_to_cancel(&io->timer);
- if (ret != -1) {
- io_cqring_fill_event(req, -ECANCELED);
- io_commit_cqring(ctx);
- req->flags &= ~REQ_F_LINK_HEAD;
- io_put_req_deferred(req, 1);
- return true;
- }
-
- return false;
+ req->link = nxt->link;
+ nxt->link = NULL;
}
-static bool __io_kill_linked_timeout(struct io_kiocb *req)
+static void io_kill_linked_timeout(struct io_kiocb *req)
{
+ struct io_ring_ctx *ctx = req->ctx;
struct io_kiocb *link;
- bool wake_ev;
+ bool cancelled = false;
+ unsigned long flags;
+
+ spin_lock_irqsave(&ctx->completion_lock, flags);
+ link = req->link;
- if (list_empty(&req->link_list))
- return false;
- link = list_first_entry(&req->link_list, struct io_kiocb, link_list);
- if (link->opcode != IORING_OP_LINK_TIMEOUT)
- return false;
/*
* Can happen if a linked timeout fired and link had been like
* req -> link t-out -> link t-out [-> ...]
*/
- if (!(link->flags & REQ_F_LTIMEOUT_ACTIVE))
- return false;
+ if (link && (link->flags & REQ_F_LTIMEOUT_ACTIVE)) {
+ struct io_timeout_data *io = link->async_data;
+ int ret;
- list_del_init(&link->link_list);
- wake_ev = io_link_cancel_timeout(link);
+ io_remove_next_linked(req);
+ link->timeout.head = NULL;
+ ret = hrtimer_try_to_cancel(&io->timer);
+ if (ret != -1) {
+ io_cqring_fill_event(link, -ECANCELED);
+ io_commit_cqring(ctx);
+ cancelled = true;
+ }
+ }
req->flags &= ~REQ_F_LINK_TIMEOUT;
- return wake_ev;
-}
-
-static void io_kill_linked_timeout(struct io_kiocb *req)
-{
- struct io_ring_ctx *ctx = req->ctx;
- unsigned long flags;
- bool wake_ev;
-
- spin_lock_irqsave(&ctx->completion_lock, flags);
- wake_ev = __io_kill_linked_timeout(req);
spin_unlock_irqrestore(&ctx->completion_lock, flags);
- if (wake_ev)
+ if (cancelled) {
io_cqring_ev_posted(ctx);
+ io_put_req(link);
+ }
}
-static struct io_kiocb *io_req_link_next(struct io_kiocb *req)
-{
- struct io_kiocb *nxt;
-
- /*
- * The list should never be empty when we are called here. But could
- * potentially happen if the chain is messed up, check to be on the
- * safe side.
- */
- if (unlikely(list_empty(&req->link_list)))
- return NULL;
-
- nxt = list_first_entry(&req->link_list, struct io_kiocb, link_list);
- list_del_init(&req->link_list);
- if (!list_empty(&nxt->link_list))
- nxt->flags |= REQ_F_LINK_HEAD;
- return nxt;
-}
-/*
- * Called if REQ_F_LINK_HEAD is set, and we fail the head request
- */
static void io_fail_links(struct io_kiocb *req)
{
+ struct io_kiocb *link, *nxt;
struct io_ring_ctx *ctx = req->ctx;
unsigned long flags;
spin_lock_irqsave(&ctx->completion_lock, flags);
- while (!list_empty(&req->link_list)) {
- struct io_kiocb *link = list_first_entry(&req->link_list,
- struct io_kiocb, link_list);
+ link = req->link;
+ req->link = NULL;
- list_del_init(&link->link_list);
- trace_io_uring_fail_link(req, link);
+ while (link) {
+ nxt = link->link;
+ link->link = NULL;
+ trace_io_uring_fail_link(req, link);
io_cqring_fill_event(link, -ECANCELED);
/*
@@ -1947,8 +2033,8 @@ static void io_fail_links(struct io_kiocb *req)
io_put_req_deferred(link, 2);
else
io_double_put_req(link);
+ link = nxt;
}
-
io_commit_cqring(ctx);
spin_unlock_irqrestore(&ctx->completion_lock, flags);
@@ -1957,7 +2043,6 @@ static void io_fail_links(struct io_kiocb *req)
static struct io_kiocb *__io_req_find_next(struct io_kiocb *req)
{
- req->flags &= ~REQ_F_LINK_HEAD;
if (req->flags & REQ_F_LINK_TIMEOUT)
io_kill_linked_timeout(req);
@@ -1967,20 +2052,24 @@ static struct io_kiocb *__io_req_find_next(struct io_kiocb *req)
* dependencies to the next request. In case of failure, fail the rest
* of the chain.
*/
- if (likely(!(req->flags & REQ_F_FAIL_LINK)))
- return io_req_link_next(req);
+ if (likely(!(req->flags & REQ_F_FAIL_LINK))) {
+ struct io_kiocb *nxt = req->link;
+
+ req->link = NULL;
+ return nxt;
+ }
io_fail_links(req);
return NULL;
}
-static struct io_kiocb *io_req_find_next(struct io_kiocb *req)
+static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req)
{
- if (likely(!(req->flags & REQ_F_LINK_HEAD)))
+ if (likely(!(req->link) && !(req->flags & REQ_F_LINK_TIMEOUT)))
return NULL;
return __io_req_find_next(req);
}
-static int io_req_task_work_add(struct io_kiocb *req, bool twa_signal_ok)
+static int io_req_task_work_add(struct io_kiocb *req)
{
struct task_struct *tsk = req->task;
struct io_ring_ctx *ctx = req->ctx;
@@ -1997,7 +2086,7 @@ static int io_req_task_work_add(struct io_kiocb *req, bool twa_signal_ok)
* will do the job.
*/
notify = TWA_NONE;
- if (!(ctx->flags & IORING_SETUP_SQPOLL) && twa_signal_ok)
+ if (!(ctx->flags & IORING_SETUP_SQPOLL))
notify = TWA_SIGNAL;
ret = task_work_add(tsk, &req->task_work, notify);
@@ -2034,7 +2123,8 @@ static void __io_req_task_submit(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
- if (!__io_sq_thread_acquire_mm(ctx)) {
+ if (!__io_sq_thread_acquire_mm(ctx) &&
+ !__io_sq_thread_acquire_files(ctx)) {
mutex_lock(&ctx->uring_lock);
__io_queue_sqe(req, NULL);
mutex_unlock(&ctx->uring_lock);
@@ -2059,7 +2149,7 @@ static void io_req_task_queue(struct io_kiocb *req)
init_task_work(&req->task_work, io_req_task_submit);
percpu_ref_get(&req->ctx->refs);
- ret = io_req_task_work_add(req, true);
+ ret = io_req_task_work_add(req);
if (unlikely(ret)) {
struct task_struct *tsk;
@@ -2070,7 +2160,7 @@ static void io_req_task_queue(struct io_kiocb *req)
}
}
-static void io_queue_next(struct io_kiocb *req)
+static inline void io_queue_next(struct io_kiocb *req)
{
struct io_kiocb *nxt = io_req_find_next(req);
@@ -2127,8 +2217,7 @@ static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req)
io_free_req(req);
return;
}
- if (req->flags & REQ_F_LINK_HEAD)
- io_queue_next(req);
+ io_queue_next(req);
if (req->task != rb->task) {
if (rb->task) {
@@ -2181,7 +2270,7 @@ static void io_free_req_deferred(struct io_kiocb *req)
int ret;
init_task_work(&req->task_work, io_put_req_deferred_cb);
- ret = io_req_task_work_add(req, true);
+ ret = io_req_task_work_add(req);
if (unlikely(ret)) {
struct task_struct *tsk;
@@ -2222,15 +2311,13 @@ static void io_double_put_req(struct io_kiocb *req)
static unsigned io_cqring_events(struct io_ring_ctx *ctx, bool noflush)
{
- struct io_rings *rings = ctx->rings;
-
if (test_bit(0, &ctx->cq_check_overflow)) {
/*
* noflush == true is from the waitqueue handler, just ensure
* we wake up the task, and the next invocation will flush the
* entries. We cannot safely to it from here.
*/
- if (noflush && !list_empty(&ctx->cq_overflow_list))
+ if (noflush)
return -1U;
io_cqring_overflow_flush(ctx, false, NULL, NULL);
@@ -2238,7 +2325,7 @@ static unsigned io_cqring_events(struct io_ring_ctx *ctx, bool noflush)
/* See comment at the top of this file */
smp_rmb();
- return ctx->cached_cq_tail - READ_ONCE(rings->cq.head);
+ return __io_cqring_events(ctx);
}
static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx)
@@ -2562,7 +2649,6 @@ static bool io_resubmit_prep(struct io_kiocb *req, int error)
}
end_req:
req_set_fail_links(req);
- io_req_complete(req, ret);
return false;
}
#endif
@@ -2578,7 +2664,7 @@ static bool io_rw_reissue(struct io_kiocb *req, long res)
if ((res != -EAGAIN && res != -EOPNOTSUPP) || io_wq_current_is_worker())
return false;
- ret = io_sq_thread_acquire_mm(req->ctx, req);
+ ret = io_sq_thread_acquire_mm_files(req->ctx, req);
if (io_resubmit_prep(req, ret)) {
refcount_inc(&req->refs);
@@ -2626,7 +2712,7 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
* find it from a io_iopoll_getevents() thread before the issuer is done
* accessing the kiocb cookie.
*/
-static void io_iopoll_req_issued(struct io_kiocb *req)
+static void io_iopoll_req_issued(struct io_kiocb *req, bool in_async)
{
struct io_ring_ctx *ctx = req->ctx;
@@ -2655,21 +2741,25 @@ static void io_iopoll_req_issued(struct io_kiocb *req)
else
list_add_tail(&req->inflight_entry, &ctx->iopoll_list);
- if ((ctx->flags & IORING_SETUP_SQPOLL) &&
+ /*
+ * If IORING_SETUP_SQPOLL is enabled, sqes are either handled in sq thread
+ * task context or in io worker task context. If current task context is
+ * sq thread, we don't need to check whether should wake up sq thread.
+ */
+ if (in_async && (ctx->flags & IORING_SETUP_SQPOLL) &&
wq_has_sleeper(&ctx->sq_data->wait))
wake_up(&ctx->sq_data->wait);
}
-static void __io_state_file_put(struct io_submit_state *state)
+static inline void __io_state_file_put(struct io_submit_state *state)
{
- if (state->has_refs)
- fput_many(state->file, state->has_refs);
- state->file = NULL;
+ fput_many(state->file, state->file_refs);
+ state->file_refs = 0;
}
static inline void io_state_file_put(struct io_submit_state *state)
{
- if (state->file)
+ if (state->file_refs)
__io_state_file_put(state);
}
@@ -2683,29 +2773,25 @@ static struct file *__io_file_get(struct io_submit_state *state, int fd)
if (!state)
return fget(fd);
- if (state->file) {
+ if (state->file_refs) {
if (state->fd == fd) {
- state->has_refs--;
+ state->file_refs--;
return state->file;
}
__io_state_file_put(state);
}
state->file = fget_many(fd, state->ios_left);
- if (!state->file)
+ if (unlikely(!state->file))
return NULL;
state->fd = fd;
- state->has_refs = state->ios_left - 1;
+ state->file_refs = state->ios_left - 1;
return state->file;
}
static bool io_bdev_nowait(struct block_device *bdev)
{
-#ifdef CONFIG_BLOCK
return !bdev || blk_queue_nowait(bdev_get_queue(bdev));
-#else
- return true;
-#endif
}
/*
@@ -2718,14 +2804,16 @@ static bool io_file_supports_async(struct file *file, int rw)
umode_t mode = file_inode(file)->i_mode;
if (S_ISBLK(mode)) {
- if (io_bdev_nowait(file->f_inode->i_bdev))
+ if (IS_ENABLED(CONFIG_BLOCK) &&
+ io_bdev_nowait(I_BDEV(file->f_mapping->host)))
return true;
return false;
}
if (S_ISCHR(mode) || S_ISSOCK(mode))
return true;
if (S_ISREG(mode)) {
- if (io_bdev_nowait(file->f_inode->i_sb->s_bdev) &&
+ if (IS_ENABLED(CONFIG_BLOCK) &&
+ io_bdev_nowait(file->f_inode->i_sb->s_bdev) &&
file->f_op != &io_uring_fops)
return true;
return false;
@@ -3037,9 +3125,7 @@ static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
iov[0].iov_len = kbuf->len;
return 0;
}
- if (!req->rw.len)
- return 0;
- else if (req->rw.len > 1)
+ if (req->rw.len != 1)
return -EINVAL;
#ifdef CONFIG_COMPAT
@@ -3050,7 +3136,7 @@ static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
return __io_iov_buffer_select(req, iov, needs_lock);
}
-static ssize_t __io_import_iovec(int rw, struct io_kiocb *req,
+static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
struct iovec **iovec, struct iov_iter *iter,
bool needs_lock)
{
@@ -3079,7 +3165,7 @@ static ssize_t __io_import_iovec(int rw, struct io_kiocb *req,
ret = import_single_range(rw, buf, sqe_len, *iovec, iter);
*iovec = NULL;
- return ret < 0 ? ret : sqe_len;
+ return ret;
}
if (req->flags & REQ_F_BUFFER_SELECT) {
@@ -3096,18 +3182,6 @@ static ssize_t __io_import_iovec(int rw, struct io_kiocb *req,
req->ctx->compat);
}
-static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
- struct iovec **iovec, struct iov_iter *iter,
- bool needs_lock)
-{
- struct io_async_rw *iorw = req->async_data;
-
- if (!iorw)
- return __io_import_iovec(rw, req, iovec, iter, needs_lock);
- *iovec = NULL;
- return iov_iter_count(&iorw->iter);
-}
-
static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb)
{
return (kiocb->ki_filp->f_mode & FMODE_STREAM) ? NULL : &kiocb->ki_pos;
@@ -3177,7 +3251,7 @@ static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec,
rw->free_iovec = iovec;
rw->bytes_done = 0;
/* can only be fixed buffers, no need to do anything */
- if (iter->type == ITER_BVEC)
+ if (iov_iter_is_bvec(iter))
return;
if (!iovec) {
unsigned iov_off = 0;
@@ -3231,7 +3305,7 @@ static inline int io_rw_prep_async(struct io_kiocb *req, int rw)
struct iovec *iov = iorw->fast_iov;
ssize_t ret;
- ret = __io_import_iovec(rw, req, &iov, &iorw->iter, false);
+ ret = io_import_iovec(rw, req, &iov, &iorw->iter, false);
if (unlikely(ret < 0))
return ret;
@@ -3290,7 +3364,7 @@ static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode,
/* submit ref gets dropped, acquire a new one */
refcount_inc(&req->refs);
- ret = io_req_task_work_add(req, true);
+ ret = io_req_task_work_add(req);
if (unlikely(ret)) {
struct task_struct *tsk;
@@ -3364,17 +3438,17 @@ static int io_read(struct io_kiocb *req, bool force_nonblock,
struct iov_iter __iter, *iter = &__iter;
struct io_async_rw *rw = req->async_data;
ssize_t io_size, ret, ret2;
- size_t iov_count;
bool no_async;
- if (rw)
+ if (rw) {
iter = &rw->iter;
-
- ret = io_import_iovec(READ, req, &iovec, iter, !force_nonblock);
- if (ret < 0)
- return ret;
- iov_count = iov_iter_count(iter);
- io_size = ret;
+ iovec = NULL;
+ } else {
+ ret = io_import_iovec(READ, req, &iovec, iter, !force_nonblock);
+ if (ret < 0)
+ return ret;
+ }
+ io_size = iov_iter_count(iter);
req->result = io_size;
ret = 0;
@@ -3390,7 +3464,7 @@ static int io_read(struct io_kiocb *req, bool force_nonblock,
if (no_async)
goto copy_iov;
- ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), iov_count);
+ ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), io_size);
if (unlikely(ret))
goto out_free;
@@ -3409,7 +3483,7 @@ static int io_read(struct io_kiocb *req, bool force_nonblock,
if (req->file->f_flags & O_NONBLOCK)
goto done;
/* some cases will consume bytes even on error returns */
- iov_iter_revert(iter, iov_count - iov_iter_count(iter));
+ iov_iter_revert(iter, io_size - iov_iter_count(iter));
ret = 0;
goto copy_iov;
} else if (ret < 0) {
@@ -3492,17 +3566,17 @@ static int io_write(struct io_kiocb *req, bool force_nonblock,
struct kiocb *kiocb = &req->rw.kiocb;
struct iov_iter __iter, *iter = &__iter;
struct io_async_rw *rw = req->async_data;
- size_t iov_count;
ssize_t ret, ret2, io_size;
- if (rw)
+ if (rw) {
iter = &rw->iter;
-
- ret = io_import_iovec(WRITE, req, &iovec, iter, !force_nonblock);
- if (ret < 0)
- return ret;
- iov_count = iov_iter_count(iter);
- io_size = ret;
+ iovec = NULL;
+ } else {
+ ret = io_import_iovec(WRITE, req, &iovec, iter, !force_nonblock);
+ if (ret < 0)
+ return ret;
+ }
+ io_size = iov_iter_count(iter);
req->result = io_size;
/* Ensure we clear previously set non-block flag */
@@ -3520,7 +3594,7 @@ static int io_write(struct io_kiocb *req, bool force_nonblock,
(req->flags & REQ_F_ISREG))
goto copy_iov;
- ret = rw_verify_area(WRITE, req->file, io_kiocb_ppos(kiocb), iov_count);
+ ret = rw_verify_area(WRITE, req->file, io_kiocb_ppos(kiocb), io_size);
if (unlikely(ret))
goto out_free;
@@ -3532,8 +3606,7 @@ static int io_write(struct io_kiocb *req, bool force_nonblock,
* we return to userspace.
*/
if (req->flags & REQ_F_ISREG) {
- __sb_start_write(file_inode(req->file)->i_sb,
- SB_FREEZE_WRITE, true);
+ sb_start_write(file_inode(req->file)->i_sb);
__sb_writers_release(file_inode(req->file)->i_sb,
SB_FREEZE_WRITE);
}
@@ -3564,7 +3637,7 @@ done:
} else {
copy_iov:
/* some cases will consume bytes even on error returns */
- iov_iter_revert(iter, iov_count - iov_iter_count(iter));
+ iov_iter_revert(iter, io_size - iov_iter_count(iter));
ret = io_setup_async_rw(req, iovec, inline_vecs, iter, false);
if (!ret)
return -EAGAIN;
@@ -3576,6 +3649,137 @@ out_free:
return ret;
}
+static int io_renameat_prep(struct io_kiocb *req,
+ const struct io_uring_sqe *sqe)
+{
+ struct io_rename *ren = &req->rename;
+ const char __user *oldf, *newf;
+
+ if (unlikely(req->flags & REQ_F_FIXED_FILE))
+ return -EBADF;
+
+ ren->old_dfd = READ_ONCE(sqe->fd);
+ oldf = u64_to_user_ptr(READ_ONCE(sqe->addr));
+ newf = u64_to_user_ptr(READ_ONCE(sqe->addr2));
+ ren->new_dfd = READ_ONCE(sqe->len);
+ ren->flags = READ_ONCE(sqe->rename_flags);
+
+ ren->oldpath = getname(oldf);
+ if (IS_ERR(ren->oldpath))
+ return PTR_ERR(ren->oldpath);
+
+ ren->newpath = getname(newf);
+ if (IS_ERR(ren->newpath)) {
+ putname(ren->oldpath);
+ return PTR_ERR(ren->newpath);
+ }
+
+ req->flags |= REQ_F_NEED_CLEANUP;
+ return 0;
+}
+
+static int io_renameat(struct io_kiocb *req, bool force_nonblock)
+{
+ struct io_rename *ren = &req->rename;
+ int ret;
+
+ if (force_nonblock)
+ return -EAGAIN;
+
+ ret = do_renameat2(ren->old_dfd, ren->oldpath, ren->new_dfd,
+ ren->newpath, ren->flags);
+
+ req->flags &= ~REQ_F_NEED_CLEANUP;
+ if (ret < 0)
+ req_set_fail_links(req);
+ io_req_complete(req, ret);
+ return 0;
+}
+
+static int io_unlinkat_prep(struct io_kiocb *req,
+ const struct io_uring_sqe *sqe)
+{
+ struct io_unlink *un = &req->unlink;
+ const char __user *fname;
+
+ if (unlikely(req->flags & REQ_F_FIXED_FILE))
+ return -EBADF;
+
+ un->dfd = READ_ONCE(sqe->fd);
+
+ un->flags = READ_ONCE(sqe->unlink_flags);
+ if (un->flags & ~AT_REMOVEDIR)
+ return -EINVAL;
+
+ fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
+ un->filename = getname(fname);
+ if (IS_ERR(un->filename))
+ return PTR_ERR(un->filename);
+
+ req->flags |= REQ_F_NEED_CLEANUP;
+ return 0;
+}
+
+static int io_unlinkat(struct io_kiocb *req, bool force_nonblock)
+{
+ struct io_unlink *un = &req->unlink;
+ int ret;
+
+ if (force_nonblock)
+ return -EAGAIN;
+
+ if (un->flags & AT_REMOVEDIR)
+ ret = do_rmdir(un->dfd, un->filename);
+ else
+ ret = do_unlinkat(un->dfd, un->filename);
+
+ req->flags &= ~REQ_F_NEED_CLEANUP;
+ if (ret < 0)
+ req_set_fail_links(req);
+ io_req_complete(req, ret);
+ return 0;
+}
+
+static int io_shutdown_prep(struct io_kiocb *req,
+ const struct io_uring_sqe *sqe)
+{
+#if defined(CONFIG_NET)
+ if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+ return -EINVAL;
+ if (sqe->ioprio || sqe->off || sqe->addr || sqe->rw_flags ||
+ sqe->buf_index)
+ return -EINVAL;
+
+ req->shutdown.how = READ_ONCE(sqe->len);
+ return 0;
+#else
+ return -EOPNOTSUPP;
+#endif
+}
+
+static int io_shutdown(struct io_kiocb *req, bool force_nonblock)
+{
+#if defined(CONFIG_NET)
+ struct socket *sock;
+ int ret;
+
+ if (force_nonblock)
+ return -EAGAIN;
+
+ sock = sock_from_file(req->file);
+ if (unlikely(!sock))
+ return -ENOTSOCK;
+
+ ret = __sys_shutdown_sock(sock, req->shutdown.how);
+ if (ret < 0)
+ req_set_fail_links(req);
+ io_req_complete(req, ret);
+ return 0;
+#else
+ return -EOPNOTSUPP;
+#endif
+}
+
static int __io_splice_prep(struct io_kiocb *req,
const struct io_uring_sqe *sqe)
{
@@ -3781,6 +3985,7 @@ static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
return ret;
}
req->open.nofile = rlimit(RLIMIT_NOFILE);
+ req->open.ignore_nonblock = false;
req->flags |= REQ_F_NEED_CLEANUP;
return 0;
}
@@ -3789,7 +3994,7 @@ static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
u64 flags, mode;
- if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
+ if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
mode = READ_ONCE(sqe->len);
flags = READ_ONCE(sqe->open_flags);
@@ -3803,7 +4008,7 @@ static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
size_t len;
int ret;
- if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
+ if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
how = u64_to_user_ptr(READ_ONCE(sqe->addr2));
len = READ_ONCE(sqe->len);
@@ -3824,7 +4029,7 @@ static int io_openat2(struct io_kiocb *req, bool force_nonblock)
struct file *file;
int ret;
- if (force_nonblock)
+ if (force_nonblock && !req->open.ignore_nonblock)
return -EAGAIN;
ret = build_open_flags(&req->open.how, &op);
@@ -3839,6 +4044,21 @@ static int io_openat2(struct io_kiocb *req, bool force_nonblock)
if (IS_ERR(file)) {
put_unused_fd(ret);
ret = PTR_ERR(file);
+ /*
+ * A work-around to ensure that /proc/self works that way
+ * that it should - if we get -EOPNOTSUPP back, then assume
+ * that proc_self_get_link() failed us because we're in async
+ * context. We should be safe to retry this from the task
+ * itself with force_nonblock == false set, as it should not
+ * block on lookup. Would be nice to know this upfront and
+ * avoid the async dance, but doesn't seem feasible.
+ */
+ if (ret == -EOPNOTSUPP && io_wq_current_is_worker()) {
+ req->open.ignore_nonblock = true;
+ refcount_inc(&req->refs);
+ io_req_task_queue(req);
+ return 0;
+ }
} else {
fsnotify_open(file);
fd_install(ret, file);
@@ -3918,11 +4138,17 @@ static int io_remove_buffers(struct io_kiocb *req, bool force_nonblock,
head = idr_find(&ctx->io_buffer_idr, p->bgid);
if (head)
ret = __io_remove_buffers(ctx, head, p->bgid, p->nbufs);
-
- io_ring_submit_lock(ctx, !force_nonblock);
if (ret < 0)
req_set_fail_links(req);
- __io_req_complete(req, ret, 0, cs);
+
+ /* need to hold the lock to complete IOPOLL requests */
+ if (ctx->flags & IORING_SETUP_IOPOLL) {
+ __io_req_complete(req, ret, 0, cs);
+ io_ring_submit_unlock(ctx, !force_nonblock);
+ } else {
+ io_ring_submit_unlock(ctx, !force_nonblock);
+ __io_req_complete(req, ret, 0, cs);
+ }
return 0;
}
@@ -4007,10 +4233,17 @@ static int io_provide_buffers(struct io_kiocb *req, bool force_nonblock,
}
}
out:
- io_ring_submit_unlock(ctx, !force_nonblock);
if (ret < 0)
req_set_fail_links(req);
- __io_req_complete(req, ret, 0, cs);
+
+ /* need to hold the lock to complete IOPOLL requests */
+ if (ctx->flags & IORING_SETUP_IOPOLL) {
+ __io_req_complete(req, ret, 0, cs);
+ io_ring_submit_unlock(ctx, !force_nonblock);
+ } else {
+ io_ring_submit_unlock(ctx, !force_nonblock);
+ __io_req_complete(req, ret, 0, cs);
+ }
return 0;
}
@@ -4182,7 +4415,7 @@ static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
io_req_init_async(req);
req->work.flags |= IO_WQ_WORK_NO_CANCEL;
- if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
+ if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
if (sqe->ioprio || sqe->off || sqe->addr || sqe->len ||
sqe->rw_flags || sqe->buf_index)
@@ -4206,7 +4439,7 @@ static int io_close(struct io_kiocb *req, bool force_nonblock,
/* might be already done during nonblock submission */
if (!close->put_file) {
- ret = __close_fd_get_file(close->fd, &close->put_file);
+ ret = close_fd_get_file(close->fd, &close->put_file);
if (ret < 0)
return (ret == -ENOENT) ? -EBADF : ret;
}
@@ -4326,9 +4559,9 @@ static int io_sendmsg(struct io_kiocb *req, bool force_nonblock,
unsigned flags;
int ret;
- sock = sock_from_file(req->file, &ret);
+ sock = sock_from_file(req->file);
if (unlikely(!sock))
- return ret;
+ return -ENOTSOCK;
if (req->async_data) {
kmsg = req->async_data;
@@ -4375,9 +4608,9 @@ static int io_send(struct io_kiocb *req, bool force_nonblock,
unsigned flags;
int ret;
- sock = sock_from_file(req->file, &ret);
+ sock = sock_from_file(req->file);
if (unlikely(!sock))
- return ret;
+ return -ENOTSOCK;
ret = import_single_range(WRITE, sr->buf, sr->len, &iov, &msg.msg_iter);
if (unlikely(ret))
@@ -4469,7 +4702,8 @@ static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req,
return -EFAULT;
if (clen < 0)
return -EINVAL;
- sr->len = iomsg->iov[0].iov_len;
+ sr->len = clen;
+ iomsg->iov[0].iov_len = clen;
iomsg->iov = NULL;
} else {
ret = __import_iovec(READ, (struct iovec __user *)uiov, len,
@@ -4554,9 +4788,9 @@ static int io_recvmsg(struct io_kiocb *req, bool force_nonblock,
unsigned flags;
int ret, cflags = 0;
- sock = sock_from_file(req->file, &ret);
+ sock = sock_from_file(req->file);
if (unlikely(!sock))
- return ret;
+ return -ENOTSOCK;
if (req->async_data) {
kmsg = req->async_data;
@@ -4617,9 +4851,9 @@ static int io_recv(struct io_kiocb *req, bool force_nonblock,
unsigned flags;
int ret, cflags = 0;
- sock = sock_from_file(req->file, &ret);
+ sock = sock_from_file(req->file);
if (unlikely(!sock))
- return ret;
+ return -ENOTSOCK;
if (req->flags & REQ_F_BUFFER_SELECT) {
kbuf = io_recv_buffer_select(req, !force_nonblock);
@@ -4663,7 +4897,7 @@ static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_accept *accept = &req->accept;
- if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
+ if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
if (sqe->ioprio || sqe->len || sqe->buf_index)
return -EINVAL;
@@ -4704,7 +4938,7 @@ static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
struct io_connect *conn = &req->connect;
struct io_async_connect *io = req->async_data;
- if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
+ if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags)
return -EINVAL;
@@ -4828,7 +5062,6 @@ struct io_poll_table {
static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll,
__poll_t mask, task_work_func_t func)
{
- bool twa_signal_ok;
int ret;
/* for instances that support it check for an event match first: */
@@ -4844,20 +5077,12 @@ static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll,
percpu_ref_get(&req->ctx->refs);
/*
- * If we using the signalfd wait_queue_head for this wakeup, then
- * it's not safe to use TWA_SIGNAL as we could be recursing on the
- * tsk->sighand->siglock on doing the wakeup. Should not be needed
- * either, as the normal wakeup will suffice.
- */
- twa_signal_ok = (poll->head != &req->task->sighand->signalfd_wqh);
-
- /*
* If this fails, then the task is exiting. When a task exits, the
* work gets canceled, so just cancel this request as well instead
* of executing it. We can't safely execute it anyway, as we may not
* have the needed state needed for it anyway.
*/
- ret = io_req_task_work_add(req, twa_signal_ok);
+ ret = io_req_task_work_add(req);
if (unlikely(ret)) {
struct task_struct *tsk;
@@ -4977,8 +5202,10 @@ static int io_poll_double_wake(struct wait_queue_entry *wait, unsigned mode,
/* make sure double remove sees this as being gone */
wait->private = NULL;
spin_unlock(&poll->head->lock);
- if (!done)
- __io_async_wake(req, poll, mask, io_poll_task_func);
+ if (!done) {
+ /* use wait func handler, so it matches the rq type */
+ poll->wait.func(&poll->wait, mode, sync, key);
+ }
}
refcount_dec(&req->refs);
return 1;
@@ -5246,7 +5473,8 @@ static bool io_poll_remove_one(struct io_kiocb *req)
/*
* Returns true if we found and killed one or more poll requests
*/
-static bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk)
+static bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk,
+ struct files_struct *files)
{
struct hlist_node *tmp;
struct io_kiocb *req;
@@ -5258,7 +5486,7 @@ static bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk)
list = &ctx->cancel_hash[i];
hlist_for_each_entry_safe(req, tmp, list, hash_node) {
- if (io_task_match(req, tsk))
+ if (io_match_task(req, tsk, files))
posted += io_poll_remove_one(req);
}
}
@@ -5296,7 +5524,7 @@ static int io_poll_remove_prep(struct io_kiocb *req,
sqe->poll_events)
return -EINVAL;
- req->poll.addr = READ_ONCE(sqe->addr);
+ req->poll_remove.addr = READ_ONCE(sqe->addr);
return 0;
}
@@ -5307,12 +5535,10 @@ static int io_poll_remove_prep(struct io_kiocb *req,
static int io_poll_remove(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
- u64 addr;
int ret;
- addr = req->poll.addr;
spin_lock_irq(&ctx->completion_lock);
- ret = io_poll_cancel(ctx, addr);
+ ret = io_poll_cancel(ctx, req->poll_remove.addr);
spin_unlock_irq(&ctx->completion_lock);
if (ret < 0)
@@ -5405,15 +5631,37 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
return HRTIMER_NORESTART;
}
-static int __io_timeout_cancel(struct io_kiocb *req)
+static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx,
+ __u64 user_data)
{
- struct io_timeout_data *io = req->async_data;
- int ret;
+ struct io_timeout_data *io;
+ struct io_kiocb *req;
+ int ret = -ENOENT;
+
+ list_for_each_entry(req, &ctx->timeout_list, timeout.list) {
+ if (user_data == req->user_data) {
+ ret = 0;
+ break;
+ }
+ }
+ if (ret == -ENOENT)
+ return ERR_PTR(ret);
+
+ io = req->async_data;
ret = hrtimer_try_to_cancel(&io->timer);
if (ret == -1)
- return -EALREADY;
+ return ERR_PTR(-EALREADY);
list_del_init(&req->timeout.list);
+ return req;
+}
+
+static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
+{
+ struct io_kiocb *req = io_timeout_extract(ctx, user_data);
+
+ if (IS_ERR(req))
+ return PTR_ERR(req);
req_set_fail_links(req);
io_cqring_fill_event(req, -ECANCELED);
@@ -5421,35 +5669,48 @@ static int __io_timeout_cancel(struct io_kiocb *req)
return 0;
}
-static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
+static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
+ struct timespec64 *ts, enum hrtimer_mode mode)
{
- struct io_kiocb *req;
- int ret = -ENOENT;
-
- list_for_each_entry(req, &ctx->timeout_list, timeout.list) {
- if (user_data == req->user_data) {
- ret = 0;
- break;
- }
- }
+ struct io_kiocb *req = io_timeout_extract(ctx, user_data);
+ struct io_timeout_data *data;
- if (ret == -ENOENT)
- return ret;
+ if (IS_ERR(req))
+ return PTR_ERR(req);
- return __io_timeout_cancel(req);
+ req->timeout.off = 0; /* noseq */
+ data = req->async_data;
+ list_add_tail(&req->timeout.list, &ctx->timeout_list);
+ hrtimer_init(&data->timer, CLOCK_MONOTONIC, mode);
+ data->timer.function = io_timeout_fn;
+ hrtimer_start(&data->timer, timespec64_to_ktime(*ts), mode);
+ return 0;
}
static int io_timeout_remove_prep(struct io_kiocb *req,
const struct io_uring_sqe *sqe)
{
+ struct io_timeout_rem *tr = &req->timeout_rem;
+
if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
return -EINVAL;
- if (sqe->ioprio || sqe->buf_index || sqe->len || sqe->timeout_flags)
+ if (sqe->ioprio || sqe->buf_index || sqe->len)
+ return -EINVAL;
+
+ tr->addr = READ_ONCE(sqe->addr);
+ tr->flags = READ_ONCE(sqe->timeout_flags);
+ if (tr->flags & IORING_TIMEOUT_UPDATE) {
+ if (tr->flags & ~(IORING_TIMEOUT_UPDATE|IORING_TIMEOUT_ABS))
+ return -EINVAL;
+ if (get_timespec64(&tr->ts, u64_to_user_ptr(sqe->addr2)))
+ return -EFAULT;
+ } else if (tr->flags) {
+ /* timeout removal doesn't support flags */
return -EINVAL;
+ }
- req->timeout_rem.addr = READ_ONCE(sqe->addr);
return 0;
}
@@ -5458,11 +5719,19 @@ static int io_timeout_remove_prep(struct io_kiocb *req,
*/
static int io_timeout_remove(struct io_kiocb *req)
{
+ struct io_timeout_rem *tr = &req->timeout_rem;
struct io_ring_ctx *ctx = req->ctx;
int ret;
spin_lock_irq(&ctx->completion_lock);
- ret = io_timeout_cancel(ctx, req->timeout_rem.addr);
+ if (req->timeout_rem.flags & IORING_TIMEOUT_UPDATE) {
+ enum hrtimer_mode mode = (tr->flags & IORING_TIMEOUT_ABS)
+ ? HRTIMER_MODE_ABS : HRTIMER_MODE_REL;
+
+ ret = io_timeout_update(ctx, tr->addr, &tr->ts, mode);
+ } else {
+ ret = io_timeout_cancel(ctx, tr->addr);
+ }
io_cqring_fill_event(req, ret);
io_commit_cqring(ctx);
@@ -5742,6 +6011,12 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return io_remove_buffers_prep(req, sqe);
case IORING_OP_TEE:
return io_tee_prep(req, sqe);
+ case IORING_OP_SHUTDOWN:
+ return io_shutdown_prep(req, sqe);
+ case IORING_OP_RENAMEAT:
+ return io_renameat_prep(req, sqe);
+ case IORING_OP_UNLINKAT:
+ return io_unlinkat_prep(req, sqe);
}
printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
@@ -5763,11 +6038,10 @@ static u32 io_get_sequence(struct io_kiocb *req)
{
struct io_kiocb *pos;
struct io_ring_ctx *ctx = req->ctx;
- u32 total_submitted, nr_reqs = 1;
+ u32 total_submitted, nr_reqs = 0;
- if (req->flags & REQ_F_LINK_HEAD)
- list_for_each_entry(pos, &req->link_list, link_list)
- nr_reqs++;
+ io_for_each_link(pos, req)
+ nr_reqs++;
total_submitted = ctx->cached_sq_head - ctx->cached_sq_dropped;
return total_submitted - nr_reqs;
@@ -5819,17 +6093,18 @@ static int io_req_defer(struct io_kiocb *req, const struct io_uring_sqe *sqe)
static void io_req_drop_files(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
+ struct io_uring_task *tctx = req->task->io_uring;
unsigned long flags;
+ put_files_struct(req->work.identity->files);
+ put_nsproxy(req->work.identity->nsproxy);
spin_lock_irqsave(&ctx->inflight_lock, flags);
list_del(&req->inflight_entry);
- if (waitqueue_active(&ctx->inflight_wait))
- wake_up(&ctx->inflight_wait);
spin_unlock_irqrestore(&ctx->inflight_lock, flags);
req->flags &= ~REQ_F_INFLIGHT;
- put_files_struct(req->work.identity->files);
- put_nsproxy(req->work.identity->nsproxy);
req->work.flags &= ~IO_WQ_WORK_FILES;
+ if (atomic_read(&tctx->in_idle))
+ wake_up(&tctx->wait);
}
static void __io_clean_op(struct io_kiocb *req)
@@ -5879,6 +6154,13 @@ static void __io_clean_op(struct io_kiocb *req)
if (req->open.filename)
putname(req->open.filename);
break;
+ case IORING_OP_RENAMEAT:
+ putname(req->rename.oldpath);
+ putname(req->rename.newpath);
+ break;
+ case IORING_OP_UNLINKAT:
+ putname(req->unlink.filename);
+ break;
}
req->flags &= ~REQ_F_NEED_CLEANUP;
}
@@ -5985,6 +6267,15 @@ static int io_issue_sqe(struct io_kiocb *req, bool force_nonblock,
case IORING_OP_TEE:
ret = io_tee(req, force_nonblock);
break;
+ case IORING_OP_SHUTDOWN:
+ ret = io_shutdown(req, force_nonblock);
+ break;
+ case IORING_OP_RENAMEAT:
+ ret = io_renameat(req, force_nonblock);
+ break;
+ case IORING_OP_UNLINKAT:
+ ret = io_unlinkat(req, force_nonblock);
+ break;
default:
ret = -EINVAL;
break;
@@ -6001,7 +6292,7 @@ static int io_issue_sqe(struct io_kiocb *req, bool force_nonblock,
if (in_async)
mutex_lock(&ctx->uring_lock);
- io_iopoll_req_issued(req);
+ io_iopoll_req_issued(req, in_async);
if (in_async)
mutex_unlock(&ctx->uring_lock);
@@ -6041,8 +6332,28 @@ static struct io_wq_work *io_wq_submit_work(struct io_wq_work *work)
}
if (ret) {
+ struct io_ring_ctx *lock_ctx = NULL;
+
+ if (req->ctx->flags & IORING_SETUP_IOPOLL)
+ lock_ctx = req->ctx;
+
+ /*
+ * io_iopoll_complete() does not hold completion_lock to
+ * complete polled io, so here for polled io, we can not call
+ * io_req_complete() directly, otherwise there maybe concurrent
+ * access to cqring, defer_list, etc, which is not safe. Given
+ * that io_iopoll_complete() is always called under uring_lock,
+ * so here for polled io, we also get uring_lock to complete
+ * it.
+ */
+ if (lock_ctx)
+ mutex_lock(&lock_ctx->uring_lock);
+
req_set_fail_links(req);
io_req_complete(req, ret);
+
+ if (lock_ctx)
+ mutex_unlock(&lock_ctx->uring_lock);
}
return io_steal_work(req);
@@ -6068,10 +6379,7 @@ static struct file *io_file_get(struct io_submit_state *state,
return NULL;
fd = array_index_nospec(fd, ctx->nr_user_files);
file = io_file_from_index(ctx, fd);
- if (file) {
- req->fixed_file_refs = &ctx->file_data->node->refs;
- percpu_ref_get(req->fixed_file_refs);
- }
+ io_set_resource_node(req);
} else {
trace_io_uring_file_get(ctx, fd);
file = __io_file_get(state, fd);
@@ -6080,45 +6388,26 @@ static struct file *io_file_get(struct io_submit_state *state,
return file;
}
-static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req,
- int fd)
-{
- bool fixed;
-
- fixed = (req->flags & REQ_F_FIXED_FILE) != 0;
- if (unlikely(!fixed && io_async_submit(req->ctx)))
- return -EBADF;
-
- req->file = io_file_get(state, req, fd, fixed);
- if (req->file || io_op_defs[req->opcode].needs_file_no_error)
- return 0;
- return -EBADF;
-}
-
static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
{
struct io_timeout_data *data = container_of(timer,
struct io_timeout_data, timer);
- struct io_kiocb *req = data->req;
+ struct io_kiocb *prev, *req = data->req;
struct io_ring_ctx *ctx = req->ctx;
- struct io_kiocb *prev = NULL;
unsigned long flags;
spin_lock_irqsave(&ctx->completion_lock, flags);
+ prev = req->timeout.head;
+ req->timeout.head = NULL;
/*
* We don't expect the list to be empty, that will only happen if we
* race with the completion of the linked work.
*/
- if (!list_empty(&req->link_list)) {
- prev = list_entry(req->link_list.prev, struct io_kiocb,
- link_list);
- if (refcount_inc_not_zero(&prev->refs))
- list_del_init(&req->link_list);
- else
- prev = NULL;
- }
-
+ if (prev && refcount_inc_not_zero(&prev->refs))
+ io_remove_next_linked(prev);
+ else
+ prev = NULL;
spin_unlock_irqrestore(&ctx->completion_lock, flags);
if (prev) {
@@ -6134,10 +6423,10 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
static void __io_queue_linked_timeout(struct io_kiocb *req)
{
/*
- * If the list is now empty, then our linked request finished before
- * we got a chance to setup the timer
+ * If the back reference is NULL, then our linked request finished
+ * before we got a chance to setup the timer
*/
- if (!list_empty(&req->link_list)) {
+ if (req->timeout.head) {
struct io_timeout_data *data = req->async_data;
data->timer.function = io_link_timeout_fn;
@@ -6160,18 +6449,13 @@ static void io_queue_linked_timeout(struct io_kiocb *req)
static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
{
- struct io_kiocb *nxt;
+ struct io_kiocb *nxt = req->link;
- if (!(req->flags & REQ_F_LINK_HEAD))
- return NULL;
- if (req->flags & REQ_F_LINK_TIMEOUT)
- return NULL;
-
- nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb,
- link_list);
- if (!nxt || nxt->opcode != IORING_OP_LINK_TIMEOUT)
+ if (!nxt || (req->flags & REQ_F_LINK_TIMEOUT) ||
+ nxt->opcode != IORING_OP_LINK_TIMEOUT)
return NULL;
+ nxt->timeout.head = req;
nxt->flags |= REQ_F_LTIMEOUT_ACTIVE;
req->flags |= REQ_F_LINK_TIMEOUT;
return nxt;
@@ -6180,7 +6464,6 @@ static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
static void __io_queue_sqe(struct io_kiocb *req, struct io_comp_state *cs)
{
struct io_kiocb *linked_timeout;
- struct io_kiocb *nxt;
const struct cred *old_creds = NULL;
int ret;
@@ -6206,7 +6489,6 @@ again:
*/
if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) {
if (!io_arm_poll_handler(req)) {
-punt:
/*
* Queued up for async execution, worker will release
* submit reference when the iocb is actually submitted.
@@ -6216,33 +6498,25 @@ punt:
if (linked_timeout)
io_queue_linked_timeout(linked_timeout);
- goto exit;
- }
+ } else if (likely(!ret)) {
+ /* drop submission reference */
+ req = io_put_req_find_next(req);
+ if (linked_timeout)
+ io_queue_linked_timeout(linked_timeout);
- if (unlikely(ret)) {
+ if (req) {
+ if (!(req->flags & REQ_F_FORCE_ASYNC))
+ goto again;
+ io_queue_async_work(req);
+ }
+ } else {
/* un-prep timeout, so it'll be killed as any other linked */
req->flags &= ~REQ_F_LINK_TIMEOUT;
req_set_fail_links(req);
io_put_req(req);
io_req_complete(req, ret);
- goto exit;
}
- /* drop submission reference */
- nxt = io_put_req_find_next(req);
- if (linked_timeout)
- io_queue_linked_timeout(linked_timeout);
-
- if (nxt) {
- req = nxt;
-
- if (req->flags & REQ_F_FORCE_ASYNC) {
- linked_timeout = NULL;
- goto punt;
- }
- goto again;
- }
-exit:
if (old_creds)
revert_creds(old_creds);
}
@@ -6266,13 +6540,6 @@ fail_req:
if (unlikely(ret))
goto fail_req;
}
-
- /*
- * Never try inline submit of IOSQE_ASYNC is set, go straight
- * to async execution.
- */
- io_req_init_async(req);
- req->work.flags |= IO_WQ_WORK_CONCURRENT;
io_queue_async_work(req);
} else {
if (sqe) {
@@ -6294,8 +6561,13 @@ static inline void io_queue_link_head(struct io_kiocb *req,
io_queue_sqe(req, NULL, cs);
}
+struct io_submit_link {
+ struct io_kiocb *head;
+ struct io_kiocb *last;
+};
+
static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
- struct io_kiocb **link, struct io_comp_state *cs)
+ struct io_submit_link *link, struct io_comp_state *cs)
{
struct io_ring_ctx *ctx = req->ctx;
int ret;
@@ -6307,8 +6579,8 @@ static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
* submitted sync once the chain is complete. If none of those
* conditions are true (normal request), then just queue it.
*/
- if (*link) {
- struct io_kiocb *head = *link;
+ if (link->head) {
+ struct io_kiocb *head = link->head;
/*
* Taking sequential execution of a link, draining both sides
@@ -6328,12 +6600,13 @@ static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
return ret;
}
trace_io_uring_link(ctx, req, head);
- list_add_tail(&req->link_list, &head->link_list);
+ link->last->link = req;
+ link->last = req;
/* last request of a link, enqueue the link */
if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) {
io_queue_link_head(head, cs);
- *link = NULL;
+ link->head = NULL;
}
} else {
if (unlikely(ctx->drain_next)) {
@@ -6341,13 +6614,11 @@ static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
ctx->drain_next = 0;
}
if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) {
- req->flags |= REQ_F_LINK_HEAD;
- INIT_LIST_HEAD(&req->link_list);
-
ret = io_req_defer_prep(req, sqe);
if (unlikely(ret))
req->flags |= REQ_F_FAIL_LINK;
- *link = req;
+ link->head = req;
+ link->last = req;
} else {
io_queue_sqe(req, sqe, cs);
}
@@ -6363,7 +6634,8 @@ static void io_submit_state_end(struct io_submit_state *state)
{
if (!list_empty(&state->comp.list))
io_submit_flush_completions(&state->comp);
- blk_finish_plug(&state->plug);
+ if (state->plug_started)
+ blk_finish_plug(&state->plug);
io_state_file_put(state);
if (state->free_reqs)
kmem_cache_free_bulk(req_cachep, state->free_reqs, state->reqs);
@@ -6375,12 +6647,12 @@ static void io_submit_state_end(struct io_submit_state *state)
static void io_submit_state_start(struct io_submit_state *state,
struct io_ring_ctx *ctx, unsigned int max_ios)
{
- blk_start_plug(&state->plug);
+ state->plug_started = false;
state->comp.nr = 0;
INIT_LIST_HEAD(&state->comp.list);
state->comp.ctx = ctx;
state->free_reqs = 0;
- state->file = NULL;
+ state->file_refs = 0;
state->ios_left = max_ios;
}
@@ -6475,6 +6747,8 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
req->file = NULL;
req->ctx = ctx;
req->flags = 0;
+ req->link = NULL;
+ req->fixed_file_refs = NULL;
/* one is dropped after submission, the other at completion */
refcount_set(&req->refs, 2);
req->task = current;
@@ -6483,7 +6757,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
if (unlikely(req->opcode >= IORING_OP_LAST))
return -EINVAL;
- if (unlikely(io_sq_thread_acquire_mm(ctx, req)))
+ if (unlikely(io_sq_thread_acquire_mm_files(ctx, req)))
return -EFAULT;
sqe_flags = READ_ONCE(sqe->flags);
@@ -6516,10 +6790,26 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
/* same numerical values with corresponding REQ_F_*, safe to copy */
req->flags |= sqe_flags;
- if (!io_op_defs[req->opcode].needs_file)
- return 0;
+ /*
+ * Plug now if we have more than 1 IO left after this, and the target
+ * is potentially a read/write to block based storage.
+ */
+ if (!state->plug_started && state->ios_left > 1 &&
+ io_op_defs[req->opcode].plug) {
+ blk_start_plug(&state->plug);
+ state->plug_started = true;
+ }
+
+ ret = 0;
+ if (io_op_defs[req->opcode].needs_file) {
+ bool fixed = req->flags & REQ_F_FIXED_FILE;
+
+ req->file = io_file_get(state, req, READ_ONCE(sqe->fd), fixed);
+ if (unlikely(!req->file &&
+ !io_op_defs[req->opcode].needs_file_no_error))
+ ret = -EBADF;
+ }
- ret = io_req_set_file(state, req, READ_ONCE(sqe->fd));
state->ios_left--;
return ret;
}
@@ -6527,13 +6817,12 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
{
struct io_submit_state state;
- struct io_kiocb *link = NULL;
+ struct io_submit_link link;
int i, submitted = 0;
/* if we have a backlog and couldn't flush it all, return BUSY */
if (test_bit(0, &ctx->sq_check_overflow)) {
- if (!list_empty(&ctx->cq_overflow_list) &&
- !io_cqring_overflow_flush(ctx, false, NULL, NULL))
+ if (!io_cqring_overflow_flush(ctx, false, NULL, NULL))
return -EBUSY;
}
@@ -6547,6 +6836,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
refcount_add(nr, &current->usage);
io_submit_state_start(&state, ctx, nr);
+ link.head = NULL;
for (i = 0; i < nr; i++) {
const struct io_uring_sqe *sqe;
@@ -6592,8 +6882,8 @@ fail_req:
percpu_counter_sub(&tctx->inflight, unused);
put_task_struct_many(current, unused);
}
- if (link)
- io_queue_link_head(link, &state.comp);
+ if (link.head)
+ io_queue_link_head(link.head, &state.comp);
io_submit_state_end(&state);
/* Commit SQ ring head once we've consumed and submitted all SQEs */
@@ -6617,111 +6907,45 @@ static inline void io_ring_clear_wakeup_flag(struct io_ring_ctx *ctx)
spin_unlock_irq(&ctx->completion_lock);
}
-static int io_sq_wake_function(struct wait_queue_entry *wqe, unsigned mode,
- int sync, void *key)
+static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)
{
- struct io_ring_ctx *ctx = container_of(wqe, struct io_ring_ctx, sqo_wait_entry);
- int ret;
-
- ret = autoremove_wake_function(wqe, mode, sync, key);
- if (ret) {
- unsigned long flags;
-
- spin_lock_irqsave(&ctx->completion_lock, flags);
- ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
- spin_unlock_irqrestore(&ctx->completion_lock, flags);
- }
- return ret;
-}
-
-enum sq_ret {
- SQT_IDLE = 1,
- SQT_SPIN = 2,
- SQT_DID_WORK = 4,
-};
-
-static enum sq_ret __io_sq_thread(struct io_ring_ctx *ctx,
- unsigned long start_jiffies, bool cap_entries)
-{
- unsigned long timeout = start_jiffies + ctx->sq_thread_idle;
- struct io_sq_data *sqd = ctx->sq_data;
unsigned int to_submit;
int ret = 0;
-again:
- if (!list_empty(&ctx->iopoll_list)) {
+ to_submit = io_sqring_entries(ctx);
+ /* if we're handling multiple rings, cap submit size for fairness */
+ if (cap_entries && to_submit > 8)
+ to_submit = 8;
+
+ if (!list_empty(&ctx->iopoll_list) || to_submit) {
unsigned nr_events = 0;
mutex_lock(&ctx->uring_lock);
- if (!list_empty(&ctx->iopoll_list) && !need_resched())
+ if (!list_empty(&ctx->iopoll_list))
io_do_iopoll(ctx, &nr_events, 0);
+
+ if (to_submit && likely(!percpu_ref_is_dying(&ctx->refs)))
+ ret = io_submit_sqes(ctx, to_submit);
mutex_unlock(&ctx->uring_lock);
}
- to_submit = io_sqring_entries(ctx);
-
- /*
- * If submit got -EBUSY, flag us as needing the application
- * to enter the kernel to reap and flush events.
- */
- if (!to_submit || ret == -EBUSY || need_resched()) {
- /*
- * Drop cur_mm before scheduling, we can't hold it for
- * long periods (or over schedule()). Do this before
- * adding ourselves to the waitqueue, as the unuse/drop
- * may sleep.
- */
- io_sq_thread_drop_mm();
-
- /*
- * We're polling. If we're within the defined idle
- * period, then let us spin without work before going
- * to sleep. The exception is if we got EBUSY doing
- * more IO, we should wait for the application to
- * reap events and wake us up.
- */
- if (!list_empty(&ctx->iopoll_list) || need_resched() ||
- (!time_after(jiffies, timeout) && ret != -EBUSY &&
- !percpu_ref_is_dying(&ctx->refs)))
- return SQT_SPIN;
+ if (!io_sqring_full(ctx) && wq_has_sleeper(&ctx->sqo_sq_wait))
+ wake_up(&ctx->sqo_sq_wait);
- prepare_to_wait(&sqd->wait, &ctx->sqo_wait_entry,
- TASK_INTERRUPTIBLE);
+ return ret;
+}
- /*
- * While doing polled IO, before going to sleep, we need
- * to check if there are new reqs added to iopoll_list,
- * it is because reqs may have been punted to io worker
- * and will be added to iopoll_list later, hence check
- * the iopoll_list again.
- */
- if ((ctx->flags & IORING_SETUP_IOPOLL) &&
- !list_empty_careful(&ctx->iopoll_list)) {
- finish_wait(&sqd->wait, &ctx->sqo_wait_entry);
- goto again;
- }
+static void io_sqd_update_thread_idle(struct io_sq_data *sqd)
+{
+ struct io_ring_ctx *ctx;
+ unsigned sq_thread_idle = 0;
- to_submit = io_sqring_entries(ctx);
- if (!to_submit || ret == -EBUSY)
- return SQT_IDLE;
+ list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
+ if (sq_thread_idle < ctx->sq_thread_idle)
+ sq_thread_idle = ctx->sq_thread_idle;
}
- finish_wait(&sqd->wait, &ctx->sqo_wait_entry);
- io_ring_clear_wakeup_flag(ctx);
-
- /* if we're handling multiple rings, cap submit size for fairness */
- if (cap_entries && to_submit > 8)
- to_submit = 8;
-
- mutex_lock(&ctx->uring_lock);
- if (likely(!percpu_ref_is_dying(&ctx->refs)))
- ret = io_submit_sqes(ctx, to_submit);
- mutex_unlock(&ctx->uring_lock);
-
- if (!io_sqring_full(ctx) && wq_has_sleeper(&ctx->sqo_sq_wait))
- wake_up(&ctx->sqo_sq_wait);
-
- return SQT_DID_WORK;
+ sqd->sq_thread_idle = sq_thread_idle;
}
static void io_sqd_init_new(struct io_sq_data *sqd)
@@ -6730,39 +6954,56 @@ static void io_sqd_init_new(struct io_sq_data *sqd)
while (!list_empty(&sqd->ctx_new_list)) {
ctx = list_first_entry(&sqd->ctx_new_list, struct io_ring_ctx, sqd_list);
- init_wait(&ctx->sqo_wait_entry);
- ctx->sqo_wait_entry.func = io_sq_wake_function;
list_move_tail(&ctx->sqd_list, &sqd->ctx_list);
complete(&ctx->sq_thread_comp);
}
+
+ io_sqd_update_thread_idle(sqd);
}
static int io_sq_thread(void *data)
{
struct cgroup_subsys_state *cur_css = NULL;
+ struct files_struct *old_files = current->files;
+ struct nsproxy *old_nsproxy = current->nsproxy;
const struct cred *old_cred = NULL;
struct io_sq_data *sqd = data;
struct io_ring_ctx *ctx;
- unsigned long start_jiffies;
+ unsigned long timeout = 0;
+ DEFINE_WAIT(wait);
+
+ task_lock(current);
+ current->files = NULL;
+ current->nsproxy = NULL;
+ task_unlock(current);
- start_jiffies = jiffies;
while (!kthread_should_stop()) {
- enum sq_ret ret = 0;
- bool cap_entries;
+ int ret;
+ bool cap_entries, sqt_spin, needs_sched;
/*
* Any changes to the sqd lists are synchronized through the
* kthread parking. This synchronizes the thread vs users,
* the users are synchronized on the sqd->ctx_lock.
*/
- if (kthread_should_park())
+ if (kthread_should_park()) {
kthread_parkme();
+ /*
+ * When sq thread is unparked, in case the previous park operation
+ * comes from io_put_sq_data(), which means that sq thread is going
+ * to be stopped, so here needs to have a check.
+ */
+ if (kthread_should_stop())
+ break;
+ }
- if (unlikely(!list_empty(&sqd->ctx_new_list)))
+ if (unlikely(!list_empty(&sqd->ctx_new_list))) {
io_sqd_init_new(sqd);
+ timeout = jiffies + sqd->sq_thread_idle;
+ }
+ sqt_spin = false;
cap_entries = !list_is_singular(&sqd->ctx_list);
-
list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
if (current->cred != ctx->creds) {
if (old_cred)
@@ -6775,24 +7016,49 @@ static int io_sq_thread(void *data)
current->sessionid = ctx->sessionid;
#endif
- ret |= __io_sq_thread(ctx, start_jiffies, cap_entries);
+ ret = __io_sq_thread(ctx, cap_entries);
+ if (!sqt_spin && (ret > 0 || !list_empty(&ctx->iopoll_list)))
+ sqt_spin = true;
- io_sq_thread_drop_mm();
+ io_sq_thread_drop_mm_files();
}
- if (ret & SQT_SPIN) {
+ if (sqt_spin || !time_after(jiffies, timeout)) {
io_run_task_work();
cond_resched();
- } else if (ret == SQT_IDLE) {
- if (kthread_should_park())
- continue;
+ if (sqt_spin)
+ timeout = jiffies + sqd->sq_thread_idle;
+ continue;
+ }
+
+ if (kthread_should_park())
+ continue;
+
+ needs_sched = true;
+ prepare_to_wait(&sqd->wait, &wait, TASK_INTERRUPTIBLE);
+ list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
+ if ((ctx->flags & IORING_SETUP_IOPOLL) &&
+ !list_empty_careful(&ctx->iopoll_list)) {
+ needs_sched = false;
+ break;
+ }
+ if (io_sqring_entries(ctx)) {
+ needs_sched = false;
+ break;
+ }
+ }
+
+ if (needs_sched) {
list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
io_ring_set_wakeup_flag(ctx);
+
schedule();
- start_jiffies = jiffies;
list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
io_ring_clear_wakeup_flag(ctx);
}
+
+ finish_wait(&sqd->wait, &wait);
+ timeout = jiffies + sqd->sq_thread_idle;
}
io_run_task_work();
@@ -6802,6 +7068,11 @@ static int io_sq_thread(void *data)
if (old_cred)
revert_creds(old_cred);
+ task_lock(current);
+ current->files = old_files;
+ current->nsproxy = old_nsproxy;
+ task_unlock(current);
+
kthread_parkme();
return 0;
@@ -6846,13 +7117,8 @@ static int io_run_task_work_sig(void)
return 1;
if (!signal_pending(current))
return 0;
- if (current->jobctl & JOBCTL_TASK_WORK) {
- spin_lock_irq(&current->sighand->siglock);
- current->jobctl &= ~JOBCTL_TASK_WORK;
- recalc_sigpending();
- spin_unlock_irq(&current->sighand->siglock);
- return 1;
- }
+ if (test_tsk_thread_flag(current, TIF_NOTIFY_SIGNAL))
+ return -ERESTARTSYS;
return -EINTR;
}
@@ -6861,7 +7127,8 @@ static int io_run_task_work_sig(void)
* application must reap them itself, as they reside on the shared cq ring.
*/
static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
- const sigset_t __user *sig, size_t sigsz)
+ const sigset_t __user *sig, size_t sigsz,
+ struct __kernel_timespec __user *uts)
{
struct io_wait_queue iowq = {
.wq = {
@@ -6873,6 +7140,8 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
.to_wait = min_events,
};
struct io_rings *rings = ctx->rings;
+ struct timespec64 ts;
+ signed long timeout = 0;
int ret = 0;
do {
@@ -6895,6 +7164,12 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
return ret;
}
+ if (uts) {
+ if (get_timespec64(&ts, uts))
+ return -EFAULT;
+ timeout = timespec64_to_jiffies(&ts);
+ }
+
iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
trace_io_uring_cqring_wait(ctx, min_events);
do {
@@ -6908,7 +7183,15 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
break;
if (io_should_wake(&iowq, false))
break;
- schedule();
+ if (uts) {
+ timeout = schedule_timeout(timeout);
+ if (timeout == 0) {
+ ret = -ETIME;
+ break;
+ }
+ } else {
+ schedule();
+ }
} while (1);
finish_wait(&ctx->wait, &iowq.wq);
@@ -6957,11 +7240,9 @@ static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
if (!data)
return -ENXIO;
- spin_lock(&data->lock);
- if (!list_empty(&data->ref_list))
- ref_node = list_first_entry(&data->ref_list,
- struct fixed_file_ref_node, node);
- spin_unlock(&data->lock);
+ spin_lock_bh(&data->lock);
+ ref_node = data->node;
+ spin_unlock_bh(&data->lock);
if (ref_node)
percpu_ref_kill(&ref_node->refs);
@@ -7084,12 +7365,11 @@ static void io_sq_thread_stop(struct io_ring_ctx *ctx)
mutex_lock(&sqd->ctx_lock);
list_del(&ctx->sqd_list);
+ io_sqd_update_thread_idle(sqd);
mutex_unlock(&sqd->ctx_lock);
- if (sqd->thread) {
- finish_wait(&sqd->wait, &ctx->sqo_wait_entry);
+ if (sqd->thread)
io_sq_thread_unpark(sqd);
- }
io_put_sq_data(sqd);
ctx->sq_data = NULL;
@@ -7309,10 +7589,6 @@ static void __io_file_put_work(struct fixed_file_ref_node *ref_node)
kfree(pfile);
}
- spin_lock(&file_data->lock);
- list_del(&ref_node->node);
- spin_unlock(&file_data->lock);
-
percpu_ref_exit(&ref_node->refs);
kfree(ref_node);
percpu_ref_put(&file_data->refs);
@@ -7339,17 +7615,32 @@ static void io_file_put_work(struct work_struct *work)
static void io_file_data_ref_zero(struct percpu_ref *ref)
{
struct fixed_file_ref_node *ref_node;
+ struct fixed_file_data *data;
struct io_ring_ctx *ctx;
- bool first_add;
+ bool first_add = false;
int delay = HZ;
ref_node = container_of(ref, struct fixed_file_ref_node, refs);
- ctx = ref_node->file_data->ctx;
+ data = ref_node->file_data;
+ ctx = data->ctx;
+
+ spin_lock_bh(&data->lock);
+ ref_node->done = true;
+
+ while (!list_empty(&data->ref_list)) {
+ ref_node = list_first_entry(&data->ref_list,
+ struct fixed_file_ref_node, node);
+ /* recycle ref nodes in order */
+ if (!ref_node->done)
+ break;
+ list_del(&ref_node->node);
+ first_add |= llist_add(&ref_node->llist, &ctx->file_put_llist);
+ }
+ spin_unlock_bh(&data->lock);
- if (percpu_ref_is_dying(&ctx->file_data->refs))
+ if (percpu_ref_is_dying(&data->refs))
delay = 0;
- first_add = llist_add(&ref_node->llist, &ctx->file_put_llist);
if (!delay)
mod_delayed_work(system_wq, &ctx->file_put_work, 0);
else if (first_add)
@@ -7373,6 +7664,7 @@ static struct fixed_file_ref_node *alloc_fixed_file_ref_node(
INIT_LIST_HEAD(&ref_node->node);
INIT_LIST_HEAD(&ref_node->file_list);
ref_node->file_data = ctx->file_data;
+ ref_node->done = false;
return ref_node;
}
@@ -7467,9 +7759,9 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
}
file_data->node = ref_node;
- spin_lock(&file_data->lock);
- list_add(&ref_node->node, &file_data->ref_list);
- spin_unlock(&file_data->lock);
+ spin_lock_bh(&file_data->lock);
+ list_add_tail(&ref_node->node, &file_data->ref_list);
+ spin_unlock_bh(&file_data->lock);
percpu_ref_get(&file_data->refs);
return ret;
out_fput:
@@ -7626,10 +7918,10 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
if (needs_switch) {
percpu_ref_kill(&data->node->refs);
- spin_lock(&data->lock);
- list_add(&ref_node->node, &data->ref_list);
+ spin_lock_bh(&data->lock);
+ list_add_tail(&ref_node->node, &data->ref_list);
data->node = ref_node;
- spin_unlock(&data->lock);
+ spin_unlock_bh(&data->lock);
percpu_ref_get(&ctx->file_data->refs);
} else
destroy_fixed_file_ref_node(ref_node);
@@ -7727,7 +8019,8 @@ static int io_uring_alloc_task_context(struct task_struct *task)
xa_init(&tctx->xa);
init_waitqueue_head(&tctx->wait);
tctx->last = NULL;
- tctx->in_idle = 0;
+ atomic_set(&tctx->in_idle, 0);
+ tctx->sqpoll = false;
io_init_identity(&tctx->__identity);
tctx->identity = &tctx->__identity;
task->io_uring = tctx;
@@ -7756,7 +8049,7 @@ static int io_sq_offload_create(struct io_ring_ctx *ctx,
struct io_sq_data *sqd;
ret = -EPERM;
- if (!capable(CAP_SYS_ADMIN))
+ if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_NICE))
goto err;
sqd = io_get_sq_data(p);
@@ -7859,10 +8152,13 @@ static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages,
__io_unaccount_mem(ctx->user, nr_pages);
if (ctx->mm_account) {
- if (acct == ACCT_LOCKED)
+ if (acct == ACCT_LOCKED) {
+ mmap_write_lock(ctx->mm_account);
ctx->mm_account->locked_vm -= nr_pages;
- else if (acct == ACCT_PINNED)
+ mmap_write_unlock(ctx->mm_account);
+ }else if (acct == ACCT_PINNED) {
atomic64_sub(nr_pages, &ctx->mm_account->pinned_vm);
+ }
}
}
@@ -7878,10 +8174,13 @@ static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages,
}
if (ctx->mm_account) {
- if (acct == ACCT_LOCKED)
+ if (acct == ACCT_LOCKED) {
+ mmap_write_lock(ctx->mm_account);
ctx->mm_account->locked_vm += nr_pages;
- else if (acct == ACCT_PINNED)
+ mmap_write_unlock(ctx->mm_account);
+ } else if (acct == ACCT_PINNED) {
atomic64_add(nr_pages, &ctx->mm_account->pinned_vm);
+ }
}
return 0;
@@ -8342,28 +8641,35 @@ static void io_ring_exit_work(struct work_struct *work)
* as nobody else will be looking for them.
*/
do {
- if (ctx->rings)
- io_cqring_overflow_flush(ctx, true, NULL, NULL);
io_iopoll_try_reap_events(ctx);
} while (!wait_for_completion_timeout(&ctx->ref_comp, HZ/20));
io_ring_ctx_free(ctx);
}
+static bool io_cancel_ctx_cb(struct io_wq_work *work, void *data)
+{
+ struct io_kiocb *req = container_of(work, struct io_kiocb, work);
+
+ return req->ctx == data;
+}
+
static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
{
mutex_lock(&ctx->uring_lock);
percpu_ref_kill(&ctx->refs);
+ /* if force is set, the ring is going away. always drop after that */
+ ctx->cq_overflow_flushed = 1;
+ if (ctx->rings)
+ io_cqring_overflow_flush(ctx, true, NULL, NULL);
mutex_unlock(&ctx->uring_lock);
- io_kill_timeouts(ctx, NULL);
- io_poll_remove_all(ctx, NULL);
+ io_kill_timeouts(ctx, NULL, NULL);
+ io_poll_remove_all(ctx, NULL, NULL);
if (ctx->io_wq)
- io_wq_cancel_all(ctx->io_wq);
+ io_wq_cancel_cb(ctx->io_wq, io_cancel_ctx_cb, ctx, true);
/* if we failed setting up the ctx, we might not have any rings */
- if (ctx->rings)
- io_cqring_overflow_flush(ctx, true, NULL, NULL);
io_iopoll_try_reap_events(ctx);
idr_for_each(&ctx->personality_idr, io_remove_personalities, ctx);
@@ -8394,123 +8700,33 @@ static int io_uring_release(struct inode *inode, struct file *file)
return 0;
}
-static bool io_wq_files_match(struct io_wq_work *work, void *data)
-{
- struct files_struct *files = data;
-
- return !files || ((work->flags & IO_WQ_WORK_FILES) &&
- work->identity->files == files);
-}
-
-/*
- * Returns true if 'preq' is the link parent of 'req'
- */
-static bool io_match_link(struct io_kiocb *preq, struct io_kiocb *req)
-{
- struct io_kiocb *link;
-
- if (!(preq->flags & REQ_F_LINK_HEAD))
- return false;
-
- list_for_each_entry(link, &preq->link_list, link_list) {
- if (link == req)
- return true;
- }
-
- return false;
-}
-
-static bool io_match_link_files(struct io_kiocb *req,
- struct files_struct *files)
-{
- struct io_kiocb *link;
-
- if (io_match_files(req, files))
- return true;
- if (req->flags & REQ_F_LINK_HEAD) {
- list_for_each_entry(link, &req->link_list, link_list) {
- if (io_match_files(link, files))
- return true;
- }
- }
- return false;
-}
+struct io_task_cancel {
+ struct task_struct *task;
+ struct files_struct *files;
+};
-/*
- * We're looking to cancel 'req' because it's holding on to our files, but
- * 'req' could be a link to another request. See if it is, and cancel that
- * parent request if so.
- */
-static bool io_poll_remove_link(struct io_ring_ctx *ctx, struct io_kiocb *req)
+static bool io_cancel_task_cb(struct io_wq_work *work, void *data)
{
- struct hlist_node *tmp;
- struct io_kiocb *preq;
- bool found = false;
- int i;
-
- spin_lock_irq(&ctx->completion_lock);
- for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
- struct hlist_head *list;
-
- list = &ctx->cancel_hash[i];
- hlist_for_each_entry_safe(preq, tmp, list, hash_node) {
- found = io_match_link(preq, req);
- if (found) {
- io_poll_remove_one(preq);
- break;
- }
- }
- }
- spin_unlock_irq(&ctx->completion_lock);
- return found;
-}
+ struct io_kiocb *req = container_of(work, struct io_kiocb, work);
+ struct io_task_cancel *cancel = data;
+ bool ret;
-static bool io_timeout_remove_link(struct io_ring_ctx *ctx,
- struct io_kiocb *req)
-{
- struct io_kiocb *preq;
- bool found = false;
+ if (cancel->files && (req->flags & REQ_F_LINK_TIMEOUT)) {
+ unsigned long flags;
+ struct io_ring_ctx *ctx = req->ctx;
- spin_lock_irq(&ctx->completion_lock);
- list_for_each_entry(preq, &ctx->timeout_list, timeout.list) {
- found = io_match_link(preq, req);
- if (found) {
- __io_timeout_cancel(preq);
- break;
- }
+ /* protect against races with linked timeouts */
+ spin_lock_irqsave(&ctx->completion_lock, flags);
+ ret = io_match_task(req, cancel->task, cancel->files);
+ spin_unlock_irqrestore(&ctx->completion_lock, flags);
+ } else {
+ ret = io_match_task(req, cancel->task, cancel->files);
}
- spin_unlock_irq(&ctx->completion_lock);
- return found;
-}
-
-static bool io_cancel_link_cb(struct io_wq_work *work, void *data)
-{
- return io_match_link(container_of(work, struct io_kiocb, work), data);
-}
-
-static void io_attempt_cancel(struct io_ring_ctx *ctx, struct io_kiocb *req)
-{
- enum io_wq_cancel cret;
-
- /* cancel this particular work, if it's running */
- cret = io_wq_cancel_work(ctx->io_wq, &req->work);
- if (cret != IO_WQ_CANCEL_NOTFOUND)
- return;
-
- /* find links that hold this pending, cancel those */
- cret = io_wq_cancel_cb(ctx->io_wq, io_cancel_link_cb, req, true);
- if (cret != IO_WQ_CANCEL_NOTFOUND)
- return;
-
- /* if we have a poll link holding this pending, cancel that */
- if (io_poll_remove_link(ctx, req))
- return;
-
- /* final option, timeout link is holding this req pending */
- io_timeout_remove_link(ctx, req);
+ return ret;
}
static void io_cancel_defer_files(struct io_ring_ctx *ctx,
+ struct task_struct *task,
struct files_struct *files)
{
struct io_defer_entry *de = NULL;
@@ -8518,7 +8734,7 @@ static void io_cancel_defer_files(struct io_ring_ctx *ctx,
spin_lock_irq(&ctx->completion_lock);
list_for_each_entry_reverse(de, &ctx->defer_list, list) {
- if (io_match_link_files(de->req, files)) {
+ if (io_match_task(de->req, task, files)) {
list_cut_position(&list, &ctx->defer_list, &de->list);
break;
}
@@ -8535,73 +8751,52 @@ static void io_cancel_defer_files(struct io_ring_ctx *ctx,
}
}
-/*
- * Returns true if we found and killed one or more files pinning requests
- */
-static bool io_uring_cancel_files(struct io_ring_ctx *ctx,
+static void io_uring_cancel_files(struct io_ring_ctx *ctx,
+ struct task_struct *task,
struct files_struct *files)
{
- if (list_empty_careful(&ctx->inflight_list))
- return false;
-
- io_cancel_defer_files(ctx, files);
- /* cancel all at once, should be faster than doing it one by one*/
- io_wq_cancel_cb(ctx->io_wq, io_wq_files_match, files, true);
-
while (!list_empty_careful(&ctx->inflight_list)) {
- struct io_kiocb *cancel_req = NULL, *req;
+ struct io_task_cancel cancel = { .task = task, .files = files };
+ struct io_kiocb *req;
DEFINE_WAIT(wait);
+ bool found = false;
spin_lock_irq(&ctx->inflight_lock);
list_for_each_entry(req, &ctx->inflight_list, inflight_entry) {
- if (files && (req->work.flags & IO_WQ_WORK_FILES) &&
+ if (req->task != task ||
req->work.identity->files != files)
continue;
- /* req is being completed, ignore */
- if (!refcount_inc_not_zero(&req->refs))
- continue;
- cancel_req = req;
+ found = true;
break;
}
- if (cancel_req)
- prepare_to_wait(&ctx->inflight_wait, &wait,
- TASK_UNINTERRUPTIBLE);
+ if (found)
+ prepare_to_wait(&task->io_uring->wait, &wait,
+ TASK_UNINTERRUPTIBLE);
spin_unlock_irq(&ctx->inflight_lock);
/* We need to keep going until we don't find a matching req */
- if (!cancel_req)
+ if (!found)
break;
- /* cancel this request, or head link requests */
- io_attempt_cancel(ctx, cancel_req);
- io_put_req(cancel_req);
+
+ io_wq_cancel_cb(ctx->io_wq, io_cancel_task_cb, &cancel, true);
+ io_poll_remove_all(ctx, task, files);
+ io_kill_timeouts(ctx, task, files);
/* cancellations _may_ trigger task work */
io_run_task_work();
schedule();
- finish_wait(&ctx->inflight_wait, &wait);
+ finish_wait(&task->io_uring->wait, &wait);
}
-
- return true;
-}
-
-static bool io_cancel_task_cb(struct io_wq_work *work, void *data)
-{
- struct io_kiocb *req = container_of(work, struct io_kiocb, work);
- struct task_struct *task = data;
-
- return io_task_match(req, task);
}
-static bool __io_uring_cancel_task_requests(struct io_ring_ctx *ctx,
- struct task_struct *task,
- struct files_struct *files)
+static void __io_uring_cancel_task_requests(struct io_ring_ctx *ctx,
+ struct task_struct *task)
{
- bool ret;
-
- ret = io_uring_cancel_files(ctx, files);
- if (!files) {
+ while (1) {
+ struct io_task_cancel cancel = { .task = task, .files = NULL, };
enum io_wq_cancel cret;
+ bool ret = false;
- cret = io_wq_cancel_cb(ctx->io_wq, io_cancel_task_cb, task, true);
+ cret = io_wq_cancel_cb(ctx->io_wq, io_cancel_task_cb, &cancel, true);
if (cret != IO_WQ_CANCEL_NOTFOUND)
ret = true;
@@ -8613,11 +8808,13 @@ static bool __io_uring_cancel_task_requests(struct io_ring_ctx *ctx,
}
}
- ret |= io_poll_remove_all(ctx, task);
- ret |= io_kill_timeouts(ctx, task);
+ ret |= io_poll_remove_all(ctx, task, NULL);
+ ret |= io_kill_timeouts(ctx, task, NULL);
+ ret |= io_run_task_work();
+ if (!ret)
+ break;
+ cond_resched();
}
-
- return ret;
}
/*
@@ -8630,27 +8827,43 @@ static void io_uring_cancel_task_requests(struct io_ring_ctx *ctx,
{
struct task_struct *task = current;
- if ((ctx->flags & IORING_SETUP_SQPOLL) && ctx->sq_data)
+ if ((ctx->flags & IORING_SETUP_SQPOLL) && ctx->sq_data) {
task = ctx->sq_data->thread;
+ atomic_inc(&task->io_uring->in_idle);
+ io_sq_thread_park(ctx->sq_data);
+ }
+ io_cancel_defer_files(ctx, task, files);
+ io_ring_submit_lock(ctx, (ctx->flags & IORING_SETUP_IOPOLL));
io_cqring_overflow_flush(ctx, true, task, files);
+ io_ring_submit_unlock(ctx, (ctx->flags & IORING_SETUP_IOPOLL));
- while (__io_uring_cancel_task_requests(ctx, task, files)) {
- io_run_task_work();
- cond_resched();
+ if (!files)
+ __io_uring_cancel_task_requests(ctx, task);
+ else
+ io_uring_cancel_files(ctx, task, files);
+
+ if ((ctx->flags & IORING_SETUP_SQPOLL) && ctx->sq_data) {
+ atomic_dec(&task->io_uring->in_idle);
+ /*
+ * If the files that are going away are the ones in the thread
+ * identity, clear them out.
+ */
+ if (task->io_uring->identity->files == files)
+ task->io_uring->identity->files = NULL;
+ io_sq_thread_unpark(ctx->sq_data);
}
}
/*
* Note that this task has used io_uring. We use it for cancelation purposes.
*/
-static int io_uring_add_task_file(struct file *file)
+static int io_uring_add_task_file(struct io_ring_ctx *ctx, struct file *file)
{
struct io_uring_task *tctx = current->io_uring;
+ int ret;
if (unlikely(!tctx)) {
- int ret;
-
ret = io_uring_alloc_task_context(current);
if (unlikely(ret))
return ret;
@@ -8661,11 +8874,24 @@ static int io_uring_add_task_file(struct file *file)
if (!old) {
get_file(file);
- xa_store(&tctx->xa, (unsigned long)file, file, GFP_KERNEL);
+ ret = xa_err(xa_store(&tctx->xa, (unsigned long)file,
+ file, GFP_KERNEL));
+ if (ret) {
+ fput(file);
+ return ret;
+ }
}
tctx->last = file;
}
+ /*
+ * This is race safe in that the task itself is doing this, hence it
+ * cannot be going through the exit/cancel paths at the same time.
+ * This cannot be modified while exit/cancel is running.
+ */
+ if (!tctx->sqpoll && (ctx->flags & IORING_SETUP_SQPOLL))
+ tctx->sqpoll = true;
+
return 0;
}
@@ -8707,7 +8933,7 @@ void __io_uring_files_cancel(struct files_struct *files)
unsigned long index;
/* make sure overflow events are dropped */
- tctx->in_idle = true;
+ atomic_inc(&tctx->in_idle);
xa_for_each(&tctx->xa, index, file) {
struct io_ring_ctx *ctx = file->private_data;
@@ -8716,6 +8942,35 @@ void __io_uring_files_cancel(struct files_struct *files)
if (files)
io_uring_del_task_file(file);
}
+
+ atomic_dec(&tctx->in_idle);
+}
+
+static s64 tctx_inflight(struct io_uring_task *tctx)
+{
+ unsigned long index;
+ struct file *file;
+ s64 inflight;
+
+ inflight = percpu_counter_sum(&tctx->inflight);
+ if (!tctx->sqpoll)
+ return inflight;
+
+ /*
+ * If we have SQPOLL rings, then we need to iterate and find them, and
+ * add the pending count for those.
+ */
+ xa_for_each(&tctx->xa, index, file) {
+ struct io_ring_ctx *ctx = file->private_data;
+
+ if (ctx->flags & IORING_SETUP_SQPOLL) {
+ struct io_uring_task *__tctx = ctx->sqo_task->io_uring;
+
+ inflight += percpu_counter_sum(&__tctx->inflight);
+ }
+ }
+
+ return inflight;
}
/*
@@ -8729,11 +8984,11 @@ void __io_uring_task_cancel(void)
s64 inflight;
/* make sure overflow events are dropped */
- tctx->in_idle = true;
+ atomic_inc(&tctx->in_idle);
do {
/* read completions before cancelations */
- inflight = percpu_counter_sum(&tctx->inflight);
+ inflight = tctx_inflight(tctx);
if (!inflight)
break;
__io_uring_files_cancel(NULL);
@@ -8744,13 +8999,13 @@ void __io_uring_task_cancel(void)
* If we've seen completions, retry. This avoids a race where
* a completion comes in before we did prepare_to_wait().
*/
- if (inflight != percpu_counter_sum(&tctx->inflight))
+ if (inflight != tctx_inflight(tctx))
continue;
schedule();
+ finish_wait(&tctx->wait, &wait);
} while (1);
- finish_wait(&tctx->wait, &wait);
- tctx->in_idle = false;
+ atomic_dec(&tctx->in_idle);
}
static int io_uring_flush(struct file *file, void *data)
@@ -8848,9 +9103,39 @@ static void io_sqpoll_wait_sq(struct io_ring_ctx *ctx)
finish_wait(&ctx->sqo_sq_wait, &wait);
}
+static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz,
+ struct __kernel_timespec __user **ts,
+ const sigset_t __user **sig)
+{
+ struct io_uring_getevents_arg arg;
+
+ /*
+ * If EXT_ARG isn't set, then we have no timespec and the argp pointer
+ * is just a pointer to the sigset_t.
+ */
+ if (!(flags & IORING_ENTER_EXT_ARG)) {
+ *sig = (const sigset_t __user *) argp;
+ *ts = NULL;
+ return 0;
+ }
+
+ /*
+ * EXT_ARG is set - ensure we agree on the size of it and copy in our
+ * timespec and sigset_t pointers if good.
+ */
+ if (*argsz != sizeof(arg))
+ return -EINVAL;
+ if (copy_from_user(&arg, argp, sizeof(arg)))
+ return -EFAULT;
+ *sig = u64_to_user_ptr(arg.sigmask);
+ *argsz = arg.sigmask_sz;
+ *ts = u64_to_user_ptr(arg.ts);
+ return 0;
+}
+
SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
- u32, min_complete, u32, flags, const sigset_t __user *, sig,
- size_t, sigsz)
+ u32, min_complete, u32, flags, const void __user *, argp,
+ size_t, argsz)
{
struct io_ring_ctx *ctx;
long ret = -EBADF;
@@ -8860,7 +9145,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
io_run_task_work();
if (flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP |
- IORING_ENTER_SQ_WAIT))
+ IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG))
return -EINVAL;
f = fdget(fd);
@@ -8887,15 +9172,20 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
*/
ret = 0;
if (ctx->flags & IORING_SETUP_SQPOLL) {
- if (!list_empty_careful(&ctx->cq_overflow_list))
+ if (!list_empty_careful(&ctx->cq_overflow_list)) {
+ bool needs_lock = ctx->flags & IORING_SETUP_IOPOLL;
+
+ io_ring_submit_lock(ctx, needs_lock);
io_cqring_overflow_flush(ctx, false, NULL, NULL);
+ io_ring_submit_unlock(ctx, needs_lock);
+ }
if (flags & IORING_ENTER_SQ_WAKEUP)
wake_up(&ctx->sq_data->wait);
if (flags & IORING_ENTER_SQ_WAIT)
io_sqpoll_wait_sq(ctx);
submitted = to_submit;
} else if (to_submit) {
- ret = io_uring_add_task_file(f.file);
+ ret = io_uring_add_task_file(ctx, f.file);
if (unlikely(ret))
goto out;
mutex_lock(&ctx->uring_lock);
@@ -8906,6 +9196,13 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
goto out;
}
if (flags & IORING_ENTER_GETEVENTS) {
+ const sigset_t __user *sig;
+ struct __kernel_timespec __user *ts;
+
+ ret = io_get_ext_arg(flags, argp, &argsz, &ts, &sig);
+ if (unlikely(ret))
+ goto out;
+
min_complete = min(min_complete, ctx->cq_entries);
/*
@@ -8918,7 +9215,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
!(ctx->flags & IORING_SETUP_SQPOLL)) {
ret = io_iopoll_check(ctx, min_complete);
} else {
- ret = io_cqring_wait(ctx, min_complete, sig, sigsz);
+ ret = io_cqring_wait(ctx, min_complete, sig, argsz, ts);
}
}
@@ -8932,7 +9229,8 @@ out_fput:
#ifdef CONFIG_PROC_FS
static int io_uring_show_cred(int id, void *p, void *data)
{
- const struct cred *cred = p;
+ struct io_identity *iod = p;
+ const struct cred *cred = iod->creds;
struct seq_file *m = data;
struct user_namespace *uns = seq_user_ns(m);
struct group_info *gi;
@@ -9090,52 +9388,52 @@ static int io_allocate_scq_urings(struct io_ring_ctx *ctx,
return 0;
}
+static int io_uring_install_fd(struct io_ring_ctx *ctx, struct file *file)
+{
+ int ret, fd;
+
+ fd = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
+ if (fd < 0)
+ return fd;
+
+ ret = io_uring_add_task_file(ctx, file);
+ if (ret) {
+ put_unused_fd(fd);
+ return ret;
+ }
+ fd_install(fd, file);
+ return fd;
+}
+
/*
* Allocate an anonymous fd, this is what constitutes the application
* visible backing of an io_uring instance. The application mmaps this
* fd to gain access to the SQ/CQ ring details. If UNIX sockets are enabled,
* we have to tie this fd to a socket for file garbage collection purposes.
*/
-static int io_uring_get_fd(struct io_ring_ctx *ctx)
+static struct file *io_uring_get_file(struct io_ring_ctx *ctx)
{
struct file *file;
+#if defined(CONFIG_UNIX)
int ret;
-#if defined(CONFIG_UNIX)
ret = sock_create_kern(&init_net, PF_UNIX, SOCK_RAW, IPPROTO_IP,
&ctx->ring_sock);
if (ret)
- return ret;
+ return ERR_PTR(ret);
#endif
- ret = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
- if (ret < 0)
- goto err;
-
file = anon_inode_getfile("[io_uring]", &io_uring_fops, ctx,
O_RDWR | O_CLOEXEC);
- if (IS_ERR(file)) {
-err_fd:
- put_unused_fd(ret);
- ret = PTR_ERR(file);
- goto err;
- }
-
#if defined(CONFIG_UNIX)
- ctx->ring_sock->file = file;
-#endif
- if (unlikely(io_uring_add_task_file(file))) {
- file = ERR_PTR(-ENOMEM);
- goto err_fd;
+ if (IS_ERR(file)) {
+ sock_release(ctx->ring_sock);
+ ctx->ring_sock = NULL;
+ } else {
+ ctx->ring_sock->file = file;
}
- fd_install(ret, file);
- return ret;
-err:
-#if defined(CONFIG_UNIX)
- sock_release(ctx->ring_sock);
- ctx->ring_sock = NULL;
#endif
- return ret;
+ return file;
}
static int io_uring_create(unsigned entries, struct io_uring_params *p,
@@ -9143,6 +9441,7 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p,
{
struct user_struct *user = NULL;
struct io_ring_ctx *ctx;
+ struct file *file;
bool limit_mem;
int ret;
@@ -9169,7 +9468,7 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p,
* to a power-of-two, if it isn't already. We do NOT impose
* any cq vs sq ring sizing.
*/
- if (p->cq_entries < p->sq_entries)
+ if (!p->cq_entries)
return -EINVAL;
if (p->cq_entries > IORING_MAX_CQ_ENTRIES) {
if (!(p->flags & IORING_SETUP_CLAMP))
@@ -9177,6 +9476,8 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p,
p->cq_entries = IORING_MAX_CQ_ENTRIES;
}
p->cq_entries = roundup_pow_of_two(p->cq_entries);
+ if (p->cq_entries < p->sq_entries)
+ return -EINVAL;
} else {
p->cq_entries = 2 * p->sq_entries;
}
@@ -9280,20 +9581,30 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p,
p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP |
IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS |
IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL |
- IORING_FEAT_POLL_32BITS;
+ IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED |
+ IORING_FEAT_EXT_ARG;
if (copy_to_user(params, p, sizeof(*p))) {
ret = -EFAULT;
goto err;
}
+ file = io_uring_get_file(ctx);
+ if (IS_ERR(file)) {
+ ret = PTR_ERR(file);
+ goto err;
+ }
+
/*
* Install ring fd as the very last thing, so we don't risk someone
* having closed it before we finish setup
*/
- ret = io_uring_get_fd(ctx);
- if (ret < 0)
- goto err;
+ ret = io_uring_install_fd(ctx, file);
+ if (ret < 0) {
+ /* fput will clean it up */
+ fput(file);
+ return ret;
+ }
trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags);
return ret;