diff options
Diffstat (limited to 'fs/io_uring.c')
| -rw-r--r-- | fs/io_uring.c | 984 |
1 files changed, 721 insertions, 263 deletions
diff --git a/fs/io_uring.c b/fs/io_uring.c index cfb48bd088e1..a30c4f622cb3 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -75,7 +75,7 @@ #include "internal.h" -#define IORING_MAX_ENTRIES 4096 +#define IORING_MAX_ENTRIES 32768 #define IORING_MAX_FIXED_FILES 1024 struct io_uring { @@ -84,27 +84,29 @@ struct io_uring { }; /* - * This data is shared with the application through the mmap at offset - * IORING_OFF_SQ_RING. + * This data is shared with the application through the mmap at offsets + * IORING_OFF_SQ_RING and IORING_OFF_CQ_RING. * * The offsets to the member fields are published through struct * io_sqring_offsets when calling io_uring_setup. */ -struct io_sq_ring { +struct io_rings { /* * Head and tail offsets into the ring; the offsets need to be * masked to get valid indices. * - * The kernel controls head and the application controls tail. + * The kernel controls head of the sq ring and the tail of the cq ring, + * and the application controls tail of the sq ring and the head of the + * cq ring. */ - struct io_uring r; + struct io_uring sq, cq; /* - * Bitmask to apply to head and tail offsets (constant, equals + * Bitmasks to apply to head and tail offsets (constant, equals * ring_entries - 1) */ - u32 ring_mask; - /* Ring size (constant, power of 2) */ - u32 ring_entries; + u32 sq_ring_mask, cq_ring_mask; + /* Ring sizes (constant, power of 2) */ + u32 sq_ring_entries, cq_ring_entries; /* * Number of invalid entries dropped by the kernel due to * invalid index stored in array @@ -117,7 +119,7 @@ struct io_sq_ring { * counter includes all submissions that were dropped reaching * the new SQ head (and possibly more). */ - u32 dropped; + u32 sq_dropped; /* * Runtime flags * @@ -127,43 +129,7 @@ struct io_sq_ring { * The application needs a full memory barrier before checking * for IORING_SQ_NEED_WAKEUP after updating the sq tail. */ - u32 flags; - /* - * Ring buffer of indices into array of io_uring_sqe, which is - * mmapped by the application using the IORING_OFF_SQES offset. - * - * This indirection could e.g. be used to assign fixed - * io_uring_sqe entries to operations and only submit them to - * the queue when needed. - * - * The kernel modifies neither the indices array nor the entries - * array. - */ - u32 array[]; -}; - -/* - * This data is shared with the application through the mmap at offset - * IORING_OFF_CQ_RING. - * - * The offsets to the member fields are published through struct - * io_cqring_offsets when calling io_uring_setup. - */ -struct io_cq_ring { - /* - * Head and tail offsets into the ring; the offsets need to be - * masked to get valid indices. - * - * The application controls head and the kernel tail. - */ - struct io_uring r; - /* - * Bitmask to apply to head and tail offsets (constant, equals - * ring_entries - 1) - */ - u32 ring_mask; - /* Ring size (constant, power of 2) */ - u32 ring_entries; + u32 sq_flags; /* * Number of completion events lost because the queue was full; * this should be avoided by the application by making sure @@ -177,7 +143,7 @@ struct io_cq_ring { * As completion events come in out of order this counter is not * ordered with any other data. */ - u32 overflow; + u32 cq_overflow; /* * Ring buffer of completion events. * @@ -185,7 +151,7 @@ struct io_cq_ring { * produced, so the application is allowed to modify pending * entries. */ - struct io_uring_cqe cqes[]; + struct io_uring_cqe cqes[] ____cacheline_aligned_in_smp; }; struct io_mapped_ubuf { @@ -201,7 +167,7 @@ struct async_list { struct list_head list; struct file *file; - off_t io_end; + off_t io_start; size_t io_len; }; @@ -215,35 +181,49 @@ struct io_ring_ctx { bool compat; bool account_mem; - /* SQ ring */ - struct io_sq_ring *sq_ring; + /* + * Ring buffer of indices into array of io_uring_sqe, which is + * mmapped by the application using the IORING_OFF_SQES offset. + * + * This indirection could e.g. be used to assign fixed + * io_uring_sqe entries to operations and only submit them to + * the queue when needed. + * + * The kernel modifies neither the indices array nor the entries + * array. + */ + u32 *sq_array; unsigned cached_sq_head; unsigned sq_entries; unsigned sq_mask; unsigned sq_thread_idle; + unsigned cached_sq_dropped; struct io_uring_sqe *sq_sqes; struct list_head defer_list; + struct list_head timeout_list; } ____cacheline_aligned_in_smp; /* IO offload */ - struct workqueue_struct *sqo_wq; + struct workqueue_struct *sqo_wq[2]; struct task_struct *sqo_thread; /* if using sq thread polling */ struct mm_struct *sqo_mm; wait_queue_head_t sqo_wait; struct completion sqo_thread_started; struct { - /* CQ ring */ - struct io_cq_ring *cq_ring; unsigned cached_cq_tail; + atomic_t cached_cq_overflow; unsigned cq_entries; unsigned cq_mask; struct wait_queue_head cq_wait; struct fasync_struct *cq_fasync; struct eventfd_ctx *cq_ev_fd; + atomic_t cq_timeouts; } ____cacheline_aligned_in_smp; + struct io_rings *rings; + /* * If used, fixed file set. Writers must ensure that ->refs is dead, * readers must ensure that ->refs is alive as long as the file* is @@ -288,6 +268,7 @@ struct io_ring_ctx { struct sqe_submit { const struct io_uring_sqe *sqe; unsigned short index; + u32 sequence; bool has_user; bool needs_lock; bool needs_fixed_file; @@ -306,6 +287,11 @@ struct io_poll_iocb { struct wait_queue_entry wait; }; +struct io_timeout { + struct file *file; + struct hrtimer timer; +}; + /* * NOTE! Each of the iocb union members has the file pointer * as the first entry in their struct definition. So you can @@ -317,6 +303,7 @@ struct io_kiocb { struct file *file; struct kiocb rw; struct io_poll_iocb poll; + struct io_timeout timeout; }; struct sqe_submit submit; @@ -335,6 +322,10 @@ struct io_kiocb { #define REQ_F_LINK 64 /* linked sqes */ #define REQ_F_LINK_DONE 128 /* linked sqes done */ #define REQ_F_FAIL_LINK 256 /* fail rest of links */ +#define REQ_F_SHADOW_DRAIN 512 /* link-drain shadow req */ +#define REQ_F_TIMEOUT 1024 /* timeout request */ +#define REQ_F_ISREG 2048 /* regular file */ +#define REQ_F_MUST_PUNT 4096 /* must be punted even for NONBLOCK */ u64 user_data; u32 result; u32 sequence; @@ -366,6 +357,9 @@ struct io_submit_state { }; static void io_sq_wq_submit_work(struct work_struct *work); +static void io_cqring_fill_event(struct io_ring_ctx *ctx, u64 ki_user_data, + long res); +static void __io_free_req(struct io_kiocb *req); static struct kmem_cache *req_cachep; @@ -421,27 +415,45 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) INIT_LIST_HEAD(&ctx->poll_list); INIT_LIST_HEAD(&ctx->cancel_list); INIT_LIST_HEAD(&ctx->defer_list); + INIT_LIST_HEAD(&ctx->timeout_list); return ctx; } +static inline bool __io_sequence_defer(struct io_ring_ctx *ctx, + struct io_kiocb *req) +{ + return req->sequence != ctx->cached_cq_tail + ctx->cached_sq_dropped + + atomic_read(&ctx->cached_cq_overflow); +} + static inline bool io_sequence_defer(struct io_ring_ctx *ctx, struct io_kiocb *req) { if ((req->flags & (REQ_F_IO_DRAIN|REQ_F_IO_DRAINED)) != REQ_F_IO_DRAIN) return false; - return req->sequence != ctx->cached_cq_tail + ctx->sq_ring->dropped; + return __io_sequence_defer(ctx, req); } static struct io_kiocb *io_get_deferred_req(struct io_ring_ctx *ctx) { struct io_kiocb *req; - if (list_empty(&ctx->defer_list)) - return NULL; + req = list_first_entry_or_null(&ctx->defer_list, struct io_kiocb, list); + if (req && !io_sequence_defer(ctx, req)) { + list_del_init(&req->list); + return req; + } + + return NULL; +} + +static struct io_kiocb *io_get_timeout_req(struct io_ring_ctx *ctx) +{ + struct io_kiocb *req; - req = list_first_entry(&ctx->defer_list, struct io_kiocb, list); - if (!io_sequence_defer(ctx, req)) { + req = list_first_entry_or_null(&ctx->timeout_list, struct io_kiocb, list); + if (req && !__io_sequence_defer(ctx, req)) { list_del_init(&req->list); return req; } @@ -451,11 +463,11 @@ static struct io_kiocb *io_get_deferred_req(struct io_ring_ctx *ctx) static void __io_commit_cqring(struct io_ring_ctx *ctx) { - struct io_cq_ring *ring = ctx->cq_ring; + struct io_rings *rings = ctx->rings; - if (ctx->cached_cq_tail != READ_ONCE(ring->r.tail)) { + if (ctx->cached_cq_tail != READ_ONCE(rings->cq.tail)) { /* order cqe stores with ring update */ - smp_store_release(&ring->r.tail, ctx->cached_cq_tail); + smp_store_release(&rings->cq.tail, ctx->cached_cq_tail); if (wq_has_sleeper(&ctx->cq_wait)) { wake_up_interruptible(&ctx->cq_wait); @@ -464,21 +476,69 @@ static void __io_commit_cqring(struct io_ring_ctx *ctx) } } +static inline void io_queue_async_work(struct io_ring_ctx *ctx, + struct io_kiocb *req) +{ + int rw = 0; + + if (req->submit.sqe) { + switch (req->submit.sqe->opcode) { + case IORING_OP_WRITEV: + case IORING_OP_WRITE_FIXED: + rw = !(req->rw.ki_flags & IOCB_DIRECT); + break; + } + } + + queue_work(ctx->sqo_wq[rw], &req->work); +} + +static void io_kill_timeout(struct io_kiocb *req) +{ + int ret; + + ret = hrtimer_try_to_cancel(&req->timeout.timer); + if (ret != -1) { + atomic_inc(&req->ctx->cq_timeouts); + list_del(&req->list); + io_cqring_fill_event(req->ctx, req->user_data, 0); + __io_free_req(req); + } +} + +static void io_kill_timeouts(struct io_ring_ctx *ctx) +{ + struct io_kiocb *req, *tmp; + + spin_lock_irq(&ctx->completion_lock); + list_for_each_entry_safe(req, tmp, &ctx->timeout_list, list) + io_kill_timeout(req); + spin_unlock_irq(&ctx->completion_lock); +} + static void io_commit_cqring(struct io_ring_ctx *ctx) { struct io_kiocb *req; + while ((req = io_get_timeout_req(ctx)) != NULL) + io_kill_timeout(req); + __io_commit_cqring(ctx); while ((req = io_get_deferred_req(ctx)) != NULL) { + if (req->flags & REQ_F_SHADOW_DRAIN) { + /* Just for drain, free it. */ + __io_free_req(req); + continue; + } req->flags |= REQ_F_IO_DRAINED; - queue_work(ctx->sqo_wq, &req->work); + io_queue_async_work(ctx, req); } } static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx) { - struct io_cq_ring *ring = ctx->cq_ring; + struct io_rings *rings = ctx->rings; unsigned tail; tail = ctx->cached_cq_tail; @@ -487,11 +547,11 @@ static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx) * control dependency is enough as we're using WRITE_ONCE to * fill the cq entry */ - if (tail - READ_ONCE(ring->r.head) == ring->ring_entries) + if (tail - READ_ONCE(rings->cq.head) == rings->cq_ring_entries) return NULL; ctx->cached_cq_tail++; - return &ring->cqes[tail & ctx->cq_mask]; + return &rings->cqes[tail & ctx->cq_mask]; } static void io_cqring_fill_event(struct io_ring_ctx *ctx, u64 ki_user_data, @@ -510,9 +570,8 @@ static void io_cqring_fill_event(struct io_ring_ctx *ctx, u64 ki_user_data, WRITE_ONCE(cqe->res, res); WRITE_ONCE(cqe->flags, 0); } else { - unsigned overflow = READ_ONCE(ctx->cq_ring->overflow); - - WRITE_ONCE(ctx->cq_ring->overflow, overflow + 1); + WRITE_ONCE(ctx->rings->cq_overflow, + atomic_inc_return(&ctx->cached_cq_overflow)); } } @@ -539,14 +598,6 @@ static void io_cqring_add_event(struct io_ring_ctx *ctx, u64 user_data, io_cqring_ev_posted(ctx); } -static void io_ring_drop_ctx_refs(struct io_ring_ctx *ctx, unsigned refs) -{ - percpu_ref_put_many(&ctx->refs, refs); - - if (waitqueue_active(&ctx->wait)) - wake_up(&ctx->wait); -} - static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx, struct io_submit_state *state) { @@ -594,7 +645,7 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx, req->result = 0; return req; out: - io_ring_drop_ctx_refs(ctx, 1); + percpu_ref_put(&ctx->refs); return NULL; } @@ -602,7 +653,7 @@ static void io_free_req_many(struct io_ring_ctx *ctx, void **reqs, int *nr) { if (*nr) { kmem_cache_free_bulk(req_cachep, *nr, reqs); - io_ring_drop_ctx_refs(ctx, *nr); + percpu_ref_put_many(&ctx->refs, *nr); *nr = 0; } } @@ -611,7 +662,7 @@ static void __io_free_req(struct io_kiocb *req) { if (req->file && !(req->flags & REQ_F_FIXED_FILE)) fput(req->file); - io_ring_drop_ctx_refs(req->ctx, 1); + percpu_ref_put(&req->ctx->refs); kmem_cache_free(req_cachep, req); } @@ -635,7 +686,7 @@ static void io_req_link_next(struct io_kiocb *req) nxt->flags |= REQ_F_LINK_DONE; INIT_WORK(&nxt->work, io_sq_wq_submit_work); - queue_work(req->ctx->sqo_wq, &nxt->work); + io_queue_async_work(req->ctx, nxt); } } @@ -679,11 +730,19 @@ static void io_put_req(struct io_kiocb *req) io_free_req(req); } -static unsigned io_cqring_events(struct io_cq_ring *ring) +static unsigned io_cqring_events(struct io_rings *rings) { /* See comment at the top of this file */ smp_rmb(); - return READ_ONCE(ring->r.tail) - READ_ONCE(ring->r.head); + return READ_ONCE(rings->cq.tail) - READ_ONCE(rings->cq.head); +} + +static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx) +{ + struct io_rings *rings = ctx->rings; + + /* make sure SQ entry isn't read before tail */ + return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head; } /* @@ -815,19 +874,11 @@ static void io_iopoll_reap_events(struct io_ring_ctx *ctx) mutex_unlock(&ctx->uring_lock); } -static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events, - long min) +static int __io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events, + long min) { - int iters, ret = 0; - - /* - * We disallow the app entering submit/complete with polling, but we - * still need to lock the ring to prevent racing with polled issue - * that got punted to a workqueue. - */ - mutex_lock(&ctx->uring_lock); + int iters = 0, ret = 0; - iters = 0; do { int tmin = 0; @@ -836,7 +887,7 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events, * If we do, we can potentially be spinning for commands that * already triggered a CQE (eg in error). */ - if (io_cqring_events(ctx->cq_ring)) + if (io_cqring_events(ctx->rings)) break; /* @@ -863,30 +914,45 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events, ret = 0; } while (min && !*nr_events && !need_resched()); + return ret; +} + +static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events, + long min) +{ + int ret; + + /* + * We disallow the app entering submit/complete with polling, but we + * still need to lock the ring to prevent racing with polled issue + * that got punted to a workqueue. + */ + mutex_lock(&ctx->uring_lock); + ret = __io_iopoll_check(ctx, nr_events, min); mutex_unlock(&ctx->uring_lock); return ret; } -static void kiocb_end_write(struct kiocb *kiocb) +static void kiocb_end_write(struct io_kiocb *req) { - if (kiocb->ki_flags & IOCB_WRITE) { - struct inode *inode = file_inode(kiocb->ki_filp); + /* + * Tell lockdep we inherited freeze protection from submission + * thread. + */ + if (req->flags & REQ_F_ISREG) { + struct inode *inode = file_inode(req->file); - /* - * Tell lockdep we inherited freeze protection from submission - * thread. - */ - if (S_ISREG(inode->i_mode)) - __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE); - file_end_write(kiocb->ki_filp); + __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE); } + file_end_write(req->file); } static void io_complete_rw(struct kiocb *kiocb, long res, long res2) { struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw); - kiocb_end_write(kiocb); + if (kiocb->ki_flags & IOCB_WRITE) + kiocb_end_write(req); if ((req->flags & REQ_F_LINK) && res != req->result) req->flags |= REQ_F_FAIL_LINK; @@ -898,7 +964,8 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2) { struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw); - kiocb_end_write(kiocb); + if (kiocb->ki_flags & IOCB_WRITE) + kiocb_end_write(req); if ((req->flags & REQ_F_LINK) && res != req->result) req->flags |= REQ_F_FAIL_LINK; @@ -1012,8 +1079,17 @@ static int io_prep_rw(struct io_kiocb *req, const struct sqe_submit *s, if (!req->file) return -EBADF; - if (force_nonblock && !io_file_supports_async(req->file)) - force_nonblock = false; + if (S_ISREG(file_inode(req->file)->i_mode)) + req->flags |= REQ_F_ISREG; + + /* + * If the file doesn't support async, mark it as REQ_F_MUST_PUNT so + * we know to async punt it even if it was opened O_NONBLOCK + */ + if (force_nonblock && !io_file_supports_async(req->file)) { + req->flags |= REQ_F_MUST_PUNT; + return -EAGAIN; + } kiocb->ki_pos = READ_ONCE(sqe->off); kiocb->ki_flags = iocb_flags(kiocb->ki_filp); @@ -1034,7 +1110,8 @@ static int io_prep_rw(struct io_kiocb *req, const struct sqe_submit *s, return ret; /* don't allow async punt if RWF_NOWAIT was requested */ - if (kiocb->ki_flags & IOCB_NOWAIT) + if ((kiocb->ki_flags & IOCB_NOWAIT) || + (req->file->f_flags & O_NONBLOCK)) req->flags |= REQ_F_NOWAIT; if (force_nonblock) @@ -1187,6 +1264,28 @@ static ssize_t io_import_iovec(struct io_ring_ctx *ctx, int rw, return import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter); } +static inline bool io_should_merge(struct async_list *al, struct kiocb *kiocb) +{ + if (al->file == kiocb->ki_filp) { + off_t start, end; + + /* + * Allow merging if we're anywhere in the range of the same + * page. Generally this happens for sub-page reads or writes, + * and it's beneficial to allow the first worker to bring the + * page in and the piggy backed work can then work on the + * cached page. + */ + start = al->io_start & PAGE_MASK; + end = (al->io_start + al->io_len + PAGE_SIZE - 1) & PAGE_MASK; + if (kiocb->ki_pos >= start && kiocb->ki_pos <= end) + return true; + } + + al->file = NULL; + return false; +} + /* * Make a note of the last file/offset/direction we punted to async * context. We'll use this information to see if we can piggy back a @@ -1198,9 +1297,8 @@ static void io_async_list_note(int rw, struct io_kiocb *req, size_t len) struct async_list *async_list = &req->ctx->pending_async[rw]; struct kiocb *kiocb = &req->rw; struct file *filp = kiocb->ki_filp; - off_t io_end = kiocb->ki_pos + len; - if (filp == async_list->file && kiocb->ki_pos == async_list->io_end) { + if (io_should_merge(async_list, kiocb)) { unsigned long max_bytes; /* Use 8x RA size as a decent limiter for both reads/writes */ @@ -1213,17 +1311,61 @@ static void io_async_list_note(int rw, struct io_kiocb *req, size_t len) req->flags |= REQ_F_SEQ_PREV; async_list->io_len += len; } else { - io_end = 0; - async_list->io_len = 0; + async_list->file = NULL; } } /* New file? Reset state. */ if (async_list->file != filp) { - async_list->io_len = 0; + async_list->io_start = kiocb->ki_pos; + async_list->io_len = len; async_list->file = filp; } - async_list->io_end = io_end; +} + +/* + * For files that don't have ->read_iter() and ->write_iter(), handle them + * by looping over ->read() or ->write() manually. + */ +static ssize_t loop_rw_iter(int rw, struct file *file, struct kiocb *kiocb, + struct iov_iter *iter) +{ + ssize_t ret = 0; + + /* + * Don't support polled IO through this interface, and we can't + * support non-blocking either. For the latter, this just causes + * the kiocb to be handled from an async context. + */ + if (kiocb->ki_flags & IOCB_HIPRI) + return -EOPNOTSUPP; + if (kiocb->ki_flags & IOCB_NOWAIT) + return -EAGAIN; + + while (iov_iter_count(iter)) { + struct iovec iovec = iov_iter_iovec(iter); + ssize_t nr; + + if (rw == READ) { + nr = file->f_op->read(file, iovec.iov_base, + iovec.iov_len, &kiocb->ki_pos); + } else { + nr = file->f_op->write(file, iovec.iov_base, + iovec.iov_len, &kiocb->ki_pos); + } + + if (nr < 0) { + if (!ret) + ret = nr; + break; + } + ret += nr; + if (nr != iovec.iov_len) + break; + iov_iter_advance(iter, nr); + } + + return ret; } static int io_read(struct io_kiocb *req, const struct sqe_submit *s, @@ -1243,8 +1385,6 @@ static int io_read(struct io_kiocb *req, const struct sqe_submit *s, if (unlikely(!(file->f_mode & FMODE_READ))) return -EBADF; - if (unlikely(!file->f_op->read_iter)) - return -EINVAL; ret = io_import_iovec(req->ctx, READ, s, &iovec, &iter); if (ret < 0) @@ -1259,7 +1399,11 @@ static int io_read(struct io_kiocb *req, const struct sqe_submit *s, if (!ret) { ssize_t ret2; - ret2 = call_read_iter(file, kiocb, &iter); + if (file->f_op->read_iter) + ret2 = call_read_iter(file, kiocb, &iter); + else + ret2 = loop_rw_iter(READ, file, kiocb, &iter); + /* * In case of a short read, punt to async. This can happen * if we have data partially cached. Alternatively we can @@ -1268,7 +1412,9 @@ static int io_read(struct io_kiocb *req, const struct sqe_submit *s, * need async punt anyway, so it's more efficient to do it * here. */ - if (force_nonblock && ret2 > 0 && ret2 < read_size) + if (force_nonblock && !(req->flags & REQ_F_NOWAIT) && + (req->flags & REQ_F_ISREG) && + ret2 > 0 && ret2 < read_size) ret2 = -EAGAIN; /* Catch -EAGAIN return for forced non-blocking submission */ if (!force_nonblock || ret2 != -EAGAIN) { @@ -1304,8 +1450,6 @@ static int io_write(struct io_kiocb *req, const struct sqe_submit *s, file = kiocb->ki_filp; if (unlikely(!(file->f_mode & FMODE_WRITE))) return -EBADF; - if (unlikely(!file->f_op->write_iter)) - return -EINVAL; ret = io_import_iovec(req->ctx, WRITE, s, &iovec, &iter); if (ret < 0) @@ -1335,7 +1479,7 @@ static int io_write(struct io_kiocb *req, const struct sqe_submit *s, * released so that it doesn't complain about the held lock when * we return to userspace. */ - if (S_ISREG(file_inode(file)->i_mode)) { + if (req->flags & REQ_F_ISREG) { __sb_start_write(file_inode(file)->i_sb, SB_FREEZE_WRITE, true); __sb_writers_release(file_inode(file)->i_sb, @@ -1343,7 +1487,10 @@ static int io_write(struct io_kiocb *req, const struct sqe_submit *s, } kiocb->ki_flags |= IOCB_WRITE; - ret2 = call_write_iter(file, kiocb, &iter); + if (file->f_op->write_iter) + ret2 = call_write_iter(file, kiocb, &iter); + else + ret2 = loop_rw_iter(WRITE, file, kiocb, &iter); if (!force_nonblock || ret2 != -EAGAIN) { io_rw_done(kiocb, ret2); } else { @@ -1535,7 +1682,7 @@ static void io_poll_remove_one(struct io_kiocb *req) WRITE_ONCE(poll->canceled, true); if (!list_empty(&poll->wait.entry)) { list_del_init(&poll->wait.entry); - queue_work(req->ctx->sqo_wq, &req->work); + io_queue_async_work(req->ctx, req); } spin_unlock(&poll->head->lock); @@ -1649,7 +1796,7 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, io_cqring_ev_posted(ctx); io_put_req(req); } else { - queue_work(ctx->sqo_wq, &req->work); + io_queue_async_work(ctx, req); } return 1; @@ -1692,6 +1839,7 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (!poll->file) return -EBADF; + req->submit.sqe = NULL; INIT_WORK(&req->work, io_poll_complete_work); events = READ_ONCE(sqe->poll_events); poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP; @@ -1743,6 +1891,114 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe) return ipt.error; } +static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer) +{ + struct io_ring_ctx *ctx; + struct io_kiocb *req, *prev; + unsigned long flags; + + req = container_of(timer, struct io_kiocb, timeout.timer); + ctx = req->ctx; + atomic_inc(&ctx->cq_timeouts); + + spin_lock_irqsave(&ctx->completion_lock, flags); + /* + * Adjust the reqs sequence before the current one because it + * will consume a slot in the cq_ring and the the cq_tail pointer + * will be increased, otherwise other timeout reqs may return in + * advance without waiting for enough wait_nr. + */ + prev = req; + list_for_each_entry_continue_reverse(prev, &ctx->timeout_list, list) + prev->sequence++; + list_del(&req->list); + + io_cqring_fill_event(ctx, req->user_data, -ETIME); + io_commit_cqring(ctx); + spin_unlock_irqrestore(&ctx->completion_lock, flags); + + io_cqring_ev_posted(ctx); + + io_put_req(req); + return HRTIMER_NORESTART; +} + +static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + unsigned count; + struct io_ring_ctx *ctx = req->ctx; + struct list_head *entry; + struct timespec64 ts; + unsigned span = 0; + + if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) + return -EINVAL; + if (sqe->flags || sqe->ioprio || sqe->buf_index || sqe->timeout_flags || + sqe->len != 1) + return -EINVAL; + + if (get_timespec64(&ts, u64_to_user_ptr(sqe->addr))) + return -EFAULT; + + /* + * sqe->off holds how many events that need to occur for this + * timeout event to be satisfied. + */ + count = READ_ONCE(sqe->off); + if (!count) + count = 1; + + req->sequence = ctx->cached_sq_head + count - 1; + /* reuse it to store the count */ + req->submit.sequence = count; + req->flags |= REQ_F_TIMEOUT; + + /* + * Insertion sort, ensuring the first entry in the list is always + * the one we need first. + */ + spin_lock_irq(&ctx->completion_lock); + list_for_each_prev(entry, &ctx->timeout_list) { + struct io_kiocb *nxt = list_entry(entry, struct io_kiocb, list); + unsigned nxt_sq_head; + long long tmp, tmp_nxt; + + /* + * Since cached_sq_head + count - 1 can overflow, use type long + * long to store it. + */ + tmp = (long long)ctx->cached_sq_head + count - 1; + nxt_sq_head = nxt->sequence - nxt->submit.sequence + 1; + tmp_nxt = (long long)nxt_sq_head + nxt->submit.sequence - 1; + + /* + * cached_sq_head may overflow, and it will never overflow twice + * once there is some timeout req still be valid. + */ + if (ctx->cached_sq_head < nxt_sq_head) + tmp += UINT_MAX; + + if (tmp > tmp_nxt) + break; + + /* + * Sequence of reqs after the insert one and itself should + * be adjusted because each timeout req consumes a slot. + */ + span++; + nxt->sequence++; + } + req->sequence -= span; + list_add(&req->list, entry); + spin_unlock_irq(&ctx->completion_lock); + + hrtimer_init(&req->timeout.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + req->timeout.timer.function = io_timeout_fn; + hrtimer_start(&req->timeout.timer, timespec64_to_ktime(ts), + HRTIMER_MODE_REL); + return 0; +} + static int io_req_defer(struct io_ring_ctx *ctx, struct io_kiocb *req, const struct io_uring_sqe *sqe) { @@ -1820,6 +2076,9 @@ static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, case IORING_OP_RECVMSG: ret = io_recvmsg(req, s->sqe, force_nonblock); break; + case IORING_OP_TIMEOUT: + ret = io_timeout(req, s->sqe); + break; default: ret = -EINVAL; break; @@ -1992,7 +2251,7 @@ out: */ static bool io_add_to_prev_work(struct async_list *list, struct io_kiocb *req) { - bool ret = false; + bool ret; if (!list) return false; @@ -2038,10 +2297,14 @@ static int io_req_set_file(struct io_ring_ctx *ctx, const struct sqe_submit *s, flags = READ_ONCE(s->sqe->flags); fd = READ_ONCE(s->sqe->fd); - if (flags & IOSQE_IO_DRAIN) { + if (flags & IOSQE_IO_DRAIN) req->flags |= REQ_F_IO_DRAIN; - req->sequence = ctx->cached_sq_head - 1; - } + /* + * All io need record the previous position, if LINK vs DARIN, + * it can be used to mark the position of the first IO in the + * link list. + */ + req->sequence = s->sequence; if (!io_op_needs_file(s->sqe)) return 0; @@ -2063,38 +2326,33 @@ static int io_req_set_file(struct io_ring_ctx *ctx, const struct sqe_submit *s, return 0; } -static int io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, +static int __io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, struct sqe_submit *s) { int ret; - ret = io_req_defer(ctx, req, s->sqe); - if (ret) { - if (ret != -EIOCBQUEUED) { - io_free_req(req); - io_cqring_add_event(ctx, s->sqe->user_data, ret); - } - return 0; - } - ret = __io_submit_sqe(ctx, req, s, true); - if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) { + + /* + * We async punt it if the file wasn't marked NOWAIT, or if the file + * doesn't support non-blocking read/write attempts + */ + if (ret == -EAGAIN && (!(req->flags & REQ_F_NOWAIT) || + (req->flags & REQ_F_MUST_PUNT))) { struct io_uring_sqe *sqe_copy; - sqe_copy = kmalloc(sizeof(*sqe_copy), GFP_KERNEL); + sqe_copy = kmemdup(s->sqe, sizeof(*sqe_copy), GFP_KERNEL); if (sqe_copy) { struct async_list *list; - memcpy(sqe_copy, s->sqe, sizeof(*sqe_copy)); s->sqe = sqe_copy; - memcpy(&req->submit, s, sizeof(*s)); list = io_async_list_from_sqe(ctx, s->sqe); if (!io_add_to_prev_work(list, req)) { if (list) atomic_inc(&list->cnt); INIT_WORK(&req->work, io_sq_wq_submit_work); - queue_work(ctx->sqo_wq, &req->work); + io_queue_async_work(ctx, req); } /* @@ -2119,6 +2377,64 @@ static int io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, return ret; } +static int io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, + struct sqe_submit *s) +{ + int ret; + + ret = io_req_defer(ctx, req, s->sqe); + if (ret) { + if (ret != -EIOCBQUEUED) { + io_free_req(req); + io_cqring_add_event(ctx, s->sqe->user_data, ret); + } + return 0; + } + + return __io_queue_sqe(ctx, req, s); +} + +static int io_queue_link_head(struct io_ring_ctx *ctx, struct io_kiocb *req, + struct sqe_submit *s, struct io_kiocb *shadow) +{ + int ret; + int need_submit = false; + + if (!shadow) + return io_queue_sqe(ctx, req, s); + + /* + * Mark the first IO in link list as DRAIN, let all the following + * IOs enter the defer list. all IO needs to be completed before link + * list. + */ + req->flags |= REQ_F_IO_DRAIN; + ret = io_req_defer(ctx, req, s->sqe); + if (ret) { + if (ret != -EIOCBQUEUED) { + io_free_req(req); + io_cqring_add_event(ctx, s->sqe->user_data, ret); + return 0; + } + } else { + /* + * If ret == 0 means that all IOs in front of link io are + * running done. let's queue link head. + */ + need_submit = true; + } + + /* Insert shadow req to defer_list, blocking next IOs */ + spin_lock_irq(&ctx->completion_lock); + list_add_tail(&shadow->list, &ctx->defer_list); + spin_unlock_irq(&ctx->completion_lock); + + if (need_submit) + return __io_queue_sqe(ctx, req, s); + + return 0; +} + #define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK) static void io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s, @@ -2149,6 +2465,8 @@ err: return; } + req->user_data = s->sqe->user_data; + /* * If we already have a head request, queue this one for async * submittal once the head completes. If we don't have a head but @@ -2205,15 +2523,15 @@ static void io_submit_state_start(struct io_submit_state *state, static void io_commit_sqring(struct io_ring_ctx *ctx) { - struct io_sq_ring *ring = ctx->sq_ring; + struct io_rings *rings = ctx->rings; - if (ctx->cached_sq_head != READ_ONCE(ring->r.head)) { + if (ctx->cached_sq_head != READ_ONCE(rings->sq.head)) { /* * Ensure any loads from the SQEs are done at this point, * since once we write the new head, the application could * write new data to them. */ - smp_store_release(&ring->r.head, ctx->cached_sq_head); + smp_store_release(&rings->sq.head, ctx->cached_sq_head); } } @@ -2227,7 +2545,8 @@ static void io_commit_sqring(struct io_ring_ctx *ctx) */ static bool io_get_sqring(struct io_ring_ctx *ctx, struct sqe_submit *s) { - struct io_sq_ring *ring = ctx->sq_ring; + struct io_rings *rings = ctx->rings; + u32 *sq_array = ctx->sq_array; unsigned head; /* @@ -2240,28 +2559,31 @@ static bool io_get_sqring(struct io_ring_ctx *ctx, struct sqe_submit *s) */ head = ctx->cached_sq_head; /* make sure SQ entry isn't read before tail */ - if (head == smp_load_acquire(&ring->r.tail)) + if (head == smp_load_acquire(&rings->sq.tail)) return false; - head = READ_ONCE(ring->array[head & ctx->sq_mask]); + head = READ_ONCE(sq_array[head & ctx->sq_mask]); if (head < ctx->sq_entries) { s->index = head; s->sqe = &ctx->sq_sqes[head]; + s->sequence = ctx->cached_sq_head; ctx->cached_sq_head++; return true; } /* drop invalid entries */ ctx->cached_sq_head++; - ring->dropped++; + ctx->cached_sq_dropped++; + WRITE_ONCE(rings->sq_dropped, ctx->cached_sq_dropped); return false; } -static int io_submit_sqes(struct io_ring_ctx *ctx, struct sqe_submit *sqes, - unsigned int nr, bool has_user, bool mm_fault) +static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, + bool has_user, bool mm_fault) { struct io_submit_state state, *statep = NULL; struct io_kiocb *link = NULL; + struct io_kiocb *shadow_req = NULL; bool prev_was_link = false; int i, submitted = 0; @@ -2271,30 +2593,48 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, struct sqe_submit *sqes, } for (i = 0; i < nr; i++) { + struct sqe_submit s; + + if (!io_get_sqring(ctx, &s)) + break; + /* * If previous wasn't linked and we have a linked command, * that's the end of the chain. Submit the previous link. */ if (!prev_was_link && link) { - io_queue_sqe(ctx, link, &link->submit); + io_queue_link_head(ctx, link, &link->submit, shadow_req); link = NULL; + shadow_req = NULL; + } + prev_was_link = (s.sqe->flags & IOSQE_IO_LINK) != 0; + + if (link && (s.sqe->flags & IOSQE_IO_DRAIN)) { + if (!shadow_req) { + shadow_req = io_get_req(ctx, NULL); + if (unlikely(!shadow_req)) + goto out; + shadow_req->flags |= (REQ_F_IO_DRAIN | REQ_F_SHADOW_DRAIN); + refcount_dec(&shadow_req->refs); + } + shadow_req->sequence = s.sequence; } - prev_was_link = (sqes[i].sqe->flags & IOSQE_IO_LINK) != 0; +out: if (unlikely(mm_fault)) { - io_cqring_add_event(ctx, sqes[i].sqe->user_data, + io_cqring_add_event(ctx, s.sqe->user_data, -EFAULT); } else { - sqes[i].has_user = has_user; - sqes[i].needs_lock = true; - sqes[i].needs_fixed_file = true; - io_submit_sqe(ctx, &sqes[i], statep, &link); + s.has_user = has_user; + s.needs_lock = true; + s.needs_fixed_file = true; + io_submit_sqe(ctx, &s, statep, &link); submitted++; } } if (link) - io_queue_sqe(ctx, link, &link->submit); + io_queue_link_head(ctx, link, &link->submit, shadow_req); if (statep) io_submit_state_end(&state); @@ -2303,7 +2643,6 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, struct sqe_submit *sqes, static int io_sq_thread(void *data) { - struct sqe_submit sqes[IO_IOPOLL_BATCH]; struct io_ring_ctx *ctx = data; struct mm_struct *cur_mm = NULL; mm_segment_t old_fs; @@ -2318,14 +2657,27 @@ static int io_sq_thread(void *data) timeout = inflight = 0; while (!kthread_should_park()) { - bool all_fixed, mm_fault = false; - int i; + bool mm_fault = false; + unsigned int to_submit; if (inflight) { unsigned nr_events = 0; if (ctx->flags & IORING_SETUP_IOPOLL) { - io_iopoll_check(ctx, &nr_events, 0); + /* + * inflight is the count of the maximum possible + * entries we submitted, but it can be smaller + * if we dropped some of them. If we don't have + * poll entries available, then we know that we + * have nothing left to poll for. Reset the + * inflight count to zero in that case. + */ + mutex_lock(&ctx->uring_lock); + if (!list_empty(&ctx->poll_list)) + __io_iopoll_check(ctx, &nr_events, 0); + else + inflight = 0; + mutex_unlock(&ctx->uring_lock); } else { /* * Normal IO, just pretend everything completed. @@ -2339,14 +2691,15 @@ static int io_sq_thread(void *data) timeout = jiffies + ctx->sq_thread_idle; } - if (!io_get_sqring(ctx, &sqes[0])) { + to_submit = io_sqring_entries(ctx); + if (!to_submit) { /* * We're polling. If we're within the defined idle * period, then let us spin without work before going * to sleep. */ if (inflight || !time_after(jiffies, timeout)) { - cpu_relax(); + cond_resched(); continue; } @@ -2366,11 +2719,12 @@ static int io_sq_thread(void *data) TASK_INTERRUPTIBLE); /* Tell userspace we may need a wakeup call */ - ctx->sq_ring->flags |= IORING_SQ_NEED_WAKEUP; + ctx->rings->sq_flags |= IORING_SQ_NEED_WAKEUP; /* make sure to read SQ tail after writing flags */ smp_mb(); - if (!io_get_sqring(ctx, &sqes[0])) { + to_submit = io_sqring_entries(ctx); + if (!to_submit) { if (kthread_should_park()) { finish_wait(&ctx->sqo_wait, &wait); break; @@ -2380,27 +2734,16 @@ static int io_sq_thread(void *data) schedule(); finish_wait(&ctx->sqo_wait, &wait); - ctx->sq_ring->flags &= ~IORING_SQ_NEED_WAKEUP; + ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP; continue; } finish_wait(&ctx->sqo_wait, &wait); - ctx->sq_ring->flags &= ~IORING_SQ_NEED_WAKEUP; + ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP; } - i = 0; - all_fixed = true; - do { - if (all_fixed && io_sqe_needs_user(sqes[i].sqe)) - all_fixed = false; - - i++; - if (i == ARRAY_SIZE(sqes)) - break; - } while (io_get_sqring(ctx, &sqes[i])); - /* Unless all new commands are FIXED regions, grab mm */ - if (!all_fixed && !cur_mm) { + if (!cur_mm) { mm_fault = !mmget_not_zero(ctx->sqo_mm); if (!mm_fault) { use_mm(ctx->sqo_mm); @@ -2408,8 +2751,9 @@ static int io_sq_thread(void *data) } } - inflight += io_submit_sqes(ctx, sqes, i, cur_mm != NULL, - mm_fault); + to_submit = min(to_submit, ctx->sq_entries); + inflight += io_submit_sqes(ctx, to_submit, cur_mm != NULL, + mm_fault); /* Commit SQ ring head once we've consumed all SQEs */ io_commit_sqring(ctx); @@ -2430,6 +2774,7 @@ static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit) { struct io_submit_state state, *statep = NULL; struct io_kiocb *link = NULL; + struct io_kiocb *shadow_req = NULL; bool prev_was_link = false; int i, submit = 0; @@ -2449,27 +2794,73 @@ static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit) * that's the end of the chain. Submit the previous link. */ if (!prev_was_link && link) { - io_queue_sqe(ctx, link, &link->submit); + io_queue_link_head(ctx, link, &link->submit, shadow_req); link = NULL; + shadow_req = NULL; } prev_was_link = (s.sqe->flags & IOSQE_IO_LINK) != 0; + if (link && (s.sqe->flags & IOSQE_IO_DRAIN)) { + if (!shadow_req) { + shadow_req = io_get_req(ctx, NULL); + if (unlikely(!shadow_req)) + goto out; + shadow_req->flags |= (REQ_F_IO_DRAIN | REQ_F_SHADOW_DRAIN); + refcount_dec(&shadow_req->refs); + } + shadow_req->sequence = s.sequence; + } + +out: s.has_user = true; s.needs_lock = false; s.needs_fixed_file = false; submit++; io_submit_sqe(ctx, &s, statep, &link); } - io_commit_sqring(ctx); if (link) - io_queue_sqe(ctx, link, &link->submit); + io_queue_link_head(ctx, link, &link->submit, shadow_req); if (statep) io_submit_state_end(statep); + io_commit_sqring(ctx); + return submit; } +struct io_wait_queue { + struct wait_queue_entry wq; + struct io_ring_ctx *ctx; + unsigned to_wait; + unsigned nr_timeouts; +}; + +static inline bool io_should_wake(struct io_wait_queue *iowq) +{ + struct io_ring_ctx *ctx = iowq->ctx; + + /* + * Wake up if we have enough events, or if a timeout occured since we + * started waiting. For timeouts, we always want to return to userspace, + * regardless of event count. + */ + return io_cqring_events(ctx->rings) >= iowq->to_wait || + atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts; +} + +static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode, + int wake_flags, void *key) +{ + struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue, + wq); + + if (!io_should_wake(iowq)) + return -1; + + return autoremove_wake_function(curr, mode, wake_flags, key); +} + /* * Wait until events become available, if we don't already have some. The * application must reap them itself, as they reside on the shared cq ring. @@ -2477,10 +2868,19 @@ static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit) static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, const sigset_t __user *sig, size_t sigsz) { - struct io_cq_ring *ring = ctx->cq_ring; + struct io_wait_queue iowq = { + .wq = { + .private = current, + .func = io_wake_function, + .entry = LIST_HEAD_INIT(iowq.wq.entry), + }, + .ctx = ctx, + .to_wait = min_events, + }; + struct io_rings *rings = ctx->rings; int ret; - if (io_cqring_events(ring) >= min_events) + if (io_cqring_events(rings) >= min_events) return 0; if (sig) { @@ -2496,12 +2896,26 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, return ret; } - ret = wait_event_interruptible(ctx->wait, io_cqring_events(ring) >= min_events); + ret = 0; + iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts); + do { + prepare_to_wait_exclusive(&ctx->wait, &iowq.wq, + TASK_INTERRUPTIBLE); + if (io_should_wake(&iowq)) + break; + schedule(); + if (signal_pending(current)) { + ret = -ERESTARTSYS; + break; + } + } while (1); + finish_wait(&ctx->wait, &iowq.wq); + restore_saved_sigmask_unless(ret == -ERESTARTSYS); if (ret == -ERESTARTSYS) ret = -EINTR; - return READ_ONCE(ring->r.head) == READ_ONCE(ring->r.tail) ? ret : 0; + return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0; } static void __io_sqe_files_unregister(struct io_ring_ctx *ctx) @@ -2551,11 +2965,15 @@ static void io_sq_thread_stop(struct io_ring_ctx *ctx) static void io_finish_async(struct io_ring_ctx *ctx) { + int i; + io_sq_thread_stop(ctx); - if (ctx->sqo_wq) { - destroy_workqueue(ctx->sqo_wq); - ctx->sqo_wq = NULL; + for (i = 0; i < ARRAY_SIZE(ctx->sqo_wq); i++) { + if (ctx->sqo_wq[i]) { + destroy_workqueue(ctx->sqo_wq[i]); + ctx->sqo_wq[i] = NULL; + } } } @@ -2563,8 +2981,12 @@ static void io_finish_async(struct io_ring_ctx *ctx) static void io_destruct_skb(struct sk_buff *skb) { struct io_ring_ctx *ctx = skb->sk->sk_user_data; + int i; + + for (i = 0; i < ARRAY_SIZE(ctx->sqo_wq); i++) + if (ctx->sqo_wq[i]) + flush_workqueue(ctx->sqo_wq[i]); - io_finish_async(ctx); unix_destruct_scm(skb); } @@ -2763,16 +3185,31 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx, } /* Do QD, or 2 * CPUS, whatever is smallest */ - ctx->sqo_wq = alloc_workqueue("io_ring-wq", WQ_UNBOUND | WQ_FREEZABLE, + ctx->sqo_wq[0] = alloc_workqueue("io_ring-wq", + WQ_UNBOUND | WQ_FREEZABLE, min(ctx->sq_entries - 1, 2 * num_online_cpus())); - if (!ctx->sqo_wq) { + if (!ctx->sqo_wq[0]) { + ret = -ENOMEM; + goto err; + } + + /* + * This is for buffered writes, where we want to limit the parallelism + * due to file locking in file systems. As "normal" buffered writes + * should parellelize on writeout quite nicely, limit us to having 2 + * pending. This avoids massive contention on the inode when doing + * buffered async writes. + */ + ctx->sqo_wq[1] = alloc_workqueue("io_ring-write-wq", + WQ_UNBOUND | WQ_FREEZABLE, 2); + if (!ctx->sqo_wq[1]) { ret = -ENOMEM; goto err; } return 0; err: - io_sq_thread_stop(ctx); + io_finish_async(ctx); mmdrop(ctx->sqo_mm); ctx->sqo_mm = NULL; return ret; @@ -2821,17 +3258,45 @@ static void *io_mem_alloc(size_t size) return (void *) __get_free_pages(gfp_flags, get_order(size)); } +static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries, + size_t *sq_offset) +{ + struct io_rings *rings; + size_t off, sq_array_size; + + off = struct_size(rings, cqes, cq_entries); + if (off == SIZE_MAX) + return SIZE_MAX; + +#ifdef CONFIG_SMP + off = ALIGN(off, SMP_CACHE_BYTES); + if (off == 0) + return SIZE_MAX; +#endif + + sq_array_size = array_size(sizeof(u32), sq_entries); + if (sq_array_size == SIZE_MAX) + return SIZE_MAX; + + if (check_add_overflow(off, sq_array_size, &off)) + return SIZE_MAX; + + if (sq_offset) + *sq_offset = off; + + return off; +} + static unsigned long ring_pages(unsigned sq_entries, unsigned cq_entries) { - struct io_sq_ring *sq_ring; - struct io_cq_ring *cq_ring; - size_t bytes; + size_t pages; - bytes = struct_size(sq_ring, array, sq_entries); - bytes += array_size(sizeof(struct io_uring_sqe), sq_entries); - bytes += struct_size(cq_ring, cqes, cq_entries); + pages = (size_t)1 << get_order( + rings_size(sq_entries, cq_entries, NULL)); + pages += (size_t)1 << get_order( + array_size(sizeof(struct io_uring_sqe), sq_entries)); - return (bytes + PAGE_SIZE - 1) / PAGE_SIZE; + return pages; } static int io_sqe_buffer_unregister(struct io_ring_ctx *ctx) @@ -2845,7 +3310,7 @@ static int io_sqe_buffer_unregister(struct io_ring_ctx *ctx) struct io_mapped_ubuf *imu = &ctx->user_bufs[i]; for (j = 0; j < imu->nr_bvecs; j++) - put_page(imu->bvec[j].bv_page); + put_user_page(imu->bvec[j].bv_page); if (ctx->account_mem) io_unaccount_mem(ctx->user, imu->nr_bvecs); @@ -2989,10 +3454,8 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg, * if we did partial map, or found file backed vmas, * release any pages we did get */ - if (pret > 0) { - for (j = 0; j < pret; j++) - put_page(pages[j]); - } + if (pret > 0) + put_user_pages(pages, pret); if (ctx->account_mem) io_unaccount_mem(ctx->user, nr_pages); kvfree(imu->bvec); @@ -3078,9 +3541,8 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx) } #endif - io_mem_free(ctx->sq_ring); + io_mem_free(ctx->rings); io_mem_free(ctx->sq_sqes); - io_mem_free(ctx->cq_ring); percpu_ref_exit(&ctx->refs); if (ctx->account_mem) @@ -3101,10 +3563,10 @@ static __poll_t io_uring_poll(struct file *file, poll_table *wait) * io_commit_cqring */ smp_rmb(); - if (READ_ONCE(ctx->sq_ring->r.tail) - ctx->cached_sq_head != - ctx->sq_ring->ring_entries) + if (READ_ONCE(ctx->rings->sq.tail) - ctx->cached_sq_head != + ctx->rings->sq_ring_entries) mask |= EPOLLOUT | EPOLLWRNORM; - if (READ_ONCE(ctx->cq_ring->r.head) != ctx->cached_cq_tail) + if (READ_ONCE(ctx->rings->cq.head) != ctx->cached_cq_tail) mask |= EPOLLIN | EPOLLRDNORM; return mask; @@ -3123,6 +3585,7 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) percpu_ref_kill(&ctx->refs); mutex_unlock(&ctx->uring_lock); + io_kill_timeouts(ctx); io_poll_remove_all(ctx); io_iopoll_reap_events(ctx); wait_for_completion(&ctx->ctx_done); @@ -3149,20 +3612,18 @@ static int io_uring_mmap(struct file *file, struct vm_area_struct *vma) switch (offset) { case IORING_OFF_SQ_RING: - ptr = ctx->sq_ring; + case IORING_OFF_CQ_RING: + ptr = ctx->rings; break; case IORING_OFF_SQES: ptr = ctx->sq_sqes; break; - case IORING_OFF_CQ_RING: - ptr = ctx->cq_ring; - break; default: return -EINVAL; } page = virt_to_head_page(ptr); - if (sz > (PAGE_SIZE << compound_order(page))) + if (sz > page_size(page)) return -EINVAL; pfn = virt_to_phys(ptr) >> PAGE_SHIFT; @@ -3199,15 +3660,12 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, * Just return the requested submit count, and wake the thread if * we were asked to. */ + ret = 0; if (ctx->flags & IORING_SETUP_SQPOLL) { if (flags & IORING_ENTER_SQ_WAKEUP) wake_up(&ctx->sqo_wait); submitted = to_submit; - goto out_ctx; - } - - ret = 0; - if (to_submit) { + } else if (to_submit) { to_submit = min(to_submit, ctx->sq_entries); mutex_lock(&ctx->uring_lock); @@ -3226,8 +3684,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, } } -out_ctx: - io_ring_drop_ctx_refs(ctx, 1); + percpu_ref_put(&ctx->refs); out_fput: fdput(f); return submitted ? submitted : ret; @@ -3243,19 +3700,27 @@ static const struct file_operations io_uring_fops = { static int io_allocate_scq_urings(struct io_ring_ctx *ctx, struct io_uring_params *p) { - struct io_sq_ring *sq_ring; - struct io_cq_ring *cq_ring; - size_t size; + struct io_rings *rings; + size_t size, sq_array_offset; - sq_ring = io_mem_alloc(struct_size(sq_ring, array, p->sq_entries)); - if (!sq_ring) + size = rings_size(p->sq_entries, p->cq_entries, &sq_array_offset); + if (size == SIZE_MAX) + return -EOVERFLOW; + + rings = io_mem_alloc(size); + if (!rings) return -ENOMEM; - ctx->sq_ring = sq_ring; - sq_ring->ring_mask = p->sq_entries - 1; - sq_ring->ring_entries = p->sq_entries; - ctx->sq_mask = sq_ring->ring_mask; - ctx->sq_entries = sq_ring->ring_entries; + ctx->rings = rings; + ctx->sq_array = (u32 *)((char *)rings + sq_array_offset); + rings->sq_ring_mask = p->sq_entries - 1; + rings->cq_ring_mask = p->cq_entries - 1; + rings->sq_ring_entries = p->sq_entries; + rings->cq_ring_entries = p->cq_entries; + ctx->sq_mask = rings->sq_ring_mask; + ctx->cq_mask = rings->cq_ring_mask; + ctx->sq_entries = rings->sq_ring_entries; + ctx->cq_entries = rings->cq_ring_entries; size = array_size(sizeof(struct io_uring_sqe), p->sq_entries); if (size == SIZE_MAX) @@ -3265,15 +3730,6 @@ static int io_allocate_scq_urings(struct io_ring_ctx *ctx, if (!ctx->sq_sqes) return -ENOMEM; - cq_ring = io_mem_alloc(struct_size(cq_ring, cqes, p->cq_entries)); - if (!cq_ring) - return -ENOMEM; - - ctx->cq_ring = cq_ring; - cq_ring->ring_mask = p->cq_entries - 1; - cq_ring->ring_entries = p->cq_entries; - ctx->cq_mask = cq_ring->ring_mask; - ctx->cq_entries = cq_ring->ring_entries; return 0; } @@ -3377,21 +3833,23 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p) goto err; memset(&p->sq_off, 0, sizeof(p->sq_off)); - p->sq_off.head = offsetof(struct io_sq_ring, r.head); - p->sq_off.tail = offsetof(struct io_sq_ring, r.tail); - p->sq_off.ring_mask = offsetof(struct io_sq_ring, ring_mask); - p->sq_off.ring_entries = offsetof(struct io_sq_ring, ring_entries); - p->sq_off.flags = offsetof(struct io_sq_ring, flags); - p->sq_off.dropped = offsetof(struct io_sq_ring, dropped); - p->sq_off.array = offsetof(struct io_sq_ring, array); + p->sq_off.head = offsetof(struct io_rings, sq.head); + p->sq_off.tail = offsetof(struct io_rings, sq.tail); + p->sq_off.ring_mask = offsetof(struct io_rings, sq_ring_mask); + p->sq_off.ring_entries = offsetof(struct io_rings, sq_ring_entries); + p->sq_off.flags = offsetof(struct io_rings, sq_flags); + p->sq_off.dropped = offsetof(struct io_rings, sq_dropped); + p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings; memset(&p->cq_off, 0, sizeof(p->cq_off)); - p->cq_off.head = offsetof(struct io_cq_ring, r.head); - p->cq_off.tail = offsetof(struct io_cq_ring, r.tail); - p->cq_off.ring_mask = offsetof(struct io_cq_ring, ring_mask); - p->cq_off.ring_entries = offsetof(struct io_cq_ring, ring_entries); - p->cq_off.overflow = offsetof(struct io_cq_ring, overflow); - p->cq_off.cqes = offsetof(struct io_cq_ring, cqes); + p->cq_off.head = offsetof(struct io_rings, cq.head); + p->cq_off.tail = offsetof(struct io_rings, cq.tail); + p->cq_off.ring_mask = offsetof(struct io_rings, cq_ring_mask); + p->cq_off.ring_entries = offsetof(struct io_rings, cq_ring_entries); + p->cq_off.overflow = offsetof(struct io_rings, cq_overflow); + p->cq_off.cqes = offsetof(struct io_rings, cqes); + + p->features = IORING_FEAT_SINGLE_MMAP; return ret; err: io_ring_ctx_wait_and_kill(ctx); |