aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--block/bfq-iosched.c9
-rw-r--r--block/blk-cgroup.c9
-rw-r--r--block/blk-core.c1
-rw-r--r--block/blk-flush.c110
-rw-r--r--block/blk-mq-debugfs.c8
-rw-r--r--block/blk-mq-sched.h8
-rw-r--r--block/blk-mq.c121
-rw-r--r--block/blk-mq.h11
-rw-r--r--block/blk-rq-qos.c20
-rw-r--r--block/blk-wbt.c2
-rw-r--r--block/blk-zoned.c8
-rw-r--r--block/blk.h2
-rw-r--r--block/fops.c3
-rw-r--r--block/mq-deadline.c125
-rw-r--r--drivers/block/brd.c93
-rw-r--r--drivers/block/ublk_drv.c458
-rw-r--r--fs/Makefile10
-rw-r--r--fs/inode.c3
-rw-r--r--fs/no-block.c19
-rw-r--r--include/linux/blk-mq.h64
-rw-r--r--include/linux/blkdev.h15
-rw-r--r--include/trace/events/block.h26
-rw-r--r--include/uapi/linux/ublk_cmd.h25
23 files changed, 707 insertions, 443 deletions
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 3164e3177965..09bbbcf9e049 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -5403,6 +5403,10 @@ void bfq_put_queue(struct bfq_queue *bfqq)
if (bfqq->bfqd->last_completed_rq_bfqq == bfqq)
bfqq->bfqd->last_completed_rq_bfqq = NULL;
+ WARN_ON_ONCE(!list_empty(&bfqq->fifo));
+ WARN_ON_ONCE(!RB_EMPTY_ROOT(&bfqq->sort_list));
+ WARN_ON_ONCE(bfqq->dispatched);
+
kmem_cache_free(bfq_pool, bfqq);
bfqg_and_blkg_put(bfqg);
}
@@ -7135,6 +7139,7 @@ static void bfq_exit_queue(struct elevator_queue *e)
{
struct bfq_data *bfqd = e->elevator_data;
struct bfq_queue *bfqq, *n;
+ unsigned int actuator;
hrtimer_cancel(&bfqd->idle_slice_timer);
@@ -7143,6 +7148,10 @@ static void bfq_exit_queue(struct elevator_queue *e)
bfq_deactivate_bfqq(bfqd, bfqq, false, false);
spin_unlock_irq(&bfqd->lock);
+ for (actuator = 0; actuator < bfqd->num_actuators; actuator++)
+ WARN_ON_ONCE(bfqd->rq_in_driver[actuator]);
+ WARN_ON_ONCE(bfqd->tot_rq_in_driver);
+
hrtimer_cancel(&bfqd->idle_slice_timer);
/* release oom-queue reference to root group */
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 0ce64dd73cfe..1c6716f51fff 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -748,6 +748,13 @@ int blkg_conf_open_bdev(struct blkg_conf_ctx *ctx)
return -ENODEV;
}
+ mutex_lock(&bdev->bd_queue->rq_qos_mutex);
+ if (!disk_live(bdev->bd_disk)) {
+ blkdev_put_no_open(bdev);
+ mutex_unlock(&bdev->bd_queue->rq_qos_mutex);
+ return -ENODEV;
+ }
+
ctx->body = input;
ctx->bdev = bdev;
return 0;
@@ -892,6 +899,7 @@ EXPORT_SYMBOL_GPL(blkg_conf_prep);
*/
void blkg_conf_exit(struct blkg_conf_ctx *ctx)
__releases(&ctx->bdev->bd_queue->queue_lock)
+ __releases(&ctx->bdev->bd_queue->rq_qos_mutex)
{
if (ctx->blkg) {
spin_unlock_irq(&bdev_get_queue(ctx->bdev)->queue_lock);
@@ -899,6 +907,7 @@ void blkg_conf_exit(struct blkg_conf_ctx *ctx)
}
if (ctx->bdev) {
+ mutex_unlock(&ctx->bdev->bd_queue->rq_qos_mutex);
blkdev_put_no_open(ctx->bdev);
ctx->body = NULL;
ctx->bdev = NULL;
diff --git a/block/blk-core.c b/block/blk-core.c
index 00c74330fa92..2ae22bebeb3e 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -420,6 +420,7 @@ struct request_queue *blk_alloc_queue(int node_id)
mutex_init(&q->debugfs_mutex);
mutex_init(&q->sysfs_lock);
mutex_init(&q->sysfs_dir_lock);
+ mutex_init(&q->rq_qos_mutex);
spin_lock_init(&q->queue_lock);
init_waitqueue_head(&q->mq_freeze_wq);
diff --git a/block/blk-flush.c b/block/blk-flush.c
index 04698ed9bcd4..dba392cf22be 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -188,7 +188,9 @@ static void blk_flush_complete_seq(struct request *rq,
case REQ_FSEQ_DATA:
list_move_tail(&rq->flush.list, &fq->flush_data_in_flight);
- blk_mq_add_to_requeue_list(rq, BLK_MQ_INSERT_AT_HEAD);
+ spin_lock(&q->requeue_lock);
+ list_add_tail(&rq->queuelist, &q->flush_list);
+ spin_unlock(&q->requeue_lock);
blk_mq_kick_requeue_list(q);
break;
@@ -346,7 +348,10 @@ static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq,
smp_wmb();
req_ref_set(flush_rq, 1);
- blk_mq_add_to_requeue_list(flush_rq, 0);
+ spin_lock(&q->requeue_lock);
+ list_add_tail(&flush_rq->queuelist, &q->flush_list);
+ spin_unlock(&q->requeue_lock);
+
blk_mq_kick_requeue_list(q);
}
@@ -376,22 +381,29 @@ static enum rq_end_io_ret mq_flush_data_end_io(struct request *rq,
return RQ_END_IO_NONE;
}
-/**
- * blk_insert_flush - insert a new PREFLUSH/FUA request
- * @rq: request to insert
- *
- * To be called from __elv_add_request() for %ELEVATOR_INSERT_FLUSH insertions.
- * or __blk_mq_run_hw_queue() to dispatch request.
- * @rq is being submitted. Analyze what needs to be done and put it on the
- * right queue.
+static void blk_rq_init_flush(struct request *rq)
+{
+ rq->flush.seq = 0;
+ INIT_LIST_HEAD(&rq->flush.list);
+ rq->rq_flags |= RQF_FLUSH_SEQ;
+ rq->flush.saved_end_io = rq->end_io; /* Usually NULL */
+ rq->end_io = mq_flush_data_end_io;
+}
+
+/*
+ * Insert a PREFLUSH/FUA request into the flush state machine.
+ * Returns true if the request has been consumed by the flush state machine,
+ * or false if the caller should continue to process it.
*/
-void blk_insert_flush(struct request *rq)
+bool blk_insert_flush(struct request *rq)
{
struct request_queue *q = rq->q;
unsigned long fflags = q->queue_flags; /* may change, cache */
unsigned int policy = blk_flush_policy(fflags, rq);
struct blk_flush_queue *fq = blk_get_flush_queue(q, rq->mq_ctx);
- struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
+
+ /* FLUSH/FUA request must never be merged */
+ WARN_ON_ONCE(rq->bio != rq->biotail);
/*
* @policy now records what operations need to be done. Adjust
@@ -408,45 +420,45 @@ void blk_insert_flush(struct request *rq)
*/
rq->cmd_flags |= REQ_SYNC;
- /*
- * An empty flush handed down from a stacking driver may
- * translate into nothing if the underlying device does not
- * advertise a write-back cache. In this case, simply
- * complete the request.
- */
- if (!policy) {
+ switch (policy) {
+ case 0:
+ /*
+ * An empty flush handed down from a stacking driver may
+ * translate into nothing if the underlying device does not
+ * advertise a write-back cache. In this case, simply
+ * complete the request.
+ */
blk_mq_end_request(rq, 0);
- return;
- }
-
- BUG_ON(rq->bio != rq->biotail); /*assumes zero or single bio rq */
-
- /*
- * If there's data but flush is not necessary, the request can be
- * processed directly without going through flush machinery. Queue
- * for normal execution.
- */
- if ((policy & REQ_FSEQ_DATA) &&
- !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) {
- blk_mq_request_bypass_insert(rq, 0);
- blk_mq_run_hw_queue(hctx, false);
- return;
+ return true;
+ case REQ_FSEQ_DATA:
+ /*
+ * If there's data, but no flush is necessary, the request can
+ * be processed directly without going through flush machinery.
+ * Queue for normal execution.
+ */
+ return false;
+ case REQ_FSEQ_DATA | REQ_FSEQ_POSTFLUSH:
+ /*
+ * Initialize the flush fields and completion handler to trigger
+ * the post flush, and then just pass the command on.
+ */
+ blk_rq_init_flush(rq);
+ rq->flush.seq |= REQ_FSEQ_POSTFLUSH;
+ spin_lock_irq(&fq->mq_flush_lock);
+ list_move_tail(&rq->flush.list, &fq->flush_data_in_flight);
+ spin_unlock_irq(&fq->mq_flush_lock);
+ return false;
+ default:
+ /*
+ * Mark the request as part of a flush sequence and submit it
+ * for further processing to the flush state machine.
+ */
+ blk_rq_init_flush(rq);
+ spin_lock_irq(&fq->mq_flush_lock);
+ blk_flush_complete_seq(rq, fq, REQ_FSEQ_ACTIONS & ~policy, 0);
+ spin_unlock_irq(&fq->mq_flush_lock);
+ return true;
}
-
- /*
- * @rq should go through flush machinery. Mark it part of flush
- * sequence and submit for further processing.
- */
- memset(&rq->flush, 0, sizeof(rq->flush));
- INIT_LIST_HEAD(&rq->flush.list);
- rq->rq_flags |= RQF_FLUSH_SEQ;
- rq->flush.saved_end_io = rq->end_io; /* Usually NULL */
-
- rq->end_io = mq_flush_data_end_io;
-
- spin_lock_irq(&fq->mq_flush_lock);
- blk_flush_complete_seq(rq, fq, REQ_FSEQ_ACTIONS & ~policy, 0);
- spin_unlock_irq(&fq->mq_flush_lock);
}
/**
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index d23a8554ec4a..68165a50951b 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -88,6 +88,7 @@ static const char *const blk_queue_flag_name[] = {
QUEUE_FLAG_NAME(IO_STAT),
QUEUE_FLAG_NAME(NOXMERGES),
QUEUE_FLAG_NAME(ADD_RANDOM),
+ QUEUE_FLAG_NAME(SYNCHRONOUS),
QUEUE_FLAG_NAME(SAME_FORCE),
QUEUE_FLAG_NAME(INIT_DONE),
QUEUE_FLAG_NAME(STABLE_WRITES),
@@ -103,6 +104,8 @@ static const char *const blk_queue_flag_name[] = {
QUEUE_FLAG_NAME(RQ_ALLOC_TIME),
QUEUE_FLAG_NAME(HCTX_ACTIVE),
QUEUE_FLAG_NAME(NOWAIT),
+ QUEUE_FLAG_NAME(SQ_SCHED),
+ QUEUE_FLAG_NAME(SKIP_TAGSET_QUIESCE),
};
#undef QUEUE_FLAG_NAME
@@ -241,14 +244,14 @@ static const char *const cmd_flag_name[] = {
#define RQF_NAME(name) [ilog2((__force u32)RQF_##name)] = #name
static const char *const rqf_name[] = {
RQF_NAME(STARTED),
- RQF_NAME(SOFTBARRIER),
RQF_NAME(FLUSH_SEQ),
RQF_NAME(MIXED_MERGE),
RQF_NAME(MQ_INFLIGHT),
RQF_NAME(DONTPREP),
+ RQF_NAME(SCHED_TAGS),
+ RQF_NAME(USE_SCHED),
RQF_NAME(FAILED),
RQF_NAME(QUIET),
- RQF_NAME(ELVPRIV),
RQF_NAME(IO_STAT),
RQF_NAME(PM),
RQF_NAME(HASHED),
@@ -256,7 +259,6 @@ static const char *const rqf_name[] = {
RQF_NAME(SPECIAL_PAYLOAD),
RQF_NAME(ZONE_WRITE_LOCKED),
RQF_NAME(TIMED_OUT),
- RQF_NAME(ELV),
RQF_NAME(RESV),
};
#undef RQF_NAME
diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h
index 7c3cbad17f30..1326526bb733 100644
--- a/block/blk-mq-sched.h
+++ b/block/blk-mq-sched.h
@@ -37,7 +37,7 @@ static inline bool
blk_mq_sched_allow_merge(struct request_queue *q, struct request *rq,
struct bio *bio)
{
- if (rq->rq_flags & RQF_ELV) {
+ if (rq->rq_flags & RQF_USE_SCHED) {
struct elevator_queue *e = q->elevator;
if (e->type->ops.allow_merge)
@@ -48,7 +48,7 @@ blk_mq_sched_allow_merge(struct request_queue *q, struct request *rq,
static inline void blk_mq_sched_completed_request(struct request *rq, u64 now)
{
- if (rq->rq_flags & RQF_ELV) {
+ if (rq->rq_flags & RQF_USE_SCHED) {
struct elevator_queue *e = rq->q->elevator;
if (e->type->ops.completed_request)
@@ -58,11 +58,11 @@ static inline void blk_mq_sched_completed_request(struct request *rq, u64 now)
static inline void blk_mq_sched_requeue_request(struct request *rq)
{
- if (rq->rq_flags & RQF_ELV) {
+ if (rq->rq_flags & RQF_USE_SCHED) {
struct request_queue *q = rq->q;
struct elevator_queue *e = q->elevator;
- if ((rq->rq_flags & RQF_ELVPRIV) && e->type->ops.requeue_request)
+ if (e->type->ops.requeue_request)
e->type->ops.requeue_request(rq);
}
}
diff --git a/block/blk-mq.c b/block/blk-mq.c
index f6dad0886a2f..1749f5890606 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -45,6 +45,8 @@
static DEFINE_PER_CPU(struct llist_head, blk_cpu_done);
static void blk_mq_insert_request(struct request *rq, blk_insert_t flags);
+static void blk_mq_request_bypass_insert(struct request *rq,
+ blk_insert_t flags);
static void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
struct list_head *list);
@@ -354,12 +356,12 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
data->rq_flags |= RQF_IO_STAT;
rq->rq_flags = data->rq_flags;
- if (!(data->rq_flags & RQF_ELV)) {
- rq->tag = tag;
- rq->internal_tag = BLK_MQ_NO_TAG;
- } else {
+ if (data->rq_flags & RQF_SCHED_TAGS) {
rq->tag = BLK_MQ_NO_TAG;
rq->internal_tag = tag;
+ } else {
+ rq->tag = tag;
+ rq->internal_tag = BLK_MQ_NO_TAG;
}
rq->timeout = 0;
@@ -386,17 +388,14 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
WRITE_ONCE(rq->deadline, 0);
req_ref_set(rq, 1);
- if (rq->rq_flags & RQF_ELV) {
+ if (rq->rq_flags & RQF_USE_SCHED) {
struct elevator_queue *e = data->q->elevator;
INIT_HLIST_NODE(&rq->hash);
RB_CLEAR_NODE(&rq->rb_node);
- if (!op_is_flush(data->cmd_flags) &&
- e->type->ops.prepare_request) {
+ if (e->type->ops.prepare_request)
e->type->ops.prepare_request(rq);
- rq->rq_flags |= RQF_ELVPRIV;
- }
}
return rq;
@@ -449,26 +448,32 @@ static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data)
data->flags |= BLK_MQ_REQ_NOWAIT;
if (q->elevator) {
- struct elevator_queue *e = q->elevator;
-
- data->rq_flags |= RQF_ELV;
+ /*
+ * All requests use scheduler tags when an I/O scheduler is
+ * enabled for the queue.
+ */
+ data->rq_flags |= RQF_SCHED_TAGS;
/*
* Flush/passthrough requests are special and go directly to the
- * dispatch list. Don't include reserved tags in the
- * limiting, as it isn't useful.
+ * dispatch list.
*/
- if (!op_is_flush(data->cmd_flags) &&
- !blk_op_is_passthrough(data->cmd_flags) &&
- e->type->ops.limit_depth &&
- !(data->flags & BLK_MQ_REQ_RESERVED))
- e->type->ops.limit_depth(data->cmd_flags, data);
+ if ((data->cmd_flags & REQ_OP_MASK) != REQ_OP_FLUSH &&
+ !blk_op_is_passthrough(data->cmd_flags)) {
+ struct elevator_mq_ops *ops = &q->elevator->type->ops;
+
+ WARN_ON_ONCE(data->flags & BLK_MQ_REQ_RESERVED);
+
+ data->rq_flags |= RQF_USE_SCHED;
+ if (ops->limit_depth)
+ ops->limit_depth(data->cmd_flags, data);
+ }
}
retry:
data->ctx = blk_mq_get_ctx(q);
data->hctx = blk_mq_map_queue(q, data->cmd_flags, data->ctx);
- if (!(data->rq_flags & RQF_ELV))
+ if (!(data->rq_flags & RQF_SCHED_TAGS))
blk_mq_tag_busy(data->hctx);
if (data->flags & BLK_MQ_REQ_RESERVED)
@@ -648,10 +653,10 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
goto out_queue_exit;
data.ctx = __blk_mq_get_ctx(q, cpu);
- if (!q->elevator)
- blk_mq_tag_busy(data.hctx);
+ if (q->elevator)
+ data.rq_flags |= RQF_SCHED_TAGS;
else
- data.rq_flags |= RQF_ELV;
+ blk_mq_tag_busy(data.hctx);
if (flags & BLK_MQ_REQ_RESERVED)
data.rq_flags |= RQF_RESV;
@@ -696,7 +701,7 @@ void blk_mq_free_request(struct request *rq)
struct request_queue *q = rq->q;
struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
- if ((rq->rq_flags & RQF_ELVPRIV) &&
+ if ((rq->rq_flags & RQF_USE_SCHED) &&
q->elevator->type->ops.finish_request)
q->elevator->type->ops.finish_request(rq);
@@ -957,6 +962,8 @@ EXPORT_SYMBOL_GPL(blk_update_request);
static inline void blk_account_io_done(struct request *req, u64 now)
{
+ trace_block_io_done(req);
+
/*
* Account IO completion. flush_rq isn't accounted as a
* normal IO on queueing nor completion. Accounting the
@@ -976,6 +983,8 @@ static inline void blk_account_io_done(struct request *req, u64 now)
static inline void blk_account_io_start(struct request *req)
{
+ trace_block_io_start(req);
+
if (blk_do_io_stat(req)) {
/*
* All non-passthrough requests are created from a bio with one
@@ -1270,7 +1279,7 @@ static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq)
if (!plug->multiple_queues && last && last->q != rq->q)
plug->multiple_queues = true;
- if (!plug->has_elevator && (rq->rq_flags & RQF_ELV))
+ if (!plug->has_elevator && (rq->rq_flags & RQF_USE_SCHED))
plug->has_elevator = true;
rq->rq_next = NULL;
rq_list_add(&plug->mq_list, rq);
@@ -1411,13 +1420,16 @@ static void __blk_mq_requeue_request(struct request *rq)
void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list)
{
struct request_queue *q = rq->q;
+ unsigned long flags;
__blk_mq_requeue_request(rq);
/* this request will be re-inserted to io scheduler queue */
blk_mq_sched_requeue_request(rq);
- blk_mq_add_to_requeue_list(rq, BLK_MQ_INSERT_AT_HEAD);
+ spin_lock_irqsave(&q->requeue_lock, flags);
+ list_add_tail(&rq->queuelist, &q->requeue_list);
+ spin_unlock_irqrestore(&q->requeue_lock, flags);
if (kick_requeue_list)
blk_mq_kick_requeue_list(q);
@@ -1429,13 +1441,16 @@ static void blk_mq_requeue_work(struct work_struct *work)
struct request_queue *q =
container_of(work, struct request_queue, requeue_work.work);
LIST_HEAD(rq_list);
- struct request *rq, *next;
+ LIST_HEAD(flush_list);
+ struct request *rq;
spin_lock_irq(&q->requeue_lock);
list_splice_init(&q->requeue_list, &rq_list);
+ list_splice_init(&q->flush_list, &flush_list);
spin_unlock_irq(&q->requeue_lock);
- list_for_each_entry_safe(rq, next, &rq_list, queuelist) {
+ while (!list_empty(&rq_list)) {
+ rq = list_entry(rq_list.next, struct request, queuelist);
/*
* If RQF_DONTPREP ist set, the request has been started by the
* driver already and might have driver-specific data allocated
@@ -1443,18 +1458,16 @@ static void blk_mq_requeue_work(struct work_struct *work)
* block layer merges for the request.
*/
if (rq->rq_flags & RQF_DONTPREP) {
- rq->rq_flags &= ~RQF_SOFTBARRIER;
list_del_init(&rq->queuelist);
blk_mq_request_bypass_insert(rq, 0);
- } else if (rq->rq_flags & RQF_SOFTBARRIER) {
- rq->rq_flags &= ~RQF_SOFTBARRIER;
+ } else {
list_del_init(&rq->queuelist);
blk_mq_insert_request(rq, BLK_MQ_INSERT_AT_HEAD);
}
}
- while (!list_empty(&rq_list)) {
- rq = list_entry(rq_list.next, struct request, queuelist);
+ while (!list_empty(&flush_list)) {
+ rq = list_entry(flush_list.next, struct request, queuelist);
list_del_init(&rq->queuelist);
blk_mq_insert_request(rq, 0);
}
@@ -1462,27 +1475,6 @@ static void blk_mq_requeue_work(struct work_struct *work)
blk_mq_run_hw_queues(q, false);
}
-void blk_mq_add_to_requeue_list(struct request *rq, blk_insert_t insert_flags)
-{
- struct request_queue *q = rq->q;
- unsigned long flags;
-
- /*
- * We abuse this flag that is otherwise used by the I/O scheduler to
- * request head insertion from the workqueue.
- */
- BUG_ON(rq->rq_flags & RQF_SOFTBARRIER);
-
- spin_lock_irqsave(&q->requeue_lock, flags);
- if (insert_flags & BLK_MQ_INSERT_AT_HEAD) {
- rq->rq_flags |= RQF_SOFTBARRIER;
- list_add(&rq->queuelist, &q->requeue_list);
- } else {
- list_add_tail(&rq->queuelist, &q->requeue_list);
- }
- spin_unlock_irqrestore(&q->requeue_lock, flags);
-}
-
void blk_mq_kick_requeue_list(struct request_queue *q)
{
kblockd_mod_delayed_work_on(WORK_CPU_UNBOUND, &q->requeue_work, 0);
@@ -2427,7 +2419,7 @@ static void blk_mq_run_work_fn(struct work_struct *work)
* Should only be used carefully, when the caller knows we want to
* bypass a potential IO scheduler on the target device.
*/
-void blk_mq_request_bypass_insert(struct request *rq, blk_insert_t flags)
+static void blk_mq_request_bypass_insert(struct request *rq, blk_insert_t flags)
{
struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
@@ -2492,7 +2484,7 @@ static void blk_mq_insert_request(struct request *rq, blk_insert_t flags)
* dispatch it given we prioritize requests in hctx->dispatch.
*/
blk_mq_request_bypass_insert(rq, flags);
- } else if (rq->rq_flags & RQF_FLUSH_SEQ) {
+ } else if (req_op(rq) == REQ_OP_FLUSH) {
/*
* Firstly normal IO request is inserted to scheduler queue or
* sw queue, meantime we add flush request to dispatch queue(
@@ -2622,7 +2614,7 @@ static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
return;
}
- if ((rq->rq_flags & RQF_ELV) || !blk_mq_get_budget_and_tag(rq)) {
+ if ((rq->rq_flags & RQF_USE_SCHED) || !blk_mq_get_budget_and_tag(rq)) {
blk_mq_insert_request(rq, 0);
blk_mq_run_hw_queue(hctx, false);
return;
@@ -2711,6 +2703,7 @@ static void blk_mq_dispatch_plug_list(struct blk_plug *plug, bool from_sched)
struct request *requeue_list = NULL;
struct request **requeue_lastp = &requeue_list;
unsigned int depth = 0;
+ bool is_passthrough = false;
LIST_HEAD(list);
do {
@@ -2719,7 +2712,9 @@ static void blk_mq_dispatch_plug_list(struct blk_plug *plug, bool from_sched)
if (!this_hctx) {
this_hctx = rq->mq_hctx;
this_ctx = rq->mq_ctx;
- } else if (this_hctx != rq->mq_hctx || this_ctx != rq->mq_ctx) {
+ is_passthrough = blk_rq_is_passthrough(rq);
+ } else if (this_hctx != rq->mq_hctx || this_ctx != rq->mq_ctx ||
+ is_passthrough != blk_rq_is_passthrough(rq)) {
rq_list_add_tail(&requeue_lastp, rq);
continue;
}
@@ -2731,7 +2726,8 @@ static void blk_mq_dispatch_plug_list(struct blk_plug *plug, bool from_sched)
trace_block_unplug(this_hctx->queue, depth, !from_sched);
percpu_ref_get(&this_hctx->queue->q_usage_counter);
- if (this_hctx->queue->elevator) {
+ /* passthrough requests should never be issued to the I/O scheduler */
+ if (this_hctx->queue->elevator && !is_passthrough) {
this_hctx->queue->elevator->type->ops.insert_requests(this_hctx,
&list, 0);
blk_mq_run_hw_queue(this_hctx, from_sched);
@@ -2970,10 +2966,8 @@ void blk_mq_submit_bio(struct bio *bio)
return;
}
- if (op_is_flush(bio->bi_opf)) {
- blk_insert_flush(rq);
+ if (op_is_flush(bio->bi_opf) && blk_insert_flush(rq))
return;
- }
if (plug) {
blk_add_rq_to_plug(plug, rq);
@@ -2981,7 +2975,7 @@ void blk_mq_submit_bio(struct bio *bio)
}
hctx = rq->mq_hctx;
- if ((rq->rq_flags & RQF_ELV) ||
+ if ((rq->rq_flags & RQF_USE_SCHED) ||
(hctx->dispatch_busy && (q->nr_hw_queues == 1 || !is_sync))) {
blk_mq_insert_request(rq, 0);
blk_mq_run_hw_queue(hctx, true);
@@ -4232,6 +4226,7 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
blk_mq_update_poll_flag(q);
INIT_DELAYED_WORK(&q->requeue_work, blk_mq_requeue_work);
+ INIT_LIST_HEAD(&q->flush_list);
INIT_LIST_HEAD(&q->requeue_list);
spin_lock_init(&q->requeue_lock);
diff --git a/block/blk-mq.h b/block/blk-mq.h
index e876584d3516..8c642e9f32f1 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -47,7 +47,6 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr);
void blk_mq_wake_waiters(struct request_queue *q);
bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *,
unsigned int);
-void blk_mq_add_to_requeue_list(struct request *rq, blk_insert_t insert_flags);
void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list);
struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx,
struct blk_mq_ctx *start);
@@ -64,10 +63,6 @@ struct blk_mq_tags *blk_mq_alloc_map_and_rqs(struct blk_mq_tag_set *set,
void blk_mq_free_map_and_rqs(struct blk_mq_tag_set *set,
struct blk_mq_tags *tags,
unsigned int hctx_idx);
-/*
- * Internal helpers for request insertion into sw queues
- */
-void blk_mq_request_bypass_insert(struct request *rq, blk_insert_t flags);
/*
* CPU -> queue mappings
@@ -226,9 +221,9 @@ static inline bool blk_mq_is_shared_tags(unsigned int flags)
static inline struct blk_mq_tags *blk_mq_tags_from_data(struct blk_mq_alloc_data *data)
{
- if (!(data->rq_flags & RQF_ELV))
- return data->hctx->tags;
- return data->hctx->sched_tags;
+ if (data->rq_flags & RQF_SCHED_TAGS)
+ return data->hctx->sched_tags;
+ return data->hctx->tags;
}
static inline bool blk_mq_hctx_stopped(struct blk_mq_hw_ctx *hctx)
diff --git a/block/blk-rq-qos.c b/block/blk-rq-qos.c
index d8cc820a365e..167be74df4ee 100644
--- a/block/blk-rq-qos.c
+++ b/block/blk-rq-qos.c
@@ -288,11 +288,13 @@ void rq_qos_wait(struct rq_wait *rqw, void *private_data,
void rq_qos_exit(struct request_queue *q)
{
+ mutex_lock(&q->rq_qos_mutex);
while (q->rq_qos) {
struct rq_qos *rqos = q->rq_qos;
q->rq_qos = rqos->next;
rqos->ops->exit(rqos);
}
+ mutex_unlock(&q->rq_qos_mutex);
}
int rq_qos_add(struct rq_qos *rqos, struct gendisk *disk, enum rq_qos_id id,
@@ -300,6 +302,8 @@ int rq_qos_add(struct rq_qos *rqos, struct gendisk *disk, enum rq_qos_id id,
{
struct request_queue *q = disk->queue;
+ lockdep_assert_held(&q->rq_qos_mutex);
+
rqos->disk = disk;
rqos->id = id;
rqos->ops = ops;
@@ -307,18 +311,13 @@ int rq_qos_add(struct rq_qos *rqos, struct gendisk *disk, enum rq_qos_id id,
/*
* No IO can be in-flight when adding rqos, so freeze queue, which
* is fine since we only support rq_qos for blk-mq queue.
- *
- * Reuse ->queue_lock for protecting against other concurrent
- * rq_qos adding/deleting
*/
blk_mq_freeze_queue(q);
- spin_lock_irq(&q->queue_lock);
if (rq_qos_id(q, rqos->id))
goto ebusy;
rqos->next = q->rq_qos;
q->rq_qos = rqos;
- spin_unlock_irq(&q->queue_lock);
blk_mq_unfreeze_queue(q);
@@ -330,7 +329,6 @@ int rq_qos_add(struct rq_qos *rqos, struct gendisk *disk, enum rq_qos_id id,
return 0;
ebusy:
- spin_unlock_irq(&q->queue_lock);
blk_mq_unfreeze_queue(q);
return -EBUSY;
}
@@ -340,21 +338,15 @@ void rq_qos_del(struct rq_qos *rqos)
struct request_queue *q = rqos->disk->queue;
struct rq_qos **cur;
- /*
- * See comment in rq_qos_add() about freezing queue & using
- * ->queue_lock.
- */
- blk_mq_freeze_queue(q);
+ lockdep_assert_held(&q->rq_qos_mutex);
- spin_lock_irq(&q->queue_lock);
+ blk_mq_freeze_queue(q);
for (cur = &q->rq_qos; *cur; cur = &(*cur)->next) {
if (*cur == rqos) {
*cur = rqos->next;
break;
}
}
- spin_unlock_irq(&q->queue_lock);
-
blk_mq_unfreeze_queue(q);
mutex_lock(&q->debugfs_mutex);
diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index e49a48684532..53bf5aa6f9ad 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -942,7 +942,9 @@ int wbt_init(struct gendisk *disk)
/*
* Assign rwb and add the stats callback.
*/
+ mutex_lock(&q->rq_qos_mutex);
ret = rq_qos_add(&rwb->rqos, disk, RQ_QOS_WBT, &wbt_rqos_ops);
+ mutex_unlock(&q->rq_qos_mutex);
if (ret)
goto err_free;
diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index fce9082384d6..096b6b47561f 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -57,16 +57,10 @@ EXPORT_SYMBOL_GPL(blk_zone_cond_str);
*/
bool blk_req_needs_zone_write_lock(struct request *rq)
{
- if (blk_rq_is_passthrough(rq))
- return false;
-
if (!rq->q->disk->seq_zones_wlock)
return false;
- if (bdev_op_is_zoned_write(rq->q->disk->part0, req_op(rq)))
- return blk_rq_zone_is_seq(rq);
-
- return false;
+ return blk_rq_is_seq_zoned_write(rq);
}
EXPORT_SYMBOL_GPL(blk_req_needs_zone_write_lock);
diff --git a/block/blk.h b/block/blk.h
index 45547bcf1119..9f171b8f1e34 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -269,7 +269,7 @@ bool blk_bio_list_merge(struct request_queue *q, struct list_head *list,
*/
#define ELV_ON_HASH(rq) ((rq)->rq_flags & RQF_HASHED)
-void blk_insert_flush(struct request *rq);
+bool blk_insert_flush(struct request *rq);
int elevator_switch(struct request_queue *q, struct elevator_type *new_e);
void elevator_disable(struct request_queue *q);
diff --git a/block/fops.c b/block/fops.c
index 6c9aa028af6e..b12c4b2a3a69 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -520,7 +520,6 @@ static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
struct block_device *bdev = iocb->ki_filp->private_data;
struct inode *bd_inode = bdev->bd_inode;
loff_t size = bdev_nr_bytes(bdev);
- struct blk_plug plug;
size_t shorted = 0;
ssize_t ret;
@@ -545,12 +544,10 @@ static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
iov_iter_truncate(from, size);
}
- blk_start_plug(&plug);
ret = __generic_file_write_iter(iocb, from);
if (ret > 0)
ret = generic_write_sync(iocb, ret);
iov_iter_reexpand(from, iov_iter_count(from) + shorted);
- blk_finish_plug(&plug);
return ret;
}
diff --git a/block/mq-deadline.c b/block/mq-deadline.c
index 5839a027e0f0..6aa5daf7ae32 100644
--- a/block/mq-deadline.c
+++ b/block/mq-deadline.c
@@ -74,8 +74,8 @@ struct dd_per_prio {
struct list_head dispatch;
struct rb_root sort_list[DD_DIR_COUNT];
struct list_head fifo_list[DD_DIR_COUNT];
- /* Next request in FIFO order. Read, write or both are NULL. */
- struct request *next_rq[DD_DIR_COUNT];
+ /* Position of the most recently dispatched request. */
+ sector_t latest_pos[DD_DIR_COUNT];
struct io_stats_per_prio stats;
};
@@ -156,6 +156,40 @@ deadline_latter_request(struct request *rq)
return NULL;
}
+/*
+ * Return the first request for which blk_rq_pos() >= @pos. For zoned devices,
+ * return the first request after the start of the zone containing @pos.
+ */
+static inline struct request *deadline_from_pos(struct dd_per_prio *per_prio,
+ enum dd_data_dir data_dir, sector_t pos)
+{
+ struct rb_node *node = per_prio->sort_list[data_dir].rb_node;
+ struct request *rq, *res = NULL;
+
+ if (!node)
+ return NULL;
+
+ rq = rb_entry_rq(node);
+ /*
+ * A zoned write may have been requeued with a starting position that
+ * is below that of the most recently dispatched request. Hence, for
+ * zoned writes, start searching from the start of a zone.
+ */
+ if (blk_rq_is_seq_zoned_write(rq))
+ pos -= round_down(pos, rq->q->limits.chunk_sectors);
+
+ while (node) {
+ rq = rb_entry_rq(node);
+ if (blk_rq_pos(rq) >= pos) {
+ res = rq;
+ node = node->rb_left;
+ } else {
+ node = node->rb_right;
+ }
+ }
+ return res;
+}
+
static void
deadline_add_rq_rb(struct dd_per_prio *per_prio, struct request *rq)
{
@@ -167,11 +201,6 @@ deadline_add_rq_rb(struct dd_per_prio *per_prio, struct request *rq)
static inline void
deadline_del_rq_rb(struct dd_per_prio *per_prio, struct request *rq)
{
- const enum dd_data_dir data_dir = rq_data_dir(rq);
-
- if (per_prio->next_rq[data_dir] == rq)
- per_prio->next_rq[data_dir] = deadline_latter_request(rq);
-
elv_rb_del(deadline_rb_root(per_prio, rq), rq);
}
@@ -251,10 +280,6 @@ static void
deadline_move_request(struct deadline_data *dd, struct dd_per_prio *per_prio,
struct request *rq)
{
- const enum dd_data_dir data_dir = rq_data_dir(rq);
-
- per_prio->next_rq[data_dir] = deadline_latter_request(rq);
-
/*
* take it off the sort and fifo list
*/
@@ -272,21 +297,15 @@ static u32 dd_queued(struct deadline_data *dd, enum dd_prio prio)
}
/*
- * deadline_check_fifo returns 0 if there are no expired requests on the fifo,
- * 1 otherwise. Requires !list_empty(&dd->fifo_list[data_dir])
+ * deadline_check_fifo returns true if and only if there are expired requests
+ * in the FIFO list. Requires !list_empty(&dd->fifo_list[data_dir]).
*/
-static inline int deadline_check_fifo(struct dd_per_prio *per_prio,
- enum dd_data_dir data_dir)
+static inline bool deadline_check_fifo(struct dd_per_prio *per_prio,
+ enum dd_data_dir data_dir)
{
struct request *rq = rq_entry_fifo(per_prio->fifo_list[data_dir].next);
- /*
- * rq is expired!
- */
- if (time_after_eq(jiffies, (unsigned long)rq->fifo_time))
- return 1;
-
- return 0;
+ return time_is_before_eq_jiffies((unsigned long)rq->fifo_time);
}
/*
@@ -310,14 +329,11 @@ static struct request *deadline_skip_seq_writes(struct deadline_data *dd,
struct request *rq)
{
sector_t pos = blk_rq_pos(rq);
- sector_t skipped_sectors = 0;
- while (rq) {
- if (blk_rq_pos(rq) != pos + skipped_sectors)
- break;
- skipped_sectors += blk_rq_sectors(rq);
+ do {
+ pos += blk_rq_sectors(rq);
rq = deadline_latter_request(rq);
- }
+ } while (rq && blk_rq_pos(rq) == pos);
return rq;
}
@@ -330,7 +346,7 @@ static struct request *
deadline_fifo_request(struct deadline_data *dd, struct dd_per_prio *per_prio,
enum dd_data_dir data_dir)
{
- struct request *rq;
+ struct request *rq, *rb_rq, *next;
unsigned long flags;
if (list_empty(&per_prio->fifo_list[data_dir]))
@@ -348,7 +364,12 @@ deadline_fifo_request(struct deadline_data *dd, struct dd_per_prio *per_prio,
* zones and these zones are unlocked.
*/
spin_lock_irqsave(&dd->zone_lock, flags);
- list_for_each_entry(rq, &per_prio->fifo_list[DD_WRITE], queuelist) {
+ list_for_each_entry_safe(rq, next, &per_prio->fifo_list[DD_WRITE],
+ queuelist) {
+ /* Check whether a prior request exists for the same zone. */
+ rb_rq = deadline_from_pos(per_prio, data_dir, blk_rq_pos(rq));
+ if (rb_rq && blk_rq_pos(rb_rq) < blk_rq_pos(rq))
+ rq = rb_rq;
if (blk_req_can_dispatch_to_zone(rq) &&
(blk_queue_nonrot(rq->q) ||
!deadline_is_seq_write(dd, rq)))
@@ -372,7 +393,8 @@ deadline_next_request(struct deadline_data *dd, struct dd_per_prio *per_prio,
struct request *rq;
unsigned long flags;
- rq = per_prio->next_rq[data_dir];
+ rq = deadline_from_pos(per_prio, data_dir,
+ per_prio->latest_pos[data_dir]);
if (!rq)
return NULL;
@@ -435,6 +457,7 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd,
if (started_after(dd, rq, latest_start))
return NULL;
list_del_init(&rq->queuelist);
+ data_dir = rq_data_dir(rq);
goto done;
}
@@ -442,9 +465,11 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd,
* batches are currently reads XOR writes
*/
rq = deadline_next_request(dd, per_prio, dd->last_dir);
- if (rq && dd->batching < dd->fifo_batch)
- /* we have a next request are still entitled to batch */
+ if (rq && dd->batching < dd->fifo_batch) {
+ /* we have a next request and are still entitled to batch */
+ data_dir = rq_data_dir(rq);
goto dispatch_request;
+ }
/*
* at this point we are not running a batch. select the appropriate
@@ -522,6 +547,7 @@ dispatch_request:
done:
ioprio_class = dd_rq_ioclass(rq);
prio = ioprio_class_to_prio[ioprio_class];
+ dd->per_prio[prio].latest_pos[data_dir] = blk_rq_pos(rq);
dd->per_prio[prio].stats.dispatched++;
/*
* If the request needs its target zone locked, do it.
@@ -766,7 +792,7 @@ static bool dd_bio_merge(struct request_queue *q, struct bio *bio,
* add rq to rbtree and fifo
*/
static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
- blk_insert_t flags)
+ blk_insert_t flags, struct list_head *free)
{
struct request_queue *q = hctx->queue;
struct deadline_data *dd = q->elevator->elevator_data;
@@ -775,7 +801,6 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
u8 ioprio_class = IOPRIO_PRIO_CLASS(ioprio);
struct dd_per_prio *per_prio;
enum dd_prio prio;
- LIST_HEAD(free);
lockdep_assert_held(&dd->lock);
@@ -792,10 +817,8 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
rq->elv.priv[0] = (void *)(uintptr_t)1;
}
- if (blk_mq_sched_try_insert_merge(q, rq, &free)) {
- blk_mq_free_requests(&free);
+ if (blk_mq_sched_try_insert_merge(q, rq, free))
return;
- }
trace_block_rq_insert(rq);
@@ -803,6 +826,8 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
list_add(&rq->queuelist, &per_prio->dispatch);
rq->fifo_time = jiffies;
} else {
+ struct list_head *insert_before;
+
deadline_add_rq_rb(per_prio, rq);
if (rq_mergeable(rq)) {
@@ -815,7 +840,20 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
* set expire time and add to fifo list
*/
rq->fifo_time = jiffies + dd->fifo_expire[data_dir];
- list_add_tail(&rq->queuelist, &per_prio->fifo_list[data_dir]);
+ insert_before = &per_prio->fifo_list[data_dir];
+#ifdef CONFIG_BLK_DEV_ZONED
+ /*
+ * Insert zoned writes such that requests are sorted by
+ * position per zone.
+ */
+ if (blk_rq_is_seq_zoned_write(rq)) {
+ struct request *rq2 = deadline_latter_request(rq);
+
+ if (rq2 && blk_rq_zone_no(rq2) == blk_rq_zone_no(rq))
+ insert_before = &rq2->queuelist;
+ }
+#endif
+ list_add_tail(&rq->queuelist, insert_before);
}
}
@@ -828,6 +866,7 @@ static void dd_insert_requests(struct blk_mq_hw_ctx *hctx,
{
struct request_queue *q = hctx->queue;
struct deadline_data *dd = q->elevator->elevator_data;
+ LIST_HEAD(free);
spin_lock(&dd->lock);
while (!list_empty(list)) {
@@ -835,9 +874,11 @@ static void dd_insert_requests(struct blk_mq_hw_ctx *hctx,
rq = list_first_entry(list, struct request, queuelist);
list_del_init(&rq->queuelist);
- dd_insert_request(hctx, rq, flags);
+ dd_insert_request(hctx, rq, flags, &free);
}
spin_unlock(&dd->lock);
+
+ blk_mq_free_requests(&free);
}
/* Callback from inside blk_mq_rq_ctx_init(). */
@@ -1035,8 +1076,10 @@ static int deadline_##name##_next_rq_show(void *data, \
struct request_queue *q = data; \
struct deadline_data *dd = q->elevator->elevator_data; \
struct dd_per_prio *per_prio = &dd->per_prio[prio]; \
- struct request *rq = per_prio->next_rq[data_dir]; \
+ struct request *rq; \
\
+ rq = deadline_from_pos(per_prio, data_dir, \
+ per_prio->latest_pos[data_dir]); \
if (rq) \
__blk_mq_debugfs_rq_show(m, rq); \
return 0; \
diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index bcad9b926b0c..2f71376afc71 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -19,7 +19,7 @@
#include <linux/highmem.h>
#include <linux/mutex.h>
#include <linux/pagemap.h>
-#include <linux/radix-tree.h>
+#include <linux/xarray.h>
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/backing-dev.h>
@@ -28,7 +28,7 @@
#include <linux/uaccess.h>
/*
- * Each block ramdisk device has a radix_tree brd_pages of pages that stores
+ * Each block ramdisk device has a xarray brd_pages of pages that stores
* the pages containing the block device's contents. A brd page's ->index is
* its offset in PAGE_SIZE units. This is similar to, but in no way connected
* with, the kernel's pagecache or buffer cache (which sit above our block
@@ -40,11 +40,9 @@ struct brd_device {
struct list_head brd_list;
/*
- * Backing store of pages and lock to protect it. This is the contents
- * of the block device.
+ * Backing store of pages. This is the contents of the block device.
*/
- spinlock_t brd_lock;
- struct radix_tree_root brd_pages;
+ struct xarray brd_pages;
u64 brd_nr_pages;
};
@@ -56,21 +54,8 @@ static struct page *brd_lookup_page(struct brd_device *brd, sector_t sector)
pgoff_t idx;
struct page *page;
- /*
- * The page lifetime is protected by the fact that we have opened the
- * device node -- brd pages will never be deleted under us, so we
- * don't need any further locking or refcounting.
- *
- * This is strictly true for the radix-tree nodes as well (ie. we
- * don't actually need the rcu_read_lock()), however that is not a
- * documented feature of the radix-tree API so it is better to be
- * safe here (we don't have total exclusion from radix tree updates
- * here, only deletes).
- */
- rcu_read_lock();
idx = sector >> PAGE_SECTORS_SHIFT; /* sector to page index */
- page = radix_tree_lookup(&brd->brd_pages, idx);
- rcu_read_unlock();
+ page = xa_load(&brd->brd_pages, idx);
BUG_ON(page && page->index != idx);
@@ -83,7 +68,7 @@ static struct page *brd_lookup_page(struct brd_device *brd, sector_t sector)
static int brd_insert_page(struct brd_device *brd, sector_t sector, gfp_t gfp)
{
pgoff_t idx;
- struct page *page;
+ struct page *page, *cur;
int ret = 0;
page = brd_lookup_page(brd, sector);
@@ -94,71 +79,42 @@ static int brd_insert_page(struct brd_device *brd, sector_t sector, gfp_t gfp)
if (!page)
return -ENOMEM;
- if (radix_tree_maybe_preload(gfp)) {
- __free_page(page);
- return -ENOMEM;
- }
+ xa_lock(&brd->brd_pages);
- spin_lock(&brd->brd_lock);
idx = sector >> PAGE_SECTORS_SHIFT;
page->index = idx;
- if (radix_tree_insert(&brd->brd_pages, idx, page)) {
+
+ cur = __xa_cmpxchg(&brd->brd_pages, idx, NULL, page, gfp);
+
+ if (unlikely(cur)) {
__free_page(page);
- page = radix_tree_lookup(&brd->brd_pages, idx);
- if (!page)
- ret = -ENOMEM;
- else if (page->index != idx)
+ ret = xa_err(cur);
+ if (!ret && (cur->index != idx))
ret = -EIO;
} else {
brd->brd_nr_pages++;
}
- spin_unlock(&brd->brd_lock);
- radix_tree_preload_end();
+ xa_unlock(&brd->brd_pages);
+
return ret;
}
/*
- * Free all backing store pages and radix tree. This must only be called when
+ * Free all backing store pages and xarray. This must only be called when
* there are no other users of the device.
*/
-#define FREE_BATCH 16
static void brd_free_pages(struct brd_device *brd)
{
- unsigned long pos = 0;
- struct page *pages[FREE_BATCH];
- int nr_pages;
-
- do {
- int i;
-
- nr_pages = radix_tree_gang_lookup(&brd->brd_pages,
- (void **)pages, pos, FREE_BATCH);
-
- for (i = 0; i < nr_pages; i++) {
- void *ret;
-
- BUG_ON(pages[i]->index < pos);
- pos = pages[i]->index;
- ret = radix_tree_delete(&brd->brd_pages, pos);
- BUG_ON(!ret || ret != pages[i]);
- __free_page(pages[i]);
- }
-
- pos++;
+ struct page *page;
+ pgoff_t idx;
- /*
- * It takes 3.4 seconds to remove 80GiB ramdisk.
- * So, we need cond_resched to avoid stalling the CPU.
- */
- cond_resched();
+ xa_for_each(&brd->brd_pages, idx, page) {
+ __free_page(page);
+ cond_resched_rcu();
+ }
- /*
- * This assumes radix_tree_gang_lookup always returns as
- * many pages as possible. If the radix-tree code changes,
- * so will this have to.
- */
- } while (nr_pages == FREE_BATCH);
+ xa_destroy(&brd->brd_pages);
}
/*
@@ -372,8 +328,7 @@ static int brd_alloc(int i)
brd->brd_number = i;
list_add_tail(&brd->brd_list, &brd_devices);
- spin_lock_init(&brd->brd_lock);
- INIT_RADIX_TREE(&brd->brd_pages, GFP_ATOMIC);
+ xa_init(&brd->brd_pages);
snprintf(buf, DISK_NAME_LEN, "ram%d", i);
if (!IS_ERR_OR_NULL(brd_debugfs_dir))
diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index c7ed5d69e9ee..539eada32861 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -43,6 +43,7 @@
#include <asm/page.h>
#include <linux/task_work.h>
#include <linux/namei.h>
+#include <linux/kref.h>
#include <uapi/linux/ublk_cmd.h>
#define UBLK_MINORS (1U << MINORBITS)
@@ -54,7 +55,8 @@
| UBLK_F_USER_RECOVERY \
| UBLK_F_USER_RECOVERY_REISSUE \
| UBLK_F_UNPRIVILEGED_DEV \
- | UBLK_F_CMD_IOCTL_ENCODE)
+ | UBLK_F_CMD_IOCTL_ENCODE \
+ | UBLK_F_USER_COPY)
/* All UBLK_PARAM_TYPE_* should be included here */
#define UBLK_PARAM_TYPE_ALL (UBLK_PARAM_TYPE_BASIC | \
@@ -62,7 +64,8 @@
struct ublk_rq_data {
struct llist_node node;
- struct callback_head work;
+
+ struct kref ref;
};
struct ublk_uring_cmd_pdu {
@@ -182,6 +185,9 @@ struct ublk_params_header {
__u32 types;
};
+static inline void __ublk_complete_rq(struct request *req);
+static void ublk_complete_rq(struct kref *ref);
+
static dev_t ublk_chr_devt;
static struct class *ublk_chr_class;
@@ -202,6 +208,23 @@ static unsigned int ublks_added; /* protected by ublk_ctl_mutex */
static struct miscdevice ublk_misc;
+static inline unsigned ublk_pos_to_hwq(loff_t pos)
+{
+ return ((pos - UBLKSRV_IO_BUF_OFFSET) >> UBLK_QID_OFF) &
+ UBLK_QID_BITS_MASK;
+}
+
+static inline unsigned ublk_pos_to_buf_off(loff_t pos)
+{
+ return (pos - UBLKSRV_IO_BUF_OFFSET) & UBLK_IO_BUF_BITS_MASK;
+}
+
+static inline unsigned ublk_pos_to_tag(loff_t pos)
+{
+ return ((pos - UBLKSRV_IO_BUF_OFFSET) >> UBLK_TAG_OFF) &
+ UBLK_TAG_BITS_MASK;
+}
+
static void ublk_dev_param_basic_apply(struct ublk_device *ub)
{
struct request_queue *q = ub->ub_disk->queue;
@@ -290,12 +313,52 @@ static int ublk_apply_params(struct ublk_device *ub)
return 0;
}
-static inline bool ublk_can_use_task_work(const struct ublk_queue *ubq)
+static inline bool ublk_support_user_copy(const struct ublk_queue *ubq)
{
- if (IS_BUILTIN(CONFIG_BLK_DEV_UBLK) &&
- !(ubq->flags & UBLK_F_URING_CMD_COMP_IN_TASK))
- return true;
- return false;
+ return ubq->flags & UBLK_F_USER_COPY;
+}
+
+static inline bool ublk_need_req_ref(const struct ublk_queue *ubq)
+{
+ /*
+ * read()/write() is involved in user copy, so request reference
+ * has to be grabbed
+ */
+ return ublk_support_user_copy(ubq);
+}
+
+static inline void ublk_init_req_ref(const struct ublk_queue *ubq,
+ struct request *req)
+{
+ if (ublk_need_req_ref(ubq)) {
+ struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
+
+ kref_init(&data->ref);
+ }
+}
+
+static inline bool ublk_get_req_ref(const struct ublk_queue *ubq,
+ struct request *req)
+{
+ if (ublk_need_req_ref(ubq)) {
+ struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
+
+ return kref_get_unless_zero(&data->ref);
+ }
+
+ return true;
+}
+
+static inline void ublk_put_req_ref(const struct ublk_queue *ubq,
+ struct request *req)
+{
+ if (ublk_need_req_ref(ubq)) {
+ struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
+
+ kref_put(&data->ref, ublk_complete_rq);
+ } else {
+ __ublk_complete_rq(req);
+ }
}
static inline bool ublk_need_get_data(const struct ublk_queue *ubq)
@@ -421,49 +484,39 @@ static const struct block_device_operations ub_fops = {
#define UBLK_MAX_PIN_PAGES 32
-struct ublk_map_data {
- const struct request *rq;
- unsigned long ubuf;
- unsigned int len;
-};
-
struct ublk_io_iter {
struct page *pages[UBLK_MAX_PIN_PAGES];
- unsigned pg_off; /* offset in the 1st page in pages */
- int nr_pages; /* how many page pointers in pages */
struct bio *bio;
struct bvec_iter iter;
};
-static inline unsigned ublk_copy_io_pages(struct ublk_io_iter *data,
- unsigned max_bytes, bool to_vm)
+/* return how many pages are copied */
+static void ublk_copy_io_pages(struct ublk_io_iter *data,
+ size_t total, size_t pg_off, int dir)
{
- const unsigned total = min_t(unsigned, max_bytes,
- PAGE_SIZE - data->pg_off +
- ((data->nr_pages - 1) << PAGE_SHIFT));
unsigned done = 0;
unsigned pg_idx = 0;
while (done < total) {
struct bio_vec bv = bio_iter_iovec(data->bio, data->iter);
- const unsigned int bytes = min3(bv.bv_len, total - done,
- (unsigned)(PAGE_SIZE - data->pg_off));
+ unsigned int bytes = min3(bv.bv_len, (unsigned)total - done,
+ (unsigned)(PAGE_SIZE - pg_off));
void *bv_buf = bvec_kmap_local(&bv);
void *pg_buf = kmap_local_page(data->pages[pg_idx]);
- if (to_vm)
- memcpy(pg_buf + data->pg_off, bv_buf, bytes);
+ if (dir == ITER_DEST)
+ memcpy(pg_buf + pg_off, bv_buf, bytes);
else
- memcpy(bv_buf, pg_buf + data->pg_off, bytes);
+ memcpy(bv_buf, pg_buf + pg_off, bytes);
kunmap_local(pg_buf);
kunmap_local(bv_buf);
/* advance page array */
- data->pg_off += bytes;
- if (data->pg_off == PAGE_SIZE) {
+ pg_off += bytes;
+ if (pg_off == PAGE_SIZE) {
pg_idx += 1;
- data->pg_off = 0;
+ pg_off = 0;
}
done += bytes;
@@ -477,41 +530,58 @@ static inline unsigned ublk_copy_io_pages(struct ublk_io_iter *data,
data->iter = data->bio->bi_iter;
}
}
+}
- return done;
+static bool ublk_advance_io_iter(const struct request *req,
+ struct ublk_io_iter *iter, unsigned int offset)
+{
+ struct bio *bio = req->bio;
+
+ for_each_bio(bio) {
+ if (bio->bi_iter.bi_size > offset) {
+ iter->bio = bio;
+ iter->iter = bio->bi_iter;
+ bio_advance_iter(iter->bio, &iter->iter, offset);
+ return true;
+ }
+ offset -= bio->bi_iter.bi_size;
+ }
+ return false;
}
-static int ublk_copy_user_pages(struct ublk_map_data *data, bool to_vm)
+/*
+ * Copy data between request pages and io_iter, and 'offset'
+ * is the start point of linear offset of request.
+ */
+static size_t ublk_copy_user_pages(const struct request *req,
+ unsigned offset, struct iov_iter *uiter, int dir)
{
- const unsigned int gup_flags = to_vm ? FOLL_WRITE : 0;
- const unsigned long start_vm = data->ubuf;
- unsigned int done = 0;
- struct ublk_io_iter iter = {
- .pg_off = start_vm & (PAGE_SIZE - 1),
- .bio = data->rq->bio,
- .iter = data->rq->bio->bi_iter,
- };
- const unsigned int nr_pages = round_up(data->len +
- (start_vm & (PAGE_SIZE - 1)), PAGE_SIZE) >> PAGE_SHIFT;
-
- while (done < nr_pages) {
- const unsigned to_pin = min_t(unsigned, UBLK_MAX_PIN_PAGES,
- nr_pages - done);
- unsigned i, len;
-
- iter.nr_pages = get_user_pages_fast(start_vm +
- (done << PAGE_SHIFT), to_pin, gup_flags,
- iter.pages);
- if (iter.nr_pages <= 0)
- return done == 0 ? iter.nr_pages : done;
- len = ublk_copy_io_pages(&iter, data->len, to_vm);
- for (i = 0; i < iter.nr_pages; i++) {
- if (to_vm)
+ struct ublk_io_iter iter;
+ size_t done = 0;
+
+ if (!ublk_advance_io_iter(req, &iter, offset))
+ return 0;
+
+ while (iov_iter_count(uiter) && iter.bio) {
+ unsigned nr_pages;
+ ssize_t len;
+ size_t off;
+ int i;
+
+ len = iov_iter_get_pages2(uiter, iter.pages,
+ iov_iter_count(uiter),
+ UBLK_MAX_PIN_PAGES, &off);
+ if (len <= 0)
+ return done;
+
+ ublk_copy_io_pages(&iter, len, off, dir);
+ nr_pages = DIV_ROUND_UP(len + off, PAGE_SIZE);
+ for (i = 0; i < nr_pages; i++) {
+ if (dir == ITER_DEST)
set_page_dirty(iter.pages[i]);
put_page(iter.pages[i]);
}
- data->len -= len;
- done += iter.nr_pages;
+ done += len;
}
return done;
@@ -532,21 +602,23 @@ static int ublk_map_io(const struct ublk_queue *ubq, const struct request *req,
{
const unsigned int rq_bytes = blk_rq_bytes(req);
+ if (ublk_support_user_copy(ubq))
+ return rq_bytes;
+
/*
* no zero copy, we delay copy WRITE request data into ublksrv
* context and the big benefit is that pinning pages in current
* context is pretty fast, see ublk_pin_user_pages
*/
if (ublk_need_map_req(req)) {
- struct ublk_map_data data = {
- .rq = req,
- .ubuf = io->addr,
- .len = rq_bytes,
- };
+ struct iov_iter iter;
+ struct iovec iov;
+ const int dir = ITER_DEST;
- ublk_copy_user_pages(&data, true);
+ import_single_range(dir, u64_to_user_ptr(io->addr), rq_bytes,
+ &iov, &iter);
- return rq_bytes - data.len;
+ return ublk_copy_user_pages(req, 0, &iter, dir);
}
return rq_bytes;
}
@@ -557,18 +629,19 @@ static int ublk_unmap_io(const struct ublk_queue *ubq,
{
const unsigned int rq_bytes = blk_rq_bytes(req);
+ if (ublk_support_user_copy(ubq))
+ return rq_bytes;
+
if (ublk_need_unmap_req(req)) {
- struct ublk_map_data data = {
- .rq = req,
- .ubuf = io->addr,
- .len = io->res,
- };
+ struct iov_iter iter;
+ struct iovec iov;
+ const int dir = ITER_SOURCE;
WARN_ON_ONCE(io->res > rq_bytes);
- ublk_copy_user_pages(&data, false);
-
- return io->res - data.len;
+ import_single_range(dir, u64_to_user_ptr(io->addr), io->res,
+ &iov, &iter);
+ return ublk_copy_user_pages(req, 0, &iter, dir);
}
return rq_bytes;
}
@@ -648,13 +721,19 @@ static inline bool ubq_daemon_is_dying(struct ublk_queue *ubq)
}
/* todo: handle partial completion */
-static void ublk_complete_rq(struct request *req)
+static inline void __ublk_complete_rq(struct request *req)
{
struct ublk_queue *ubq = req->mq_hctx->driver_data;
struct ublk_io *io = &ubq->ios[req->tag];
unsigned int unmapped_bytes;
blk_status_t res = BLK_STS_OK;
+ /* called from ublk_abort_queue() code path */
+ if (io->flags & UBLK_IO_FLAG_ABORTED) {
+ res = BLK_STS_IOERR;
+ goto exit;
+ }
+
/* failed read IO if nothing is read */
if (!io->res && req_op(req) == REQ_OP_READ)
io->res = -EIO;
@@ -694,6 +773,15 @@ exit:
blk_mq_end_request(req, res);
}
+static void ublk_complete_rq(struct kref *ref)
+{
+ struct ublk_rq_data *data = container_of(ref, struct ublk_rq_data,
+ ref);
+ struct request *req = blk_mq_rq_from_pdu(data);
+
+ __ublk_complete_rq(req);
+}
+
/*
* Since __ublk_rq_task_work always fails requests immediately during
* exiting, __ublk_fail_req() is only called from abort context during
@@ -712,7 +800,7 @@ static void __ublk_fail_req(struct ublk_queue *ubq, struct ublk_io *io,
if (ublk_queue_can_use_recovery_reissue(ubq))
blk_mq_requeue_request(req, false);
else
- blk_mq_end_request(req, BLK_STS_IOERR);
+ ublk_put_req_ref(ubq, req);
}
}
@@ -821,6 +909,7 @@ static inline void __ublk_rq_task_work(struct request *req,
mapped_bytes >> 9;
}
+ ublk_init_req_ref(ubq, req);
ubq_complete_io_cmd(io, UBLK_IO_RES_OK, issue_flags);
}
@@ -852,17 +941,6 @@ static void ublk_rq_task_work_cb(struct io_uring_cmd *cmd, unsigned issue_flags)
ublk_forward_io_cmds(ubq, issue_flags);
}
-static void ublk_rq_task_work_fn(struct callback_head *work)
-{
- struct ublk_rq_data *data = container_of(work,
- struct ublk_rq_data, work);
- struct request *req = blk_mq_rq_from_pdu(data);
- struct ublk_queue *ubq = req->mq_hctx->driver_data;
- unsigned issue_flags = IO_URING_F_UNLOCKED;
-
- ublk_forward_io_cmds(ubq, issue_flags);
-}
-
static void ublk_queue_cmd(struct ublk_queue *ubq, struct request *rq)
{
struct ublk_rq_data *data = blk_mq_rq_to_pdu(rq);
@@ -886,10 +964,6 @@ static void ublk_queue_cmd(struct ublk_queue *ubq, struct request *rq)
*/
if (unlikely(io->flags & UBLK_IO_FLAG_ABORTED)) {
ublk_abort_io_cmds(ubq);
- } else if (ublk_can_use_task_work(ubq)) {
- if (task_work_add(ubq->ubq_daemon, &data->work,
- TWA_SIGNAL_NO_IPI))
- ublk_abort_io_cmds(ubq);
} else {
struct io_uring_cmd *cmd = io->cmd;
struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
@@ -961,19 +1035,9 @@ static int ublk_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data,
return 0;
}
-static int ublk_init_rq(struct blk_mq_tag_set *set, struct request *req,
- unsigned int hctx_idx, unsigned int numa_node)
-{
- struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
-
- init_task_work(&data->work, ublk_rq_task_work_fn);
- return 0;
-}
-
static const struct blk_mq_ops ublk_mq_ops = {
.queue_rq = ublk_queue_rq,
.init_hctx = ublk_init_hctx,
- .init_request = ublk_init_rq,
.timeout = ublk_timeout,
};
@@ -1050,7 +1114,7 @@ static void ublk_commit_completion(struct ublk_device *ub,
req = blk_mq_tag_to_rq(ub->tag_set.tags[qid], tag);
if (req && likely(!blk_should_fake_timeout(req->q)))
- ublk_complete_rq(req);
+ ublk_put_req_ref(ubq, req);
}
/*
@@ -1290,6 +1354,14 @@ static inline int ublk_check_cmd_op(u32 cmd_op)
return 0;
}
+static inline void ublk_fill_io_cmd(struct ublk_io *io,
+ struct io_uring_cmd *cmd, unsigned long buf_addr)
+{
+ io->cmd = cmd;
+ io->flags |= UBLK_IO_FLAG_ACTIVE;
+ io->addr = buf_addr;
+}
+
static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
unsigned int issue_flags,
const struct ublksrv_io_cmd *ub_cmd)
@@ -1335,6 +1407,11 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
^ (_IOC_NR(cmd_op) == UBLK_IO_NEED_GET_DATA))
goto out;
+ if (ublk_support_user_copy(ubq) && ub_cmd->addr) {
+ ret = -EINVAL;
+ goto out;
+ }
+
ret = ublk_check_cmd_op(cmd_op);
if (ret)
goto out;
@@ -1353,36 +1430,41 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
*/
if (io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)
goto out;
- /* FETCH_RQ has to provide IO buffer if NEED GET DATA is not enabled */
- if (!ub_cmd->addr && !ublk_need_get_data(ubq))
- goto out;
- io->cmd = cmd;
- io->flags |= UBLK_IO_FLAG_ACTIVE;
- io->addr = ub_cmd->addr;
+ if (!ublk_support_user_copy(ubq)) {
+ /*
+ * FETCH_RQ has to provide IO buffer if NEED GET
+ * DATA is not enabled
+ */
+ if (!ub_cmd->addr && !ublk_need_get_data(ubq))
+ goto out;
+ }
+
+ ublk_fill_io_cmd(io, cmd, ub_cmd->addr);
ublk_mark_io_ready(ub, ubq);
break;
case UBLK_IO_COMMIT_AND_FETCH_REQ:
req = blk_mq_tag_to_rq(ub->tag_set.tags[ub_cmd->q_id], tag);
- /*
- * COMMIT_AND_FETCH_REQ has to provide IO buffer if NEED GET DATA is
- * not enabled or it is Read IO.
- */
- if (!ub_cmd->addr && (!ublk_need_get_data(ubq) || req_op(req) == REQ_OP_READ))
- goto out;
+
if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV))
goto out;
- io->addr = ub_cmd->addr;
- io->flags |= UBLK_IO_FLAG_ACTIVE;
- io->cmd = cmd;
+
+ if (!ublk_support_user_copy(ubq)) {
+ /*
+ * COMMIT_AND_FETCH_REQ has to provide IO buffer if
+ * NEED GET DATA is not enabled or it is Read IO.
+ */
+ if (!ub_cmd->addr && (!ublk_need_get_data(ubq) ||
+ req_op(req) == REQ_OP_READ))
+ goto out;
+ }
+ ublk_fill_io_cmd(io, cmd, ub_cmd->addr);
ublk_commit_completion(ub, ub_cmd);
break;
case UBLK_IO_NEED_GET_DATA:
if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV))
goto out;
- io->addr = ub_cmd->addr;
- io->cmd = cmd;
- io->flags |= UBLK_IO_FLAG_ACTIVE;
+ ublk_fill_io_cmd(io, cmd, ub_cmd->addr);
ublk_handle_need_get_data(ub, ub_cmd->q_id, ub_cmd->tag);
break;
default:
@@ -1397,6 +1479,36 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
return -EIOCBQUEUED;
}
+static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub,
+ struct ublk_queue *ubq, int tag, size_t offset)
+{
+ struct request *req;
+
+ if (!ublk_need_req_ref(ubq))
+ return NULL;
+
+ req = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], tag);
+ if (!req)
+ return NULL;
+
+ if (!ublk_get_req_ref(ubq, req))
+ return NULL;
+
+ if (unlikely(!blk_mq_request_started(req) || req->tag != tag))
+ goto fail_put;
+
+ if (!ublk_rq_has_data(req))
+ goto fail_put;
+
+ if (offset > blk_rq_bytes(req))
+ goto fail_put;
+
+ return req;
+fail_put:
+ ublk_put_req_ref(ubq, req);
+ return NULL;
+}
+
static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
{
/*
@@ -1414,11 +1526,112 @@ static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
return __ublk_ch_uring_cmd(cmd, issue_flags, &ub_cmd);
}
+static inline bool ublk_check_ubuf_dir(const struct request *req,
+ int ubuf_dir)
+{
+ /* copy ubuf to request pages */
+ if (req_op(req) == REQ_OP_READ && ubuf_dir == ITER_SOURCE)
+ return true;
+
+ /* copy request pages to ubuf */
+ if (req_op(req) == REQ_OP_WRITE && ubuf_dir == ITER_DEST)
+ return true;
+
+ return false;
+}
+
+static struct request *ublk_check_and_get_req(struct kiocb *iocb,
+ struct iov_iter *iter, size_t *off, int dir)
+{
+ struct ublk_device *ub = iocb->ki_filp->private_data;
+ struct ublk_queue *ubq;
+ struct request *req;
+ size_t buf_off;
+ u16 tag, q_id;
+
+ if (!ub)
+ return ERR_PTR(-EACCES);
+
+ if (!user_backed_iter(iter))
+ return ERR_PTR(-EACCES);
+
+ if (ub->dev_info.state == UBLK_S_DEV_DEAD)
+ return ERR_PTR(-EACCES);
+
+ tag = ublk_pos_to_tag(iocb->ki_pos);
+ q_id = ublk_pos_to_hwq(iocb->ki_pos);
+ buf_off = ublk_pos_to_buf_off(iocb->ki_pos);
+
+ if (q_id >= ub->dev_info.nr_hw_queues)
+ return ERR_PTR(-EINVAL);
+
+ ubq = ublk_get_queue(ub, q_id);
+ if (!ubq)
+ return ERR_PTR(-EINVAL);
+
+ if (tag >= ubq->q_depth)
+ return ERR_PTR(-EINVAL);
+
+ req = __ublk_check_and_get_req(ub, ubq, tag, buf_off);
+ if (!req)
+ return ERR_PTR(-EINVAL);
+
+ if (!req->mq_hctx || !req->mq_hctx->driver_data)
+ goto fail;
+
+ if (!ublk_check_ubuf_dir(req, dir))
+ goto fail;
+
+ *off = buf_off;
+ return req;
+fail:
+ ublk_put_req_ref(ubq, req);
+ return ERR_PTR(-EACCES);
+}
+
+static ssize_t ublk_ch_read_iter(struct kiocb *iocb, struct iov_iter *to)
+{
+ struct ublk_queue *ubq;
+ struct request *req;
+ size_t buf_off;
+ size_t ret;
+
+ req = ublk_check_and_get_req(iocb, to, &buf_off, ITER_DEST);
+ if (IS_ERR(req))
+ return PTR_ERR(req);
+
+ ret = ublk_copy_user_pages(req, buf_off, to, ITER_DEST);
+ ubq = req->mq_hctx->driver_data;
+ ublk_put_req_ref(ubq, req);
+
+ return ret;
+}
+
+static ssize_t ublk_ch_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+ struct ublk_queue *ubq;
+ struct request *req;
+ size_t buf_off;
+ size_t ret;
+
+ req = ublk_check_and_get_req(iocb, from, &buf_off, ITER_SOURCE);
+ if (IS_ERR(req))
+ return PTR_ERR(req);
+
+ ret = ublk_copy_user_pages(req, buf_off, from, ITER_SOURCE);
+ ubq = req->mq_hctx->driver_data;
+ ublk_put_req_ref(ubq, req);
+
+ return ret;
+}
+
static const struct file_operations ublk_ch_fops = {
.owner = THIS_MODULE,
.open = ublk_ch_open,
.release = ublk_ch_release,
.llseek = no_llseek,
+ .read_iter = ublk_ch_read_iter,
+ .write_iter = ublk_ch_write_iter,
.uring_cmd = ublk_ch_uring_cmd,
.mmap = ublk_ch_mmap,
};
@@ -1813,10 +2026,12 @@ static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd)
*/
ub->dev_info.flags &= UBLK_F_ALL;
- if (!IS_BUILTIN(CONFIG_BLK_DEV_UBLK))
- ub->dev_info.flags |= UBLK_F_URING_CMD_COMP_IN_TASK;
+ ub->dev_info.flags |= UBLK_F_CMD_IOCTL_ENCODE |
+ UBLK_F_URING_CMD_COMP_IN_TASK;
- ub->dev_info.flags |= UBLK_F_CMD_IOCTL_ENCODE;
+ /* GET_DATA isn't needed any more with USER_COPY */
+ if (ub->dev_info.flags & UBLK_F_USER_COPY)
+ ub->dev_info.flags &= ~UBLK_F_NEED_GET_DATA;
/* We are not ready to support zero copy */
ub->dev_info.flags &= ~UBLK_F_SUPPORT_ZERO_COPY;
@@ -2332,6 +2547,9 @@ static int __init ublk_init(void)
{
int ret;
+ BUILD_BUG_ON((u64)UBLKSRV_IO_BUF_OFFSET +
+ UBLKSRV_IO_BUF_TOTAL_SIZE < UBLKSRV_IO_BUF_OFFSET);
+
init_waitqueue_head(&ublk_idr_wq);
ret = misc_register(&ublk_misc);
diff --git a/fs/Makefile b/fs/Makefile
index 834f1c3dba46..4709eba1303c 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -17,14 +17,8 @@ obj-y := open.o read_write.o file_table.o super.o \
fs_types.o fs_context.o fs_parser.o fsopen.o init.o \
kernel_read_file.o mnt_idmapping.o remap_range.o
-ifeq ($(CONFIG_BLOCK),y)
-obj-y += buffer.o mpage.o
-else
-obj-y += no-block.o
-endif
-
-obj-$(CONFIG_PROC_FS) += proc_namespace.o
-
+obj-$(CONFIG_BLOCK) += buffer.o mpage.o
+obj-$(CONFIG_PROC_FS) += proc_namespace.o
obj-$(CONFIG_LEGACY_DIRECT_IO) += direct-io.o
obj-y += notify/
obj-$(CONFIG_EPOLL) += eventpoll.o
diff --git a/fs/inode.c b/fs/inode.c
index 577799b7855f..4d6a1544e95b 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -2264,7 +2264,8 @@ void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev)
inode->i_fop = &def_chr_fops;
inode->i_rdev = rdev;
} else if (S_ISBLK(mode)) {
- inode->i_fop = &def_blk_fops;
+ if (IS_ENABLED(CONFIG_BLOCK))
+ inode->i_fop = &def_blk_fops;
inode->i_rdev = rdev;
} else if (S_ISFIFO(mode))
inode->i_fop = &pipefifo_fops;
diff --git a/fs/no-block.c b/fs/no-block.c
deleted file mode 100644
index 481c0f0ab4bd..000000000000
--- a/fs/no-block.c
+++ /dev/null
@@ -1,19 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/* no-block.c: implementation of routines required for non-BLOCK configuration
- *
- * Copyright (C) 2006 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells ([email protected])
- */
-
-#include <linux/kernel.h>
-#include <linux/fs.h>
-
-static int no_blkdev_open(struct inode * inode, struct file * filp)
-{
- return -ENODEV;
-}
-
-const struct file_operations def_blk_fops = {
- .open = no_blkdev_open,
- .llseek = noop_llseek,
-};
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 06caacd77ed6..59b52ec155b1 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -28,8 +28,6 @@ typedef __u32 __bitwise req_flags_t;
/* drive already may have started this one */
#define RQF_STARTED ((__force req_flags_t)(1 << 1))
-/* may not be passed by ioscheduler */
-#define RQF_SOFTBARRIER ((__force req_flags_t)(1 << 3))
/* request for flush sequence */
#define RQF_FLUSH_SEQ ((__force req_flags_t)(1 << 4))
/* merge of different types, fail separately */
@@ -38,12 +36,14 @@ typedef __u32 __bitwise req_flags_t;
#define RQF_MQ_INFLIGHT ((__force req_flags_t)(1 << 6))
/* don't call prep for this one */
#define RQF_DONTPREP ((__force req_flags_t)(1 << 7))
+/* use hctx->sched_tags */
+#define RQF_SCHED_TAGS ((__force req_flags_t)(1 << 8))
+/* use an I/O scheduler for this request */
+#define RQF_USE_SCHED ((__force req_flags_t)(1 << 9))
/* vaguely specified driver internal error. Ignored by the block layer */
#define RQF_FAILED ((__force req_flags_t)(1 << 10))
/* don't warn about errors */
#define RQF_QUIET ((__force req_flags_t)(1 << 11))
-/* elevator private data attached */
-#define RQF_ELVPRIV ((__force req_flags_t)(1 << 12))
/* account into disk and partition IO statistics */
#define RQF_IO_STAT ((__force req_flags_t)(1 << 13))
/* runtime pm request */
@@ -59,13 +59,11 @@ typedef __u32 __bitwise req_flags_t;
#define RQF_ZONE_WRITE_LOCKED ((__force req_flags_t)(1 << 19))
/* ->timeout has been called, don't expire again */
#define RQF_TIMED_OUT ((__force req_flags_t)(1 << 21))
-/* queue has elevator attached */
-#define RQF_ELV ((__force req_flags_t)(1 << 22))
-#define RQF_RESV ((__force req_flags_t)(1 << 23))
+#define RQF_RESV ((__force req_flags_t)(1 << 23))
/* flags that prevent us from merging requests: */
#define RQF_NOMERGE_FLAGS \
- (RQF_STARTED | RQF_SOFTBARRIER | RQF_FLUSH_SEQ | RQF_SPECIAL_PAYLOAD)
+ (RQF_STARTED | RQF_FLUSH_SEQ | RQF_SPECIAL_PAYLOAD)
enum mq_rq_state {
MQ_RQ_IDLE = 0,
@@ -169,25 +167,20 @@ struct request {
void *completion_data;
};
-
/*
* Three pointers are available for the IO schedulers, if they need
- * more they have to dynamically allocate it. Flush requests are
- * never put on the IO scheduler. So let the flush fields share
- * space with the elevator data.
+ * more they have to dynamically allocate it.
*/
- union {
- struct {
- struct io_cq *icq;
- void *priv[2];
- } elv;
-
- struct {
- unsigned int seq;
- struct list_head list;
- rq_end_io_fn *saved_end_io;
- } flush;
- };
+ struct {
+ struct io_cq *icq;
+ void *priv[2];
+ } elv;
+
+ struct {
+ unsigned int seq;
+ struct list_head list;
+ rq_end_io_fn *saved_end_io;
+ } flush;
union {
struct __call_single_data csd;
@@ -208,7 +201,7 @@ static inline enum req_op req_op(const struct request *req)
static inline bool blk_rq_is_passthrough(struct request *rq)
{
- return blk_op_is_passthrough(req_op(rq));
+ return blk_op_is_passthrough(rq->cmd_flags);
}
static inline unsigned short req_get_ioprio(struct request *req)
@@ -844,7 +837,7 @@ void blk_mq_end_request_batch(struct io_comp_batch *ib);
*/
static inline bool blk_mq_need_time_stamp(struct request *rq)
{
- return (rq->rq_flags & (RQF_IO_STAT | RQF_STATS | RQF_ELV));
+ return (rq->rq_flags & (RQF_IO_STAT | RQF_STATS | RQF_USE_SCHED));
}
static inline bool blk_mq_is_reserved_rq(struct request *rq)
@@ -860,7 +853,7 @@ static inline bool blk_mq_add_to_batch(struct request *req,
struct io_comp_batch *iob, int ioerror,
void (*complete)(struct io_comp_batch *))
{
- if (!iob || (req->rq_flags & RQF_ELV) || ioerror ||
+ if (!iob || (req->rq_flags & RQF_USE_SCHED) || ioerror ||
(req->end_io && !blk_rq_is_passthrough(req)))
return false;
@@ -1164,6 +1157,18 @@ static inline unsigned int blk_rq_zone_is_seq(struct request *rq)
return disk_zone_is_seq(rq->q->disk, blk_rq_pos(rq));
}
+/**
+ * blk_rq_is_seq_zoned_write() - Check if @rq requires write serialization.
+ * @rq: Request to examine.
+ *
+ * Note: REQ_OP_ZONE_APPEND requests do not require serialization.
+ */
+static inline bool blk_rq_is_seq_zoned_write(struct request *rq)
+{
+ return op_needs_zoned_write_locking(req_op(rq)) &&
+ blk_rq_zone_is_seq(rq);
+}
+
bool blk_req_needs_zone_write_lock(struct request *rq);
bool blk_req_zone_write_trylock(struct request *rq);
void __blk_req_zone_write_lock(struct request *rq);
@@ -1194,6 +1199,11 @@ static inline bool blk_req_can_dispatch_to_zone(struct request *rq)
return !blk_req_zone_is_write_locked(rq);
}
#else /* CONFIG_BLK_DEV_ZONED */
+static inline bool blk_rq_is_seq_zoned_write(struct request *rq)
+{
+ return false;
+}
+
static inline bool blk_req_needs_zone_write_lock(struct request *rq)
{
return false;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index b441e633f4dd..b2ac587e3402 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -392,6 +392,7 @@ struct request_queue {
struct blk_queue_stats *stats;
struct rq_qos *rq_qos;
+ struct mutex rq_qos_mutex;
const struct blk_mq_ops *mq_ops;
@@ -487,6 +488,7 @@ struct request_queue {
* for flush operations
*/
struct blk_flush_queue *fq;
+ struct list_head flush_list;
struct list_head requeue_list;
spinlock_t requeue_lock;
@@ -1281,15 +1283,18 @@ static inline unsigned int bdev_zone_no(struct block_device *bdev, sector_t sec)
return disk_zone_no(bdev->bd_disk, sec);
}
-static inline bool bdev_op_is_zoned_write(struct block_device *bdev,
- blk_opf_t op)
+/* Whether write serialization is required for @op on zoned devices. */
+static inline bool op_needs_zoned_write_locking(enum req_op op)
{
- if (!bdev_is_zoned(bdev))
- return false;
-
return op == REQ_OP_WRITE || op == REQ_OP_WRITE_ZEROES;
}
+static inline bool bdev_op_is_zoned_write(struct block_device *bdev,
+ enum req_op op)
+{
+ return bdev_is_zoned(bdev) && op_needs_zoned_write_locking(op);
+}
+
static inline sector_t bdev_zone_sectors(struct block_device *bdev)
{
struct request_queue *q = bdev_get_queue(bdev);
diff --git a/include/trace/events/block.h b/include/trace/events/block.h
index 7f4dfbdf12a6..40e60c33cc6f 100644
--- a/include/trace/events/block.h
+++ b/include/trace/events/block.h
@@ -246,6 +246,32 @@ DEFINE_EVENT(block_rq, block_rq_merge,
);
/**
+ * block_io_start - insert a request for execution
+ * @rq: block IO operation request
+ *
+ * Called when block operation request @rq is queued for execution
+ */
+DEFINE_EVENT(block_rq, block_io_start,
+
+ TP_PROTO(struct request *rq),
+
+ TP_ARGS(rq)
+);
+
+/**
+ * block_io_done - block IO operation request completed
+ * @rq: block IO operation request
+ *
+ * Called when block operation request @rq is completed
+ */
+DEFINE_EVENT(block_rq, block_io_done,
+
+ TP_PROTO(struct request *rq),
+
+ TP_ARGS(rq)
+);
+
+/**
* block_bio_complete - completed all work on the block operation
* @q: queue holding the block operation
* @bio: block operation completed
diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h
index 640bf687b94a..54b5b0aeefca 100644
--- a/include/uapi/linux/ublk_cmd.h
+++ b/include/uapi/linux/ublk_cmd.h
@@ -93,9 +93,29 @@
#define UBLKSRV_CMD_BUF_OFFSET 0
#define UBLKSRV_IO_BUF_OFFSET 0x80000000
-/* tag bit is 12bit, so at most 4096 IOs for each queue */
+/* tag bit is 16bit, so far limit at most 4096 IOs for each queue */
#define UBLK_MAX_QUEUE_DEPTH 4096
+/* single IO buffer max size is 32MB */
+#define UBLK_IO_BUF_OFF 0
+#define UBLK_IO_BUF_BITS 25
+#define UBLK_IO_BUF_BITS_MASK ((1ULL << UBLK_IO_BUF_BITS) - 1)
+
+/* so at most 64K IOs for each queue */
+#define UBLK_TAG_OFF UBLK_IO_BUF_BITS
+#define UBLK_TAG_BITS 16
+#define UBLK_TAG_BITS_MASK ((1ULL << UBLK_TAG_BITS) - 1)
+
+/* max 4096 queues */
+#define UBLK_QID_OFF (UBLK_TAG_OFF + UBLK_TAG_BITS)
+#define UBLK_QID_BITS 12
+#define UBLK_QID_BITS_MASK ((1ULL << UBLK_QID_BITS) - 1)
+
+#define UBLK_MAX_NR_QUEUES (1U << UBLK_QID_BITS)
+
+#define UBLKSRV_IO_BUF_TOTAL_BITS (UBLK_QID_OFF + UBLK_QID_BITS)
+#define UBLKSRV_IO_BUF_TOTAL_SIZE (1ULL << UBLKSRV_IO_BUF_TOTAL_BITS)
+
/*
* zero copy requires 4k block size, and can remap ublk driver's io
* request into ublksrv's vm space
@@ -145,6 +165,9 @@
/* use ioctl encoding for uring command */
#define UBLK_F_CMD_IOCTL_ENCODE (1UL << 6)
+/* Copy between request and user buffer by pread()/pwrite() */
+#define UBLK_F_USER_COPY (1UL << 7)
+
/* device state */
#define UBLK_S_DEV_DEAD 0
#define UBLK_S_DEV_LIVE 1