aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--block/Kconfig5
-rw-r--r--block/Makefile1
-rw-r--r--block/bio.c2
-rw-r--r--block/blk-core.c11
-rw-r--r--block/blk-flush.c1
-rw-r--r--block/blk-merge.c23
-rw-r--r--block/blk-mq-debugfs-zoned.c22
-rw-r--r--block/blk-mq-debugfs.c3
-rw-r--r--block/blk-mq-debugfs.h6
-rw-r--r--block/blk-mq.c125
-rw-r--r--block/blk-mq.h31
-rw-r--r--block/blk-settings.c46
-rw-r--r--block/blk-sysfs.c2
-rw-r--r--block/blk-zoned.c1357
-rw-r--r--block/blk.h84
-rw-r--r--block/elevator.c46
-rw-r--r--block/elevator.h1
-rw-r--r--block/fops.c29
-rw-r--r--block/genhd.c3
-rw-r--r--block/mq-deadline.c204
-rw-r--r--drivers/block/null_blk/main.c40
-rw-r--r--drivers/block/null_blk/null_blk.h2
-rw-r--r--drivers/block/null_blk/zoned.c358
-rw-r--r--drivers/block/ublk_drv.c5
-rw-r--r--drivers/block/virtio_blk.c2
-rw-r--r--drivers/md/dm-core.h2
-rw-r--r--drivers/md/dm-zone.c476
-rw-r--r--drivers/md/dm.c72
-rw-r--r--drivers/md/dm.h2
-rw-r--r--drivers/nvme/host/core.c2
-rw-r--r--drivers/nvme/target/zns.c10
-rw-r--r--drivers/scsi/scsi_lib.c1
-rw-r--r--drivers/scsi/sd.c8
-rw-r--r--drivers/scsi/sd.h19
-rw-r--r--drivers/scsi/sd_zbc.c335
-rw-r--r--include/linux/blk-mq.h85
-rw-r--r--include/linux/blk_types.h30
-rw-r--r--include/linux/blkdev.h103
38 files changed, 1875 insertions, 1679 deletions
diff --git a/block/Kconfig b/block/Kconfig
index 1de4682d48cc..d47398ae9824 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -100,7 +100,6 @@ config BLK_DEV_WRITE_MOUNTED
config BLK_DEV_ZONED
bool "Zoned block device support"
- select MQ_IOSCHED_DEADLINE
help
Block layer zoned block device support. This option enables
support for ZAC/ZBC/ZNS host-managed and host-aware zoned block
@@ -198,10 +197,6 @@ config BLK_DEBUG_FS
Unless you are building a kernel for a tiny system, you should
say Y here.
-config BLK_DEBUG_FS_ZONED
- bool
- default BLK_DEBUG_FS && BLK_DEV_ZONED
-
config BLK_SED_OPAL
bool "Logic for interfacing with Opal enabled SEDs"
depends on KEYS
diff --git a/block/Makefile b/block/Makefile
index 46ada9dc8bbf..168150b9c510 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -33,7 +33,6 @@ obj-$(CONFIG_BLK_MQ_VIRTIO) += blk-mq-virtio.o
obj-$(CONFIG_BLK_DEV_ZONED) += blk-zoned.o
obj-$(CONFIG_BLK_WBT) += blk-wbt.o
obj-$(CONFIG_BLK_DEBUG_FS) += blk-mq-debugfs.o
-obj-$(CONFIG_BLK_DEBUG_FS_ZONED)+= blk-mq-debugfs-zoned.o
obj-$(CONFIG_BLK_SED_OPAL) += sed-opal.o
obj-$(CONFIG_BLK_PM) += blk-pm.o
obj-$(CONFIG_BLK_INLINE_ENCRYPTION) += blk-crypto.o blk-crypto-profile.o \
diff --git a/block/bio.c b/block/bio.c
index d24420ed1c4c..38baedb39c6f 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1576,6 +1576,8 @@ again:
if (!bio_integrity_endio(bio))
return;
+ blk_zone_bio_endio(bio);
+
rq_qos_done_bio(bio);
if (bio->bi_bdev && bio_flagged(bio, BIO_TRACE_COMPLETION)) {
diff --git a/block/blk-core.c b/block/blk-core.c
index a16b5abdbbf5..47400a4fe851 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -589,8 +589,7 @@ static inline blk_status_t blk_check_zone_append(struct request_queue *q,
return BLK_STS_NOTSUPP;
/* The bio sector must point to the start of a sequential zone */
- if (!bdev_is_zone_start(bio->bi_bdev, bio->bi_iter.bi_sector) ||
- !bio_zone_is_seq(bio))
+ if (!bdev_is_zone_start(bio->bi_bdev, bio->bi_iter.bi_sector))
return BLK_STS_IOERR;
/*
@@ -602,7 +601,7 @@ static inline blk_status_t blk_check_zone_append(struct request_queue *q,
return BLK_STS_IOERR;
/* Make sure the BIO is small enough and will not get split */
- if (nr_sectors > q->limits.max_zone_append_sectors)
+ if (nr_sectors > queue_max_zone_append_sectors(q))
return BLK_STS_IOERR;
bio->bi_opf |= REQ_NOMERGE;
@@ -908,12 +907,6 @@ int bio_poll(struct bio *bio, struct io_comp_batch *iob, unsigned int flags)
!test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
return 0;
- /*
- * As the requests that require a zone lock are not plugged in the
- * first place, directly accessing the plug instead of using
- * blk_mq_plug() should not have any consequences during flushing for
- * zoned devices.
- */
blk_flush_plug(current->plug, false);
/*
diff --git a/block/blk-flush.c b/block/blk-flush.c
index b0f314f4bc14..2f58ae018464 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -130,6 +130,7 @@ static void blk_flush_restore_request(struct request *rq)
* original @rq->bio. Restore it.
*/
rq->bio = rq->biotail;
+ rq->__sector = rq->bio->bi_iter.bi_sector;
/* make @rq a normal request */
rq->rq_flags &= ~RQF_FLUSH_SEQ;
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 4e3483a16b75..f64115d72f3d 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -377,6 +377,7 @@ struct bio *__bio_split_to_limits(struct bio *bio,
blkcg_bio_issue_init(split);
bio_chain(split, bio);
trace_block_split(split, bio->bi_iter.bi_sector);
+ WARN_ON_ONCE(bio_zone_write_plugging(bio));
submit_bio_noacct(bio);
return split;
}
@@ -972,13 +973,7 @@ static void blk_account_io_merge_bio(struct request *req)
part_stat_unlock();
}
-enum bio_merge_status {
- BIO_MERGE_OK,
- BIO_MERGE_NONE,
- BIO_MERGE_FAILED,
-};
-
-static enum bio_merge_status bio_attempt_back_merge(struct request *req,
+enum bio_merge_status bio_attempt_back_merge(struct request *req,
struct bio *bio, unsigned int nr_segs)
{
const blk_opf_t ff = bio_failfast(bio);
@@ -994,6 +989,9 @@ static enum bio_merge_status bio_attempt_back_merge(struct request *req,
blk_update_mixed_merge(req, bio, false);
+ if (req->rq_flags & RQF_ZONE_WRITE_PLUGGING)
+ blk_zone_write_plug_bio_merged(bio);
+
req->biotail->bi_next = bio;
req->biotail = bio;
req->__data_len += bio->bi_iter.bi_size;
@@ -1009,6 +1007,14 @@ static enum bio_merge_status bio_attempt_front_merge(struct request *req,
{
const blk_opf_t ff = bio_failfast(bio);
+ /*
+ * A front merge for writes to sequential zones of a zoned block device
+ * can happen only if the user submitted writes out of order. Do not
+ * merge such write to let it fail.
+ */
+ if (req->rq_flags & RQF_ZONE_WRITE_PLUGGING)
+ return BIO_MERGE_FAILED;
+
if (!ll_front_merge_fn(req, bio, nr_segs))
return BIO_MERGE_FAILED;
@@ -1107,10 +1113,9 @@ static enum bio_merge_status blk_attempt_bio_merge(struct request_queue *q,
bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
unsigned int nr_segs)
{
- struct blk_plug *plug;
+ struct blk_plug *plug = current->plug;
struct request *rq;
- plug = blk_mq_plug(bio);
if (!plug || rq_list_empty(plug->mq_list))
return false;
diff --git a/block/blk-mq-debugfs-zoned.c b/block/blk-mq-debugfs-zoned.c
deleted file mode 100644
index a77b099c34b7..000000000000
--- a/block/blk-mq-debugfs-zoned.c
+++ /dev/null
@@ -1,22 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (C) 2017 Western Digital Corporation or its affiliates.
- */
-
-#include <linux/blkdev.h>
-#include "blk-mq-debugfs.h"
-
-int queue_zone_wlock_show(void *data, struct seq_file *m)
-{
- struct request_queue *q = data;
- unsigned int i;
-
- if (!q->disk->seq_zones_wlock)
- return 0;
-
- for (i = 0; i < q->disk->nr_zones; i++)
- if (test_bit(i, q->disk->seq_zones_wlock))
- seq_printf(m, "%u\n", i);
-
- return 0;
-}
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 94668e72ab09..770c0c2b72fa 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -160,7 +160,7 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_queue_attrs[] = {
{ "requeue_list", 0400, .seq_ops = &queue_requeue_list_seq_ops },
{ "pm_only", 0600, queue_pm_only_show, NULL },
{ "state", 0600, queue_state_show, queue_state_write },
- { "zone_wlock", 0400, queue_zone_wlock_show, NULL },
+ { "zone_wplugs", 0400, queue_zone_wplugs_show, NULL },
{ },
};
@@ -256,7 +256,6 @@ static const char *const rqf_name[] = {
RQF_NAME(HASHED),
RQF_NAME(STATS),
RQF_NAME(SPECIAL_PAYLOAD),
- RQF_NAME(ZONE_WRITE_LOCKED),
RQF_NAME(TIMED_OUT),
RQF_NAME(RESV),
};
diff --git a/block/blk-mq-debugfs.h b/block/blk-mq-debugfs.h
index 9c7d4b6117d4..c80e453e3014 100644
--- a/block/blk-mq-debugfs.h
+++ b/block/blk-mq-debugfs.h
@@ -83,10 +83,10 @@ static inline void blk_mq_debugfs_unregister_rqos(struct rq_qos *rqos)
}
#endif
-#ifdef CONFIG_BLK_DEBUG_FS_ZONED
-int queue_zone_wlock_show(void *data, struct seq_file *m);
+#if defined(CONFIG_BLK_DEV_ZONED) && defined(CONFIG_BLK_DEBUG_FS)
+int queue_zone_wplugs_show(void *data, struct seq_file *m);
#else
-static inline int queue_zone_wlock_show(void *data, struct seq_file *m)
+static inline int queue_zone_wplugs_show(void *data, struct seq_file *m)
{
return 0;
}
diff --git a/block/blk-mq.c b/block/blk-mq.c
index b8dbfed8b28b..434d45219e23 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -762,31 +762,6 @@ void blk_dump_rq_flags(struct request *rq, char *msg)
}
EXPORT_SYMBOL(blk_dump_rq_flags);
-static void req_bio_endio(struct request *rq, struct bio *bio,
- unsigned int nbytes, blk_status_t error)
-{
- if (unlikely(error)) {
- bio->bi_status = error;
- } else if (req_op(rq) == REQ_OP_ZONE_APPEND) {
- /*
- * Partial zone append completions cannot be supported as the
- * BIO fragments may end up not being written sequentially.
- */
- if (bio->bi_iter.bi_size != nbytes)
- bio->bi_status = BLK_STS_IOERR;
- else
- bio->bi_iter.bi_sector = rq->__sector;
- }
-
- bio_advance(bio, nbytes);
-
- if (unlikely(rq->rq_flags & RQF_QUIET))
- bio_set_flag(bio, BIO_QUIET);
- /* don't actually finish bio if it's part of flush sequence */
- if (bio->bi_iter.bi_size == 0 && !(rq->rq_flags & RQF_FLUSH_SEQ))
- bio_endio(bio);
-}
-
static void blk_account_io_completion(struct request *req, unsigned int bytes)
{
if (req->part && blk_do_io_stat(req)) {
@@ -846,14 +821,15 @@ static void blk_complete_request(struct request *req)
/* Completion has already been traced */
bio_clear_flag(bio, BIO_TRACE_COMPLETION);
- if (req_op(req) == REQ_OP_ZONE_APPEND)
- bio->bi_iter.bi_sector = req->__sector;
+ blk_zone_update_request_bio(req, bio);
if (!is_flush)
bio_endio(bio);
bio = next;
} while (bio);
+ blk_zone_complete_request(req);
+
/*
* Reset counters so that the request stacking driver
* can find how many bytes remain in the request
@@ -890,6 +866,8 @@ static void blk_complete_request(struct request *req)
bool blk_update_request(struct request *req, blk_status_t error,
unsigned int nr_bytes)
{
+ bool is_flush = req->rq_flags & RQF_FLUSH_SEQ;
+ bool quiet = req->rq_flags & RQF_QUIET;
int total_bytes;
trace_block_rq_complete(req, error, nr_bytes);
@@ -910,9 +888,8 @@ bool blk_update_request(struct request *req, blk_status_t error,
if (blk_crypto_rq_has_keyslot(req) && nr_bytes >= blk_rq_bytes(req))
__blk_crypto_rq_put_keyslot(req);
- if (unlikely(error && !blk_rq_is_passthrough(req) &&
- !(req->rq_flags & RQF_QUIET)) &&
- !test_bit(GD_DEAD, &req->q->disk->state)) {
+ if (unlikely(error && !blk_rq_is_passthrough(req) && !quiet) &&
+ !test_bit(GD_DEAD, &req->q->disk->state)) {
blk_print_req_error(req, error);
trace_block_rq_error(req, error, nr_bytes);
}
@@ -924,12 +901,33 @@ bool blk_update_request(struct request *req, blk_status_t error,
struct bio *bio = req->bio;
unsigned bio_bytes = min(bio->bi_iter.bi_size, nr_bytes);
- if (bio_bytes == bio->bi_iter.bi_size)
+ if (unlikely(error))
+ bio->bi_status = error;
+
+ if (bio_bytes == bio->bi_iter.bi_size) {
req->bio = bio->bi_next;
+ } else if (bio_is_zone_append(bio) && error == BLK_STS_OK) {
+ /*
+ * Partial zone append completions cannot be supported
+ * as the BIO fragments may end up not being written
+ * sequentially.
+ */
+ bio->bi_status = BLK_STS_IOERR;
+ }
/* Completion has already been traced */
bio_clear_flag(bio, BIO_TRACE_COMPLETION);
- req_bio_endio(req, bio, bio_bytes, error);
+ if (unlikely(quiet))
+ bio_set_flag(bio, BIO_QUIET);
+
+ bio_advance(bio, bio_bytes);
+
+ /* Don't actually finish bio if it's part of flush sequence */
+ if (!bio->bi_iter.bi_size) {
+ blk_zone_update_request_bio(req, bio);
+ if (!is_flush)
+ bio_endio(bio);
+ }
total_bytes += bio_bytes;
nr_bytes -= bio_bytes;
@@ -942,6 +940,7 @@ bool blk_update_request(struct request *req, blk_status_t error,
* completely done
*/
if (!req->bio) {
+ blk_zone_complete_request(req);
/*
* Reset counters so that the request stacking driver
* can find how many bytes remain in the request
@@ -1331,11 +1330,6 @@ void blk_execute_rq_nowait(struct request *rq, bool at_head)
blk_account_io_start(rq);
- /*
- * As plugging can be enabled for passthrough requests on a zoned
- * device, directly accessing the plug instead of using blk_mq_plug()
- * should not have any consequences.
- */
if (current->plug && !at_head) {
blk_add_rq_to_plug(current->plug, rq);
return;
@@ -1922,19 +1916,6 @@ static void blk_mq_handle_dev_resource(struct request *rq,
__blk_mq_requeue_request(rq);
}
-static void blk_mq_handle_zone_resource(struct request *rq,
- struct list_head *zone_list)
-{
- /*
- * If we end up here it is because we cannot dispatch a request to a
- * specific zone due to LLD level zone-write locking or other zone
- * related resource not being available. In this case, set the request
- * aside in zone_list for retrying it later.
- */
- list_add(&rq->queuelist, zone_list);
- __blk_mq_requeue_request(rq);
-}
-
enum prep_dispatch {
PREP_DISPATCH_OK,
PREP_DISPATCH_NO_TAG,
@@ -2020,7 +2001,6 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list,
struct request *rq;
int queued;
blk_status_t ret = BLK_STS_OK;
- LIST_HEAD(zone_list);
bool needs_resource = false;
if (list_empty(list))
@@ -2062,23 +2042,11 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list,
case BLK_STS_DEV_RESOURCE:
blk_mq_handle_dev_resource(rq, list);
goto out;
- case BLK_STS_ZONE_RESOURCE:
- /*
- * Move the request to zone_list and keep going through
- * the dispatch list to find more requests the drive can
- * accept.
- */
- blk_mq_handle_zone_resource(rq, &zone_list);
- needs_resource = true;
- break;
default:
blk_mq_end_request(rq, ret);
}
} while (!list_empty(list));
out:
- if (!list_empty(&zone_list))
- list_splice_tail_init(&zone_list, list);
-
/* If we didn't flush the entire list, we could have told the driver
* there was more coming, but that turned out to be a lie.
*/
@@ -2959,22 +2927,37 @@ static void blk_mq_use_cached_rq(struct request *rq, struct blk_plug *plug,
void blk_mq_submit_bio(struct bio *bio)
{
struct request_queue *q = bdev_get_queue(bio->bi_bdev);
- struct blk_plug *plug = blk_mq_plug(bio);
+ struct blk_plug *plug = current->plug;
const int is_sync = op_is_sync(bio->bi_opf);
struct blk_mq_hw_ctx *hctx;
unsigned int nr_segs = 1;
struct request *rq;
blk_status_t ret;
+ /*
+ * If the plug has a cached request for this queue, try to use it.
+ */
+ rq = blk_mq_peek_cached_request(plug, q, bio->bi_opf);
+
+ /*
+ * A BIO that was released from a zone write plug has already been
+ * through the preparation in this function, already holds a reference
+ * on the queue usage counter, and is the only write BIO in-flight for
+ * the target zone. Go straight to preparing a request for it.
+ */
+ if (bio_zone_write_plugging(bio)) {
+ nr_segs = bio->__bi_nr_segments;
+ if (rq)
+ blk_queue_exit(q);
+ goto new_request;
+ }
+
bio = blk_queue_bounce(bio, q);
/*
- * If the plug has a cached request for this queue, try use it.
- *
* The cached request already holds a q_usage_counter reference and we
* don't have to acquire a new one if we use it.
*/
- rq = blk_mq_peek_cached_request(plug, q, bio->bi_opf);
if (!rq) {
if (unlikely(bio_queue_enter(bio)))
return;
@@ -2991,6 +2974,10 @@ void blk_mq_submit_bio(struct bio *bio)
if (blk_mq_attempt_bio_merge(q, bio, nr_segs))
goto queue_exit;
+ if (blk_queue_is_zoned(q) && blk_zone_plug_bio(bio, nr_segs))
+ goto queue_exit;
+
+new_request:
if (!rq) {
rq = blk_mq_get_new_requests(q, plug, bio, nr_segs);
if (unlikely(!rq))
@@ -3009,6 +2996,7 @@ void blk_mq_submit_bio(struct bio *bio)
if (ret != BLK_STS_OK) {
bio->bi_status = ret;
bio_endio(bio);
+ blk_zone_complete_request(rq);
blk_mq_free_request(rq);
return;
}
@@ -3016,6 +3004,9 @@ void blk_mq_submit_bio(struct bio *bio)
if (op_is_flush(bio->bi_opf) && blk_insert_flush(rq))
return;
+ if (bio_zone_write_plugging(bio))
+ blk_zone_write_plug_attempt_merge(rq);
+
if (plug) {
blk_add_rq_to_plug(plug, rq);
return;
diff --git a/block/blk-mq.h b/block/blk-mq.h
index f75a9ecfebde..260beea8e332 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -365,37 +365,6 @@ static inline void blk_mq_clear_mq_map(struct blk_mq_queue_map *qmap)
qmap->mq_map[cpu] = 0;
}
-/*
- * blk_mq_plug() - Get caller context plug
- * @bio : the bio being submitted by the caller context
- *
- * Plugging, by design, may delay the insertion of BIOs into the elevator in
- * order to increase BIO merging opportunities. This however can cause BIO
- * insertion order to change from the order in which submit_bio() is being
- * executed in the case of multiple contexts concurrently issuing BIOs to a
- * device, even if these context are synchronized to tightly control BIO issuing
- * order. While this is not a problem with regular block devices, this ordering
- * change can cause write BIO failures with zoned block devices as these
- * require sequential write patterns to zones. Prevent this from happening by
- * ignoring the plug state of a BIO issuing context if it is for a zoned block
- * device and the BIO to plug is a write operation.
- *
- * Return current->plug if the bio can be plugged and NULL otherwise
- */
-static inline struct blk_plug *blk_mq_plug( struct bio *bio)
-{
- /* Zoned block device write operation case: do not plug the BIO */
- if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
- bdev_op_is_zoned_write(bio->bi_bdev, bio_op(bio)))
- return NULL;
-
- /*
- * For regular block devices or read operations, use the context plug
- * which may be NULL if blk_start_plug() was not executed.
- */
- return current->plug;
-}
-
/* Free all requests on the list */
static inline void blk_mq_free_requests(struct list_head *list)
{
diff --git a/block/blk-settings.c b/block/blk-settings.c
index cdbaef159c4b..715f4b6356c4 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -412,24 +412,32 @@ EXPORT_SYMBOL(blk_queue_max_write_zeroes_sectors);
* blk_queue_max_zone_append_sectors - set max sectors for a single zone append
* @q: the request queue for the device
* @max_zone_append_sectors: maximum number of sectors to write per command
+ *
+ * Sets the maximum number of sectors allowed for zone append commands. If
+ * Specifying 0 for @max_zone_append_sectors indicates that the queue does
+ * not natively support zone append operations and that the block layer must
+ * emulate these operations using regular writes.
**/
void blk_queue_max_zone_append_sectors(struct request_queue *q,
unsigned int max_zone_append_sectors)
{
- unsigned int max_sectors;
+ unsigned int max_sectors = 0;
if (WARN_ON(!blk_queue_is_zoned(q)))
return;
- max_sectors = min(q->limits.max_hw_sectors, max_zone_append_sectors);
- max_sectors = min(q->limits.chunk_sectors, max_sectors);
+ if (max_zone_append_sectors) {
+ max_sectors = min(q->limits.max_hw_sectors,
+ max_zone_append_sectors);
+ max_sectors = min(q->limits.chunk_sectors, max_sectors);
- /*
- * Signal eventual driver bugs resulting in the max_zone_append sectors limit
- * being 0 due to a 0 argument, the chunk_sectors limit (zone size) not set,
- * or the max_hw_sectors limit not set.
- */
- WARN_ON(!max_sectors);
+ /*
+ * Signal eventual driver bugs resulting in the max_zone_append
+ * sectors limit being 0 due to the chunk_sectors limit (zone
+ * size) not set or the max_hw_sectors limit not set.
+ */
+ WARN_ON_ONCE(!max_sectors);
+ }
q->limits.max_zone_append_sectors = max_sectors;
}
@@ -756,8 +764,8 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
t->max_dev_sectors = min_not_zero(t->max_dev_sectors, b->max_dev_sectors);
t->max_write_zeroes_sectors = min(t->max_write_zeroes_sectors,
b->max_write_zeroes_sectors);
- t->max_zone_append_sectors = min(t->max_zone_append_sectors,
- b->max_zone_append_sectors);
+ t->max_zone_append_sectors = min(queue_limits_max_zone_append_sectors(t),
+ queue_limits_max_zone_append_sectors(b));
t->bounce = max(t->bounce, b->bounce);
t->seg_boundary_mask = min_not_zero(t->seg_boundary_mask,
@@ -1045,22 +1053,6 @@ void blk_queue_write_cache(struct request_queue *q, bool wc, bool fua)
EXPORT_SYMBOL_GPL(blk_queue_write_cache);
/**
- * blk_queue_required_elevator_features - Set a queue required elevator features
- * @q: the request queue for the target device
- * @features: Required elevator features OR'ed together
- *
- * Tell the block layer that for the device controlled through @q, only the
- * only elevators that can be used are those that implement at least the set of
- * features specified by @features.
- */
-void blk_queue_required_elevator_features(struct request_queue *q,
- unsigned int features)
-{
- q->required_elevator_features = features;
-}
-EXPORT_SYMBOL_GPL(blk_queue_required_elevator_features);
-
-/**
* blk_queue_can_use_dma_map_merging - configure queue for merging segments.
* @q: the request queue for the device
* @dev: the device pointer for dma
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 8c8f69d8ba48..e3ed5a921aff 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -224,7 +224,7 @@ static ssize_t queue_zone_write_granularity_show(struct request_queue *q,
static ssize_t queue_zone_append_max_show(struct request_queue *q, char *page)
{
- unsigned long long max_sectors = q->limits.max_zone_append_sectors;
+ unsigned long long max_sectors = queue_max_zone_append_sectors(q);
return sprintf(page, "%llu\n", max_sectors << SECTOR_SHIFT);
}
diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index da0f4b2a8fa0..bad68277c0b2 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -7,6 +7,7 @@
*
* Copyright (c) 2016, Damien Le Moal
* Copyright (c) 2016, Western Digital
+ * Copyright (c) 2024, Western Digital Corporation or its affiliates.
*/
#include <linux/kernel.h>
@@ -16,8 +17,13 @@
#include <linux/mm.h>
#include <linux/vmalloc.h>
#include <linux/sched/mm.h>
+#include <linux/spinlock.h>
+#include <linux/atomic.h>
+#include <linux/mempool.h>
#include "blk.h"
+#include "blk-mq-sched.h"
+#include "blk-mq-debugfs.h"
#define ZONE_COND_NAME(name) [BLK_ZONE_COND_##name] = #name
static const char *const zone_cond_name[] = {
@@ -32,6 +38,64 @@ static const char *const zone_cond_name[] = {
};
#undef ZONE_COND_NAME
+/*
+ * Per-zone write plug.
+ * @node: hlist_node structure for managing the plug using a hash table.
+ * @link: To list the plug in the zone write plug error list of the disk.
+ * @ref: Zone write plug reference counter. A zone write plug reference is
+ * always at least 1 when the plug is hashed in the disk plug hash table.
+ * The reference is incremented whenever a new BIO needing plugging is
+ * submitted and when a function needs to manipulate a plug. The
+ * reference count is decremented whenever a plugged BIO completes and
+ * when a function that referenced the plug returns. The initial
+ * reference is dropped whenever the zone of the zone write plug is reset,
+ * finished and when the zone becomes full (last write BIO to the zone
+ * completes).
+ * @lock: Spinlock to atomically manipulate the plug.
+ * @flags: Flags indicating the plug state.
+ * @zone_no: The number of the zone the plug is managing.
+ * @wp_offset: The zone write pointer location relative to the start of the zone
+ * as a number of 512B sectors.
+ * @bio_list: The list of BIOs that are currently plugged.
+ * @bio_work: Work struct to handle issuing of plugged BIOs
+ * @rcu_head: RCU head to free zone write plugs with an RCU grace period.
+ * @disk: The gendisk the plug belongs to.
+ */
+struct blk_zone_wplug {
+ struct hlist_node node;
+ struct list_head link;
+ atomic_t ref;
+ spinlock_t lock;
+ unsigned int flags;
+ unsigned int zone_no;
+ unsigned int wp_offset;
+ struct bio_list bio_list;
+ struct work_struct bio_work;
+ struct rcu_head rcu_head;
+ struct gendisk *disk;
+};
+
+/*
+ * Zone write plug flags bits:
+ * - BLK_ZONE_WPLUG_PLUGGED: Indicates that the zone write plug is plugged,
+ * that is, that write BIOs are being throttled due to a write BIO already
+ * being executed or the zone write plug bio list is not empty.
+ * - BLK_ZONE_WPLUG_ERROR: Indicates that a write error happened which will be
+ * recovered with a report zone to update the zone write pointer offset.
+ * - BLK_ZONE_WPLUG_UNHASHED: Indicates that the zone write plug was removed
+ * from the disk hash table and that the initial reference to the zone
+ * write plug set when the plug was first added to the hash table has been
+ * dropped. This flag is set when a zone is reset, finished or become full,
+ * to prevent new references to the zone write plug to be taken for
+ * newly incoming BIOs. A zone write plug flagged with this flag will be
+ * freed once all remaining references from BIOs or functions are dropped.
+ */
+#define BLK_ZONE_WPLUG_PLUGGED (1U << 0)
+#define BLK_ZONE_WPLUG_ERROR (1U << 1)
+#define BLK_ZONE_WPLUG_UNHASHED (1U << 2)
+
+#define BLK_ZONE_WPLUG_BUSY (BLK_ZONE_WPLUG_PLUGGED | BLK_ZONE_WPLUG_ERROR)
+
/**
* blk_zone_cond_str - Return string XXX in BLK_ZONE_COND_XXX.
* @zone_cond: BLK_ZONE_COND_XXX.
@@ -51,52 +115,6 @@ const char *blk_zone_cond_str(enum blk_zone_cond zone_cond)
}
EXPORT_SYMBOL_GPL(blk_zone_cond_str);
-/*
- * Return true if a request is a write requests that needs zone write locking.
- */
-bool blk_req_needs_zone_write_lock(struct request *rq)
-{
- if (!rq->q->disk->seq_zones_wlock)
- return false;
-
- return blk_rq_is_seq_zoned_write(rq);
-}
-EXPORT_SYMBOL_GPL(blk_req_needs_zone_write_lock);
-
-bool blk_req_zone_write_trylock(struct request *rq)
-{
- unsigned int zno = blk_rq_zone_no(rq);
-
- if (test_and_set_bit(zno, rq->q->disk->seq_zones_wlock))
- return false;
-
- WARN_ON_ONCE(rq->rq_flags & RQF_ZONE_WRITE_LOCKED);
- rq->rq_flags |= RQF_ZONE_WRITE_LOCKED;
-
- return true;
-}
-EXPORT_SYMBOL_GPL(blk_req_zone_write_trylock);
-
-void __blk_req_zone_write_lock(struct request *rq)
-{
- if (WARN_ON_ONCE(test_and_set_bit(blk_rq_zone_no(rq),
- rq->q->disk->seq_zones_wlock)))
- return;
-
- WARN_ON_ONCE(rq->rq_flags & RQF_ZONE_WRITE_LOCKED);
- rq->rq_flags |= RQF_ZONE_WRITE_LOCKED;
-}
-EXPORT_SYMBOL_GPL(__blk_req_zone_write_lock);
-
-void __blk_req_zone_write_unlock(struct request *rq)
-{
- rq->rq_flags &= ~RQF_ZONE_WRITE_LOCKED;
- if (rq->q->disk->seq_zones_wlock)
- WARN_ON_ONCE(!test_and_clear_bit(blk_rq_zone_no(rq),
- rq->q->disk->seq_zones_wlock));
-}
-EXPORT_SYMBOL_GPL(__blk_req_zone_write_unlock);
-
/**
* bdev_nr_zones - Get number of zones
* @bdev: Target device
@@ -425,23 +443,1123 @@ fail:
return ret;
}
-void disk_free_zone_bitmaps(struct gendisk *disk)
+static inline bool disk_zone_is_conv(struct gendisk *disk, sector_t sector)
+{
+ if (!disk->conv_zones_bitmap)
+ return false;
+ return test_bit(disk_zone_no(disk, sector), disk->conv_zones_bitmap);
+}
+
+static bool disk_insert_zone_wplug(struct gendisk *disk,
+ struct blk_zone_wplug *zwplug)
+{
+ struct blk_zone_wplug *zwplg;
+ unsigned long flags;
+ unsigned int idx =
+ hash_32(zwplug->zone_no, disk->zone_wplugs_hash_bits);
+
+ /*
+ * Add the new zone write plug to the hash table, but carefully as we
+ * are racing with other submission context, so we may already have a
+ * zone write plug for the same zone.
+ */
+ spin_lock_irqsave(&disk->zone_wplugs_lock, flags);
+ hlist_for_each_entry_rcu(zwplg, &disk->zone_wplugs_hash[idx], node) {
+ if (zwplg->zone_no == zwplug->zone_no) {
+ spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags);
+ return false;
+ }
+ }
+ hlist_add_head_rcu(&zwplug->node, &disk->zone_wplugs_hash[idx]);
+ spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags);
+
+ return true;
+}
+
+static void disk_remove_zone_wplug(struct gendisk *disk,
+ struct blk_zone_wplug *zwplug)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&disk->zone_wplugs_lock, flags);
+ zwplug->flags |= BLK_ZONE_WPLUG_UNHASHED;
+ atomic_dec(&zwplug->ref);
+ hlist_del_init_rcu(&zwplug->node);
+ spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags);
+}
+
+static inline bool disk_should_remove_zone_wplug(struct gendisk *disk,
+ struct blk_zone_wplug *zwplug)
+{
+ /* If the zone is still busy, the plug cannot be removed. */
+ if (zwplug->flags & BLK_ZONE_WPLUG_BUSY)
+ return false;
+
+ /* We can remove zone write plugs for zones that are empty or full. */
+ return !zwplug->wp_offset || zwplug->wp_offset >= disk->zone_capacity;
+}
+
+static struct blk_zone_wplug *disk_get_zone_wplug(struct gendisk *disk,
+ sector_t sector)
+{
+ unsigned int zno = disk_zone_no(disk, sector);
+ unsigned int idx = hash_32(zno, disk->zone_wplugs_hash_bits);
+ struct blk_zone_wplug *zwplug;
+
+ rcu_read_lock();
+
+ hlist_for_each_entry_rcu(zwplug, &disk->zone_wplugs_hash[idx], node) {
+ if (zwplug->zone_no == zno &&
+ atomic_inc_not_zero(&zwplug->ref)) {
+ rcu_read_unlock();
+ return zwplug;
+ }
+ }
+
+ rcu_read_unlock();
+
+ return NULL;
+}
+
+static void disk_free_zone_wplug_rcu(struct rcu_head *rcu_head)
+{
+ struct blk_zone_wplug *zwplug =
+ container_of(rcu_head, struct blk_zone_wplug, rcu_head);
+
+ mempool_free(zwplug, zwplug->disk->zone_wplugs_pool);
+}
+
+static inline void disk_put_zone_wplug(struct blk_zone_wplug *zwplug)
+{
+ if (atomic_dec_and_test(&zwplug->ref)) {
+ WARN_ON_ONCE(!bio_list_empty(&zwplug->bio_list));
+ WARN_ON_ONCE(!list_empty(&zwplug->link));
+
+ call_rcu(&zwplug->rcu_head, disk_free_zone_wplug_rcu);
+ }
+}
+
+static void blk_zone_wplug_bio_work(struct work_struct *work);
+
+/*
+ * Get a reference on the write plug for the zone containing @sector.
+ * If the plug does not exist, it is allocated and hashed.
+ * Return a pointer to the zone write plug with the plug spinlock held.
+ */
+static struct blk_zone_wplug *disk_get_and_lock_zone_wplug(struct gendisk *disk,
+ sector_t sector, gfp_t gfp_mask,
+ unsigned long *flags)
+{
+ unsigned int zno = disk_zone_no(disk, sector);
+ struct blk_zone_wplug *zwplug;
+
+again:
+ zwplug = disk_get_zone_wplug(disk, sector);
+ if (zwplug) {
+ /*
+ * Check that a BIO completion or a zone reset or finish
+ * operation has not already removed the zone write plug from
+ * the hash table and dropped its reference count. In such case,
+ * we need to get a new plug so start over from the beginning.
+ */
+ spin_lock_irqsave(&zwplug->lock, *flags);
+ if (zwplug->flags & BLK_ZONE_WPLUG_UNHASHED) {
+ spin_unlock_irqrestore(&zwplug->lock, *flags);
+ disk_put_zone_wplug(zwplug);
+ goto again;
+ }
+ return zwplug;
+ }
+
+ /*
+ * Allocate and initialize a zone write plug with an extra reference
+ * so that it is not freed when the zone write plug becomes idle without
+ * the zone being full.
+ */
+ zwplug = mempool_alloc(disk->zone_wplugs_pool, gfp_mask);
+ if (!zwplug)
+ return NULL;
+
+ INIT_HLIST_NODE(&zwplug->node);
+ INIT_LIST_HEAD(&zwplug->link);
+ atomic_set(&zwplug->ref, 2);
+ spin_lock_init(&zwplug->lock);
+ zwplug->flags = 0;
+ zwplug->zone_no = zno;
+ zwplug->wp_offset = sector & (disk->queue->limits.chunk_sectors - 1);
+ bio_list_init(&zwplug->bio_list);
+ INIT_WORK(&zwplug->bio_work, blk_zone_wplug_bio_work);
+ zwplug->disk = disk;
+
+ spin_lock_irqsave(&zwplug->lock, *flags);
+
+ /*
+ * Insert the new zone write plug in the hash table. This can fail only
+ * if another context already inserted a plug. Retry from the beginning
+ * in such case.
+ */
+ if (!disk_insert_zone_wplug(disk, zwplug)) {
+ spin_unlock_irqrestore(&zwplug->lock, *flags);
+ mempool_free(zwplug, disk->zone_wplugs_pool);
+ goto again;
+ }
+
+ return zwplug;
+}
+
+static inline void blk_zone_wplug_bio_io_error(struct bio *bio)
+{
+ struct request_queue *q = bio->bi_bdev->bd_disk->queue;
+
+ bio_clear_flag(bio, BIO_ZONE_WRITE_PLUGGING);
+ bio_io_error(bio);
+ blk_queue_exit(q);
+}
+
+/*
+ * Abort (fail) all plugged BIOs of a zone write plug.
+ */
+static void disk_zone_wplug_abort(struct blk_zone_wplug *zwplug)
+{
+ struct bio *bio;
+
+ while ((bio = bio_list_pop(&zwplug->bio_list))) {
+ blk_zone_wplug_bio_io_error(bio);
+ disk_put_zone_wplug(zwplug);
+ }
+}
+
+/*
+ * Abort (fail) all plugged BIOs of a zone write plug that are not aligned
+ * with the assumed write pointer location of the zone when the BIO will
+ * be unplugged.
+ */
+static void disk_zone_wplug_abort_unaligned(struct gendisk *disk,
+ struct blk_zone_wplug *zwplug)
+{
+ unsigned int zone_capacity = disk->zone_capacity;
+ unsigned int wp_offset = zwplug->wp_offset;
+ struct bio_list bl = BIO_EMPTY_LIST;
+ struct bio *bio;
+
+ while ((bio = bio_list_pop(&zwplug->bio_list))) {
+ if (wp_offset >= zone_capacity ||
+ (bio_op(bio) != REQ_OP_ZONE_APPEND &&
+ bio_offset_from_zone_start(bio) != wp_offset)) {
+ blk_zone_wplug_bio_io_error(bio);
+ disk_put_zone_wplug(zwplug);
+ continue;
+ }
+
+ wp_offset += bio_sectors(bio);
+ bio_list_add(&bl, bio);
+ }
+
+ bio_list_merge(&zwplug->bio_list, &bl);
+}
+
+/*
+ * Set a zone write plug write pointer offset to either 0 (zone reset case)
+ * or to the zone size (zone finish case). This aborts all plugged BIOs, which
+ * is fine to do as doing a zone reset or zone finish while writes are in-flight
+ * is a mistake from the user which will most likely cause all plugged BIOs to
+ * fail anyway.
+ */
+static void disk_zone_wplug_set_wp_offset(struct gendisk *disk,
+ struct blk_zone_wplug *zwplug,
+ unsigned int wp_offset)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&zwplug->lock, flags);
+
+ /*
+ * Make sure that a BIO completion or another zone reset or finish
+ * operation has not already removed the plug from the hash table.
+ */
+ if (zwplug->flags & BLK_ZONE_WPLUG_UNHASHED) {
+ spin_unlock_irqrestore(&zwplug->lock, flags);
+ return;
+ }
+
+ /* Update the zone write pointer and abort all plugged BIOs. */
+ zwplug->wp_offset = wp_offset;
+ disk_zone_wplug_abort(zwplug);
+
+ /*
+ * Updating the write pointer offset puts back the zone
+ * in a good state. So clear the error flag and decrement the
+ * error count if we were in error state.
+ */
+ if (zwplug->flags & BLK_ZONE_WPLUG_ERROR) {
+ zwplug->flags &= ~BLK_ZONE_WPLUG_ERROR;
+ spin_lock(&disk->zone_wplugs_lock);
+ list_del_init(&zwplug->link);
+ spin_unlock(&disk->zone_wplugs_lock);
+ }
+
+ /*
+ * The zone write plug now has no BIO plugged: remove it from the
+ * hash table so that it cannot be seen. The plug will be freed
+ * when the last reference is dropped.
+ */
+ if (disk_should_remove_zone_wplug(disk, zwplug))
+ disk_remove_zone_wplug(disk, zwplug);
+
+ spin_unlock_irqrestore(&zwplug->lock, flags);
+}
+
+static bool blk_zone_wplug_handle_reset_or_finish(struct bio *bio,
+ unsigned int wp_offset)
+{
+ struct gendisk *disk = bio->bi_bdev->bd_disk;
+ sector_t sector = bio->bi_iter.bi_sector;
+ struct blk_zone_wplug *zwplug;
+
+ /* Conventional zones cannot be reset nor finished. */
+ if (disk_zone_is_conv(disk, sector)) {
+ bio_io_error(bio);
+ return true;
+ }
+
+ /*
+ * If we have a zone write plug, set its write pointer offset to 0
+ * (reset case) or to the zone size (finish case). This will abort all
+ * BIOs plugged for the target zone. It is fine as resetting or
+ * finishing zones while writes are still in-flight will result in the
+ * writes failing anyway.
+ */
+ zwplug = disk_get_zone_wplug(disk, sector);
+ if (zwplug) {
+ disk_zone_wplug_set_wp_offset(disk, zwplug, wp_offset);
+ disk_put_zone_wplug(zwplug);
+ }
+
+ return false;
+}
+
+static bool blk_zone_wplug_handle_reset_all(struct bio *bio)
+{
+ struct gendisk *disk = bio->bi_bdev->bd_disk;
+ struct blk_zone_wplug *zwplug;
+ sector_t sector;
+
+ /*
+ * Set the write pointer offset of all zone write plugs to 0. This will
+ * abort all plugged BIOs. It is fine as resetting zones while writes
+ * are still in-flight will result in the writes failing anyway.
+ */
+ for (sector = 0; sector < get_capacity(disk);
+ sector += disk->queue->limits.chunk_sectors) {
+ zwplug = disk_get_zone_wplug(disk, sector);
+ if (zwplug) {
+ disk_zone_wplug_set_wp_offset(disk, zwplug, 0);
+ disk_put_zone_wplug(zwplug);
+ }
+ }
+
+ return false;
+}
+
+static inline void blk_zone_wplug_add_bio(struct blk_zone_wplug *zwplug,
+ struct bio *bio, unsigned int nr_segs)
+{
+ /*
+ * Grab an extra reference on the BIO request queue usage counter.
+ * This reference will be reused to submit a request for the BIO for
+ * blk-mq devices and dropped when the BIO is failed and after
+ * it is issued in the case of BIO-based devices.
+ */
+ percpu_ref_get(&bio->bi_bdev->bd_disk->queue->q_usage_counter);
+
+ /*
+ * The BIO is being plugged and thus will have to wait for the on-going
+ * write and for all other writes already plugged. So polling makes
+ * no sense.
+ */
+ bio_clear_polled(bio);
+
+ /*
+ * Reuse the poll cookie field to store the number of segments when
+ * split to the hardware limits.
+ */
+ bio->__bi_nr_segments = nr_segs;
+
+ /*
+ * We always receive BIOs after they are split and ready to be issued.
+ * The block layer passes the parts of a split BIO in order, and the
+ * user must also issue write sequentially. So simply add the new BIO
+ * at the tail of the list to preserve the sequential write order.
+ */
+ bio_list_add(&zwplug->bio_list, bio);
+}
+
+/*
+ * Called from bio_attempt_back_merge() when a BIO was merged with a request.
+ */
+void blk_zone_write_plug_bio_merged(struct bio *bio)
+{
+ struct blk_zone_wplug *zwplug;
+ unsigned long flags;
+
+ /*
+ * If the BIO was already plugged, then we were called through
+ * blk_zone_write_plug_attempt_merge() -> blk_attempt_bio_merge().
+ * For this case, blk_zone_write_plug_attempt_merge() will handle the
+ * zone write pointer offset update.
+ */
+ if (bio_flagged(bio, BIO_ZONE_WRITE_PLUGGING))
+ return;
+
+ bio_set_flag(bio, BIO_ZONE_WRITE_PLUGGING);
+
+ /*
+ * Increase the plug reference count and advance the zone write
+ * pointer offset.
+ */
+ zwplug = disk_get_zone_wplug(bio->bi_bdev->bd_disk,
+ bio->bi_iter.bi_sector);
+ spin_lock_irqsave(&zwplug->lock, flags);
+ zwplug->wp_offset += bio_sectors(bio);
+ spin_unlock_irqrestore(&zwplug->lock, flags);
+}
+
+/*
+ * Attempt to merge plugged BIOs with a newly prepared request for a BIO that
+ * already went through zone write plugging (either a new BIO or one that was
+ * unplugged).
+ */
+void blk_zone_write_plug_attempt_merge(struct request *req)
+{
+ sector_t req_back_sector = blk_rq_pos(req) + blk_rq_sectors(req);
+ struct request_queue *q = req->q;
+ struct gendisk *disk = q->disk;
+ unsigned int zone_capacity = disk->zone_capacity;
+ struct blk_zone_wplug *zwplug =
+ disk_get_zone_wplug(disk, blk_rq_pos(req));
+ unsigned long flags;
+ struct bio *bio;
+
+ /*
+ * Completion of this request needs to be handled with
+ * blk_zone_write_plug_complete_request().
+ */
+ req->rq_flags |= RQF_ZONE_WRITE_PLUGGING;
+
+ if (blk_queue_nomerges(q))
+ return;
+
+ /*
+ * Walk through the list of plugged BIOs to check if they can be merged
+ * into the back of the request.
+ */
+ spin_lock_irqsave(&zwplug->lock, flags);
+ while (zwplug->wp_offset < zone_capacity) {
+ bio = bio_list_peek(&zwplug->bio_list);
+ if (!bio)
+ break;
+
+ if (bio->bi_iter.bi_sector != req_back_sector ||
+ !blk_rq_merge_ok(req, bio))
+ break;
+
+ WARN_ON_ONCE(bio_op(bio) != REQ_OP_WRITE_ZEROES &&
+ !bio->__bi_nr_segments);
+
+ bio_list_pop(&zwplug->bio_list);
+ if (bio_attempt_back_merge(req, bio, bio->__bi_nr_segments) !=
+ BIO_MERGE_OK) {
+ bio_list_add_head(&zwplug->bio_list, bio);
+ break;
+ }
+
+ /*
+ * Drop the extra reference on the queue usage we got when
+ * plugging the BIO and advance the write pointer offset.
+ */
+ blk_queue_exit(q);
+ zwplug->wp_offset += bio_sectors(bio);
+
+ req_back_sector += bio_sectors(bio);
+ }
+ spin_unlock_irqrestore(&zwplug->lock, flags);
+}
+
+static inline void disk_zone_wplug_set_error(struct gendisk *disk,
+ struct blk_zone_wplug *zwplug)
+{
+ if (!(zwplug->flags & BLK_ZONE_WPLUG_ERROR)) {
+ unsigned long flags;
+
+ /*
+ * Increase the plug reference count. The reference will be
+ * dropped in disk_zone_wplugs_work() once the error state
+ * is handled.
+ */
+ zwplug->flags |= BLK_ZONE_WPLUG_ERROR;
+ atomic_inc(&zwplug->ref);
+
+ spin_lock_irqsave(&disk->zone_wplugs_lock, flags);
+ list_add_tail(&zwplug->link, &disk->zone_wplugs_err_list);
+ spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags);
+ }
+}
+
+/*
+ * Check and prepare a BIO for submission by incrementing the write pointer
+ * offset of its zone write plug and changing zone append operations into
+ * regular write when zone append emulation is needed.
+ */
+static bool blk_zone_wplug_prepare_bio(struct blk_zone_wplug *zwplug,
+ struct bio *bio)
+{
+ struct gendisk *disk = bio->bi_bdev->bd_disk;
+
+ /*
+ * Check that the user is not attempting to write to a full zone.
+ * We know such BIO will fail, and that would potentially overflow our
+ * write pointer offset beyond the end of the zone.
+ */
+ if (zwplug->wp_offset >= disk->zone_capacity)
+ goto err;
+
+ if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
+ /*
+ * Use a regular write starting at the current write pointer.
+ * Similarly to native zone append operations, do not allow
+ * merging.
+ */
+ bio->bi_opf &= ~REQ_OP_MASK;
+ bio->bi_opf |= REQ_OP_WRITE | REQ_NOMERGE;
+ bio->bi_iter.bi_sector += zwplug->wp_offset;
+
+ /*
+ * Remember that this BIO is in fact a zone append operation
+ * so that we can restore its operation code on completion.
+ */
+ bio_set_flag(bio, BIO_EMULATES_ZONE_APPEND);
+ } else {
+ /*
+ * Check for non-sequential writes early because we avoid a
+ * whole lot of error handling trouble if we don't send it off
+ * to the driver.
+ */
+ if (bio_offset_from_zone_start(bio) != zwplug->wp_offset)
+ goto err;
+ }
+
+ /* Advance the zone write pointer offset. */
+ zwplug->wp_offset += bio_sectors(bio);
+
+ return true;
+
+err:
+ /* We detected an invalid write BIO: schedule error recovery. */
+ disk_zone_wplug_set_error(disk, zwplug);
+ kblockd_schedule_work(&disk->zone_wplugs_work);
+ return false;
+}
+
+static bool blk_zone_wplug_handle_write(struct bio *bio, unsigned int nr_segs)
+{
+ struct gendisk *disk = bio->bi_bdev->bd_disk;
+ sector_t sector = bio->bi_iter.bi_sector;
+ struct blk_zone_wplug *zwplug;
+ gfp_t gfp_mask = GFP_NOIO;
+ unsigned long flags;
+
+ /*
+ * BIOs must be fully contained within a zone so that we use the correct
+ * zone write plug for the entire BIO. For blk-mq devices, the block
+ * layer should already have done any splitting required to ensure this
+ * and this BIO should thus not be straddling zone boundaries. For
+ * BIO-based devices, it is the responsibility of the driver to split
+ * the bio before submitting it.
+ */
+ if (WARN_ON_ONCE(bio_straddles_zones(bio))) {
+ bio_io_error(bio);
+ return true;
+ }
+
+ /* Conventional zones do not need write plugging. */
+ if (disk_zone_is_conv(disk, sector)) {
+ /* Zone append to conventional zones is not allowed. */
+ if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
+ bio_io_error(bio);
+ return true;
+ }
+ return false;
+ }
+
+ if (bio->bi_opf & REQ_NOWAIT)
+ gfp_mask = GFP_NOWAIT;
+
+ zwplug = disk_get_and_lock_zone_wplug(disk, sector, gfp_mask, &flags);
+ if (!zwplug) {
+ bio_io_error(bio);
+ return true;
+ }
+
+ /* Indicate that this BIO is being handled using zone write plugging. */
+ bio_set_flag(bio, BIO_ZONE_WRITE_PLUGGING);
+
+ /*
+ * If the zone is already plugged or has a pending error, add the BIO
+ * to the plug BIO list. Otherwise, plug and let the BIO execute.
+ */
+ if (zwplug->flags & BLK_ZONE_WPLUG_BUSY)
+ goto plug;
+
+ /*
+ * If an error is detected when preparing the BIO, add it to the BIO
+ * list so that error recovery can deal with it.
+ */
+ if (!blk_zone_wplug_prepare_bio(zwplug, bio))
+ goto plug;
+
+ zwplug->flags |= BLK_ZONE_WPLUG_PLUGGED;
+
+ spin_unlock_irqrestore(&zwplug->lock, flags);
+
+ return false;
+
+plug:
+ zwplug->flags |= BLK_ZONE_WPLUG_PLUGGED;
+ blk_zone_wplug_add_bio(zwplug, bio, nr_segs);
+
+ spin_unlock_irqrestore(&zwplug->lock, flags);
+
+ return true;
+}
+
+/**
+ * blk_zone_plug_bio - Handle a zone write BIO with zone write plugging
+ * @bio: The BIO being submitted
+ * @nr_segs: The number of physical segments of @bio
+ *
+ * Handle write, write zeroes and zone append operations requiring emulation
+ * using zone write plugging.
+ *
+ * Return true whenever @bio execution needs to be delayed through the zone
+ * write plug. Otherwise, return false to let the submission path process
+ * @bio normally.
+ */
+bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs)
+{
+ struct block_device *bdev = bio->bi_bdev;
+
+ if (!bdev->bd_disk->zone_wplugs_hash)
+ return false;
+
+ /*
+ * If the BIO already has the plugging flag set, then it was already
+ * handled through this path and this is a submission from the zone
+ * plug bio submit work.
+ */
+ if (bio_flagged(bio, BIO_ZONE_WRITE_PLUGGING))
+ return false;
+
+ /*
+ * We do not need to do anything special for empty flush BIOs, e.g
+ * BIOs such as issued by blkdev_issue_flush(). The is because it is
+ * the responsibility of the user to first wait for the completion of
+ * write operations for flush to have any effect on the persistence of
+ * the written data.
+ */
+ if (op_is_flush(bio->bi_opf) && !bio_sectors(bio))
+ return false;
+
+ /*
+ * Regular writes and write zeroes need to be handled through the target
+ * zone write plug. This includes writes with REQ_FUA | REQ_PREFLUSH
+ * which may need to go through the flush machinery depending on the
+ * target device capabilities. Plugging such writes is fine as the flush
+ * machinery operates at the request level, below the plug, and
+ * completion of the flush sequence will go through the regular BIO
+ * completion, which will handle zone write plugging.
+ * Zone append operations for devices that requested emulation must
+ * also be plugged so that these BIOs can be changed into regular
+ * write BIOs.
+ * Zone reset, reset all and finish commands need special treatment
+ * to correctly track the write pointer offset of zones. These commands
+ * are not plugged as we do not need serialization with write
+ * operations. It is the responsibility of the user to not issue reset
+ * and finish commands when write operations are in flight.
+ */
+ switch (bio_op(bio)) {
+ case REQ_OP_ZONE_APPEND:
+ if (!bdev_emulates_zone_append(bdev))
+ return false;
+ fallthrough;
+ case REQ_OP_WRITE:
+ case REQ_OP_WRITE_ZEROES:
+ return blk_zone_wplug_handle_write(bio, nr_segs);
+ case REQ_OP_ZONE_RESET:
+ return blk_zone_wplug_handle_reset_or_finish(bio, 0);
+ case REQ_OP_ZONE_FINISH:
+ return blk_zone_wplug_handle_reset_or_finish(bio,
+ bdev_zone_sectors(bdev));
+ case REQ_OP_ZONE_RESET_ALL:
+ return blk_zone_wplug_handle_reset_all(bio);
+ default:
+ return false;
+ }
+
+ return false;
+}
+EXPORT_SYMBOL_GPL(blk_zone_plug_bio);
+
+static void disk_zone_wplug_unplug_bio(struct gendisk *disk,
+ struct blk_zone_wplug *zwplug)
{
+ unsigned long flags;
+
+ spin_lock_irqsave(&zwplug->lock, flags);
+
+ /*
+ * If we had an error, schedule error recovery. The recovery work
+ * will restart submission of plugged BIOs.
+ */
+ if (zwplug->flags & BLK_ZONE_WPLUG_ERROR) {
+ spin_unlock_irqrestore(&zwplug->lock, flags);
+ kblockd_schedule_work(&disk->zone_wplugs_work);
+ return;
+ }
+
+ /* Schedule submission of the next plugged BIO if we have one. */
+ if (!bio_list_empty(&zwplug->bio_list)) {
+ spin_unlock_irqrestore(&zwplug->lock, flags);
+ queue_work(disk->zone_wplugs_wq, &zwplug->bio_work);
+ return;
+ }
+
+ zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED;
+
+ /*
+ * If the zone is full (it was fully written or finished, or empty
+ * (it was reset), remove its zone write plug from the hash table.
+ */
+ if (disk_should_remove_zone_wplug(disk, zwplug))
+ disk_remove_zone_wplug(disk, zwplug);
+
+ spin_unlock_irqrestore(&zwplug->lock, flags);
+}
+
+void blk_zone_write_plug_bio_endio(struct bio *bio)
+{
+ struct gendisk *disk = bio->bi_bdev->bd_disk;
+ struct blk_zone_wplug *zwplug =
+ disk_get_zone_wplug(bio->bi_bdev->bd_disk,
+ bio->bi_iter.bi_sector);
+ unsigned long flags;
+
+ if (WARN_ON_ONCE(!zwplug))
+ return;
+
+ /* Make sure we do not see this BIO again by clearing the plug flag. */
+ bio_clear_flag(bio, BIO_ZONE_WRITE_PLUGGING);
+
+ /*
+ * If this is a regular write emulating a zone append operation,
+ * restore the original operation code.
+ */
+ if (bio_flagged(bio, BIO_EMULATES_ZONE_APPEND)) {
+ bio->bi_opf &= ~REQ_OP_MASK;
+ bio->bi_opf |= REQ_OP_ZONE_APPEND;
+ }
+
+ /*
+ * If the BIO failed, mark the plug as having an error to trigger
+ * recovery.
+ */
+ if (bio->bi_status != BLK_STS_OK) {
+ spin_lock_irqsave(&zwplug->lock, flags);
+ disk_zone_wplug_set_error(disk, zwplug);
+ spin_unlock_irqrestore(&zwplug->lock, flags);
+ }
+
+ /*
+ * For BIO-based devices, blk_zone_write_plug_complete_request()
+ * is not called. So we need to schedule execution of the next
+ * plugged BIO here.
+ */
+ if (bio->bi_bdev->bd_has_submit_bio)
+ disk_zone_wplug_unplug_bio(disk, zwplug);
+
+ /* Drop the reference we took when the BIO was issued. */
+ atomic_dec(&zwplug->ref);
+ disk_put_zone_wplug(zwplug);
+}
+
+void blk_zone_write_plug_complete_request(struct request *req)
+{
+ struct gendisk *disk = req->q->disk;
+ struct blk_zone_wplug *zwplug = disk_get_zone_wplug(disk, req->__sector);
+
+ if (WARN_ON_ONCE(!zwplug))
+ return;
+
+ req->rq_flags &= ~RQF_ZONE_WRITE_PLUGGING;
+
+ disk_zone_wplug_unplug_bio(disk, zwplug);
+
+ /*
+ * Drop the reference we took when the request was initialized in
+ * blk_zone_write_plug_attempt_merge().
+ */
+ atomic_dec(&zwplug->ref);
+ disk_put_zone_wplug(zwplug);
+}
+
+static void blk_zone_wplug_bio_work(struct work_struct *work)
+{
+ struct blk_zone_wplug *zwplug =
+ container_of(work, struct blk_zone_wplug, bio_work);
+ struct block_device *bdev;
+ unsigned long flags;
+ struct bio *bio;
+
+ /*
+ * Submit the next plugged BIO. If we do not have any, clear
+ * the plugged flag.
+ */
+ spin_lock_irqsave(&zwplug->lock, flags);
+
+ bio = bio_list_pop(&zwplug->bio_list);
+ if (!bio) {
+ zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED;
+ spin_unlock_irqrestore(&zwplug->lock, flags);
+ return;
+ }
+
+ if (!blk_zone_wplug_prepare_bio(zwplug, bio)) {
+ /* Error recovery will decide what to do with the BIO. */
+ bio_list_add_head(&zwplug->bio_list, bio);
+ spin_unlock_irqrestore(&zwplug->lock, flags);
+ return;
+ }
+
+ spin_unlock_irqrestore(&zwplug->lock, flags);
+
+ bdev = bio->bi_bdev;
+ submit_bio_noacct_nocheck(bio);
+
+ /*
+ * blk-mq devices will reuse the extra reference on the request queue
+ * usage counter we took when the BIO was plugged, but the submission
+ * path for BIO-based devices will not do that. So drop this extra
+ * reference here.
+ */
+ if (bdev->bd_has_submit_bio)
+ blk_queue_exit(bdev->bd_disk->queue);
+}
+
+static unsigned int blk_zone_wp_offset(struct blk_zone *zone)
+{
+ switch (zone->cond) {
+ case BLK_ZONE_COND_IMP_OPEN:
+ case BLK_ZONE_COND_EXP_OPEN:
+ case BLK_ZONE_COND_CLOSED:
+ return zone->wp - zone->start;
+ case BLK_ZONE_COND_FULL:
+ return zone->len;
+ case BLK_ZONE_COND_EMPTY:
+ return 0;
+ case BLK_ZONE_COND_NOT_WP:
+ case BLK_ZONE_COND_OFFLINE:
+ case BLK_ZONE_COND_READONLY:
+ default:
+ /*
+ * Conventional, offline and read-only zones do not have a valid
+ * write pointer.
+ */
+ return UINT_MAX;
+ }
+}
+
+static int blk_zone_wplug_report_zone_cb(struct blk_zone *zone,
+ unsigned int idx, void *data)
+{
+ struct blk_zone *zonep = data;
+
+ *zonep = *zone;
+ return 0;
+}
+
+static void disk_zone_wplug_handle_error(struct gendisk *disk,
+ struct blk_zone_wplug *zwplug)
+{
+ sector_t zone_start_sector =
+ bdev_zone_sectors(disk->part0) * zwplug->zone_no;
+ unsigned int noio_flag;
+ struct blk_zone zone;
+ unsigned long flags;
+ int ret;
+
+ /* Get the current zone information from the device. */
+ noio_flag = memalloc_noio_save();
+ ret = disk->fops->report_zones(disk, zone_start_sector, 1,
+ blk_zone_wplug_report_zone_cb, &zone);
+ memalloc_noio_restore(noio_flag);
+
+ spin_lock_irqsave(&zwplug->lock, flags);
+
+ /*
+ * A zone reset or finish may have cleared the error already. In such
+ * case, do nothing as the report zones may have seen the "old" write
+ * pointer value before the reset/finish operation completed.
+ */
+ if (!(zwplug->flags & BLK_ZONE_WPLUG_ERROR))
+ goto unlock;
+
+ zwplug->flags &= ~BLK_ZONE_WPLUG_ERROR;
+
+ if (ret != 1) {
+ /*
+ * We failed to get the zone information, meaning that something
+ * is likely really wrong with the device. Abort all remaining
+ * plugged BIOs as otherwise we could endup waiting forever on
+ * plugged BIOs to complete if there is a queue freeze on-going.
+ */
+ disk_zone_wplug_abort(zwplug);
+ goto unplug;
+ }
+
+ /* Update the zone write pointer offset. */
+ zwplug->wp_offset = blk_zone_wp_offset(&zone);
+ disk_zone_wplug_abort_unaligned(disk, zwplug);
+
+ /* Restart BIO submission if we still have any BIO left. */
+ if (!bio_list_empty(&zwplug->bio_list)) {
+ WARN_ON_ONCE(!(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED));
+ queue_work(disk->zone_wplugs_wq, &zwplug->bio_work);
+ goto unlock;
+ }
+
+unplug:
+ zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED;
+ if (disk_should_remove_zone_wplug(disk, zwplug))
+ disk_remove_zone_wplug(disk, zwplug);
+
+unlock:
+ spin_unlock_irqrestore(&zwplug->lock, flags);
+}
+
+static void disk_zone_wplugs_work(struct work_struct *work)
+{
+ struct gendisk *disk =
+ container_of(work, struct gendisk, zone_wplugs_work);
+ struct blk_zone_wplug *zwplug;
+ unsigned long flags;
+
+ spin_lock_irqsave(&disk->zone_wplugs_lock, flags);
+
+ while (!list_empty(&disk->zone_wplugs_err_list)) {
+ zwplug = list_first_entry(&disk->zone_wplugs_err_list,
+ struct blk_zone_wplug, link);
+ list_del_init(&zwplug->link);
+ spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags);
+
+ disk_zone_wplug_handle_error(disk, zwplug);
+ disk_put_zone_wplug(zwplug);
+
+ spin_lock_irqsave(&disk->zone_wplugs_lock, flags);
+ }
+
+ spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags);
+}
+
+static inline unsigned int disk_zone_wplugs_hash_size(struct gendisk *disk)
+{
+ return 1U << disk->zone_wplugs_hash_bits;
+}
+
+void disk_init_zone_resources(struct gendisk *disk)
+{
+ spin_lock_init(&disk->zone_wplugs_lock);
+ INIT_LIST_HEAD(&disk->zone_wplugs_err_list);
+ INIT_WORK(&disk->zone_wplugs_work, disk_zone_wplugs_work);
+}
+
+/*
+ * For the size of a disk zone write plug hash table, use the size of the
+ * zone write plug mempool, which is the maximum of the disk open zones and
+ * active zones limits. But do not exceed 4KB (512 hlist head entries), that is,
+ * 9 bits. For a disk that has no limits, mempool size defaults to 128.
+ */
+#define BLK_ZONE_WPLUG_MAX_HASH_BITS 9
+#define BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE 128
+
+static int disk_alloc_zone_resources(struct gendisk *disk,
+ unsigned int pool_size)
+{
+ unsigned int i;
+
+ disk->zone_wplugs_hash_bits =
+ min(ilog2(pool_size) + 1, BLK_ZONE_WPLUG_MAX_HASH_BITS);
+
+ disk->zone_wplugs_hash =
+ kcalloc(disk_zone_wplugs_hash_size(disk),
+ sizeof(struct hlist_head), GFP_KERNEL);
+ if (!disk->zone_wplugs_hash)
+ return -ENOMEM;
+
+ for (i = 0; i < disk_zone_wplugs_hash_size(disk); i++)
+ INIT_HLIST_HEAD(&disk->zone_wplugs_hash[i]);
+
+ disk->zone_wplugs_pool = mempool_create_kmalloc_pool(pool_size,
+ sizeof(struct blk_zone_wplug));
+ if (!disk->zone_wplugs_pool)
+ goto free_hash;
+
+ disk->zone_wplugs_wq =
+ alloc_workqueue("%s_zwplugs", WQ_MEM_RECLAIM | WQ_HIGHPRI,
+ pool_size, disk->disk_name);
+ if (!disk->zone_wplugs_wq)
+ goto destroy_pool;
+
+ return 0;
+
+destroy_pool:
+ mempool_destroy(disk->zone_wplugs_pool);
+ disk->zone_wplugs_pool = NULL;
+free_hash:
+ kfree(disk->zone_wplugs_hash);
+ disk->zone_wplugs_hash = NULL;
+ disk->zone_wplugs_hash_bits = 0;
+ return -ENOMEM;
+}
+
+static void disk_destroy_zone_wplugs_hash_table(struct gendisk *disk)
+{
+ struct blk_zone_wplug *zwplug;
+ unsigned int i;
+
+ if (!disk->zone_wplugs_hash)
+ return;
+
+ /* Free all the zone write plugs we have. */
+ for (i = 0; i < disk_zone_wplugs_hash_size(disk); i++) {
+ while (!hlist_empty(&disk->zone_wplugs_hash[i])) {
+ zwplug = hlist_entry(disk->zone_wplugs_hash[i].first,
+ struct blk_zone_wplug, node);
+ atomic_inc(&zwplug->ref);
+ disk_remove_zone_wplug(disk, zwplug);
+ disk_put_zone_wplug(zwplug);
+ }
+ }
+
+ kfree(disk->zone_wplugs_hash);
+ disk->zone_wplugs_hash = NULL;
+ disk->zone_wplugs_hash_bits = 0;
+}
+
+void disk_free_zone_resources(struct gendisk *disk)
+{
+ cancel_work_sync(&disk->zone_wplugs_work);
+
+ if (disk->zone_wplugs_wq) {
+ destroy_workqueue(disk->zone_wplugs_wq);
+ disk->zone_wplugs_wq = NULL;
+ }
+
+ disk_destroy_zone_wplugs_hash_table(disk);
+
+ /*
+ * Wait for the zone write plugs to be RCU-freed before
+ * destorying the mempool.
+ */
+ rcu_barrier();
+
+ mempool_destroy(disk->zone_wplugs_pool);
+ disk->zone_wplugs_pool = NULL;
+
kfree(disk->conv_zones_bitmap);
disk->conv_zones_bitmap = NULL;
- kfree(disk->seq_zones_wlock);
- disk->seq_zones_wlock = NULL;
+ disk->zone_capacity = 0;
+ disk->nr_zones = 0;
+}
+
+static inline bool disk_need_zone_resources(struct gendisk *disk)
+{
+ /*
+ * All mq zoned devices need zone resources so that the block layer
+ * can automatically handle write BIO plugging. BIO-based device drivers
+ * (e.g. DM devices) are normally responsible for handling zone write
+ * ordering and do not need zone resources, unless the driver requires
+ * zone append emulation.
+ */
+ return queue_is_mq(disk->queue) ||
+ queue_emulates_zone_append(disk->queue);
+}
+
+static int disk_revalidate_zone_resources(struct gendisk *disk,
+ unsigned int nr_zones)
+{
+ struct queue_limits *lim = &disk->queue->limits;
+ unsigned int pool_size;
+
+ if (!disk_need_zone_resources(disk))
+ return 0;
+
+ /*
+ * If the device has no limit on the maximum number of open and active
+ * zones, use BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE.
+ */
+ pool_size = max(lim->max_open_zones, lim->max_active_zones);
+ if (!pool_size)
+ pool_size = min(BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE, nr_zones);
+
+ if (!disk->zone_wplugs_hash)
+ return disk_alloc_zone_resources(disk, pool_size);
+
+ /* Resize the zone write plug memory pool if needed. */
+ if (disk->zone_wplugs_pool->min_nr != pool_size)
+ return mempool_resize(disk->zone_wplugs_pool, pool_size);
+
+ return 0;
}
struct blk_revalidate_zone_args {
struct gendisk *disk;
unsigned long *conv_zones_bitmap;
- unsigned long *seq_zones_wlock;
unsigned int nr_zones;
+ unsigned int zone_capacity;
sector_t sector;
};
/*
+ * Update the disk zone resources information and device queue limits.
+ * The disk queue is frozen when this is executed.
+ */
+static int disk_update_zone_resources(struct gendisk *disk,
+ struct blk_revalidate_zone_args *args)
+{
+ struct request_queue *q = disk->queue;
+ struct queue_limits lim;
+
+ disk->nr_zones = args->nr_zones;
+ disk->zone_capacity = args->zone_capacity;
+ swap(disk->conv_zones_bitmap, args->conv_zones_bitmap);
+
+ /*
+ * If the device has no limit on the maximum number of open and active
+ * zones, set its max open zone limit to the mempool size to indicate
+ * to the user that there is a potential performance impact due to
+ * dynamic zone write plug allocation when simultaneously writing to
+ * more zones than the size of the mempool.
+ */
+ if (disk->zone_wplugs_pool) {
+ lim = queue_limits_start_update(q);
+ if (!lim.max_open_zones && !lim.max_active_zones)
+ lim.max_open_zones = disk->zone_wplugs_pool->min_nr;
+ return queue_limits_commit_update(q, &lim);
+ }
+
+ return 0;
+}
+
+/*
* Helper function to check the validity of zones of a zoned block device.
*/
static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx,
@@ -452,6 +1570,9 @@ static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx,
struct request_queue *q = disk->queue;
sector_t capacity = get_capacity(disk);
sector_t zone_sectors = q->limits.chunk_sectors;
+ struct blk_zone_wplug *zwplug;
+ unsigned long flags;
+ unsigned int wp_offset;
/* Check for bad zones and holes in the zone report */
if (zone->start != args->sector) {
@@ -482,9 +1603,23 @@ static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx,
return -ENODEV;
}
+ if (!zone->capacity || zone->capacity > zone->len) {
+ pr_warn("%s: Invalid zone capacity\n",
+ disk->disk_name);
+ return -ENODEV;
+ }
+
/* Check zone type */
switch (zone->type) {
case BLK_ZONE_TYPE_CONVENTIONAL:
+ if (zone->capacity != zone->len) {
+ pr_warn("%s: Invalid conventional zone capacity\n",
+ disk->disk_name);
+ return -ENODEV;
+ }
+
+ if (!disk_need_zone_resources(disk))
+ break;
if (!args->conv_zones_bitmap) {
args->conv_zones_bitmap =
blk_alloc_zone_bitmap(q->node, args->nr_zones);
@@ -494,12 +1629,34 @@ static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx,
set_bit(idx, args->conv_zones_bitmap);
break;
case BLK_ZONE_TYPE_SEQWRITE_REQ:
- if (!args->seq_zones_wlock) {
- args->seq_zones_wlock =
- blk_alloc_zone_bitmap(q->node, args->nr_zones);
- if (!args->seq_zones_wlock)
+ /*
+ * Remember the capacity of the first sequential zone and check
+ * if it is constant for all zones.
+ */
+ if (!args->zone_capacity)
+ args->zone_capacity = zone->capacity;
+ if (zone->capacity != args->zone_capacity) {
+ pr_warn("%s: Invalid variable zone capacity\n",
+ disk->disk_name);
+ return -ENODEV;
+ }
+
+ /*
+ * We need to track the write pointer of all zones that are not
+ * empty nor full. So make sure we have a zone write plug for
+ * such zone if the device has a zone write plug hash table.
+ */
+ wp_offset = blk_zone_wp_offset(zone);
+ if (disk->zone_wplugs_hash &&
+ wp_offset && wp_offset < zone_sectors) {
+ zwplug = disk_get_and_lock_zone_wplug(disk, zone->start,
+ GFP_NOIO, &flags);
+ if (!zwplug)
return -ENOMEM;
+ spin_unlock_irqrestore(&zwplug->lock, flags);
+ disk_put_zone_wplug(zwplug);
}
+
break;
case BLK_ZONE_TYPE_SEQWRITE_PREF:
default:
@@ -513,35 +1670,29 @@ static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx,
}
/**
- * blk_revalidate_disk_zones - (re)allocate and initialize zone bitmaps
+ * blk_revalidate_disk_zones - (re)allocate and initialize zone write plugs
* @disk: Target disk
- * @update_driver_data: Callback to update driver data on the frozen disk
*
- * Helper function for low-level device drivers to check and (re) allocate and
- * initialize a disk request queue zone bitmaps. This functions should normally
- * be called within the disk ->revalidate method for blk-mq based drivers.
+ * Helper function for low-level device drivers to check, (re) allocate and
+ * initialize resources used for managing zoned disks. This function should
+ * normally be called by blk-mq based drivers when a zoned gendisk is probed
+ * and when the zone configuration of the gendisk changes (e.g. after a format).
* Before calling this function, the device driver must already have set the
* device zone size (chunk_sector limit) and the max zone append limit.
- * For BIO based drivers, this function cannot be used. BIO based device drivers
- * only need to set disk->nr_zones so that the sysfs exposed value is correct.
- * If the @update_driver_data callback function is not NULL, the callback is
- * executed with the device request queue frozen after all zones have been
- * checked.
+ * BIO based drivers can also use this function as long as the device queue
+ * can be safely frozen.
*/
-int blk_revalidate_disk_zones(struct gendisk *disk,
- void (*update_driver_data)(struct gendisk *disk))
+int blk_revalidate_disk_zones(struct gendisk *disk)
{
struct request_queue *q = disk->queue;
sector_t zone_sectors = q->limits.chunk_sectors;
sector_t capacity = get_capacity(disk);
struct blk_revalidate_zone_args args = { };
unsigned int noio_flag;
- int ret;
+ int ret = -ENOMEM;
if (WARN_ON_ONCE(!blk_queue_is_zoned(q)))
return -EIO;
- if (WARN_ON_ONCE(!queue_is_mq(q)))
- return -EIO;
if (!capacity)
return -ENODEV;
@@ -556,7 +1707,7 @@ int blk_revalidate_disk_zones(struct gendisk *disk,
return -ENODEV;
}
- if (!q->limits.max_zone_append_sectors) {
+ if (!queue_max_zone_append_sectors(q)) {
pr_warn("%s: Invalid 0 maximum zone append limit\n",
disk->disk_name);
return -ENODEV;
@@ -569,6 +1720,11 @@ int blk_revalidate_disk_zones(struct gendisk *disk,
args.disk = disk;
args.nr_zones = (capacity + zone_sectors - 1) >> ilog2(zone_sectors);
noio_flag = memalloc_noio_save();
+ ret = disk_revalidate_zone_resources(disk, args.nr_zones);
+ if (ret) {
+ memalloc_noio_restore(noio_flag);
+ return ret;
+ }
ret = disk->fops->report_zones(disk, 0, UINT_MAX,
blk_revalidate_zone_cb, &args);
if (!ret) {
@@ -588,26 +1744,59 @@ int blk_revalidate_disk_zones(struct gendisk *disk,
}
/*
- * Install the new bitmaps and update nr_zones only once the queue is
- * stopped and all I/Os are completed (i.e. a scheduler is not
- * referencing the bitmaps).
+ * Set the new disk zone parameters only once the queue is frozen and
+ * all I/Os are completed.
*/
blk_mq_freeze_queue(q);
- if (ret > 0) {
- disk->nr_zones = args.nr_zones;
- swap(disk->seq_zones_wlock, args.seq_zones_wlock);
- swap(disk->conv_zones_bitmap, args.conv_zones_bitmap);
- if (update_driver_data)
- update_driver_data(disk);
- ret = 0;
- } else {
+ if (ret > 0)
+ ret = disk_update_zone_resources(disk, &args);
+ else
pr_warn("%s: failed to revalidate zones\n", disk->disk_name);
- disk_free_zone_bitmaps(disk);
- }
+ if (ret)
+ disk_free_zone_resources(disk);
blk_mq_unfreeze_queue(q);
- kfree(args.seq_zones_wlock);
kfree(args.conv_zones_bitmap);
+
return ret;
}
EXPORT_SYMBOL_GPL(blk_revalidate_disk_zones);
+
+#ifdef CONFIG_BLK_DEBUG_FS
+
+int queue_zone_wplugs_show(void *data, struct seq_file *m)
+{
+ struct request_queue *q = data;
+ struct gendisk *disk = q->disk;
+ struct blk_zone_wplug *zwplug;
+ unsigned int zwp_wp_offset, zwp_flags;
+ unsigned int zwp_zone_no, zwp_ref;
+ unsigned int zwp_bio_list_size, i;
+ unsigned long flags;
+
+ if (!disk->zone_wplugs_hash)
+ return 0;
+
+ rcu_read_lock();
+ for (i = 0; i < disk_zone_wplugs_hash_size(disk); i++) {
+ hlist_for_each_entry_rcu(zwplug,
+ &disk->zone_wplugs_hash[i], node) {
+ spin_lock_irqsave(&zwplug->lock, flags);
+ zwp_zone_no = zwplug->zone_no;
+ zwp_flags = zwplug->flags;
+ zwp_ref = atomic_read(&zwplug->ref);
+ zwp_wp_offset = zwplug->wp_offset;
+ zwp_bio_list_size = bio_list_size(&zwplug->bio_list);
+ spin_unlock_irqrestore(&zwplug->lock, flags);
+
+ seq_printf(m, "%u 0x%x %u %u %u\n",
+ zwp_zone_no, zwp_flags, zwp_ref,
+ zwp_wp_offset, zwp_bio_list_size);
+ }
+ }
+ rcu_read_unlock();
+
+ return 0;
+}
+
+#endif
diff --git a/block/blk.h b/block/blk.h
index d9f584984bc4..1140c4a0be03 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -269,6 +269,14 @@ static inline void bio_integrity_free(struct bio *bio)
unsigned long blk_rq_timeout(unsigned long timeout);
void blk_add_timer(struct request *req);
+enum bio_merge_status {
+ BIO_MERGE_OK,
+ BIO_MERGE_NONE,
+ BIO_MERGE_FAILED,
+};
+
+enum bio_merge_status bio_attempt_back_merge(struct request *req,
+ struct bio *bio, unsigned int nr_segs);
bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
unsigned int nr_segs);
bool blk_bio_list_merge(struct request_queue *q, struct list_head *list,
@@ -407,13 +415,85 @@ static inline struct bio *blk_queue_bounce(struct bio *bio,
}
#ifdef CONFIG_BLK_DEV_ZONED
-void disk_free_zone_bitmaps(struct gendisk *disk);
+void disk_init_zone_resources(struct gendisk *disk);
+void disk_free_zone_resources(struct gendisk *disk);
+static inline bool bio_zone_write_plugging(struct bio *bio)
+{
+ return bio_flagged(bio, BIO_ZONE_WRITE_PLUGGING);
+}
+static inline bool bio_is_zone_append(struct bio *bio)
+{
+ return bio_op(bio) == REQ_OP_ZONE_APPEND ||
+ bio_flagged(bio, BIO_EMULATES_ZONE_APPEND);
+}
+void blk_zone_write_plug_bio_merged(struct bio *bio);
+void blk_zone_write_plug_attempt_merge(struct request *rq);
+static inline void blk_zone_update_request_bio(struct request *rq,
+ struct bio *bio)
+{
+ /*
+ * For zone append requests, the request sector indicates the location
+ * at which the BIO data was written. Return this value to the BIO
+ * issuer through the BIO iter sector.
+ * For plugged zone writes, which include emulated zone append, we need
+ * the original BIO sector so that blk_zone_write_plug_bio_endio() can
+ * lookup the zone write plug.
+ */
+ if (req_op(rq) == REQ_OP_ZONE_APPEND || bio_zone_write_plugging(bio))
+ bio->bi_iter.bi_sector = rq->__sector;
+}
+void blk_zone_write_plug_bio_endio(struct bio *bio);
+static inline void blk_zone_bio_endio(struct bio *bio)
+{
+ /*
+ * For write BIOs to zoned devices, signal the completion of the BIO so
+ * that the next write BIO can be submitted by zone write plugging.
+ */
+ if (bio_zone_write_plugging(bio))
+ blk_zone_write_plug_bio_endio(bio);
+}
+
+void blk_zone_write_plug_complete_request(struct request *rq);
+static inline void blk_zone_complete_request(struct request *rq)
+{
+ if (rq->rq_flags & RQF_ZONE_WRITE_PLUGGING)
+ blk_zone_write_plug_complete_request(rq);
+}
int blkdev_report_zones_ioctl(struct block_device *bdev, unsigned int cmd,
unsigned long arg);
int blkdev_zone_mgmt_ioctl(struct block_device *bdev, blk_mode_t mode,
unsigned int cmd, unsigned long arg);
#else /* CONFIG_BLK_DEV_ZONED */
-static inline void disk_free_zone_bitmaps(struct gendisk *disk) {}
+static inline void disk_init_zone_resources(struct gendisk *disk)
+{
+}
+static inline void disk_free_zone_resources(struct gendisk *disk)
+{
+}
+static inline bool bio_zone_write_plugging(struct bio *bio)
+{
+ return false;
+}
+static inline bool bio_is_zone_append(struct bio *bio)
+{
+ return false;
+}
+static inline void blk_zone_write_plug_bio_merged(struct bio *bio)
+{
+}
+static inline void blk_zone_write_plug_attempt_merge(struct request *rq)
+{
+}
+static inline void blk_zone_update_request_bio(struct request *rq,
+ struct bio *bio)
+{
+}
+static inline void blk_zone_bio_endio(struct bio *bio)
+{
+}
+static inline void blk_zone_complete_request(struct request *rq)
+{
+}
static inline int blkdev_report_zones_ioctl(struct block_device *bdev,
unsigned int cmd, unsigned long arg)
{
diff --git a/block/elevator.c b/block/elevator.c
index 5ff093cb3cf8..f64ebd726e58 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -83,13 +83,6 @@ bool elv_bio_merge_ok(struct request *rq, struct bio *bio)
}
EXPORT_SYMBOL(elv_bio_merge_ok);
-static inline bool elv_support_features(struct request_queue *q,
- const struct elevator_type *e)
-{
- return (q->required_elevator_features & e->elevator_features) ==
- q->required_elevator_features;
-}
-
/**
* elevator_match - Check whether @e's name or alias matches @name
* @e: Scheduler to test
@@ -120,7 +113,7 @@ static struct elevator_type *elevator_find_get(struct request_queue *q,
spin_lock(&elv_list_lock);
e = __elevator_find(name);
- if (e && (!elv_support_features(q, e) || !elevator_tryget(e)))
+ if (e && (!elevator_tryget(e)))
e = NULL;
spin_unlock(&elv_list_lock);
return e;
@@ -580,34 +573,8 @@ static struct elevator_type *elevator_get_default(struct request_queue *q)
}
/*
- * Get the first elevator providing the features required by the request queue.
- * Default to "none" if no matching elevator is found.
- */
-static struct elevator_type *elevator_get_by_features(struct request_queue *q)
-{
- struct elevator_type *e, *found = NULL;
-
- spin_lock(&elv_list_lock);
-
- list_for_each_entry(e, &elv_list, list) {
- if (elv_support_features(q, e)) {
- found = e;
- break;
- }
- }
-
- if (found && !elevator_tryget(found))
- found = NULL;
-
- spin_unlock(&elv_list_lock);
- return found;
-}
-
-/*
- * For a device queue that has no required features, use the default elevator
- * settings. Otherwise, use the first elevator available matching the required
- * features. If no suitable elevator is find or if the chosen elevator
- * initialization fails, fall back to the "none" elevator (no elevator).
+ * Use the default elevator settings. If the chosen elevator initialization
+ * fails, fall back to the "none" elevator (no elevator).
*/
void elevator_init_mq(struct request_queue *q)
{
@@ -622,10 +589,7 @@ void elevator_init_mq(struct request_queue *q)
if (unlikely(q->elevator))
return;
- if (!q->required_elevator_features)
- e = elevator_get_default(q);
- else
- e = elevator_get_by_features(q);
+ e = elevator_get_default(q);
if (!e)
return;
@@ -781,7 +745,7 @@ ssize_t elv_iosched_show(struct request_queue *q, char *name)
list_for_each_entry(e, &elv_list, list) {
if (e == cur)
len += sprintf(name+len, "[%s] ", e->elevator_name);
- else if (elv_support_features(q, e))
+ else
len += sprintf(name+len, "%s ", e->elevator_name);
}
spin_unlock(&elv_list_lock);
diff --git a/block/elevator.h b/block/elevator.h
index 7ca3d7b6ed82..e9a050a96e53 100644
--- a/block/elevator.h
+++ b/block/elevator.h
@@ -74,7 +74,6 @@ struct elevator_type
struct elv_fs_entry *elevator_attrs;
const char *elevator_name;
const char *elevator_alias;
- const unsigned int elevator_features;
struct module *elevator_owner;
#ifdef CONFIG_BLK_DEBUG_FS
const struct blk_mq_debugfs_attr *queue_debugfs_attrs;
diff --git a/block/fops.c b/block/fops.c
index 679d9b752fe8..c091ea43bca3 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -44,18 +44,15 @@ static bool blkdev_dio_unaligned(struct block_device *bdev, loff_t pos,
#define DIO_INLINE_BIO_VECS 4
static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb,
- struct iov_iter *iter, unsigned int nr_pages)
+ struct iov_iter *iter, struct block_device *bdev,
+ unsigned int nr_pages)
{
- struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host);
struct bio_vec inline_vecs[DIO_INLINE_BIO_VECS], *vecs;
loff_t pos = iocb->ki_pos;
bool should_dirty = false;
struct bio bio;
ssize_t ret;
- if (blkdev_dio_unaligned(bdev, pos, iter))
- return -EINVAL;
-
if (nr_pages <= DIO_INLINE_BIO_VECS)
vecs = inline_vecs;
else {
@@ -161,9 +158,8 @@ static void blkdev_bio_end_io(struct bio *bio)
}
static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
- unsigned int nr_pages)
+ struct block_device *bdev, unsigned int nr_pages)
{
- struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host);
struct blk_plug plug;
struct blkdev_dio *dio;
struct bio *bio;
@@ -172,9 +168,6 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
loff_t pos = iocb->ki_pos;
int ret = 0;
- if (blkdev_dio_unaligned(bdev, pos, iter))
- return -EINVAL;
-
if (iocb->ki_flags & IOCB_ALLOC_CACHE)
opf |= REQ_ALLOC_CACHE;
bio = bio_alloc_bioset(bdev, nr_pages, opf, GFP_KERNEL,
@@ -302,9 +295,9 @@ static void blkdev_bio_end_io_async(struct bio *bio)
static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
struct iov_iter *iter,
+ struct block_device *bdev,
unsigned int nr_pages)
{
- struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host);
bool is_read = iov_iter_rw(iter) == READ;
blk_opf_t opf = is_read ? REQ_OP_READ : dio_bio_write_op(iocb);
struct blkdev_dio *dio;
@@ -312,9 +305,6 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
loff_t pos = iocb->ki_pos;
int ret = 0;
- if (blkdev_dio_unaligned(bdev, pos, iter))
- return -EINVAL;
-
if (iocb->ki_flags & IOCB_ALLOC_CACHE)
opf |= REQ_ALLOC_CACHE;
bio = bio_alloc_bioset(bdev, nr_pages, opf, GFP_KERNEL,
@@ -368,18 +358,23 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
{
+ struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host);
unsigned int nr_pages;
if (!iov_iter_count(iter))
return 0;
+ if (blkdev_dio_unaligned(bdev, iocb->ki_pos, iter))
+ return -EINVAL;
+
nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS + 1);
if (likely(nr_pages <= BIO_MAX_VECS)) {
if (is_sync_kiocb(iocb))
- return __blkdev_direct_IO_simple(iocb, iter, nr_pages);
- return __blkdev_direct_IO_async(iocb, iter, nr_pages);
+ return __blkdev_direct_IO_simple(iocb, iter, bdev,
+ nr_pages);
+ return __blkdev_direct_IO_async(iocb, iter, bdev, nr_pages);
}
- return __blkdev_direct_IO(iocb, iter, bio_max_segs(nr_pages));
+ return __blkdev_direct_IO(iocb, iter, bdev, bio_max_segs(nr_pages));
}
static int blkdev_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
diff --git a/block/genhd.c b/block/genhd.c
index bb29a68e1d67..eb893df56d51 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1182,7 +1182,7 @@ static void disk_release(struct device *dev)
disk_release_events(disk);
kfree(disk->random);
- disk_free_zone_bitmaps(disk);
+ disk_free_zone_resources(disk);
xa_destroy(&disk->part_tbl);
disk->queue->disk = NULL;
@@ -1364,6 +1364,7 @@ struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id,
if (blkcg_init_disk(disk))
goto out_erase_part0;
+ disk_init_zone_resources(disk);
rand_initialize_disk(disk);
disk_to_dev(disk)->class = &block_class;
disk_to_dev(disk)->type = &disk_type;
diff --git a/block/mq-deadline.c b/block/mq-deadline.c
index 02a916ba62ee..94eede4fb9eb 100644
--- a/block/mq-deadline.c
+++ b/block/mq-deadline.c
@@ -102,7 +102,6 @@ struct deadline_data {
int prio_aging_expire;
spinlock_t lock;
- spinlock_t zone_lock;
};
/* Maps an I/O priority class to a deadline scheduler priority. */
@@ -129,36 +128,7 @@ static u8 dd_rq_ioclass(struct request *rq)
}
/*
- * get the request before `rq' in sector-sorted order
- */
-static inline struct request *
-deadline_earlier_request(struct request *rq)
-{
- struct rb_node *node = rb_prev(&rq->rb_node);
-
- if (node)
- return rb_entry_rq(node);
-
- return NULL;
-}
-
-/*
- * get the request after `rq' in sector-sorted order
- */
-static inline struct request *
-deadline_latter_request(struct request *rq)
-{
- struct rb_node *node = rb_next(&rq->rb_node);
-
- if (node)
- return rb_entry_rq(node);
-
- return NULL;
-}
-
-/*
- * Return the first request for which blk_rq_pos() >= @pos. For zoned devices,
- * return the first request after the start of the zone containing @pos.
+ * Return the first request for which blk_rq_pos() >= @pos.
*/
static inline struct request *deadline_from_pos(struct dd_per_prio *per_prio,
enum dd_data_dir data_dir, sector_t pos)
@@ -170,14 +140,6 @@ static inline struct request *deadline_from_pos(struct dd_per_prio *per_prio,
return NULL;
rq = rb_entry_rq(node);
- /*
- * A zoned write may have been requeued with a starting position that
- * is below that of the most recently dispatched request. Hence, for
- * zoned writes, start searching from the start of a zone.
- */
- if (blk_rq_is_seq_zoned_write(rq))
- pos = round_down(pos, rq->q->limits.chunk_sectors);
-
while (node) {
rq = rb_entry_rq(node);
if (blk_rq_pos(rq) >= pos) {
@@ -309,36 +271,6 @@ static inline bool deadline_check_fifo(struct dd_per_prio *per_prio,
}
/*
- * Check if rq has a sequential request preceding it.
- */
-static bool deadline_is_seq_write(struct deadline_data *dd, struct request *rq)
-{
- struct request *prev = deadline_earlier_request(rq);
-
- if (!prev)
- return false;
-
- return blk_rq_pos(prev) + blk_rq_sectors(prev) == blk_rq_pos(rq);
-}
-
-/*
- * Skip all write requests that are sequential from @rq, even if we cross
- * a zone boundary.
- */
-static struct request *deadline_skip_seq_writes(struct deadline_data *dd,
- struct request *rq)
-{
- sector_t pos = blk_rq_pos(rq);
-
- do {
- pos += blk_rq_sectors(rq);
- rq = deadline_latter_request(rq);
- } while (rq && blk_rq_pos(rq) == pos);
-
- return rq;
-}
-
-/*
* For the specified data direction, return the next request to
* dispatch using arrival ordered lists.
*/
@@ -346,40 +278,10 @@ static struct request *
deadline_fifo_request(struct deadline_data *dd, struct dd_per_prio *per_prio,
enum dd_data_dir data_dir)
{
- struct request *rq, *rb_rq, *next;
- unsigned long flags;
-
if (list_empty(&per_prio->fifo_list[data_dir]))
return NULL;
- rq = rq_entry_fifo(per_prio->fifo_list[data_dir].next);
- if (data_dir == DD_READ || !blk_queue_is_zoned(rq->q))
- return rq;
-
- /*
- * Look for a write request that can be dispatched, that is one with
- * an unlocked target zone. For some HDDs, breaking a sequential
- * write stream can lead to lower throughput, so make sure to preserve
- * sequential write streams, even if that stream crosses into the next
- * zones and these zones are unlocked.
- */
- spin_lock_irqsave(&dd->zone_lock, flags);
- list_for_each_entry_safe(rq, next, &per_prio->fifo_list[DD_WRITE],
- queuelist) {
- /* Check whether a prior request exists for the same zone. */
- rb_rq = deadline_from_pos(per_prio, data_dir, blk_rq_pos(rq));
- if (rb_rq && blk_rq_pos(rb_rq) < blk_rq_pos(rq))
- rq = rb_rq;
- if (blk_req_can_dispatch_to_zone(rq) &&
- (blk_queue_nonrot(rq->q) ||
- !deadline_is_seq_write(dd, rq)))
- goto out;
- }
- rq = NULL;
-out:
- spin_unlock_irqrestore(&dd->zone_lock, flags);
-
- return rq;
+ return rq_entry_fifo(per_prio->fifo_list[data_dir].next);
}
/*
@@ -390,36 +292,8 @@ static struct request *
deadline_next_request(struct deadline_data *dd, struct dd_per_prio *per_prio,
enum dd_data_dir data_dir)
{
- struct request *rq;
- unsigned long flags;
-
- rq = deadline_from_pos(per_prio, data_dir,
- per_prio->latest_pos[data_dir]);
- if (!rq)
- return NULL;
-
- if (data_dir == DD_READ || !blk_queue_is_zoned(rq->q))
- return rq;
-
- /*
- * Look for a write request that can be dispatched, that is one with
- * an unlocked target zone. For some HDDs, breaking a sequential
- * write stream can lead to lower throughput, so make sure to preserve
- * sequential write streams, even if that stream crosses into the next
- * zones and these zones are unlocked.
- */
- spin_lock_irqsave(&dd->zone_lock, flags);
- while (rq) {
- if (blk_req_can_dispatch_to_zone(rq))
- break;
- if (blk_queue_nonrot(rq->q))
- rq = deadline_latter_request(rq);
- else
- rq = deadline_skip_seq_writes(dd, rq);
- }
- spin_unlock_irqrestore(&dd->zone_lock, flags);
-
- return rq;
+ return deadline_from_pos(per_prio, data_dir,
+ per_prio->latest_pos[data_dir]);
}
/*
@@ -525,10 +399,6 @@ dispatch_find_request:
rq = next_rq;
}
- /*
- * For a zoned block device, if we only have writes queued and none of
- * them can be dispatched, rq will be NULL.
- */
if (!rq)
return NULL;
@@ -549,10 +419,6 @@ done:
prio = ioprio_class_to_prio[ioprio_class];
dd->per_prio[prio].latest_pos[data_dir] = blk_rq_pos(rq);
dd->per_prio[prio].stats.dispatched++;
- /*
- * If the request needs its target zone locked, do it.
- */
- blk_req_zone_write_lock(rq);
rq->rq_flags |= RQF_STARTED;
return rq;
}
@@ -722,7 +588,6 @@ static int dd_init_sched(struct request_queue *q, struct elevator_type *e)
dd->fifo_batch = fifo_batch;
dd->prio_aging_expire = prio_aging_expire;
spin_lock_init(&dd->lock);
- spin_lock_init(&dd->zone_lock);
/* We dispatch from request queue wide instead of hw queue */
blk_queue_flag_set(QUEUE_FLAG_SQ_SCHED, q);
@@ -804,12 +669,6 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
lockdep_assert_held(&dd->lock);
- /*
- * This may be a requeue of a write request that has locked its
- * target zone. If it is the case, this releases the zone lock.
- */
- blk_req_zone_write_unlock(rq);
-
prio = ioprio_class_to_prio[ioprio_class];
per_prio = &dd->per_prio[prio];
if (!rq->elv.priv[0]) {
@@ -841,18 +700,6 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
*/
rq->fifo_time = jiffies + dd->fifo_expire[data_dir];
insert_before = &per_prio->fifo_list[data_dir];
-#ifdef CONFIG_BLK_DEV_ZONED
- /*
- * Insert zoned writes such that requests are sorted by
- * position per zone.
- */
- if (blk_rq_is_seq_zoned_write(rq)) {
- struct request *rq2 = deadline_latter_request(rq);
-
- if (rq2 && blk_rq_zone_no(rq2) == blk_rq_zone_no(rq))
- insert_before = &rq2->queuelist;
- }
-#endif
list_add_tail(&rq->queuelist, insert_before);
}
}
@@ -887,33 +734,8 @@ static void dd_prepare_request(struct request *rq)
rq->elv.priv[0] = NULL;
}
-static bool dd_has_write_work(struct blk_mq_hw_ctx *hctx)
-{
- struct deadline_data *dd = hctx->queue->elevator->elevator_data;
- enum dd_prio p;
-
- for (p = 0; p <= DD_PRIO_MAX; p++)
- if (!list_empty_careful(&dd->per_prio[p].fifo_list[DD_WRITE]))
- return true;
-
- return false;
-}
-
/*
* Callback from inside blk_mq_free_request().
- *
- * For zoned block devices, write unlock the target zone of
- * completed write requests. Do this while holding the zone lock
- * spinlock so that the zone is never unlocked while deadline_fifo_request()
- * or deadline_next_request() are executing. This function is called for
- * all requests, whether or not these requests complete successfully.
- *
- * For a zoned block device, __dd_dispatch_request() may have stopped
- * dispatching requests if all the queued requests are write requests directed
- * at zones that are already locked due to on-going write requests. To ensure
- * write request dispatch progress in this case, mark the queue as needing a
- * restart to ensure that the queue is run again after completion of the
- * request and zones being unlocked.
*/
static void dd_finish_request(struct request *rq)
{
@@ -928,21 +750,8 @@ static void dd_finish_request(struct request *rq)
* called dd_insert_requests(). Skip requests that bypassed I/O
* scheduling. See also blk_mq_request_bypass_insert().
*/
- if (!rq->elv.priv[0])
- return;
-
- atomic_inc(&per_prio->stats.completed);
-
- if (blk_queue_is_zoned(q)) {
- unsigned long flags;
-
- spin_lock_irqsave(&dd->zone_lock, flags);
- blk_req_zone_write_unlock(rq);
- spin_unlock_irqrestore(&dd->zone_lock, flags);
-
- if (dd_has_write_work(rq->mq_hctx))
- blk_mq_sched_mark_restart_hctx(rq->mq_hctx);
- }
+ if (rq->elv.priv[0])
+ atomic_inc(&per_prio->stats.completed);
}
static bool dd_has_work_for_prio(struct dd_per_prio *per_prio)
@@ -1266,7 +1075,6 @@ static struct elevator_type mq_deadline = {
.elevator_attrs = deadline_attrs,
.elevator_name = "mq-deadline",
.elevator_alias = "deadline",
- .elevator_features = ELEVATOR_F_ZBD_SEQ_WRITE,
.elevator_owner = THIS_MODULE,
};
MODULE_ALIAS("mq-deadline-iosched");
diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c
index 71c39bcd872c..bccee438b4b7 100644
--- a/drivers/block/null_blk/main.c
+++ b/drivers/block/null_blk/main.c
@@ -225,6 +225,10 @@ static unsigned long g_cache_size;
module_param_named(cache_size, g_cache_size, ulong, 0444);
MODULE_PARM_DESC(mbps, "Cache size in MiB for memory-backed device. Default: 0 (none)");
+static bool g_fua = true;
+module_param_named(fua, g_fua, bool, 0444);
+MODULE_PARM_DESC(zoned, "Enable/disable FUA support when cache_size is used. Default: true");
+
static unsigned int g_mbps;
module_param_named(mbps, g_mbps, uint, 0444);
MODULE_PARM_DESC(mbps, "Limit maximum bandwidth (in MiB/s). Default: 0 (no limit)");
@@ -253,6 +257,11 @@ static unsigned int g_zone_max_active;
module_param_named(zone_max_active, g_zone_max_active, uint, 0444);
MODULE_PARM_DESC(zone_max_active, "Maximum number of active zones when block device is zoned. Default: 0 (no limit)");
+static int g_zone_append_max_sectors = INT_MAX;
+module_param_named(zone_append_max_sectors, g_zone_append_max_sectors, int, 0444);
+MODULE_PARM_DESC(zone_append_max_sectors,
+ "Maximum size of a zone append command (in 512B sectors). Specify 0 for zone append emulation");
+
static struct nullb_device *null_alloc_dev(void);
static void null_free_dev(struct nullb_device *dev);
static void null_del_dev(struct nullb *nullb);
@@ -436,10 +445,12 @@ NULLB_DEVICE_ATTR(zone_capacity, ulong, NULL);
NULLB_DEVICE_ATTR(zone_nr_conv, uint, NULL);
NULLB_DEVICE_ATTR(zone_max_open, uint, NULL);
NULLB_DEVICE_ATTR(zone_max_active, uint, NULL);
+NULLB_DEVICE_ATTR(zone_append_max_sectors, uint, NULL);
NULLB_DEVICE_ATTR(virt_boundary, bool, NULL);
NULLB_DEVICE_ATTR(no_sched, bool, NULL);
NULLB_DEVICE_ATTR(shared_tags, bool, NULL);
NULLB_DEVICE_ATTR(shared_tag_bitmap, bool, NULL);
+NULLB_DEVICE_ATTR(fua, bool, NULL);
static ssize_t nullb_device_power_show(struct config_item *item, char *page)
{
@@ -580,12 +591,14 @@ static struct configfs_attribute *nullb_device_attrs[] = {
&nullb_device_attr_zone_nr_conv,
&nullb_device_attr_zone_max_open,
&nullb_device_attr_zone_max_active,
+ &nullb_device_attr_zone_append_max_sectors,
&nullb_device_attr_zone_readonly,
&nullb_device_attr_zone_offline,
&nullb_device_attr_virt_boundary,
&nullb_device_attr_no_sched,
&nullb_device_attr_shared_tags,
&nullb_device_attr_shared_tag_bitmap,
+ &nullb_device_attr_fua,
NULL,
};
@@ -664,14 +677,14 @@ nullb_group_drop_item(struct config_group *group, struct config_item *item)
static ssize_t memb_group_features_show(struct config_item *item, char *page)
{
return snprintf(page, PAGE_SIZE,
- "badblocks,blocking,blocksize,cache_size,"
+ "badblocks,blocking,blocksize,cache_size,fua,"
"completion_nsec,discard,home_node,hw_queue_depth,"
"irqmode,max_sectors,mbps,memory_backed,no_sched,"
"poll_queues,power,queue_mode,shared_tag_bitmap,"
"shared_tags,size,submit_queues,use_per_node_hctx,"
"virt_boundary,zoned,zone_capacity,zone_max_active,"
"zone_max_open,zone_nr_conv,zone_offline,zone_readonly,"
- "zone_size\n");
+ "zone_size,zone_append_max_sectors\n");
}
CONFIGFS_ATTR_RO(memb_group_, features);
@@ -751,10 +764,13 @@ static struct nullb_device *null_alloc_dev(void)
dev->zone_nr_conv = g_zone_nr_conv;
dev->zone_max_open = g_zone_max_open;
dev->zone_max_active = g_zone_max_active;
+ dev->zone_append_max_sectors = g_zone_append_max_sectors;
dev->virt_boundary = g_virt_boundary;
dev->no_sched = g_no_sched;
dev->shared_tags = g_shared_tags;
dev->shared_tag_bitmap = g_shared_tag_bitmap;
+ dev->fua = g_fua;
+
return dev;
}
@@ -1151,7 +1167,7 @@ blk_status_t null_handle_discard(struct nullb_device *dev,
return BLK_STS_OK;
}
-static int null_handle_flush(struct nullb *nullb)
+static blk_status_t null_handle_flush(struct nullb *nullb)
{
int err;
@@ -1168,7 +1184,7 @@ static int null_handle_flush(struct nullb *nullb)
WARN_ON(!radix_tree_empty(&nullb->dev->cache));
spin_unlock_irq(&nullb->lock);
- return err;
+ return errno_to_blk_status(err);
}
static int null_transfer(struct nullb *nullb, struct page *page,
@@ -1206,7 +1222,7 @@ static int null_handle_rq(struct nullb_cmd *cmd)
{
struct request *rq = blk_mq_rq_from_pdu(cmd);
struct nullb *nullb = cmd->nq->dev->nullb;
- int err;
+ int err = 0;
unsigned int len;
sector_t sector = blk_rq_pos(rq);
struct req_iterator iter;
@@ -1218,15 +1234,13 @@ static int null_handle_rq(struct nullb_cmd *cmd)
err = null_transfer(nullb, bvec.bv_page, len, bvec.bv_offset,
op_is_write(req_op(rq)), sector,
rq->cmd_flags & REQ_FUA);
- if (err) {
- spin_unlock_irq(&nullb->lock);
- return err;
- }
+ if (err)
+ break;
sector += len >> SECTOR_SHIFT;
}
spin_unlock_irq(&nullb->lock);
- return 0;
+ return errno_to_blk_status(err);
}
static inline blk_status_t null_handle_throttled(struct nullb_cmd *cmd)
@@ -1273,8 +1287,8 @@ static inline blk_status_t null_handle_memory_backed(struct nullb_cmd *cmd,
if (op == REQ_OP_DISCARD)
return null_handle_discard(dev, sector, nr_sectors);
- return errno_to_blk_status(null_handle_rq(cmd));
+ return null_handle_rq(cmd);
}
static void nullb_zero_read_cmd_buffer(struct nullb_cmd *cmd)
@@ -1343,7 +1357,7 @@ static void null_handle_cmd(struct nullb_cmd *cmd, sector_t sector,
blk_status_t sts;
if (op == REQ_OP_FLUSH) {
- cmd->error = errno_to_blk_status(null_handle_flush(nullb));
+ cmd->error = null_handle_flush(nullb);
goto out;
}
@@ -1912,7 +1926,7 @@ static int null_add_dev(struct nullb_device *dev)
if (dev->cache_size > 0) {
set_bit(NULLB_DEV_FL_CACHE, &nullb->dev->flags);
- blk_queue_write_cache(nullb->q, true, true);
+ blk_queue_write_cache(nullb->q, true, dev->fua);
}
nullb->q->queuedata = nullb;
diff --git a/drivers/block/null_blk/null_blk.h b/drivers/block/null_blk/null_blk.h
index 477b97746823..3234e6c85eed 100644
--- a/drivers/block/null_blk/null_blk.h
+++ b/drivers/block/null_blk/null_blk.h
@@ -82,6 +82,7 @@ struct nullb_device {
unsigned int zone_nr_conv; /* number of conventional zones */
unsigned int zone_max_open; /* max number of open zones */
unsigned int zone_max_active; /* max number of active zones */
+ unsigned int zone_append_max_sectors; /* Max sectors per zone append command */
unsigned int submit_queues; /* number of submission queues */
unsigned int prev_submit_queues; /* number of submission queues before change */
unsigned int poll_queues; /* number of IOPOLL submission queues */
@@ -104,6 +105,7 @@ struct nullb_device {
bool no_sched; /* no IO scheduler for the device */
bool shared_tags; /* share tag set between devices for blk-mq */
bool shared_tag_bitmap; /* use hostwide shared tags */
+ bool fua; /* Support FUA */
};
struct nullb {
diff --git a/drivers/block/null_blk/zoned.c b/drivers/block/null_blk/zoned.c
index 1689e2584104..5b5a63adacc1 100644
--- a/drivers/block/null_blk/zoned.c
+++ b/drivers/block/null_blk/zoned.c
@@ -9,6 +9,8 @@
#undef pr_fmt
#define pr_fmt(fmt) "null_blk: " fmt
+#define NULL_ZONE_INVALID_WP ((sector_t)-1)
+
static inline sector_t mb_to_sects(unsigned long mb)
{
return ((sector_t)mb * SZ_1M) >> SECTOR_SHIFT;
@@ -19,18 +21,6 @@ static inline unsigned int null_zone_no(struct nullb_device *dev, sector_t sect)
return sect >> ilog2(dev->zone_size_sects);
}
-static inline void null_lock_zone_res(struct nullb_device *dev)
-{
- if (dev->need_zone_res_mgmt)
- spin_lock_irq(&dev->zone_res_lock);
-}
-
-static inline void null_unlock_zone_res(struct nullb_device *dev)
-{
- if (dev->need_zone_res_mgmt)
- spin_unlock_irq(&dev->zone_res_lock);
-}
-
static inline void null_init_zone_lock(struct nullb_device *dev,
struct nullb_zone *zone)
{
@@ -103,6 +93,11 @@ int null_init_zoned_dev(struct nullb_device *dev,
dev->zone_nr_conv);
}
+ dev->zone_append_max_sectors =
+ min(ALIGN_DOWN(dev->zone_append_max_sectors,
+ dev->blocksize >> SECTOR_SHIFT),
+ zone_capacity_sects);
+
/* Max active zones has to be < nbr of seq zones in order to be enforceable */
if (dev->zone_max_active >= dev->nr_zones - dev->zone_nr_conv) {
dev->zone_max_active = 0;
@@ -154,7 +149,7 @@ int null_init_zoned_dev(struct nullb_device *dev,
lim->zoned = true;
lim->chunk_sectors = dev->zone_size_sects;
- lim->max_zone_append_sectors = dev->zone_size_sects;
+ lim->max_zone_append_sectors = dev->zone_append_max_sectors;
lim->max_open_zones = dev->zone_max_open;
lim->max_active_zones = dev->zone_max_active;
return 0;
@@ -163,11 +158,16 @@ int null_init_zoned_dev(struct nullb_device *dev,
int null_register_zoned_dev(struct nullb *nullb)
{
struct request_queue *q = nullb->q;
+ struct gendisk *disk = nullb->disk;
blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q);
- blk_queue_required_elevator_features(q, ELEVATOR_F_ZBD_SEQ_WRITE);
- nullb->disk->nr_zones = bdev_nr_zones(nullb->disk->part0);
- return blk_revalidate_disk_zones(nullb->disk, NULL);
+ disk->nr_zones = bdev_nr_zones(disk->part0);
+
+ pr_info("%s: using %s zone append\n",
+ disk->disk_name,
+ queue_emulates_zone_append(q) ? "emulated" : "native");
+
+ return blk_revalidate_disk_zones(disk);
}
void null_free_zoned_dev(struct nullb_device *dev)
@@ -241,35 +241,6 @@ size_t null_zone_valid_read_len(struct nullb *nullb,
return (zone->wp - sector) << SECTOR_SHIFT;
}
-static blk_status_t __null_close_zone(struct nullb_device *dev,
- struct nullb_zone *zone)
-{
- switch (zone->cond) {
- case BLK_ZONE_COND_CLOSED:
- /* close operation on closed is not an error */
- return BLK_STS_OK;
- case BLK_ZONE_COND_IMP_OPEN:
- dev->nr_zones_imp_open--;
- break;
- case BLK_ZONE_COND_EXP_OPEN:
- dev->nr_zones_exp_open--;
- break;
- case BLK_ZONE_COND_EMPTY:
- case BLK_ZONE_COND_FULL:
- default:
- return BLK_STS_IOERR;
- }
-
- if (zone->wp == zone->start) {
- zone->cond = BLK_ZONE_COND_EMPTY;
- } else {
- zone->cond = BLK_ZONE_COND_CLOSED;
- dev->nr_zones_closed++;
- }
-
- return BLK_STS_OK;
-}
-
static void null_close_imp_open_zone(struct nullb_device *dev)
{
struct nullb_zone *zone;
@@ -286,7 +257,13 @@ static void null_close_imp_open_zone(struct nullb_device *dev)
zno = dev->zone_nr_conv;
if (zone->cond == BLK_ZONE_COND_IMP_OPEN) {
- __null_close_zone(dev, zone);
+ dev->nr_zones_imp_open--;
+ if (zone->wp == zone->start) {
+ zone->cond = BLK_ZONE_COND_EMPTY;
+ } else {
+ zone->cond = BLK_ZONE_COND_CLOSED;
+ dev->nr_zones_closed++;
+ }
dev->imp_close_zone_no = zno;
return;
}
@@ -374,73 +351,73 @@ static blk_status_t null_zone_write(struct nullb_cmd *cmd, sector_t sector,
null_lock_zone(dev, zone);
- if (zone->cond == BLK_ZONE_COND_FULL ||
- zone->cond == BLK_ZONE_COND_READONLY ||
- zone->cond == BLK_ZONE_COND_OFFLINE) {
- /* Cannot write to the zone */
- ret = BLK_STS_IOERR;
- goto unlock;
- }
-
/*
- * Regular writes must be at the write pointer position.
- * Zone append writes are automatically issued at the write
- * pointer and the position returned using the request or BIO
- * sector.
+ * Regular writes must be at the write pointer position. Zone append
+ * writes are automatically issued at the write pointer and the position
+ * returned using the request sector. Note that we do not check the zone
+ * condition because for FULL, READONLY and OFFLINE zones, the sector
+ * check against the zone write pointer will always result in failing
+ * the command.
*/
if (append) {
+ if (WARN_ON_ONCE(!dev->zone_append_max_sectors) ||
+ zone->wp == NULL_ZONE_INVALID_WP) {
+ ret = BLK_STS_IOERR;
+ goto unlock_zone;
+ }
sector = zone->wp;
blk_mq_rq_from_pdu(cmd)->__sector = sector;
- } else if (sector != zone->wp) {
- ret = BLK_STS_IOERR;
- goto unlock;
}
- if (zone->wp + nr_sectors > zone->start + zone->capacity) {
+ if (sector != zone->wp ||
+ zone->wp + nr_sectors > zone->start + zone->capacity) {
ret = BLK_STS_IOERR;
- goto unlock;
+ goto unlock_zone;
}
if (zone->cond == BLK_ZONE_COND_CLOSED ||
zone->cond == BLK_ZONE_COND_EMPTY) {
- null_lock_zone_res(dev);
+ if (dev->need_zone_res_mgmt) {
+ spin_lock(&dev->zone_res_lock);
- ret = null_check_zone_resources(dev, zone);
- if (ret != BLK_STS_OK) {
- null_unlock_zone_res(dev);
- goto unlock;
- }
- if (zone->cond == BLK_ZONE_COND_CLOSED) {
- dev->nr_zones_closed--;
- dev->nr_zones_imp_open++;
- } else if (zone->cond == BLK_ZONE_COND_EMPTY) {
- dev->nr_zones_imp_open++;
- }
+ ret = null_check_zone_resources(dev, zone);
+ if (ret != BLK_STS_OK) {
+ spin_unlock(&dev->zone_res_lock);
+ goto unlock_zone;
+ }
+ if (zone->cond == BLK_ZONE_COND_CLOSED) {
+ dev->nr_zones_closed--;
+ dev->nr_zones_imp_open++;
+ } else if (zone->cond == BLK_ZONE_COND_EMPTY) {
+ dev->nr_zones_imp_open++;
+ }
- if (zone->cond != BLK_ZONE_COND_EXP_OPEN)
- zone->cond = BLK_ZONE_COND_IMP_OPEN;
+ spin_unlock(&dev->zone_res_lock);
+ }
- null_unlock_zone_res(dev);
+ zone->cond = BLK_ZONE_COND_IMP_OPEN;
}
ret = null_process_cmd(cmd, REQ_OP_WRITE, sector, nr_sectors);
if (ret != BLK_STS_OK)
- goto unlock;
+ goto unlock_zone;
zone->wp += nr_sectors;
if (zone->wp == zone->start + zone->capacity) {
- null_lock_zone_res(dev);
- if (zone->cond == BLK_ZONE_COND_EXP_OPEN)
- dev->nr_zones_exp_open--;
- else if (zone->cond == BLK_ZONE_COND_IMP_OPEN)
- dev->nr_zones_imp_open--;
+ if (dev->need_zone_res_mgmt) {
+ spin_lock(&dev->zone_res_lock);
+ if (zone->cond == BLK_ZONE_COND_EXP_OPEN)
+ dev->nr_zones_exp_open--;
+ else if (zone->cond == BLK_ZONE_COND_IMP_OPEN)
+ dev->nr_zones_imp_open--;
+ spin_unlock(&dev->zone_res_lock);
+ }
zone->cond = BLK_ZONE_COND_FULL;
- null_unlock_zone_res(dev);
}
ret = BLK_STS_OK;
-unlock:
+unlock_zone:
null_unlock_zone(dev, zone);
return ret;
@@ -454,54 +431,100 @@ static blk_status_t null_open_zone(struct nullb_device *dev,
if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL)
return BLK_STS_IOERR;
- null_lock_zone_res(dev);
-
switch (zone->cond) {
case BLK_ZONE_COND_EXP_OPEN:
- /* open operation on exp open is not an error */
- goto unlock;
+ /* Open operation on exp open is not an error */
+ return BLK_STS_OK;
case BLK_ZONE_COND_EMPTY:
- ret = null_check_zone_resources(dev, zone);
- if (ret != BLK_STS_OK)
- goto unlock;
- break;
case BLK_ZONE_COND_IMP_OPEN:
- dev->nr_zones_imp_open--;
- break;
case BLK_ZONE_COND_CLOSED:
- ret = null_check_zone_resources(dev, zone);
- if (ret != BLK_STS_OK)
- goto unlock;
- dev->nr_zones_closed--;
break;
case BLK_ZONE_COND_FULL:
default:
- ret = BLK_STS_IOERR;
- goto unlock;
+ return BLK_STS_IOERR;
}
- zone->cond = BLK_ZONE_COND_EXP_OPEN;
- dev->nr_zones_exp_open++;
+ if (dev->need_zone_res_mgmt) {
+ spin_lock(&dev->zone_res_lock);
-unlock:
- null_unlock_zone_res(dev);
+ switch (zone->cond) {
+ case BLK_ZONE_COND_EMPTY:
+ ret = null_check_zone_resources(dev, zone);
+ if (ret != BLK_STS_OK) {
+ spin_unlock(&dev->zone_res_lock);
+ return ret;
+ }
+ break;
+ case BLK_ZONE_COND_IMP_OPEN:
+ dev->nr_zones_imp_open--;
+ break;
+ case BLK_ZONE_COND_CLOSED:
+ ret = null_check_zone_resources(dev, zone);
+ if (ret != BLK_STS_OK) {
+ spin_unlock(&dev->zone_res_lock);
+ return ret;
+ }
+ dev->nr_zones_closed--;
+ break;
+ default:
+ break;
+ }
- return ret;
+ dev->nr_zones_exp_open++;
+
+ spin_unlock(&dev->zone_res_lock);
+ }
+
+ zone->cond = BLK_ZONE_COND_EXP_OPEN;
+
+ return BLK_STS_OK;
}
static blk_status_t null_close_zone(struct nullb_device *dev,
struct nullb_zone *zone)
{
- blk_status_t ret;
-
if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL)
return BLK_STS_IOERR;
- null_lock_zone_res(dev);
- ret = __null_close_zone(dev, zone);
- null_unlock_zone_res(dev);
+ switch (zone->cond) {
+ case BLK_ZONE_COND_CLOSED:
+ /* close operation on closed is not an error */
+ return BLK_STS_OK;
+ case BLK_ZONE_COND_IMP_OPEN:
+ case BLK_ZONE_COND_EXP_OPEN:
+ break;
+ case BLK_ZONE_COND_EMPTY:
+ case BLK_ZONE_COND_FULL:
+ default:
+ return BLK_STS_IOERR;
+ }
+
+ if (dev->need_zone_res_mgmt) {
+ spin_lock(&dev->zone_res_lock);
- return ret;
+ switch (zone->cond) {
+ case BLK_ZONE_COND_IMP_OPEN:
+ dev->nr_zones_imp_open--;
+ break;
+ case BLK_ZONE_COND_EXP_OPEN:
+ dev->nr_zones_exp_open--;
+ break;
+ default:
+ break;
+ }
+
+ if (zone->wp > zone->start)
+ dev->nr_zones_closed++;
+
+ spin_unlock(&dev->zone_res_lock);
+ }
+
+ if (zone->wp == zone->start)
+ zone->cond = BLK_ZONE_COND_EMPTY;
+ else
+ zone->cond = BLK_ZONE_COND_CLOSED;
+
+ return BLK_STS_OK;
}
static blk_status_t null_finish_zone(struct nullb_device *dev,
@@ -512,41 +535,47 @@ static blk_status_t null_finish_zone(struct nullb_device *dev,
if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL)
return BLK_STS_IOERR;
- null_lock_zone_res(dev);
+ if (dev->need_zone_res_mgmt) {
+ spin_lock(&dev->zone_res_lock);
- switch (zone->cond) {
- case BLK_ZONE_COND_FULL:
- /* finish operation on full is not an error */
- goto unlock;
- case BLK_ZONE_COND_EMPTY:
- ret = null_check_zone_resources(dev, zone);
- if (ret != BLK_STS_OK)
- goto unlock;
- break;
- case BLK_ZONE_COND_IMP_OPEN:
- dev->nr_zones_imp_open--;
- break;
- case BLK_ZONE_COND_EXP_OPEN:
- dev->nr_zones_exp_open--;
- break;
- case BLK_ZONE_COND_CLOSED:
- ret = null_check_zone_resources(dev, zone);
- if (ret != BLK_STS_OK)
- goto unlock;
- dev->nr_zones_closed--;
- break;
- default:
- ret = BLK_STS_IOERR;
- goto unlock;
+ switch (zone->cond) {
+ case BLK_ZONE_COND_FULL:
+ /* Finish operation on full is not an error */
+ spin_unlock(&dev->zone_res_lock);
+ return BLK_STS_OK;
+ case BLK_ZONE_COND_EMPTY:
+ ret = null_check_zone_resources(dev, zone);
+ if (ret != BLK_STS_OK) {
+ spin_unlock(&dev->zone_res_lock);
+ return ret;
+ }
+ break;
+ case BLK_ZONE_COND_IMP_OPEN:
+ dev->nr_zones_imp_open--;
+ break;
+ case BLK_ZONE_COND_EXP_OPEN:
+ dev->nr_zones_exp_open--;
+ break;
+ case BLK_ZONE_COND_CLOSED:
+ ret = null_check_zone_resources(dev, zone);
+ if (ret != BLK_STS_OK) {
+ spin_unlock(&dev->zone_res_lock);
+ return ret;
+ }
+ dev->nr_zones_closed--;
+ break;
+ default:
+ spin_unlock(&dev->zone_res_lock);
+ return BLK_STS_IOERR;
+ }
+
+ spin_unlock(&dev->zone_res_lock);
}
zone->cond = BLK_ZONE_COND_FULL;
zone->wp = zone->start + zone->len;
-unlock:
- null_unlock_zone_res(dev);
-
- return ret;
+ return BLK_STS_OK;
}
static blk_status_t null_reset_zone(struct nullb_device *dev,
@@ -555,34 +584,33 @@ static blk_status_t null_reset_zone(struct nullb_device *dev,
if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL)
return BLK_STS_IOERR;
- null_lock_zone_res(dev);
+ if (dev->need_zone_res_mgmt) {
+ spin_lock(&dev->zone_res_lock);
- switch (zone->cond) {
- case BLK_ZONE_COND_EMPTY:
- /* reset operation on empty is not an error */
- null_unlock_zone_res(dev);
- return BLK_STS_OK;
- case BLK_ZONE_COND_IMP_OPEN:
- dev->nr_zones_imp_open--;
- break;
- case BLK_ZONE_COND_EXP_OPEN:
- dev->nr_zones_exp_open--;
- break;
- case BLK_ZONE_COND_CLOSED:
- dev->nr_zones_closed--;
- break;
- case BLK_ZONE_COND_FULL:
- break;
- default:
- null_unlock_zone_res(dev);
- return BLK_STS_IOERR;
+ switch (zone->cond) {
+ case BLK_ZONE_COND_IMP_OPEN:
+ dev->nr_zones_imp_open--;
+ break;
+ case BLK_ZONE_COND_EXP_OPEN:
+ dev->nr_zones_exp_open--;
+ break;
+ case BLK_ZONE_COND_CLOSED:
+ dev->nr_zones_closed--;
+ break;
+ case BLK_ZONE_COND_EMPTY:
+ case BLK_ZONE_COND_FULL:
+ break;
+ default:
+ spin_unlock(&dev->zone_res_lock);
+ return BLK_STS_IOERR;
+ }
+
+ spin_unlock(&dev->zone_res_lock);
}
zone->cond = BLK_ZONE_COND_EMPTY;
zone->wp = zone->start;
- null_unlock_zone_res(dev);
-
if (dev->memory_backed)
return null_handle_discard(dev, zone->start, zone->len);
@@ -711,7 +739,7 @@ static void null_set_zone_cond(struct nullb_device *dev,
zone->cond != BLK_ZONE_COND_OFFLINE)
null_finish_zone(dev, zone);
zone->cond = cond;
- zone->wp = (sector_t)-1;
+ zone->wp = NULL_ZONE_INVALID_WP;
}
null_unlock_zone(dev, zone);
diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index bea3d5cf8a83..851c78913de2 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -221,7 +221,7 @@ static int ublk_get_nr_zones(const struct ublk_device *ub)
static int ublk_revalidate_disk_zones(struct ublk_device *ub)
{
- return blk_revalidate_disk_zones(ub->ub_disk, NULL);
+ return blk_revalidate_disk_zones(ub->ub_disk);
}
static int ublk_dev_param_zoned_validate(const struct ublk_device *ub)
@@ -249,8 +249,7 @@ static int ublk_dev_param_zoned_validate(const struct ublk_device *ub)
static void ublk_dev_param_zoned_apply(struct ublk_device *ub)
{
blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, ub->ub_disk->queue);
- blk_queue_required_elevator_features(ub->ub_disk->queue,
- ELEVATOR_F_ZBD_SEQ_WRITE);
+
ub->ub_disk->nr_zones = ublk_get_nr_zones(ub);
}
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 42dea7601d87..c1af0a7d56c8 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -1543,7 +1543,7 @@ static int virtblk_probe(struct virtio_device *vdev)
*/
if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && lim.zoned) {
blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, vblk->disk->queue);
- err = blk_revalidate_disk_zones(vblk->disk, NULL);
+ err = blk_revalidate_disk_zones(vblk->disk);
if (err)
goto out_cleanup_disk;
}
diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h
index e6757a30dcca..08700bfc3e23 100644
--- a/drivers/md/dm-core.h
+++ b/drivers/md/dm-core.h
@@ -140,7 +140,7 @@ struct mapped_device {
#ifdef CONFIG_BLK_DEV_ZONED
unsigned int nr_zones;
- unsigned int *zwp_offset;
+ void *zone_revalidate_map;
#endif
#ifdef CONFIG_IMA
diff --git a/drivers/md/dm-zone.c b/drivers/md/dm-zone.c
index eb9832b22b14..d17ae4486a6a 100644
--- a/drivers/md/dm-zone.c
+++ b/drivers/md/dm-zone.c
@@ -60,16 +60,23 @@ int dm_blk_report_zones(struct gendisk *disk, sector_t sector,
struct dm_table *map;
int srcu_idx, ret;
- if (dm_suspended_md(md))
- return -EAGAIN;
+ if (!md->zone_revalidate_map) {
+ /* Regular user context */
+ if (dm_suspended_md(md))
+ return -EAGAIN;
- map = dm_get_live_table(md, &srcu_idx);
- if (!map)
- return -EIO;
+ map = dm_get_live_table(md, &srcu_idx);
+ if (!map)
+ return -EIO;
+ } else {
+ /* Zone revalidation during __bind() */
+ map = md->zone_revalidate_map;
+ }
ret = dm_blk_do_report_zones(md, map, sector, nr_zones, cb, data);
- dm_put_live_table(md, srcu_idx);
+ if (!md->zone_revalidate_map)
+ dm_put_live_table(md, srcu_idx);
return ret;
}
@@ -138,85 +145,6 @@ bool dm_is_zone_write(struct mapped_device *md, struct bio *bio)
}
}
-void dm_cleanup_zoned_dev(struct mapped_device *md)
-{
- if (md->disk) {
- bitmap_free(md->disk->conv_zones_bitmap);
- md->disk->conv_zones_bitmap = NULL;
- bitmap_free(md->disk->seq_zones_wlock);
- md->disk->seq_zones_wlock = NULL;
- }
-
- kvfree(md->zwp_offset);
- md->zwp_offset = NULL;
- md->nr_zones = 0;
-}
-
-static unsigned int dm_get_zone_wp_offset(struct blk_zone *zone)
-{
- switch (zone->cond) {
- case BLK_ZONE_COND_IMP_OPEN:
- case BLK_ZONE_COND_EXP_OPEN:
- case BLK_ZONE_COND_CLOSED:
- return zone->wp - zone->start;
- case BLK_ZONE_COND_FULL:
- return zone->len;
- case BLK_ZONE_COND_EMPTY:
- case BLK_ZONE_COND_NOT_WP:
- case BLK_ZONE_COND_OFFLINE:
- case BLK_ZONE_COND_READONLY:
- default:
- /*
- * Conventional, offline and read-only zones do not have a valid
- * write pointer. Use 0 as for an empty zone.
- */
- return 0;
- }
-}
-
-static int dm_zone_revalidate_cb(struct blk_zone *zone, unsigned int idx,
- void *data)
-{
- struct mapped_device *md = data;
- struct gendisk *disk = md->disk;
-
- switch (zone->type) {
- case BLK_ZONE_TYPE_CONVENTIONAL:
- if (!disk->conv_zones_bitmap) {
- disk->conv_zones_bitmap = bitmap_zalloc(disk->nr_zones,
- GFP_NOIO);
- if (!disk->conv_zones_bitmap)
- return -ENOMEM;
- }
- set_bit(idx, disk->conv_zones_bitmap);
- break;
- case BLK_ZONE_TYPE_SEQWRITE_REQ:
- case BLK_ZONE_TYPE_SEQWRITE_PREF:
- if (!disk->seq_zones_wlock) {
- disk->seq_zones_wlock = bitmap_zalloc(disk->nr_zones,
- GFP_NOIO);
- if (!disk->seq_zones_wlock)
- return -ENOMEM;
- }
- if (!md->zwp_offset) {
- md->zwp_offset =
- kvcalloc(disk->nr_zones, sizeof(unsigned int),
- GFP_KERNEL);
- if (!md->zwp_offset)
- return -ENOMEM;
- }
- md->zwp_offset[idx] = dm_get_zone_wp_offset(zone);
-
- break;
- default:
- DMERR("Invalid zone type 0x%x at sectors %llu",
- (int)zone->type, zone->start);
- return -ENODEV;
- }
-
- return 0;
-}
-
/*
* Revalidate the zones of a mapped device to initialize resource necessary
* for zone append emulation. Note that we cannot simply use the block layer
@@ -226,41 +154,32 @@ static int dm_zone_revalidate_cb(struct blk_zone *zone, unsigned int idx,
static int dm_revalidate_zones(struct mapped_device *md, struct dm_table *t)
{
struct gendisk *disk = md->disk;
- unsigned int noio_flag;
int ret;
- /*
- * Check if something changed. If yes, cleanup the current resources
- * and reallocate everything.
- */
+ /* Revalidate only if something changed. */
if (!disk->nr_zones || disk->nr_zones != md->nr_zones)
- dm_cleanup_zoned_dev(md);
+ md->nr_zones = 0;
+
if (md->nr_zones)
return 0;
/*
- * Scan all zones to initialize everything. Ensure that all vmalloc
- * operations in this context are done as if GFP_NOIO was specified.
+ * Our table is not live yet. So the call to dm_get_live_table()
+ * in dm_blk_report_zones() will fail. Set a temporary pointer to
+ * our table for dm_blk_report_zones() to use directly.
*/
- noio_flag = memalloc_noio_save();
- ret = dm_blk_do_report_zones(md, t, 0, disk->nr_zones,
- dm_zone_revalidate_cb, md);
- memalloc_noio_restore(noio_flag);
- if (ret < 0)
- goto err;
- if (ret != disk->nr_zones) {
- ret = -EIO;
- goto err;
+ md->zone_revalidate_map = t;
+ ret = blk_revalidate_disk_zones(disk);
+ md->zone_revalidate_map = NULL;
+
+ if (ret) {
+ DMERR("Revalidate zones failed %d", ret);
+ return ret;
}
md->nr_zones = disk->nr_zones;
return 0;
-
-err:
- DMERR("Revalidate zones failed %d", ret);
- dm_cleanup_zoned_dev(md);
- return ret;
}
static int device_not_zone_append_capable(struct dm_target *ti,
@@ -291,292 +210,27 @@ int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q)
struct mapped_device *md = t->md;
/*
- * For a zoned target, the number of zones should be updated for the
- * correct value to be exposed in sysfs queue/nr_zones.
+ * Check if zone append is natively supported, and if not, set the
+ * mapped device queue as needing zone append emulation.
*/
WARN_ON_ONCE(queue_is_mq(q));
- md->disk->nr_zones = bdev_nr_zones(md->disk->part0);
-
- /* Check if zone append is natively supported */
if (dm_table_supports_zone_append(t)) {
clear_bit(DMF_EMULATE_ZONE_APPEND, &md->flags);
- dm_cleanup_zoned_dev(md);
- return 0;
+ } else {
+ set_bit(DMF_EMULATE_ZONE_APPEND, &md->flags);
+ blk_queue_max_zone_append_sectors(q, 0);
}
- /*
- * Mark the mapped device as needing zone append emulation and
- * initialize the emulation resources once the capacity is set.
- */
- set_bit(DMF_EMULATE_ZONE_APPEND, &md->flags);
if (!get_capacity(md->disk))
return 0;
- return dm_revalidate_zones(md, t);
-}
-
-static int dm_update_zone_wp_offset_cb(struct blk_zone *zone, unsigned int idx,
- void *data)
-{
- unsigned int *wp_offset = data;
-
- *wp_offset = dm_get_zone_wp_offset(zone);
-
- return 0;
-}
-
-static int dm_update_zone_wp_offset(struct mapped_device *md, unsigned int zno,
- unsigned int *wp_ofst)
-{
- sector_t sector = zno * bdev_zone_sectors(md->disk->part0);
- unsigned int noio_flag;
- struct dm_table *t;
- int srcu_idx, ret;
-
- t = dm_get_live_table(md, &srcu_idx);
- if (!t)
- return -EIO;
-
- /*
- * Ensure that all memory allocations in this context are done as if
- * GFP_NOIO was specified.
- */
- noio_flag = memalloc_noio_save();
- ret = dm_blk_do_report_zones(md, t, sector, 1,
- dm_update_zone_wp_offset_cb, wp_ofst);
- memalloc_noio_restore(noio_flag);
-
- dm_put_live_table(md, srcu_idx);
-
- if (ret != 1)
- return -EIO;
-
- return 0;
-}
-
-struct orig_bio_details {
- enum req_op op;
- unsigned int nr_sectors;
-};
-
-/*
- * First phase of BIO mapping for targets with zone append emulation:
- * check all BIO that change a zone writer pointer and change zone
- * append operations into regular write operations.
- */
-static bool dm_zone_map_bio_begin(struct mapped_device *md,
- unsigned int zno, struct bio *clone)
-{
- sector_t zsectors = bdev_zone_sectors(md->disk->part0);
- unsigned int zwp_offset = READ_ONCE(md->zwp_offset[zno]);
-
- /*
- * If the target zone is in an error state, recover by inspecting the
- * zone to get its current write pointer position. Note that since the
- * target zone is already locked, a BIO issuing context should never
- * see the zone write in the DM_ZONE_UPDATING_WP_OFST state.
- */
- if (zwp_offset == DM_ZONE_INVALID_WP_OFST) {
- if (dm_update_zone_wp_offset(md, zno, &zwp_offset))
- return false;
- WRITE_ONCE(md->zwp_offset[zno], zwp_offset);
- }
-
- switch (bio_op(clone)) {
- case REQ_OP_ZONE_RESET:
- case REQ_OP_ZONE_FINISH:
- return true;
- case REQ_OP_WRITE_ZEROES:
- case REQ_OP_WRITE:
- /* Writes must be aligned to the zone write pointer */
- if ((clone->bi_iter.bi_sector & (zsectors - 1)) != zwp_offset)
- return false;
- break;
- case REQ_OP_ZONE_APPEND:
- /*
- * Change zone append operations into a non-mergeable regular
- * writes directed at the current write pointer position of the
- * target zone.
- */
- clone->bi_opf = REQ_OP_WRITE | REQ_NOMERGE |
- (clone->bi_opf & (~REQ_OP_MASK));
- clone->bi_iter.bi_sector += zwp_offset;
- break;
- default:
- DMWARN_LIMIT("Invalid BIO operation");
- return false;
- }
-
- /* Cannot write to a full zone */
- if (zwp_offset >= zsectors)
- return false;
-
- return true;
-}
-
-/*
- * Second phase of BIO mapping for targets with zone append emulation:
- * update the zone write pointer offset array to account for the additional
- * data written to a zone. Note that at this point, the remapped clone BIO
- * may already have completed, so we do not touch it.
- */
-static blk_status_t dm_zone_map_bio_end(struct mapped_device *md, unsigned int zno,
- struct orig_bio_details *orig_bio_details,
- unsigned int nr_sectors)
-{
- unsigned int zwp_offset = READ_ONCE(md->zwp_offset[zno]);
-
- /* The clone BIO may already have been completed and failed */
- if (zwp_offset == DM_ZONE_INVALID_WP_OFST)
- return BLK_STS_IOERR;
-
- /* Update the zone wp offset */
- switch (orig_bio_details->op) {
- case REQ_OP_ZONE_RESET:
- WRITE_ONCE(md->zwp_offset[zno], 0);
- return BLK_STS_OK;
- case REQ_OP_ZONE_FINISH:
- WRITE_ONCE(md->zwp_offset[zno],
- bdev_zone_sectors(md->disk->part0));
- return BLK_STS_OK;
- case REQ_OP_WRITE_ZEROES:
- case REQ_OP_WRITE:
- WRITE_ONCE(md->zwp_offset[zno], zwp_offset + nr_sectors);
- return BLK_STS_OK;
- case REQ_OP_ZONE_APPEND:
- /*
- * Check that the target did not truncate the write operation
- * emulating a zone append.
- */
- if (nr_sectors != orig_bio_details->nr_sectors) {
- DMWARN_LIMIT("Truncated write for zone append");
- return BLK_STS_IOERR;
- }
- WRITE_ONCE(md->zwp_offset[zno], zwp_offset + nr_sectors);
- return BLK_STS_OK;
- default:
- DMWARN_LIMIT("Invalid BIO operation");
- return BLK_STS_IOERR;
+ if (!md->disk->nr_zones) {
+ DMINFO("%s using %s zone append",
+ md->disk->disk_name,
+ queue_emulates_zone_append(q) ? "emulated" : "native");
}
-}
-
-static inline void dm_zone_lock(struct gendisk *disk, unsigned int zno,
- struct bio *clone)
-{
- if (WARN_ON_ONCE(bio_flagged(clone, BIO_ZONE_WRITE_LOCKED)))
- return;
-
- wait_on_bit_lock_io(disk->seq_zones_wlock, zno, TASK_UNINTERRUPTIBLE);
- bio_set_flag(clone, BIO_ZONE_WRITE_LOCKED);
-}
-
-static inline void dm_zone_unlock(struct gendisk *disk, unsigned int zno,
- struct bio *clone)
-{
- if (!bio_flagged(clone, BIO_ZONE_WRITE_LOCKED))
- return;
-
- WARN_ON_ONCE(!test_bit(zno, disk->seq_zones_wlock));
- clear_bit_unlock(zno, disk->seq_zones_wlock);
- smp_mb__after_atomic();
- wake_up_bit(disk->seq_zones_wlock, zno);
-
- bio_clear_flag(clone, BIO_ZONE_WRITE_LOCKED);
-}
-static bool dm_need_zone_wp_tracking(struct bio *bio)
-{
- /*
- * Special processing is not needed for operations that do not need the
- * zone write lock, that is, all operations that target conventional
- * zones and all operations that do not modify directly a sequential
- * zone write pointer.
- */
- if (op_is_flush(bio->bi_opf) && !bio_sectors(bio))
- return false;
- switch (bio_op(bio)) {
- case REQ_OP_WRITE_ZEROES:
- case REQ_OP_WRITE:
- case REQ_OP_ZONE_RESET:
- case REQ_OP_ZONE_FINISH:
- case REQ_OP_ZONE_APPEND:
- return bio_zone_is_seq(bio);
- default:
- return false;
- }
-}
-
-/*
- * Special IO mapping for targets needing zone append emulation.
- */
-int dm_zone_map_bio(struct dm_target_io *tio)
-{
- struct dm_io *io = tio->io;
- struct dm_target *ti = tio->ti;
- struct mapped_device *md = io->md;
- struct bio *clone = &tio->clone;
- struct orig_bio_details orig_bio_details;
- unsigned int zno;
- blk_status_t sts;
- int r;
-
- /*
- * IOs that do not change a zone write pointer do not need
- * any additional special processing.
- */
- if (!dm_need_zone_wp_tracking(clone))
- return ti->type->map(ti, clone);
-
- /* Lock the target zone */
- zno = bio_zone_no(clone);
- dm_zone_lock(md->disk, zno, clone);
-
- orig_bio_details.nr_sectors = bio_sectors(clone);
- orig_bio_details.op = bio_op(clone);
-
- /*
- * Check that the bio and the target zone write pointer offset are
- * both valid, and if the bio is a zone append, remap it to a write.
- */
- if (!dm_zone_map_bio_begin(md, zno, clone)) {
- dm_zone_unlock(md->disk, zno, clone);
- return DM_MAPIO_KILL;
- }
-
- /* Let the target do its work */
- r = ti->type->map(ti, clone);
- switch (r) {
- case DM_MAPIO_SUBMITTED:
- /*
- * The target submitted the clone BIO. The target zone will
- * be unlocked on completion of the clone.
- */
- sts = dm_zone_map_bio_end(md, zno, &orig_bio_details,
- *tio->len_ptr);
- break;
- case DM_MAPIO_REMAPPED:
- /*
- * The target only remapped the clone BIO. In case of error,
- * unlock the target zone here as the clone will not be
- * submitted.
- */
- sts = dm_zone_map_bio_end(md, zno, &orig_bio_details,
- *tio->len_ptr);
- if (sts != BLK_STS_OK)
- dm_zone_unlock(md->disk, zno, clone);
- break;
- case DM_MAPIO_REQUEUE:
- case DM_MAPIO_KILL:
- default:
- dm_zone_unlock(md->disk, zno, clone);
- sts = BLK_STS_IOERR;
- break;
- }
-
- if (sts != BLK_STS_OK)
- return DM_MAPIO_KILL;
-
- return r;
+ return dm_revalidate_zones(md, t);
}
/*
@@ -587,61 +241,17 @@ void dm_zone_endio(struct dm_io *io, struct bio *clone)
struct mapped_device *md = io->md;
struct gendisk *disk = md->disk;
struct bio *orig_bio = io->orig_bio;
- unsigned int zwp_offset;
- unsigned int zno;
/*
- * For targets that do not emulate zone append, we only need to
- * handle native zone-append bios.
+ * Get the offset within the zone of the written sector
+ * and add that to the original bio sector position.
*/
- if (!dm_emulate_zone_append(md)) {
- /*
- * Get the offset within the zone of the written sector
- * and add that to the original bio sector position.
- */
- if (clone->bi_status == BLK_STS_OK &&
- bio_op(clone) == REQ_OP_ZONE_APPEND) {
- sector_t mask =
- (sector_t)bdev_zone_sectors(disk->part0) - 1;
-
- orig_bio->bi_iter.bi_sector +=
- clone->bi_iter.bi_sector & mask;
- }
-
- return;
- }
+ if (clone->bi_status == BLK_STS_OK &&
+ bio_op(clone) == REQ_OP_ZONE_APPEND) {
+ sector_t mask = bdev_zone_sectors(disk->part0) - 1;
- /*
- * For targets that do emulate zone append, if the clone BIO does not
- * own the target zone write lock, we have nothing to do.
- */
- if (!bio_flagged(clone, BIO_ZONE_WRITE_LOCKED))
- return;
-
- zno = bio_zone_no(orig_bio);
-
- if (clone->bi_status != BLK_STS_OK) {
- /*
- * BIOs that modify a zone write pointer may leave the zone
- * in an unknown state in case of failure (e.g. the write
- * pointer was only partially advanced). In this case, set
- * the target zone write pointer as invalid unless it is
- * already being updated.
- */
- WRITE_ONCE(md->zwp_offset[zno], DM_ZONE_INVALID_WP_OFST);
- } else if (bio_op(orig_bio) == REQ_OP_ZONE_APPEND) {
- /*
- * Get the written sector for zone append operation that were
- * emulated using regular write operations.
- */
- zwp_offset = READ_ONCE(md->zwp_offset[zno]);
- if (WARN_ON_ONCE(zwp_offset < bio_sectors(orig_bio)))
- WRITE_ONCE(md->zwp_offset[zno],
- DM_ZONE_INVALID_WP_OFST);
- else
- orig_bio->bi_iter.bi_sector +=
- zwp_offset - bio_sectors(orig_bio);
+ orig_bio->bi_iter.bi_sector += clone->bi_iter.bi_sector & mask;
}
- dm_zone_unlock(disk, zno, clone);
+ return;
}
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 56aa2a8b9d71..2369d10c8475 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1422,25 +1422,12 @@ static void __map_bio(struct bio *clone)
down(&md->swap_bios_semaphore);
}
- if (static_branch_unlikely(&zoned_enabled)) {
- /*
- * Check if the IO needs a special mapping due to zone append
- * emulation on zoned target. In this case, dm_zone_map_bio()
- * calls the target map operation.
- */
- if (unlikely(dm_emulate_zone_append(md)))
- r = dm_zone_map_bio(tio);
- else
- goto do_map;
- } else {
-do_map:
- if (likely(ti->type->map == linear_map))
- r = linear_map(ti, clone);
- else if (ti->type->map == stripe_map)
- r = stripe_map(ti, clone);
- else
- r = ti->type->map(ti, clone);
- }
+ if (likely(ti->type->map == linear_map))
+ r = linear_map(ti, clone);
+ else if (ti->type->map == stripe_map)
+ r = stripe_map(ti, clone);
+ else
+ r = ti->type->map(ti, clone);
switch (r) {
case DM_MAPIO_SUBMITTED:
@@ -1768,6 +1755,33 @@ static void init_clone_info(struct clone_info *ci, struct dm_io *io,
ci->sector_count = 0;
}
+#ifdef CONFIG_BLK_DEV_ZONED
+static inline bool dm_zone_bio_needs_split(struct mapped_device *md,
+ struct bio *bio)
+{
+ /*
+ * For mapped device that need zone append emulation, we must
+ * split any large BIO that straddles zone boundaries.
+ */
+ return dm_emulate_zone_append(md) && bio_straddles_zones(bio) &&
+ !bio_flagged(bio, BIO_ZONE_WRITE_PLUGGING);
+}
+static inline bool dm_zone_plug_bio(struct mapped_device *md, struct bio *bio)
+{
+ return dm_emulate_zone_append(md) && blk_zone_plug_bio(bio, 0);
+}
+#else
+static inline bool dm_zone_bio_needs_split(struct mapped_device *md,
+ struct bio *bio)
+{
+ return false;
+}
+static inline bool dm_zone_plug_bio(struct mapped_device *md, struct bio *bio)
+{
+ return false;
+}
+#endif
+
/*
* Entry point to split a bio into clones and submit them to the targets.
*/
@@ -1777,19 +1791,32 @@ static void dm_split_and_process_bio(struct mapped_device *md,
struct clone_info ci;
struct dm_io *io;
blk_status_t error = BLK_STS_OK;
- bool is_abnormal;
+ bool is_abnormal, need_split;
+
+ need_split = is_abnormal = is_abnormal_io(bio);
+ if (static_branch_unlikely(&zoned_enabled))
+ need_split = is_abnormal || dm_zone_bio_needs_split(md, bio);
- is_abnormal = is_abnormal_io(bio);
- if (unlikely(is_abnormal)) {
+ if (unlikely(need_split)) {
/*
* Use bio_split_to_limits() for abnormal IO (e.g. discard, etc)
* otherwise associated queue_limits won't be imposed.
+ * Also split the BIO for mapped devices needing zone append
+ * emulation to ensure that the BIO does not cross zone
+ * boundaries.
*/
bio = bio_split_to_limits(bio);
if (!bio)
return;
}
+ /*
+ * Use the block layer zone write plugging for mapped devices that
+ * need zone append emulation (e.g. dm-crypt).
+ */
+ if (static_branch_unlikely(&zoned_enabled) && dm_zone_plug_bio(md, bio))
+ return;
+
/* Only support nowait for normal IO */
if (unlikely(bio->bi_opf & REQ_NOWAIT) && !is_abnormal) {
io = alloc_io(md, bio, GFP_NOWAIT);
@@ -2010,7 +2037,6 @@ static void cleanup_mapped_device(struct mapped_device *md)
md->dax_dev = NULL;
}
- dm_cleanup_zoned_dev(md);
if (md->disk) {
spin_lock(&_minor_lock);
md->disk->private_data = NULL;
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 7f1acbf6bd9e..e0c57f19839b 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -104,13 +104,11 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t);
int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q);
void dm_zone_endio(struct dm_io *io, struct bio *clone);
#ifdef CONFIG_BLK_DEV_ZONED
-void dm_cleanup_zoned_dev(struct mapped_device *md);
int dm_blk_report_zones(struct gendisk *disk, sector_t sector,
unsigned int nr_zones, report_zones_cb cb, void *data);
bool dm_is_zone_write(struct mapped_device *md, struct bio *bio);
int dm_zone_map_bio(struct dm_target_io *io);
#else
-static inline void dm_cleanup_zoned_dev(struct mapped_device *md) {}
#define dm_blk_report_zones NULL
static inline bool dm_is_zone_write(struct mapped_device *md, struct bio *bio)
{
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 943d72bdd794..c9955ecd1790 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -2150,7 +2150,7 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns,
blk_mq_unfreeze_queue(ns->disk->queue);
if (blk_queue_is_zoned(ns->queue)) {
- ret = blk_revalidate_disk_zones(ns->disk, NULL);
+ ret = blk_revalidate_disk_zones(ns->disk);
if (ret && !nvme_first_scan(ns->disk))
goto out;
}
diff --git a/drivers/nvme/target/zns.c b/drivers/nvme/target/zns.c
index 3148d9f1bde6..0021d06041c1 100644
--- a/drivers/nvme/target/zns.c
+++ b/drivers/nvme/target/zns.c
@@ -52,14 +52,10 @@ bool nvmet_bdev_zns_enable(struct nvmet_ns *ns)
if (get_capacity(bd_disk) & (bdev_zone_sectors(ns->bdev) - 1))
return false;
/*
- * ZNS does not define a conventional zone type. If the underlying
- * device has a bitmap set indicating the existence of conventional
- * zones, reject the device. Otherwise, use report zones to detect if
- * the device has conventional zones.
+ * ZNS does not define a conventional zone type. Use report zones
+ * to detect if the device has conventional zones and reject it if
+ * it does.
*/
- if (ns->bdev->bd_disk->conv_zones_bitmap)
- return false;
-
ret = blkdev_report_zones(ns->bdev, 0, bdev_nr_zones(ns->bdev),
validate_conv_zones_cb, NULL);
if (ret < 0)
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 2e28e2360c85..9ca96116bd33 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -1870,7 +1870,6 @@ out_put_budget:
case BLK_STS_OK:
break;
case BLK_STS_RESOURCE:
- case BLK_STS_ZONE_RESOURCE:
if (scsi_device_blocked(sdev))
ret = BLK_STS_DEV_RESOURCE;
break;
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index 3cf898670290..dcba9530ffa5 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -1260,12 +1260,6 @@ static blk_status_t sd_setup_read_write_cmnd(struct scsi_cmnd *cmd)
}
}
- if (req_op(rq) == REQ_OP_ZONE_APPEND) {
- ret = sd_zbc_prepare_zone_append(cmd, &lba, nr_blocks);
- if (ret)
- goto fail;
- }
-
fua = rq->cmd_flags & REQ_FUA ? 0x8 : 0;
dix = scsi_prot_sg_count(cmd);
dif = scsi_host_dif_capable(cmd->device->host, sdkp->protection_type);
@@ -1348,7 +1342,6 @@ static blk_status_t sd_init_command(struct scsi_cmnd *cmd)
return sd_setup_flush_cmnd(cmd);
case REQ_OP_READ:
case REQ_OP_WRITE:
- case REQ_OP_ZONE_APPEND:
return sd_setup_read_write_cmnd(cmd);
case REQ_OP_ZONE_RESET:
return sd_zbc_setup_zone_mgmt_cmnd(cmd, ZO_RESET_WRITE_POINTER,
@@ -3979,7 +3972,6 @@ static void scsi_disk_release(struct device *dev)
struct scsi_disk *sdkp = to_scsi_disk(dev);
ida_free(&sd_index_ida, sdkp->index);
- sd_zbc_free_zone_info(sdkp);
put_device(&sdkp->device->sdev_gendev);
free_opal_dev(sdkp->opal_dev);
diff --git a/drivers/scsi/sd.h b/drivers/scsi/sd.h
index 5c4285a582b2..49dd600bfa48 100644
--- a/drivers/scsi/sd.h
+++ b/drivers/scsi/sd.h
@@ -104,12 +104,6 @@ struct scsi_disk {
* between zone starting LBAs is constant.
*/
u32 zone_starting_lba_gran;
- u32 *zones_wp_offset;
- spinlock_t zones_wp_offset_lock;
- u32 *rev_wp_offset;
- struct mutex rev_mutex;
- struct work_struct zone_wp_offset_work;
- char *zone_wp_update_buf;
#endif
atomic_t openers;
sector_t capacity; /* size in logical blocks */
@@ -245,7 +239,6 @@ static inline int sd_is_zoned(struct scsi_disk *sdkp)
#ifdef CONFIG_BLK_DEV_ZONED
-void sd_zbc_free_zone_info(struct scsi_disk *sdkp);
int sd_zbc_read_zones(struct scsi_disk *sdkp, u8 buf[SD_BUF_SIZE]);
int sd_zbc_revalidate_zones(struct scsi_disk *sdkp);
blk_status_t sd_zbc_setup_zone_mgmt_cmnd(struct scsi_cmnd *cmd,
@@ -255,13 +248,8 @@ unsigned int sd_zbc_complete(struct scsi_cmnd *cmd, unsigned int good_bytes,
int sd_zbc_report_zones(struct gendisk *disk, sector_t sector,
unsigned int nr_zones, report_zones_cb cb, void *data);
-blk_status_t sd_zbc_prepare_zone_append(struct scsi_cmnd *cmd, sector_t *lba,
- unsigned int nr_blocks);
-
#else /* CONFIG_BLK_DEV_ZONED */
-static inline void sd_zbc_free_zone_info(struct scsi_disk *sdkp) {}
-
static inline int sd_zbc_read_zones(struct scsi_disk *sdkp, u8 buf[SD_BUF_SIZE])
{
return 0;
@@ -285,13 +273,6 @@ static inline unsigned int sd_zbc_complete(struct scsi_cmnd *cmd,
return good_bytes;
}
-static inline blk_status_t sd_zbc_prepare_zone_append(struct scsi_cmnd *cmd,
- sector_t *lba,
- unsigned int nr_blocks)
-{
- return BLK_STS_TARGET;
-}
-
#define sd_zbc_report_zones NULL
#endif /* CONFIG_BLK_DEV_ZONED */
diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c
index 26af5ab7d7c1..806036e48abe 100644
--- a/drivers/scsi/sd_zbc.c
+++ b/drivers/scsi/sd_zbc.c
@@ -23,36 +23,6 @@
#define CREATE_TRACE_POINTS
#include "sd_trace.h"
-/**
- * sd_zbc_get_zone_wp_offset - Get zone write pointer offset.
- * @zone: Zone for which to return the write pointer offset.
- *
- * Return: offset of the write pointer from the start of the zone.
- */
-static unsigned int sd_zbc_get_zone_wp_offset(struct blk_zone *zone)
-{
- if (zone->type == ZBC_ZONE_TYPE_CONV)
- return 0;
-
- switch (zone->cond) {
- case BLK_ZONE_COND_IMP_OPEN:
- case BLK_ZONE_COND_EXP_OPEN:
- case BLK_ZONE_COND_CLOSED:
- return zone->wp - zone->start;
- case BLK_ZONE_COND_FULL:
- return zone->len;
- case BLK_ZONE_COND_EMPTY:
- case BLK_ZONE_COND_OFFLINE:
- case BLK_ZONE_COND_READONLY:
- default:
- /*
- * Offline and read-only zones do not have a valid
- * write pointer. Use 0 as for an empty zone.
- */
- return 0;
- }
-}
-
/* Whether or not a SCSI zone descriptor describes a gap zone. */
static bool sd_zbc_is_gap_zone(const u8 buf[64])
{
@@ -121,9 +91,6 @@ static int sd_zbc_parse_report(struct scsi_disk *sdkp, const u8 buf[64],
if (ret)
return ret;
- if (sdkp->rev_wp_offset)
- sdkp->rev_wp_offset[idx] = sd_zbc_get_zone_wp_offset(&zone);
-
return 0;
}
@@ -347,123 +314,6 @@ static blk_status_t sd_zbc_cmnd_checks(struct scsi_cmnd *cmd)
return BLK_STS_OK;
}
-#define SD_ZBC_INVALID_WP_OFST (~0u)
-#define SD_ZBC_UPDATING_WP_OFST (SD_ZBC_INVALID_WP_OFST - 1)
-
-static int sd_zbc_update_wp_offset_cb(struct blk_zone *zone, unsigned int idx,
- void *data)
-{
- struct scsi_disk *sdkp = data;
-
- lockdep_assert_held(&sdkp->zones_wp_offset_lock);
-
- sdkp->zones_wp_offset[idx] = sd_zbc_get_zone_wp_offset(zone);
-
- return 0;
-}
-
-/*
- * An attempt to append a zone triggered an invalid write pointer error.
- * Reread the write pointer of the zone(s) in which the append failed.
- */
-static void sd_zbc_update_wp_offset_workfn(struct work_struct *work)
-{
- struct scsi_disk *sdkp;
- unsigned long flags;
- sector_t zno;
- int ret;
-
- sdkp = container_of(work, struct scsi_disk, zone_wp_offset_work);
-
- spin_lock_irqsave(&sdkp->zones_wp_offset_lock, flags);
- for (zno = 0; zno < sdkp->zone_info.nr_zones; zno++) {
- if (sdkp->zones_wp_offset[zno] != SD_ZBC_UPDATING_WP_OFST)
- continue;
-
- spin_unlock_irqrestore(&sdkp->zones_wp_offset_lock, flags);
- ret = sd_zbc_do_report_zones(sdkp, sdkp->zone_wp_update_buf,
- SD_BUF_SIZE,
- zno * sdkp->zone_info.zone_blocks, true);
- spin_lock_irqsave(&sdkp->zones_wp_offset_lock, flags);
- if (!ret)
- sd_zbc_parse_report(sdkp, sdkp->zone_wp_update_buf + 64,
- zno, sd_zbc_update_wp_offset_cb,
- sdkp);
- }
- spin_unlock_irqrestore(&sdkp->zones_wp_offset_lock, flags);
-
- scsi_device_put(sdkp->device);
-}
-
-/**
- * sd_zbc_prepare_zone_append() - Prepare an emulated ZONE_APPEND command.
- * @cmd: the command to setup
- * @lba: the LBA to patch
- * @nr_blocks: the number of LBAs to be written
- *
- * Called from sd_setup_read_write_cmnd() for REQ_OP_ZONE_APPEND.
- * @sd_zbc_prepare_zone_append() handles the necessary zone wrote locking and
- * patching of the lba for an emulated ZONE_APPEND command.
- *
- * In case the cached write pointer offset is %SD_ZBC_INVALID_WP_OFST it will
- * schedule a REPORT ZONES command and return BLK_STS_IOERR.
- */
-blk_status_t sd_zbc_prepare_zone_append(struct scsi_cmnd *cmd, sector_t *lba,
- unsigned int nr_blocks)
-{
- struct request *rq = scsi_cmd_to_rq(cmd);
- struct scsi_disk *sdkp = scsi_disk(rq->q->disk);
- unsigned int wp_offset, zno = blk_rq_zone_no(rq);
- unsigned long flags;
- blk_status_t ret;
-
- ret = sd_zbc_cmnd_checks(cmd);
- if (ret != BLK_STS_OK)
- return ret;
-
- if (!blk_rq_zone_is_seq(rq))
- return BLK_STS_IOERR;
-
- /* Unlock of the write lock will happen in sd_zbc_complete() */
- if (!blk_req_zone_write_trylock(rq))
- return BLK_STS_ZONE_RESOURCE;
-
- spin_lock_irqsave(&sdkp->zones_wp_offset_lock, flags);
- wp_offset = sdkp->zones_wp_offset[zno];
- switch (wp_offset) {
- case SD_ZBC_INVALID_WP_OFST:
- /*
- * We are about to schedule work to update a zone write pointer
- * offset, which will cause the zone append command to be
- * requeued. So make sure that the scsi device does not go away
- * while the work is being processed.
- */
- if (scsi_device_get(sdkp->device)) {
- ret = BLK_STS_IOERR;
- break;
- }
- sdkp->zones_wp_offset[zno] = SD_ZBC_UPDATING_WP_OFST;
- schedule_work(&sdkp->zone_wp_offset_work);
- fallthrough;
- case SD_ZBC_UPDATING_WP_OFST:
- ret = BLK_STS_DEV_RESOURCE;
- break;
- default:
- wp_offset = sectors_to_logical(sdkp->device, wp_offset);
- if (wp_offset + nr_blocks > sdkp->zone_info.zone_blocks) {
- ret = BLK_STS_IOERR;
- break;
- }
-
- trace_scsi_prepare_zone_append(cmd, *lba, wp_offset);
- *lba += wp_offset;
- }
- spin_unlock_irqrestore(&sdkp->zones_wp_offset_lock, flags);
- if (ret)
- blk_req_zone_write_unlock(rq);
- return ret;
-}
-
/**
* sd_zbc_setup_zone_mgmt_cmnd - Prepare a zone ZBC_OUT command. The operations
* can be RESET WRITE POINTER, OPEN, CLOSE or FINISH.
@@ -504,96 +354,6 @@ blk_status_t sd_zbc_setup_zone_mgmt_cmnd(struct scsi_cmnd *cmd,
return BLK_STS_OK;
}
-static bool sd_zbc_need_zone_wp_update(struct request *rq)
-{
- switch (req_op(rq)) {
- case REQ_OP_ZONE_APPEND:
- case REQ_OP_ZONE_FINISH:
- case REQ_OP_ZONE_RESET:
- case REQ_OP_ZONE_RESET_ALL:
- return true;
- case REQ_OP_WRITE:
- case REQ_OP_WRITE_ZEROES:
- return blk_rq_zone_is_seq(rq);
- default:
- return false;
- }
-}
-
-/**
- * sd_zbc_zone_wp_update - Update cached zone write pointer upon cmd completion
- * @cmd: Completed command
- * @good_bytes: Command reply bytes
- *
- * Called from sd_zbc_complete() to handle the update of the cached zone write
- * pointer value in case an update is needed.
- */
-static unsigned int sd_zbc_zone_wp_update(struct scsi_cmnd *cmd,
- unsigned int good_bytes)
-{
- int result = cmd->result;
- struct request *rq = scsi_cmd_to_rq(cmd);
- struct scsi_disk *sdkp = scsi_disk(rq->q->disk);
- unsigned int zno = blk_rq_zone_no(rq);
- enum req_op op = req_op(rq);
- unsigned long flags;
-
- /*
- * If we got an error for a command that needs updating the write
- * pointer offset cache, we must mark the zone wp offset entry as
- * invalid to force an update from disk the next time a zone append
- * command is issued.
- */
- spin_lock_irqsave(&sdkp->zones_wp_offset_lock, flags);
-
- if (result && op != REQ_OP_ZONE_RESET_ALL) {
- if (op == REQ_OP_ZONE_APPEND) {
- /* Force complete completion (no retry) */
- good_bytes = 0;
- scsi_set_resid(cmd, blk_rq_bytes(rq));
- }
-
- /*
- * Force an update of the zone write pointer offset on
- * the next zone append access.
- */
- if (sdkp->zones_wp_offset[zno] != SD_ZBC_UPDATING_WP_OFST)
- sdkp->zones_wp_offset[zno] = SD_ZBC_INVALID_WP_OFST;
- goto unlock_wp_offset;
- }
-
- switch (op) {
- case REQ_OP_ZONE_APPEND:
- trace_scsi_zone_wp_update(cmd, rq->__sector,
- sdkp->zones_wp_offset[zno], good_bytes);
- rq->__sector += sdkp->zones_wp_offset[zno];
- fallthrough;
- case REQ_OP_WRITE_ZEROES:
- case REQ_OP_WRITE:
- if (sdkp->zones_wp_offset[zno] < sd_zbc_zone_sectors(sdkp))
- sdkp->zones_wp_offset[zno] +=
- good_bytes >> SECTOR_SHIFT;
- break;
- case REQ_OP_ZONE_RESET:
- sdkp->zones_wp_offset[zno] = 0;
- break;
- case REQ_OP_ZONE_FINISH:
- sdkp->zones_wp_offset[zno] = sd_zbc_zone_sectors(sdkp);
- break;
- case REQ_OP_ZONE_RESET_ALL:
- memset(sdkp->zones_wp_offset, 0,
- sdkp->zone_info.nr_zones * sizeof(unsigned int));
- break;
- default:
- break;
- }
-
-unlock_wp_offset:
- spin_unlock_irqrestore(&sdkp->zones_wp_offset_lock, flags);
-
- return good_bytes;
-}
-
/**
* sd_zbc_complete - ZBC command post processing.
* @cmd: Completed command
@@ -619,11 +379,7 @@ unsigned int sd_zbc_complete(struct scsi_cmnd *cmd, unsigned int good_bytes,
* so be quiet about the error.
*/
rq->rq_flags |= RQF_QUIET;
- } else if (sd_zbc_need_zone_wp_update(rq))
- good_bytes = sd_zbc_zone_wp_update(cmd, good_bytes);
-
- if (req_op(rq) == REQ_OP_ZONE_APPEND)
- blk_req_zone_write_unlock(rq);
+ }
return good_bytes;
}
@@ -780,46 +536,6 @@ static void sd_zbc_print_zones(struct scsi_disk *sdkp)
sdkp->zone_info.zone_blocks);
}
-static int sd_zbc_init_disk(struct scsi_disk *sdkp)
-{
- sdkp->zones_wp_offset = NULL;
- spin_lock_init(&sdkp->zones_wp_offset_lock);
- sdkp->rev_wp_offset = NULL;
- mutex_init(&sdkp->rev_mutex);
- INIT_WORK(&sdkp->zone_wp_offset_work, sd_zbc_update_wp_offset_workfn);
- sdkp->zone_wp_update_buf = kzalloc(SD_BUF_SIZE, GFP_KERNEL);
- if (!sdkp->zone_wp_update_buf)
- return -ENOMEM;
-
- return 0;
-}
-
-void sd_zbc_free_zone_info(struct scsi_disk *sdkp)
-{
- if (!sdkp->zone_wp_update_buf)
- return;
-
- /* Serialize against revalidate zones */
- mutex_lock(&sdkp->rev_mutex);
-
- kvfree(sdkp->zones_wp_offset);
- sdkp->zones_wp_offset = NULL;
- kfree(sdkp->zone_wp_update_buf);
- sdkp->zone_wp_update_buf = NULL;
-
- sdkp->early_zone_info = (struct zoned_disk_info){ };
- sdkp->zone_info = (struct zoned_disk_info){ };
-
- mutex_unlock(&sdkp->rev_mutex);
-}
-
-static void sd_zbc_revalidate_zones_cb(struct gendisk *disk)
-{
- struct scsi_disk *sdkp = scsi_disk(disk);
-
- swap(sdkp->zones_wp_offset, sdkp->rev_wp_offset);
-}
-
/*
* Call blk_revalidate_disk_zones() if any of the zoned disk properties have
* changed that make it necessary to call that function. Called by
@@ -831,18 +547,8 @@ int sd_zbc_revalidate_zones(struct scsi_disk *sdkp)
struct request_queue *q = disk->queue;
u32 zone_blocks = sdkp->early_zone_info.zone_blocks;
unsigned int nr_zones = sdkp->early_zone_info.nr_zones;
- int ret = 0;
unsigned int flags;
-
- /*
- * For all zoned disks, initialize zone append emulation data if not
- * already done.
- */
- if (sd_is_zoned(sdkp) && !sdkp->zone_wp_update_buf) {
- ret = sd_zbc_init_disk(sdkp);
- if (ret)
- return ret;
- }
+ int ret;
/*
* There is nothing to do for regular disks, including host-aware disks
@@ -851,50 +557,32 @@ int sd_zbc_revalidate_zones(struct scsi_disk *sdkp)
if (!blk_queue_is_zoned(q))
return 0;
- /*
- * Make sure revalidate zones are serialized to ensure exclusive
- * updates of the scsi disk data.
- */
- mutex_lock(&sdkp->rev_mutex);
-
if (sdkp->zone_info.zone_blocks == zone_blocks &&
sdkp->zone_info.nr_zones == nr_zones &&
disk->nr_zones == nr_zones)
- goto unlock;
+ return 0;
- flags = memalloc_noio_save();
sdkp->zone_info.zone_blocks = zone_blocks;
sdkp->zone_info.nr_zones = nr_zones;
- sdkp->rev_wp_offset = kvcalloc(nr_zones, sizeof(u32), GFP_KERNEL);
- if (!sdkp->rev_wp_offset) {
- ret = -ENOMEM;
- memalloc_noio_restore(flags);
- goto unlock;
- }
blk_queue_chunk_sectors(q,
logical_to_sectors(sdkp->device, zone_blocks));
- blk_queue_max_zone_append_sectors(q,
- q->limits.max_segments << PAGE_SECTORS_SHIFT);
- ret = blk_revalidate_disk_zones(disk, sd_zbc_revalidate_zones_cb);
+ /* Enable block layer zone append emulation */
+ blk_queue_max_zone_append_sectors(q, 0);
+ flags = memalloc_noio_save();
+ ret = blk_revalidate_disk_zones(disk);
memalloc_noio_restore(flags);
- kvfree(sdkp->rev_wp_offset);
- sdkp->rev_wp_offset = NULL;
-
if (ret) {
sdkp->zone_info = (struct zoned_disk_info){ };
sdkp->capacity = 0;
- goto unlock;
+ return ret;
}
sd_zbc_print_zones(sdkp);
-unlock:
- mutex_unlock(&sdkp->rev_mutex);
-
- return ret;
+ return 0;
}
/**
@@ -917,10 +605,8 @@ int sd_zbc_read_zones(struct scsi_disk *sdkp, u8 buf[SD_BUF_SIZE])
if (!sd_is_zoned(sdkp)) {
/*
* Device managed or normal SCSI disk, no special handling
- * required. Nevertheless, free the disk zone information in
- * case the device type changed.
+ * required.
*/
- sd_zbc_free_zone_info(sdkp);
return 0;
}
@@ -941,7 +627,6 @@ int sd_zbc_read_zones(struct scsi_disk *sdkp, u8 buf[SD_BUF_SIZE])
/* The drive satisfies the kernel restrictions: set it up */
blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q);
- blk_queue_required_elevator_features(q, ELEVATOR_F_ZBD_SEQ_WRITE);
if (sdkp->zones_max_open == U32_MAX)
disk_set_max_open_zones(disk, 0);
else
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index d3d8fd8e229b..89ba6b16fe8b 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -54,8 +54,8 @@ typedef __u32 __bitwise req_flags_t;
/* Look at ->special_vec for the actual data payload instead of the
bio chain. */
#define RQF_SPECIAL_PAYLOAD ((__force req_flags_t)(1 << 18))
-/* The per-zone write lock is held for this request */
-#define RQF_ZONE_WRITE_LOCKED ((__force req_flags_t)(1 << 19))
+/* The request completion needs to be signaled to zone write pluging. */
+#define RQF_ZONE_WRITE_PLUGGING ((__force req_flags_t)(1 << 20))
/* ->timeout has been called, don't expire again */
#define RQF_TIMED_OUT ((__force req_flags_t)(1 << 21))
#define RQF_RESV ((__force req_flags_t)(1 << 23))
@@ -1150,85 +1150,4 @@ static inline int blk_rq_map_sg(struct request_queue *q, struct request *rq,
}
void blk_dump_rq_flags(struct request *, char *);
-#ifdef CONFIG_BLK_DEV_ZONED
-static inline unsigned int blk_rq_zone_no(struct request *rq)
-{
- return disk_zone_no(rq->q->disk, blk_rq_pos(rq));
-}
-
-static inline unsigned int blk_rq_zone_is_seq(struct request *rq)
-{
- return disk_zone_is_seq(rq->q->disk, blk_rq_pos(rq));
-}
-
-/**
- * blk_rq_is_seq_zoned_write() - Check if @rq requires write serialization.
- * @rq: Request to examine.
- *
- * Note: REQ_OP_ZONE_APPEND requests do not require serialization.
- */
-static inline bool blk_rq_is_seq_zoned_write(struct request *rq)
-{
- return op_needs_zoned_write_locking(req_op(rq)) &&
- blk_rq_zone_is_seq(rq);
-}
-
-bool blk_req_needs_zone_write_lock(struct request *rq);
-bool blk_req_zone_write_trylock(struct request *rq);
-void __blk_req_zone_write_lock(struct request *rq);
-void __blk_req_zone_write_unlock(struct request *rq);
-
-static inline void blk_req_zone_write_lock(struct request *rq)
-{
- if (blk_req_needs_zone_write_lock(rq))
- __blk_req_zone_write_lock(rq);
-}
-
-static inline void blk_req_zone_write_unlock(struct request *rq)
-{
- if (rq->rq_flags & RQF_ZONE_WRITE_LOCKED)
- __blk_req_zone_write_unlock(rq);
-}
-
-static inline bool blk_req_zone_is_write_locked(struct request *rq)
-{
- return rq->q->disk->seq_zones_wlock &&
- test_bit(blk_rq_zone_no(rq), rq->q->disk->seq_zones_wlock);
-}
-
-static inline bool blk_req_can_dispatch_to_zone(struct request *rq)
-{
- if (!blk_req_needs_zone_write_lock(rq))
- return true;
- return !blk_req_zone_is_write_locked(rq);
-}
-#else /* CONFIG_BLK_DEV_ZONED */
-static inline bool blk_rq_is_seq_zoned_write(struct request *rq)
-{
- return false;
-}
-
-static inline bool blk_req_needs_zone_write_lock(struct request *rq)
-{
- return false;
-}
-
-static inline void blk_req_zone_write_lock(struct request *rq)
-{
-}
-
-static inline void blk_req_zone_write_unlock(struct request *rq)
-{
-}
-static inline bool blk_req_zone_is_write_locked(struct request *rq)
-{
- return false;
-}
-
-static inline bool blk_req_can_dispatch_to_zone(struct request *rq)
-{
- return true;
-}
-#endif /* CONFIG_BLK_DEV_ZONED */
-
#endif /* BLK_MQ_H */
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index cb1526ec44b5..5751292fee6a 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -137,25 +137,13 @@ typedef u16 blk_short_t;
#define BLK_STS_DEV_RESOURCE ((__force blk_status_t)13)
/*
- * BLK_STS_ZONE_RESOURCE is returned from the driver to the block layer if zone
- * related resources are unavailable, but the driver can guarantee the queue
- * will be rerun in the future once the resources become available again.
- *
- * This is different from BLK_STS_DEV_RESOURCE in that it explicitly references
- * a zone specific resource and IO to a different zone on the same device could
- * still be served. Examples of that are zones that are write-locked, but a read
- * to the same zone could be served.
- */
-#define BLK_STS_ZONE_RESOURCE ((__force blk_status_t)14)
-
-/*
* BLK_STS_ZONE_OPEN_RESOURCE is returned from the driver in the completion
* path if the device returns a status indicating that too many zone resources
* are currently open. The same command should be successful if resubmitted
* after the number of open zones decreases below the device's limits, which is
* reported in the request_queue's max_open_zones.
*/
-#define BLK_STS_ZONE_OPEN_RESOURCE ((__force blk_status_t)15)
+#define BLK_STS_ZONE_OPEN_RESOURCE ((__force blk_status_t)14)
/*
* BLK_STS_ZONE_ACTIVE_RESOURCE is returned from the driver in the completion
@@ -164,20 +152,20 @@ typedef u16 blk_short_t;
* after the number of active zones decreases below the device's limits, which
* is reported in the request_queue's max_active_zones.
*/
-#define BLK_STS_ZONE_ACTIVE_RESOURCE ((__force blk_status_t)16)
+#define BLK_STS_ZONE_ACTIVE_RESOURCE ((__force blk_status_t)15)
/*
* BLK_STS_OFFLINE is returned from the driver when the target device is offline
* or is being taken offline. This could help differentiate the case where a
* device is intentionally being shut down from a real I/O error.
*/
-#define BLK_STS_OFFLINE ((__force blk_status_t)17)
+#define BLK_STS_OFFLINE ((__force blk_status_t)16)
/*
* BLK_STS_DURATION_LIMIT is returned from the driver when the target device
* aborted the command because it exceeded one of its Command Duration Limits.
*/
-#define BLK_STS_DURATION_LIMIT ((__force blk_status_t)18)
+#define BLK_STS_DURATION_LIMIT ((__force blk_status_t)17)
/**
* blk_path_error - returns true if error may be path related
@@ -234,7 +222,12 @@ struct bio {
struct bvec_iter bi_iter;
- blk_qc_t bi_cookie;
+ union {
+ /* for polled bios: */
+ blk_qc_t bi_cookie;
+ /* for plugged zoned writes only: */
+ unsigned int __bi_nr_segments;
+ };
bio_end_io_t *bi_end_io;
void *bi_private;
#ifdef CONFIG_BLK_CGROUP
@@ -304,7 +297,8 @@ enum {
BIO_QOS_THROTTLED, /* bio went through rq_qos throttle path */
BIO_QOS_MERGED, /* but went through rq_qos merge path */
BIO_REMAPPED,
- BIO_ZONE_WRITE_LOCKED, /* Owns a zoned device zone write lock */
+ BIO_ZONE_WRITE_PLUGGING, /* bio handled through zone write plugging */
+ BIO_EMULATES_ZONE_APPEND, /* bio emulates a zone append operation */
BIO_FLAG_LAST
};
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index d0fa4f1c29a2..040a22e0eda0 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -177,22 +177,21 @@ struct gendisk {
#ifdef CONFIG_BLK_DEV_ZONED
/*
- * Zoned block device information for request dispatch control.
- * nr_zones is the total number of zones of the device. This is always
- * 0 for regular block devices. conv_zones_bitmap is a bitmap of nr_zones
- * bits which indicates if a zone is conventional (bit set) or
- * sequential (bit clear). seq_zones_wlock is a bitmap of nr_zones
- * bits which indicates if a zone is write locked, that is, if a write
- * request targeting the zone was dispatched.
- *
- * Reads of this information must be protected with blk_queue_enter() /
- * blk_queue_exit(). Modifying this information is only allowed while
- * no requests are being processed. See also blk_mq_freeze_queue() and
- * blk_mq_unfreeze_queue().
+ * Zoned block device information. Reads of this information must be
+ * protected with blk_queue_enter() / blk_queue_exit(). Modifying this
+ * information is only allowed while no requests are being processed.
+ * See also blk_mq_freeze_queue() and blk_mq_unfreeze_queue().
*/
unsigned int nr_zones;
+ unsigned int zone_capacity;
unsigned long *conv_zones_bitmap;
- unsigned long *seq_zones_wlock;
+ unsigned int zone_wplugs_hash_bits;
+ spinlock_t zone_wplugs_lock;
+ struct mempool_s *zone_wplugs_pool;
+ struct hlist_head *zone_wplugs_hash;
+ struct list_head zone_wplugs_err_list;
+ struct work_struct zone_wplugs_work;
+ struct workqueue_struct *zone_wplugs_wq;
#endif /* CONFIG_BLK_DEV_ZONED */
#if IS_ENABLED(CONFIG_CDROM)
@@ -329,8 +328,7 @@ int blkdev_report_zones(struct block_device *bdev, sector_t sector,
unsigned int nr_zones, report_zones_cb cb, void *data);
int blkdev_zone_mgmt(struct block_device *bdev, enum req_op op,
sector_t sectors, sector_t nr_sectors);
-int blk_revalidate_disk_zones(struct gendisk *disk,
- void (*update_driver_data)(struct gendisk *disk));
+int blk_revalidate_disk_zones(struct gendisk *disk);
/*
* Independent access ranges: struct blk_independent_access_range describes
@@ -447,8 +445,6 @@ struct request_queue {
atomic_t nr_active_requests_shared_tags;
- unsigned int required_elevator_features;
-
struct blk_mq_tags *sched_shared_tags;
struct list_head icq_list;
@@ -631,15 +627,6 @@ static inline unsigned int disk_zone_no(struct gendisk *disk, sector_t sector)
return sector >> ilog2(disk->queue->limits.chunk_sectors);
}
-static inline bool disk_zone_is_seq(struct gendisk *disk, sector_t sector)
-{
- if (!blk_queue_is_zoned(disk->queue))
- return false;
- if (!disk->conv_zones_bitmap)
- return true;
- return !test_bit(disk_zone_no(disk, sector), disk->conv_zones_bitmap);
-}
-
static inline void disk_set_max_open_zones(struct gendisk *disk,
unsigned int max_open_zones)
{
@@ -662,6 +649,7 @@ static inline unsigned int bdev_max_active_zones(struct block_device *bdev)
return bdev->bd_disk->queue->limits.max_active_zones;
}
+bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs);
#else /* CONFIG_BLK_DEV_ZONED */
static inline unsigned int bdev_nr_zones(struct block_device *bdev)
{
@@ -672,10 +660,6 @@ static inline unsigned int disk_nr_zones(struct gendisk *disk)
{
return 0;
}
-static inline bool disk_zone_is_seq(struct gendisk *disk, sector_t sector)
-{
- return false;
-}
static inline unsigned int disk_zone_no(struct gendisk *disk, sector_t sector)
{
return 0;
@@ -689,6 +673,10 @@ static inline unsigned int bdev_max_active_zones(struct block_device *bdev)
{
return 0;
}
+static inline bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs)
+{
+ return false;
+}
#endif /* CONFIG_BLK_DEV_ZONED */
static inline unsigned int blk_queue_depth(struct request_queue *q)
@@ -853,9 +841,11 @@ static inline unsigned int bio_zone_no(struct bio *bio)
return disk_zone_no(bio->bi_bdev->bd_disk, bio->bi_iter.bi_sector);
}
-static inline unsigned int bio_zone_is_seq(struct bio *bio)
+static inline bool bio_straddles_zones(struct bio *bio)
{
- return disk_zone_is_seq(bio->bi_bdev->bd_disk, bio->bi_iter.bi_sector);
+ return bio_sectors(bio) &&
+ bio_zone_no(bio) !=
+ disk_zone_no(bio->bi_bdev->bd_disk, bio_end_sector(bio) - 1);
}
/*
@@ -940,14 +930,6 @@ disk_alloc_independent_access_ranges(struct gendisk *disk, int nr_ia_ranges);
void disk_set_independent_access_ranges(struct gendisk *disk,
struct blk_independent_access_ranges *iars);
-/*
- * Elevator features for blk_queue_required_elevator_features:
- */
-/* Supports zoned block devices sequential write constraint */
-#define ELEVATOR_F_ZBD_SEQ_WRITE (1U << 0)
-
-extern void blk_queue_required_elevator_features(struct request_queue *q,
- unsigned int features);
extern bool blk_queue_can_use_dma_map_merging(struct request_queue *q,
struct device *dev);
@@ -1154,12 +1136,29 @@ static inline unsigned int queue_max_segment_size(const struct request_queue *q)
return q->limits.max_segment_size;
}
-static inline unsigned int queue_max_zone_append_sectors(const struct request_queue *q)
+static inline unsigned int queue_limits_max_zone_append_sectors(struct queue_limits *l)
+{
+ unsigned int max_sectors = min(l->chunk_sectors, l->max_hw_sectors);
+
+ return min_not_zero(l->max_zone_append_sectors, max_sectors);
+}
+
+static inline unsigned int queue_max_zone_append_sectors(struct request_queue *q)
{
+ if (!blk_queue_is_zoned(q))
+ return 0;
- const struct queue_limits *l = &q->limits;
+ return queue_limits_max_zone_append_sectors(&q->limits);
+}
- return min(l->max_zone_append_sectors, l->max_sectors);
+static inline bool queue_emulates_zone_append(struct request_queue *q)
+{
+ return blk_queue_is_zoned(q) && !q->limits.max_zone_append_sectors;
+}
+
+static inline bool bdev_emulates_zone_append(struct block_device *bdev)
+{
+ return queue_emulates_zone_append(bdev_get_queue(bdev));
}
static inline unsigned int
@@ -1301,18 +1300,6 @@ static inline unsigned int bdev_zone_no(struct block_device *bdev, sector_t sec)
return disk_zone_no(bdev->bd_disk, sec);
}
-/* Whether write serialization is required for @op on zoned devices. */
-static inline bool op_needs_zoned_write_locking(enum req_op op)
-{
- return op == REQ_OP_WRITE || op == REQ_OP_WRITE_ZEROES;
-}
-
-static inline bool bdev_op_is_zoned_write(struct block_device *bdev,
- enum req_op op)
-{
- return bdev_is_zoned(bdev) && op_needs_zoned_write_locking(op);
-}
-
static inline sector_t bdev_zone_sectors(struct block_device *bdev)
{
struct request_queue *q = bdev_get_queue(bdev);
@@ -1328,6 +1315,12 @@ static inline sector_t bdev_offset_from_zone_start(struct block_device *bdev,
return sector & (bdev_zone_sectors(bdev) - 1);
}
+static inline sector_t bio_offset_from_zone_start(struct bio *bio)
+{
+ return bdev_offset_from_zone_start(bio->bi_bdev,
+ bio->bi_iter.bi_sector);
+}
+
static inline bool bdev_is_zone_start(struct block_device *bdev,
sector_t sector)
{