aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--MAINTAINERS14
-rw-r--r--block/bdev.c2
-rw-r--r--block/bfq-cgroup.c51
-rw-r--r--block/bfq-iosched.c38
-rw-r--r--block/bfq-iosched.h3
-rw-r--r--block/blk-cgroup.h13
-rw-r--r--block/blk-integrity.c15
-rw-r--r--block/blk-settings.c9
-rw-r--r--block/blk-zoned.c20
-rw-r--r--drivers/block/Kconfig9
-rw-r--r--drivers/block/Makefile3
-rw-r--r--drivers/block/amiflop.c1
-rw-r--r--drivers/block/ataflop.c1
-rw-r--r--drivers/block/rnull.rs73
-rw-r--r--drivers/block/z2ram.c1
-rw-r--r--drivers/md/dm-raid.c2
-rw-r--r--drivers/md/dm-table.c18
-rw-r--r--drivers/md/dm-zone.c206
-rw-r--r--drivers/md/dm.h1
-rw-r--r--drivers/md/md-bitmap.c6
-rw-r--r--drivers/md/md.c506
-rw-r--r--drivers/md/md.h126
-rw-r--r--drivers/md/raid1.c8
-rw-r--r--drivers/md/raid10.c11
-rw-r--r--drivers/md/raid5.c26
-rw-r--r--include/linux/blkdev.h1
-rw-r--r--rust/bindings/bindings_helper.h3
-rw-r--r--rust/helpers.c16
-rw-r--r--rust/kernel/block.rs5
-rw-r--r--rust/kernel/block/mq.rs98
-rw-r--r--rust/kernel/block/mq/gen_disk.rs198
-rw-r--r--rust/kernel/block/mq/operations.rs245
-rw-r--r--rust/kernel/block/mq/raw_writer.rs55
-rw-r--r--rust/kernel/block/mq/request.rs253
-rw-r--r--rust/kernel/block/mq/tag_set.rs86
-rw-r--r--rust/kernel/error.rs6
-rw-r--r--rust/kernel/lib.rs2
37 files changed, 1722 insertions, 409 deletions
diff --git a/MAINTAINERS b/MAINTAINERS
index aacccb376c28..aa1321fdc300 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3781,6 +3781,20 @@ F: include/linux/blk*
F: kernel/trace/blktrace.c
F: lib/sbitmap.c
+BLOCK LAYER DEVICE DRIVER API [RUST]
+M: Andreas Hindborg <[email protected]>
+R: Boqun Feng <[email protected]>
+S: Supported
+W: https://rust-for-linux.com
+B: https://github.com/Rust-for-Linux/linux/issues
+C: https://rust-for-linux.zulipchat.com/#narrow/stream/Block
+T: git https://github.com/Rust-for-Linux/linux.git rust-block-next
+F: drivers/block/rnull.rs
+F: rust/kernel/block.rs
+F: rust/kernel/block/
+
BLOCK2MTD DRIVER
M: Joern Engel <[email protected]>
diff --git a/block/bdev.c b/block/bdev.c
index 353677ac49b3..ced4ac990ec8 100644
--- a/block/bdev.c
+++ b/block/bdev.c
@@ -385,7 +385,7 @@ static struct file_system_type bd_type = {
};
struct super_block *blockdev_superblock __ro_after_init;
-struct vfsmount *blockdev_mnt __ro_after_init;
+static struct vfsmount *blockdev_mnt __ro_after_init;
EXPORT_SYMBOL_GPL(blockdev_superblock);
void __init bdev_cache_init(void)
diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
index d442ee358fc2..b758693697c0 100644
--- a/block/bfq-cgroup.c
+++ b/block/bfq-cgroup.c
@@ -797,57 +797,6 @@ void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio)
*/
bfq_link_bfqg(bfqd, bfqg);
__bfq_bic_change_cgroup(bfqd, bic, bfqg);
- /*
- * Update blkg_path for bfq_log_* functions. We cache this
- * path, and update it here, for the following
- * reasons. Operations on blkg objects in blk-cgroup are
- * protected with the request_queue lock, and not with the
- * lock that protects the instances of this scheduler
- * (bfqd->lock). This exposes BFQ to the following sort of
- * race.
- *
- * The blkg_lookup performed in bfq_get_queue, protected
- * through rcu, may happen to return the address of a copy of
- * the original blkg. If this is the case, then the
- * bfqg_and_blkg_get performed in bfq_get_queue, to pin down
- * the blkg, is useless: it does not prevent blk-cgroup code
- * from destroying both the original blkg and all objects
- * directly or indirectly referred by the copy of the
- * blkg.
- *
- * On the bright side, destroy operations on a blkg invoke, as
- * a first step, hooks of the scheduler associated with the
- * blkg. And these hooks are executed with bfqd->lock held for
- * BFQ. As a consequence, for any blkg associated with the
- * request queue this instance of the scheduler is attached
- * to, we are guaranteed that such a blkg is not destroyed, and
- * that all the pointers it contains are consistent, while we
- * are holding bfqd->lock. A blkg_lookup performed with
- * bfqd->lock held then returns a fully consistent blkg, which
- * remains consistent until this lock is held.
- *
- * Thanks to the last fact, and to the fact that: (1) bfqg has
- * been obtained through a blkg_lookup in the above
- * assignment, and (2) bfqd->lock is being held, here we can
- * safely use the policy data for the involved blkg (i.e., the
- * field bfqg->pd) to get to the blkg associated with bfqg,
- * and then we can safely use any field of blkg. After we
- * release bfqd->lock, even just getting blkg through this
- * bfqg may cause dangling references to be traversed, as
- * bfqg->pd may not exist any more.
- *
- * In view of the above facts, here we cache, in the bfqg, any
- * blkg data we may need for this bic, and for its associated
- * bfq_queue. As of now, we need to cache only the path of the
- * blkg, which is used in the bfq_log_* functions.
- *
- * Finally, note that bfqg itself needs to be protected from
- * destruction on the blkg_free of the original blkg (which
- * invokes bfq_pd_free). We use an additional private
- * refcounter for bfqg, to let it disappear only after no
- * bfq_queue refers to it any longer.
- */
- blkg_path(bfqg_to_blkg(bfqg), bfqg->blkg_path, sizeof(bfqg->blkg_path));
bic->blkcg_serial_nr = serial_nr;
}
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 4b88a54a9b76..36a4998c4b37 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -5463,40 +5463,42 @@ static void bfq_exit_icq_bfqq(struct bfq_io_cq *bic, bool is_sync,
}
}
+static void _bfq_exit_icq(struct bfq_io_cq *bic, unsigned int num_actuators)
+{
+ struct bfq_iocq_bfqq_data *bfqq_data = bic->bfqq_data;
+ unsigned int act_idx;
+
+ for (act_idx = 0; act_idx < num_actuators; act_idx++) {
+ if (bfqq_data[act_idx].stable_merge_bfqq)
+ bfq_put_stable_ref(bfqq_data[act_idx].stable_merge_bfqq);
+
+ bfq_exit_icq_bfqq(bic, true, act_idx);
+ bfq_exit_icq_bfqq(bic, false, act_idx);
+ }
+}
+
static void bfq_exit_icq(struct io_cq *icq)
{
struct bfq_io_cq *bic = icq_to_bic(icq);
struct bfq_data *bfqd = bic_to_bfqd(bic);
unsigned long flags;
- unsigned int act_idx;
+
/*
* If bfqd and thus bfqd->num_actuators is not available any
* longer, then cycle over all possible per-actuator bfqqs in
* next loop. We rely on bic being zeroed on creation, and
* therefore on its unused per-actuator fields being NULL.
- */
- unsigned int num_actuators = BFQ_MAX_ACTUATORS;
- struct bfq_iocq_bfqq_data *bfqq_data = bic->bfqq_data;
-
- /*
+ *
* bfqd is NULL if scheduler already exited, and in that case
* this is the last time these queues are accessed.
*/
if (bfqd) {
spin_lock_irqsave(&bfqd->lock, flags);
- num_actuators = bfqd->num_actuators;
- }
-
- for (act_idx = 0; act_idx < num_actuators; act_idx++) {
- if (bfqq_data[act_idx].stable_merge_bfqq)
- bfq_put_stable_ref(bfqq_data[act_idx].stable_merge_bfqq);
-
- bfq_exit_icq_bfqq(bic, true, act_idx);
- bfq_exit_icq_bfqq(bic, false, act_idx);
- }
-
- if (bfqd)
+ _bfq_exit_icq(bic, bfqd->num_actuators);
spin_unlock_irqrestore(&bfqd->lock, flags);
+ } else {
+ _bfq_exit_icq(bic, BFQ_MAX_ACTUATORS);
+ }
}
/*
diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h
index 467e8cfc41a2..08ddf2cfae5b 100644
--- a/block/bfq-iosched.h
+++ b/block/bfq-iosched.h
@@ -1003,9 +1003,6 @@ struct bfq_group {
/* must be the first member */
struct blkg_policy_data pd;
- /* cached path for this blkg (see comments in bfq_bic_update_cgroup) */
- char blkg_path[128];
-
/* reference counter (see comments in bfq_bic_update_cgroup) */
refcount_t ref;
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 90b3959d88cf..bd472a30bc61 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -301,19 +301,6 @@ static inline struct blkcg *cpd_to_blkcg(struct blkcg_policy_data *cpd)
}
/**
- * blkg_path - format cgroup path of blkg
- * @blkg: blkg of interest
- * @buf: target buffer
- * @buflen: target buffer length
- *
- * Format the path of the cgroup of @blkg into @buf.
- */
-static inline int blkg_path(struct blkcg_gq *blkg, char *buf, int buflen)
-{
- return cgroup_path(blkg->blkcg->css.cgroup, buf, buflen);
-}
-
-/**
* blkg_get - get a blkg reference
* @blkg: blkg to get
*
diff --git a/block/blk-integrity.c b/block/blk-integrity.c
index 05a48689a424..010decc892ea 100644
--- a/block/blk-integrity.c
+++ b/block/blk-integrity.c
@@ -186,8 +186,8 @@ const char *blk_integrity_profile_name(struct blk_integrity *bi)
}
EXPORT_SYMBOL_GPL(blk_integrity_profile_name);
-static ssize_t flag_store(struct device *dev, struct device_attribute *attr,
- const char *page, size_t count, unsigned char flag)
+static ssize_t flag_store(struct device *dev, const char *page, size_t count,
+ unsigned char flag)
{
struct request_queue *q = dev_to_disk(dev)->queue;
struct queue_limits lim;
@@ -213,8 +213,7 @@ static ssize_t flag_store(struct device *dev, struct device_attribute *attr,
return count;
}
-static ssize_t flag_show(struct device *dev, struct device_attribute *attr,
- char *page, unsigned char flag)
+static ssize_t flag_show(struct device *dev, char *page, unsigned char flag)
{
struct blk_integrity *bi = dev_to_bi(dev);
@@ -253,26 +252,26 @@ static ssize_t read_verify_store(struct device *dev,
struct device_attribute *attr,
const char *page, size_t count)
{
- return flag_store(dev, attr, page, count, BLK_INTEGRITY_NOVERIFY);
+ return flag_store(dev, page, count, BLK_INTEGRITY_NOVERIFY);
}
static ssize_t read_verify_show(struct device *dev,
struct device_attribute *attr, char *page)
{
- return flag_show(dev, attr, page, BLK_INTEGRITY_NOVERIFY);
+ return flag_show(dev, page, BLK_INTEGRITY_NOVERIFY);
}
static ssize_t write_generate_store(struct device *dev,
struct device_attribute *attr,
const char *page, size_t count)
{
- return flag_store(dev, attr, page, count, BLK_INTEGRITY_NOGENERATE);
+ return flag_store(dev, page, count, BLK_INTEGRITY_NOGENERATE);
}
static ssize_t write_generate_show(struct device *dev,
struct device_attribute *attr, char *page)
{
- return flag_show(dev, attr, page, BLK_INTEGRITY_NOGENERATE);
+ return flag_show(dev, page, BLK_INTEGRITY_NOGENERATE);
}
static ssize_t device_is_integrity_capable_show(struct device *dev,
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 008fed84edb4..b19306804056 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -80,6 +80,14 @@ static int blk_validate_zoned_limits(struct queue_limits *lim)
if (WARN_ON_ONCE(!IS_ENABLED(CONFIG_BLK_DEV_ZONED)))
return -EINVAL;
+ /*
+ * Given that active zones include open zones, the maximum number of
+ * open zones cannot be larger than the maximum number of active zones.
+ */
+ if (lim->max_active_zones &&
+ lim->max_open_zones > lim->max_active_zones)
+ return -EINVAL;
+
if (lim->zone_write_granularity < lim->logical_block_size)
lim->zone_write_granularity = lim->logical_block_size;
@@ -298,7 +306,6 @@ int blk_set_default_limits(struct queue_limits *lim)
*/
int queue_limits_commit_update(struct request_queue *q,
struct queue_limits *lim)
- __releases(q->limits_lock)
{
int error;
diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index 52abebf56027..8f89705f5e1c 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -1647,8 +1647,22 @@ static int disk_update_zone_resources(struct gendisk *disk,
return -ENODEV;
}
+ lim = queue_limits_start_update(q);
+
+ /*
+ * Some devices can advertize zone resource limits that are larger than
+ * the number of sequential zones of the zoned block device, e.g. a
+ * small ZNS namespace. For such case, assume that the zoned device has
+ * no zone resource limits.
+ */
+ nr_seq_zones = disk->nr_zones - nr_conv_zones;
+ if (lim.max_open_zones >= nr_seq_zones)
+ lim.max_open_zones = 0;
+ if (lim.max_active_zones >= nr_seq_zones)
+ lim.max_active_zones = 0;
+
if (!disk->zone_wplugs_pool)
- return 0;
+ goto commit;
/*
* If the device has no limit on the maximum number of open and active
@@ -1657,9 +1671,6 @@ static int disk_update_zone_resources(struct gendisk *disk,
* dynamic zone write plug allocation when simultaneously writing to
* more zones than the size of the mempool.
*/
- lim = queue_limits_start_update(q);
-
- nr_seq_zones = disk->nr_zones - nr_conv_zones;
pool_size = max(lim.max_open_zones, lim.max_active_zones);
if (!pool_size)
pool_size = min(BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE, nr_seq_zones);
@@ -1673,6 +1684,7 @@ static int disk_update_zone_resources(struct gendisk *disk,
lim.max_open_zones = 0;
}
+commit:
return queue_limits_commit_update(q, &lim);
}
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index 5b9d4aaebb81..ed209f4f2798 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -354,6 +354,15 @@ config VIRTIO_BLK
This is the virtual block driver for virtio. It can be used with
QEMU based VMMs (like KVM or Xen). Say Y or M.
+config BLK_DEV_RUST_NULL
+ tristate "Rust null block driver (Experimental)"
+ depends on RUST
+ help
+ This is the Rust implementation of the null block driver. For now it
+ is only a minimal stub.
+
+ If unsure, say N.
+
config BLK_DEV_RBD
tristate "Rados block device (RBD)"
depends on INET && BLOCK
diff --git a/drivers/block/Makefile b/drivers/block/Makefile
index 101612cba303..1105a2d4fdcb 100644
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -9,6 +9,9 @@
# needed for trace events
ccflags-y += -I$(src)
+obj-$(CONFIG_BLK_DEV_RUST_NULL) += rnull_mod.o
+rnull_mod-y := rnull.o
+
obj-$(CONFIG_MAC_FLOPPY) += swim3.o
obj-$(CONFIG_BLK_DEV_SWIM) += swim_mod.o
obj-$(CONFIG_BLK_DEV_FD) += floppy.o
diff --git a/drivers/block/amiflop.c b/drivers/block/amiflop.c
index ff45701f7a5e..49ced65bef4c 100644
--- a/drivers/block/amiflop.c
+++ b/drivers/block/amiflop.c
@@ -232,6 +232,7 @@ static DEFINE_MUTEX(amiflop_mutex);
static unsigned long int fd_def_df0 = FD_DD_3; /* default for df0 if it doesn't identify */
module_param(fd_def_df0, ulong, 0);
+MODULE_DESCRIPTION("Amiga floppy driver");
MODULE_LICENSE("GPL");
/*
diff --git a/drivers/block/ataflop.c b/drivers/block/ataflop.c
index 4ee10a742bdb..4ba98c6654be 100644
--- a/drivers/block/ataflop.c
+++ b/drivers/block/ataflop.c
@@ -2200,4 +2200,5 @@ static void __exit atari_floppy_exit(void)
module_init(atari_floppy_init)
module_exit(atari_floppy_exit)
+MODULE_DESCRIPTION("Atari floppy driver");
MODULE_LICENSE("GPL");
diff --git a/drivers/block/rnull.rs b/drivers/block/rnull.rs
new file mode 100644
index 000000000000..b0227cf9ddd3
--- /dev/null
+++ b/drivers/block/rnull.rs
@@ -0,0 +1,73 @@
+// SPDX-License-Identifier: GPL-2.0
+
+//! This is a Rust implementation of the C null block driver.
+//!
+//! Supported features:
+//!
+//! - blk-mq interface
+//! - direct completion
+//! - block size 4k
+//!
+//! The driver is not configurable.
+
+use kernel::{
+ alloc::flags,
+ block::mq::{
+ self,
+ gen_disk::{self, GenDisk},
+ Operations, TagSet,
+ },
+ error::Result,
+ new_mutex, pr_info,
+ prelude::*,
+ sync::{Arc, Mutex},
+ types::ARef,
+};
+
+module! {
+ type: NullBlkModule,
+ name: "rnull_mod",
+ author: "Andreas Hindborg",
+ license: "GPL v2",
+}
+
+struct NullBlkModule {
+ _disk: Pin<Box<Mutex<GenDisk<NullBlkDevice>>>>,
+}
+
+impl kernel::Module for NullBlkModule {
+ fn init(_module: &'static ThisModule) -> Result<Self> {
+ pr_info!("Rust null_blk loaded\n");
+ let tagset = Arc::pin_init(TagSet::new(1, 256, 1), flags::GFP_KERNEL)?;
+
+ let disk = gen_disk::GenDiskBuilder::new()
+ .capacity_sectors(4096 << 11)
+ .logical_block_size(4096)?
+ .physical_block_size(4096)?
+ .rotational(false)
+ .build(format_args!("rnullb{}", 0), tagset)?;
+
+ let disk = Box::pin_init(new_mutex!(disk, "nullb:disk"), flags::GFP_KERNEL)?;
+
+ Ok(Self { _disk: disk })
+ }
+}
+
+struct NullBlkDevice;
+
+#[vtable]
+impl Operations for NullBlkDevice {
+ #[inline(always)]
+ fn queue_rq(rq: ARef<mq::Request<Self>>, _is_last: bool) -> Result {
+ mq::Request::end_ok(rq)
+ .map_err(|_e| kernel::error::code::EIO)
+ // We take no refcounts on the request, so we expect to be able to
+ // end the request. The request reference must be unique at this
+ // point, and so `end_ok` cannot fail.
+ .expect("Fatal error - expected to be able to end request");
+
+ Ok(())
+ }
+
+ fn commit_rqs() {}
+}
diff --git a/drivers/block/z2ram.c b/drivers/block/z2ram.c
index 7c5f4e4d9b50..4b7219be1bb8 100644
--- a/drivers/block/z2ram.c
+++ b/drivers/block/z2ram.c
@@ -409,4 +409,5 @@ static void __exit z2_exit(void)
module_init(z2_init);
module_exit(z2_exit);
+MODULE_DESCRIPTION("Amiga Zorro II ramdisk driver");
MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index abe88d1e6735..052c00c1eb15 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -3542,7 +3542,7 @@ static void raid_status(struct dm_target *ti, status_type_t type,
recovery = rs->md.recovery;
state = decipher_sync_action(mddev, recovery);
progress = rs_get_progress(rs, recovery, state, resync_max_sectors);
- resync_mismatches = (mddev->last_sync_action && !strcasecmp(mddev->last_sync_action, "check")) ?
+ resync_mismatches = mddev->last_sync_action == ACTION_CHECK ?
atomic64_read(&mddev->resync_mismatches) : 0;
/* HM FIXME: do we want another state char for raid0? It shows 'D'/'A'/'-' now */
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 502ebc78d490..92d18dcdb3e9 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -1829,12 +1829,9 @@ int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
if (dm_table_any_dev_attr(t, device_dax_write_cache_enabled, NULL))
dax_write_cache(t->md->dax_dev, true);
- /*
- * For a zoned target, setup the zones related queue attributes
- * and resources necessary for zone append emulation if necessary.
- */
+ /* For a zoned table, setup the zone related queue attributes. */
if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
- (limits->features & limits->features & BLK_FEAT_ZONED)) {
+ (limits->features & BLK_FEAT_ZONED)) {
r = dm_set_zones_restrictions(t, q, limits);
if (r)
return r;
@@ -1844,6 +1841,17 @@ int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
if (r)
return r;
+ /*
+ * Now that the limits are set, check the zones mapped by the table
+ * and setup the resources for zone append emulation if necessary.
+ */
+ if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
+ (limits->features & BLK_FEAT_ZONED)) {
+ r = dm_revalidate_zones(t, q);
+ if (r)
+ return r;
+ }
+
dm_update_crypto_profile(q, t);
return 0;
}
diff --git a/drivers/md/dm-zone.c b/drivers/md/dm-zone.c
index 88d313229b43..4d37e53b50ee 100644
--- a/drivers/md/dm-zone.c
+++ b/drivers/md/dm-zone.c
@@ -13,8 +13,6 @@
#define DM_MSG_PREFIX "zone"
-#define DM_ZONE_INVALID_WP_OFST UINT_MAX
-
/*
* For internal zone reports bypassing the top BIO submission path.
*/
@@ -146,34 +144,27 @@ bool dm_is_zone_write(struct mapped_device *md, struct bio *bio)
}
/*
- * Count conventional zones of a mapped zoned device. If the device
- * only has conventional zones, do not expose it as zoned.
- */
-static int dm_check_zoned_cb(struct blk_zone *zone, unsigned int idx,
- void *data)
-{
- unsigned int *nr_conv_zones = data;
-
- if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL)
- (*nr_conv_zones)++;
-
- return 0;
-}
-
-/*
* Revalidate the zones of a mapped device to initialize resource necessary
* for zone append emulation. Note that we cannot simply use the block layer
* blk_revalidate_disk_zones() function here as the mapped device is suspended
* (this is called from __bind() context).
*/
-static int dm_revalidate_zones(struct mapped_device *md, struct dm_table *t)
+int dm_revalidate_zones(struct dm_table *t, struct request_queue *q)
{
+ struct mapped_device *md = t->md;
struct gendisk *disk = md->disk;
int ret;
+ if (!get_capacity(disk))
+ return 0;
+
/* Revalidate only if something changed. */
- if (!disk->nr_zones || disk->nr_zones != md->nr_zones)
+ if (!disk->nr_zones || disk->nr_zones != md->nr_zones) {
+ DMINFO("%s using %s zone append",
+ disk->disk_name,
+ queue_emulates_zone_append(q) ? "emulated" : "native");
md->nr_zones = 0;
+ }
if (md->nr_zones)
return 0;
@@ -220,13 +211,127 @@ static bool dm_table_supports_zone_append(struct dm_table *t)
return true;
}
+struct dm_device_zone_count {
+ sector_t start;
+ sector_t len;
+ unsigned int total_nr_seq_zones;
+ unsigned int target_nr_seq_zones;
+};
+
+/*
+ * Count the total number of and the number of mapped sequential zones of a
+ * target zoned device.
+ */
+static int dm_device_count_zones_cb(struct blk_zone *zone,
+ unsigned int idx, void *data)
+{
+ struct dm_device_zone_count *zc = data;
+
+ if (zone->type != BLK_ZONE_TYPE_CONVENTIONAL) {
+ zc->total_nr_seq_zones++;
+ if (zone->start >= zc->start &&
+ zone->start < zc->start + zc->len)
+ zc->target_nr_seq_zones++;
+ }
+
+ return 0;
+}
+
+static int dm_device_count_zones(struct dm_dev *dev,
+ struct dm_device_zone_count *zc)
+{
+ int ret;
+
+ ret = blkdev_report_zones(dev->bdev, 0, BLK_ALL_ZONES,
+ dm_device_count_zones_cb, zc);
+ if (ret < 0)
+ return ret;
+ if (!ret)
+ return -EIO;
+ return 0;
+}
+
+struct dm_zone_resource_limits {
+ unsigned int mapped_nr_seq_zones;
+ struct queue_limits *lim;
+ bool reliable_limits;
+};
+
+static int device_get_zone_resource_limits(struct dm_target *ti,
+ struct dm_dev *dev, sector_t start,
+ sector_t len, void *data)
+{
+ struct dm_zone_resource_limits *zlim = data;
+ struct gendisk *disk = dev->bdev->bd_disk;
+ unsigned int max_open_zones, max_active_zones;
+ int ret;
+ struct dm_device_zone_count zc = {
+ .start = start,
+ .len = len,
+ };
+
+ /*
+ * If the target is not the whole device, the device zone resources may
+ * be shared between different targets. Check this by counting the
+ * number of mapped sequential zones: if this number is smaller than the
+ * total number of sequential zones of the target device, then resource
+ * sharing may happen and the zone limits will not be reliable.
+ */
+ ret = dm_device_count_zones(dev, &zc);
+ if (ret) {
+ DMERR("Count %s zones failed %d", disk->disk_name, ret);
+ return ret;
+ }
+
+ /*
+ * If the target does not map any sequential zones, then we do not need
+ * any zone resource limits.
+ */
+ if (!zc.target_nr_seq_zones)
+ return 0;
+
+ /*
+ * If the target does not map all sequential zones, the limits
+ * will not be reliable.
+ */
+ if (zc.target_nr_seq_zones < zc.total_nr_seq_zones)
+ zlim->reliable_limits = false;
+
+ /*
+ * If the target maps less sequential zones than the limit values, then
+ * we do not have limits for this target.
+ */
+ max_active_zones = disk->queue->limits.max_active_zones;
+ if (max_active_zones >= zc.target_nr_seq_zones)
+ max_active_zones = 0;
+ zlim->lim->max_active_zones =
+ min_not_zero(max_active_zones, zlim->lim->max_active_zones);
+
+ max_open_zones = disk->queue->limits.max_open_zones;
+ if (max_open_zones >= zc.target_nr_seq_zones)
+ max_open_zones = 0;
+ zlim->lim->max_open_zones =
+ min_not_zero(max_open_zones, zlim->lim->max_open_zones);
+
+ /*
+ * Also count the total number of sequential zones for the mapped
+ * device so that when we are done inspecting all its targets, we are
+ * able to check if the mapped device actually has any sequential zones.
+ */
+ zlim->mapped_nr_seq_zones += zc.target_nr_seq_zones;
+
+ return 0;
+}
+
int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q,
struct queue_limits *lim)
{
struct mapped_device *md = t->md;
struct gendisk *disk = md->disk;
- unsigned int nr_conv_zones = 0;
- int ret;
+ struct dm_zone_resource_limits zlim = {
+ .reliable_limits = true,
+ .lim = lim,
+ };
/*
* Check if zone append is natively supported, and if not, set the
@@ -240,46 +345,55 @@ int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q,
lim->max_zone_append_sectors = 0;
}
- if (!get_capacity(md->disk))
- return 0;
-
/*
- * Count conventional zones to check that the mapped device will indeed
- * have sequential write required zones.
+ * Determine the max open and max active zone limits for the mapped
+ * device by inspecting the zone resource limits and the zones mapped
+ * by each target.
*/
- md->zone_revalidate_map = t;
- ret = dm_blk_report_zones(disk, 0, UINT_MAX,
- dm_check_zoned_cb, &nr_conv_zones);
- md->zone_revalidate_map = NULL;
- if (ret < 0) {
- DMERR("Check zoned failed %d", ret);
- return ret;
+ for (unsigned int i = 0; i < t->num_targets; i++) {
+ struct dm_target *ti = dm_table_get_target(t, i);
+
+ if (!ti->type->iterate_devices ||
+ ti->type->iterate_devices(ti,
+ device_get_zone_resource_limits, &zlim)) {
+ DMERR("Could not determine %s zone resource limits",
+ disk->disk_name);
+ return -ENODEV;
+ }
}
/*
- * If we only have conventional zones, expose the mapped device as
- * a regular device.
+ * If we only have conventional zones mapped, expose the mapped device
+ + as a regular device.
*/
- if (nr_conv_zones >= ret) {
+ if (!zlim.mapped_nr_seq_zones) {
lim->max_open_zones = 0;
lim->max_active_zones = 0;
+ lim->max_zone_append_sectors = 0;
+ lim->zone_write_granularity = 0;
+ lim->chunk_sectors = 0;
lim->features &= ~BLK_FEAT_ZONED;
clear_bit(DMF_EMULATE_ZONE_APPEND, &md->flags);
+ md->nr_zones = 0;
disk->nr_zones = 0;
return 0;
}
- if (!md->disk->nr_zones) {
- DMINFO("%s using %s zone append",
- md->disk->disk_name,
- queue_emulates_zone_append(q) ? "emulated" : "native");
- }
-
- ret = dm_revalidate_zones(md, t);
- if (ret < 0)
- return ret;
+ /*
+ * Warn once (when the capacity is not yet set) if the mapped device is
+ * partially using zone resources of the target devices as that leads to
+ * unreliable limits, i.e. if another mapped device uses the same
+ * underlying devices, we cannot enforce zone limits to guarantee that
+ * writing will not lead to errors. Note that we really should return
+ * an error for such case but there is no easy way to find out if
+ * another mapped device uses the same underlying zoned devices.
+ */
+ if (!get_capacity(disk) && !zlim.reliable_limits)
+ DMWARN("%s zone resource limits may be unreliable",
+ disk->disk_name);
- if (!static_key_enabled(&zoned_enabled.key))
+ if (lim->features & BLK_FEAT_ZONED &&
+ !static_key_enabled(&zoned_enabled.key))
static_branch_enable(&zoned_enabled);
return 0;
}
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 53ef8207fe2c..c984ecb64b1e 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -103,6 +103,7 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t);
*/
int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q,
struct queue_limits *lim);
+int dm_revalidate_zones(struct dm_table *t, struct request_queue *q);
void dm_zone_endio(struct dm_io *io, struct bio *clone);
#ifdef CONFIG_BLK_DEV_ZONED
int dm_blk_report_zones(struct gendisk *disk, sector_t sector,
diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
index 0a2d37eb38ef..08232d8dc815 100644
--- a/drivers/md/md-bitmap.c
+++ b/drivers/md/md-bitmap.c
@@ -227,6 +227,8 @@ static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap,
struct block_device *bdev;
struct mddev *mddev = bitmap->mddev;
struct bitmap_storage *store = &bitmap->storage;
+ unsigned int bitmap_limit = (bitmap->storage.file_pages - pg_index) <<
+ PAGE_SHIFT;
loff_t sboff, offset = mddev->bitmap_info.offset;
sector_t ps = pg_index * PAGE_SIZE / SECTOR_SIZE;
unsigned int size = PAGE_SIZE;
@@ -269,11 +271,9 @@ static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap,
if (size == 0)
/* bitmap runs in to data */
return -EINVAL;
- } else {
- /* DATA METADATA BITMAP - no problems */
}
- md_super_write(mddev, rdev, sboff + ps, (int) size, page);
+ md_super_write(mddev, rdev, sboff + ps, (int)min(size, bitmap_limit), page);
return 0;
}
diff --git a/drivers/md/md.c b/drivers/md/md.c
index f1c7d4f281c5..69ea54aedd99 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -69,6 +69,16 @@
#include "md-bitmap.h"
#include "md-cluster.h"
+static const char *action_name[NR_SYNC_ACTIONS] = {
+ [ACTION_RESYNC] = "resync",
+ [ACTION_RECOVER] = "recover",
+ [ACTION_CHECK] = "check",
+ [ACTION_REPAIR] = "repair",
+ [ACTION_RESHAPE] = "reshape",
+ [ACTION_FROZEN] = "frozen",
+ [ACTION_IDLE] = "idle",
+};
+
/* pers_list is a list of registered personalities protected by pers_lock. */
static LIST_HEAD(pers_list);
static DEFINE_SPINLOCK(pers_lock);
@@ -479,7 +489,6 @@ int mddev_suspend(struct mddev *mddev, bool interruptible)
*/
WRITE_ONCE(mddev->suspended, mddev->suspended + 1);
- del_timer_sync(&mddev->safemode_timer);
/* restrict memory reclaim I/O during raid array is suspend */
mddev->noio_flag = memalloc_noio_save();
@@ -550,13 +559,9 @@ static void md_end_flush(struct bio *bio)
rdev_dec_pending(rdev, mddev);
- if (atomic_dec_and_test(&mddev->flush_pending)) {
- /* The pair is percpu_ref_get() from md_flush_request() */
- percpu_ref_put(&mddev->active_io);
-
+ if (atomic_dec_and_test(&mddev->flush_pending))
/* The pre-request flush has finished */
queue_work(md_wq, &mddev->flush_work);
- }
}
static void md_submit_flush_data(struct work_struct *ws);
@@ -587,12 +592,8 @@ static void submit_flushes(struct work_struct *ws)
rcu_read_lock();
}
rcu_read_unlock();
- if (atomic_dec_and_test(&mddev->flush_pending)) {
- /* The pair is percpu_ref_get() from md_flush_request() */
- percpu_ref_put(&mddev->active_io);
-
+ if (atomic_dec_and_test(&mddev->flush_pending))
queue_work(md_wq, &mddev->flush_work);
- }
}
static void md_submit_flush_data(struct work_struct *ws)
@@ -617,8 +618,20 @@ static void md_submit_flush_data(struct work_struct *ws)
bio_endio(bio);
} else {
bio->bi_opf &= ~REQ_PREFLUSH;
- md_handle_request(mddev, bio);
+
+ /*
+ * make_requst() will never return error here, it only
+ * returns error in raid5_make_request() by dm-raid.
+ * Since dm always splits data and flush operation into
+ * two separate io, io size of flush submitted by dm
+ * always is 0, make_request() will not be called here.
+ */
+ if (WARN_ON_ONCE(!mddev->pers->make_request(mddev, bio)))
+ bio_io_error(bio);;
}
+
+ /* The pair is percpu_ref_get() from md_flush_request() */
+ percpu_ref_put(&mddev->active_io);
}
/*
@@ -654,24 +667,22 @@ bool md_flush_request(struct mddev *mddev, struct bio *bio)
WARN_ON(percpu_ref_is_zero(&mddev->active_io));
percpu_ref_get(&mddev->active_io);
mddev->flush_bio = bio;
- bio = NULL;
- }
- spin_unlock_irq(&mddev->lock);
-
- if (!bio) {
+ spin_unlock_irq(&mddev->lock);
INIT_WORK(&mddev->flush_work, submit_flushes);
queue_work(md_wq, &mddev->flush_work);
- } else {
- /* flush was performed for some other bio while we waited. */
- if (bio->bi_iter.bi_size == 0)
- /* an empty barrier - all done */
- bio_endio(bio);
- else {
- bio->bi_opf &= ~REQ_PREFLUSH;
- return false;
- }
+ return true;
}
- return true;
+
+ /* flush was performed for some other bio while we waited. */
+ spin_unlock_irq(&mddev->lock);
+ if (bio->bi_iter.bi_size == 0) {
+ /* pure flush without data - all done */
+ bio_endio(bio);
+ return true;
+ }
+
+ bio->bi_opf &= ~REQ_PREFLUSH;
+ return false;
}
EXPORT_SYMBOL(md_flush_request);
@@ -742,7 +753,6 @@ int mddev_init(struct mddev *mddev)
mutex_init(&mddev->open_mutex);
mutex_init(&mddev->reconfig_mutex);
- mutex_init(&mddev->sync_mutex);
mutex_init(&mddev->suspend_mutex);
mutex_init(&mddev->bitmap_info.mutex);
INIT_LIST_HEAD(&mddev->disks);
@@ -758,7 +768,7 @@ int mddev_init(struct mddev *mddev)
init_waitqueue_head(&mddev->recovery_wait);
mddev->reshape_position = MaxSector;
mddev->reshape_backwards = 0;
- mddev->last_sync_action = "none";
+ mddev->last_sync_action = ACTION_IDLE;
mddev->resync_min = 0;
mddev->resync_max = MaxSector;
mddev->level = LEVEL_NONE;
@@ -4815,30 +4825,81 @@ out_unlock:
static struct md_sysfs_entry md_metadata =
__ATTR_PREALLOC(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store);
-static ssize_t
-action_show(struct mddev *mddev, char *page)
+enum sync_action md_sync_action(struct mddev *mddev)
{
- char *type = "idle";
unsigned long recovery = mddev->recovery;
+
+ /*
+ * frozen has the highest priority, means running sync_thread will be
+ * stopped immediately, and no new sync_thread can start.
+ */
if (test_bit(MD_RECOVERY_FROZEN, &recovery))
- type = "frozen";
- else if (test_bit(MD_RECOVERY_RUNNING, &recovery) ||
- (md_is_rdwr(mddev) && test_bit(MD_RECOVERY_NEEDED, &recovery))) {
- if (test_bit(MD_RECOVERY_RESHAPE, &recovery))
- type = "reshape";
- else if (test_bit(MD_RECOVERY_SYNC, &recovery)) {
- if (!test_bit(MD_RECOVERY_REQUESTED, &recovery))
- type = "resync";
- else if (test_bit(MD_RECOVERY_CHECK, &recovery))
- type = "check";
- else
- type = "repair";
- } else if (test_bit(MD_RECOVERY_RECOVER, &recovery))
- type = "recover";
- else if (mddev->reshape_position != MaxSector)
- type = "reshape";
+ return ACTION_FROZEN;
+
+ /*
+ * read-only array can't register sync_thread, and it can only
+ * add/remove spares.
+ */
+ if (!md_is_rdwr(mddev))
+ return ACTION_IDLE;
+
+ /*
+ * idle means no sync_thread is running, and no new sync_thread is
+ * requested.
+ */
+ if (!test_bit(MD_RECOVERY_RUNNING, &recovery) &&
+ !test_bit(MD_RECOVERY_NEEDED, &recovery))
+ return ACTION_IDLE;
+
+ if (test_bit(MD_RECOVERY_RESHAPE, &recovery) ||
+ mddev->reshape_position != MaxSector)
+ return ACTION_RESHAPE;
+
+ if (test_bit(MD_RECOVERY_RECOVER, &recovery))
+ return ACTION_RECOVER;
+
+ if (test_bit(MD_RECOVERY_SYNC, &recovery)) {
+ /*
+ * MD_RECOVERY_CHECK must be paired with
+ * MD_RECOVERY_REQUESTED.
+ */
+ if (test_bit(MD_RECOVERY_CHECK, &recovery))
+ return ACTION_CHECK;
+ if (test_bit(MD_RECOVERY_REQUESTED, &recovery))
+ return ACTION_REPAIR;
+ return ACTION_RESYNC;
}
- return sprintf(page, "%s\n", type);
+
+ /*
+ * MD_RECOVERY_NEEDED or MD_RECOVERY_RUNNING is set, however, no
+ * sync_action is specified.
+ */
+ return ACTION_IDLE;
+}
+
+enum sync_action md_sync_action_by_name(const char *page)
+{
+ enum sync_action action;
+
+ for (action = 0; action < NR_SYNC_ACTIONS; ++action) {
+ if (cmd_match(page, action_name[action]))
+ return action;
+ }
+
+ return NR_SYNC_ACTIONS;
+}
+
+const char *md_sync_action_name(enum sync_action action)
+{
+ return action_name[action];
+}
+
+static ssize_t
+action_show(struct mddev *mddev, char *page)
+{
+ enum sync_action action = md_sync_action(mddev);
+
+ return sprintf(page, "%s\n", md_sync_action_name(action));
}
/**
@@ -4847,15 +4908,10 @@ action_show(struct mddev *mddev, char *page)
* @locked: if set, reconfig_mutex will still be held after this function
* return; if not set, reconfig_mutex will be released after this
* function return.
- * @check_seq: if set, only wait for curent running sync_thread to stop, noted
- * that new sync_thread can still start.
*/
-static void stop_sync_thread(struct mddev *mddev, bool locked, bool check_seq)
+static void stop_sync_thread(struct mddev *mddev, bool locked)
{
- int sync_seq;
-
- if (check_seq)
- sync_seq = atomic_read(&mddev->sync_seq);
+ int sync_seq = atomic_read(&mddev->sync_seq);
if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
if (!locked)
@@ -4876,7 +4932,8 @@ static void stop_sync_thread(struct mddev *mddev, bool locked, bool check_seq)
wait_event(resync_wait,
!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
- (check_seq && sync_seq != atomic_read(&mddev->sync_seq)));
+ (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery) &&
+ sync_seq != atomic_read(&mddev->sync_seq)));
if (locked)
mddev_lock_nointr(mddev);
@@ -4887,7 +4944,7 @@ void md_idle_sync_thread(struct mddev *mddev)
lockdep_assert_held(&mddev->reconfig_mutex);
clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
- stop_sync_thread(mddev, true, true);
+ stop_sync_thread(mddev, true);
}
EXPORT_SYMBOL_GPL(md_idle_sync_thread);
@@ -4896,7 +4953,7 @@ void md_frozen_sync_thread(struct mddev *mddev)
lockdep_assert_held(&mddev->reconfig_mutex);
set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
- stop_sync_thread(mddev, true, false);
+ stop_sync_thread(mddev, true);
}
EXPORT_SYMBOL_GPL(md_frozen_sync_thread);
@@ -4911,100 +4968,127 @@ void md_unfrozen_sync_thread(struct mddev *mddev)
}
EXPORT_SYMBOL_GPL(md_unfrozen_sync_thread);
-static void idle_sync_thread(struct mddev *mddev)
+static int mddev_start_reshape(struct mddev *mddev)
{
- mutex_lock(&mddev->sync_mutex);
- clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
-
- if (mddev_lock(mddev)) {
- mutex_unlock(&mddev->sync_mutex);
- return;
- }
-
- stop_sync_thread(mddev, false, true);
- mutex_unlock(&mddev->sync_mutex);
-}
+ int ret;
-static void frozen_sync_thread(struct mddev *mddev)
-{
- mutex_lock(&mddev->sync_mutex);
- set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
+ if (mddev->pers->start_reshape == NULL)
+ return -EINVAL;
- if (mddev_lock(mddev)) {
- mutex_unlock(&mddev->sync_mutex);
- return;
+ if (mddev->reshape_position == MaxSector ||
+ mddev->pers->check_reshape == NULL ||
+ mddev->pers->check_reshape(mddev)) {
+ clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
+ ret = mddev->pers->start_reshape(mddev);
+ if (ret)
+ return ret;
+ } else {
+ /*
+ * If reshape is still in progress, and md_check_recovery() can
+ * continue to reshape, don't restart reshape because data can
+ * be corrupted for raid456.
+ */
+ clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
}
- stop_sync_thread(mddev, false, false);
- mutex_unlock(&mddev->sync_mutex);
+ sysfs_notify_dirent_safe(mddev->sysfs_degraded);
+ return 0;
}
static ssize_t
action_store(struct mddev *mddev, const char *page, size_t len)
{
+ int ret;
+ enum sync_action action;
+
if (!mddev->pers || !mddev->pers->sync_request)
return -EINVAL;
+retry:
+ if (work_busy(&mddev->sync_work))
+ flush_work(&mddev->sync_work);
- if (cmd_match(page, "idle"))
- idle_sync_thread(mddev);
- else if (cmd_match(page, "frozen"))
- frozen_sync_thread(mddev);
- else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
- return -EBUSY;
- else if (cmd_match(page, "resync"))
- clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
- else if (cmd_match(page, "recover")) {
- clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
- set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
- } else if (cmd_match(page, "reshape")) {
- int err;
- if (mddev->pers->start_reshape == NULL)
- return -EINVAL;
- err = mddev_lock(mddev);
- if (!err) {
- if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
- err = -EBUSY;
- } else if (mddev->reshape_position == MaxSector ||
- mddev->pers->check_reshape == NULL ||
- mddev->pers->check_reshape(mddev)) {
- clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
- err = mddev->pers->start_reshape(mddev);
- } else {
- /*
- * If reshape is still in progress, and
- * md_check_recovery() can continue to reshape,
- * don't restart reshape because data can be
- * corrupted for raid456.
- */
- clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
- }
- mddev_unlock(mddev);
+ ret = mddev_lock(mddev);
+ if (ret)
+ return ret;
+
+ if (work_busy(&mddev->sync_work)) {
+ mddev_unlock(mddev);
+ goto retry;
+ }
+
+ action = md_sync_action_by_name(page);
+
+ /* TODO: mdadm rely on "idle" to start sync_thread. */
+ if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
+ switch (action) {
+ case ACTION_FROZEN:
+ md_frozen_sync_thread(mddev);
+ ret = len;
+ goto out;
+ case ACTION_IDLE:
+ md_idle_sync_thread(mddev);
+ break;
+ case ACTION_RESHAPE:
+ case ACTION_RECOVER:
+ case ACTION_CHECK:
+ case ACTION_REPAIR:
+ case ACTION_RESYNC:
+ ret = -EBUSY;
+ goto out;
+ default:
+ ret = -EINVAL;
+ goto out;
}
- if (err)
- return err;
- sysfs_notify_dirent_safe(mddev->sysfs_degraded);
} else {
- if (cmd_match(page, "check"))
+ switch (action) {
+ case ACTION_FROZEN:
+ set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
+ ret = len;
+ goto out;
+ case ACTION_RESHAPE:
+ clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
+ ret = mddev_start_reshape(mddev);
+ if (ret)
+ goto out;
+ break;
+ case ACTION_RECOVER:
+ clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
+ set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
+ break;
+ case ACTION_CHECK:
set_bit(MD_RECOVERY_CHECK, &mddev->recovery);
- else if (!cmd_match(page, "repair"))
- return -EINVAL;
- clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
- set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
- set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
+ fallthrough;
+ case ACTION_REPAIR:
+ set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
+ set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
+ fallthrough;
+ case ACTION_RESYNC:
+ case ACTION_IDLE:
+ clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
+ break;
+ default:
+ ret = -EINVAL;
+ goto out;
+ }
}
+
if (mddev->ro == MD_AUTO_READ) {
/* A write to sync_action is enough to justify
* canceling read-auto mode
*/
- flush_work(&mddev->sync_work);
mddev->ro = MD_RDWR;
md_wakeup_thread(mddev->sync_thread);
}
+
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
md_wakeup_thread(mddev->thread);
sysfs_notify_dirent_safe(mddev->sysfs_action);
- return len;
+ ret = len;
+
+out:
+ mddev_unlock(mddev);
+ return ret;
}
static struct md_sysfs_entry md_scan_mode =
@@ -5013,7 +5097,8 @@ __ATTR_PREALLOC(sync_action, S_IRUGO|S_IWUSR, action_show, action_store);
static ssize_t
last_sync_action_show(struct mddev *mddev, char *page)
{
- return sprintf(page, "%s\n", mddev->last_sync_action);
+ return sprintf(page, "%s\n",
+ md_sync_action_name(mddev->last_sync_action));
}
static struct md_sysfs_entry md_last_scan_mode = __ATTR_RO(last_sync_action);
@@ -6380,7 +6465,7 @@ void md_stop_writes(struct mddev *mddev)
{
mddev_lock_nointr(mddev);
set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
- stop_sync_thread(mddev, true, false);
+ stop_sync_thread(mddev, true);
__md_stop_writes(mddev);
mddev_unlock(mddev);
}
@@ -6448,7 +6533,7 @@ static int md_set_readonly(struct mddev *mddev)
set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
}
- stop_sync_thread(mddev, false, false);
+ stop_sync_thread(mddev, false);
wait_event(mddev->sb_wait,
!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
mddev_lock_nointr(mddev);
@@ -6494,7 +6579,7 @@ static int do_md_stop(struct mddev *mddev, int mode)
set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
}
- stop_sync_thread(mddev, true, false);
+ stop_sync_thread(mddev, true);
if (mddev->sysfs_active ||
test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
@@ -8575,12 +8660,12 @@ EXPORT_SYMBOL(md_done_sync);
* A return value of 'false' means that the write wasn't recorded
* and cannot proceed as the array is being suspend.
*/
-bool md_write_start(struct mddev *mddev, struct bio *bi)
+void md_write_start(struct mddev *mddev, struct bio *bi)
{
int did_change = 0;
if (bio_data_dir(bi) != WRITE)
- return true;
+ return;
BUG_ON(mddev->ro == MD_RDONLY);
if (mddev->ro == MD_AUTO_READ) {
@@ -8613,15 +8698,9 @@ bool md_write_start(struct mddev *mddev, struct bio *bi)
if (did_change)
sysfs_notify_dirent_safe(mddev->sysfs_state);
if (!mddev->has_superblocks)
- return true;
+ return;
wait_event(mddev->sb_wait,
- !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags) ||
- is_md_suspended(mddev));
- if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
- percpu_ref_put(&mddev->writes_pending);
- return false;
- }
- return true;
+ !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
}
EXPORT_SYMBOL(md_write_start);
@@ -8769,6 +8848,77 @@ void md_allow_write(struct mddev *mddev)
}
EXPORT_SYMBOL_GPL(md_allow_write);
+static sector_t md_sync_max_sectors(struct mddev *mddev,
+ enum sync_action action)
+{
+ switch (action) {
+ case ACTION_RESYNC:
+ case ACTION_CHECK:
+ case ACTION_REPAIR:
+ atomic64_set(&mddev->resync_mismatches, 0);
+ fallthrough;
+ case ACTION_RESHAPE:
+ return mddev->resync_max_sectors;
+ case ACTION_RECOVER:
+ return mddev->dev_sectors;
+ default:
+ return 0;
+ }
+}
+
+static sector_t md_sync_position(struct mddev *mddev, enum sync_action action)
+{
+ sector_t start = 0;
+ struct md_rdev *rdev;
+
+ switch (action) {
+ case ACTION_CHECK:
+ case ACTION_REPAIR:
+ return mddev->resync_min;
+ case ACTION_RESYNC:
+ if (!mddev->bitmap)
+ return mddev->recovery_cp;
+ return 0;
+ case ACTION_RESHAPE:
+ /*
+ * If the original node aborts reshaping then we continue the
+ * reshaping, so set again to avoid restart reshape from the
+ * first beginning
+ */
+ if (mddev_is_clustered(mddev) &&
+ mddev->reshape_position != MaxSector)
+ return mddev->reshape_position;
+ return 0;
+ case ACTION_RECOVER:
+ start = MaxSector;
+ rcu_read_lock();
+ rdev_for_each_rcu(rdev, mddev)
+ if (rdev->raid_disk >= 0 &&
+ !test_bit(Journal, &rdev->flags) &&
+ !test_bit(Faulty, &rdev->flags) &&
+ !test_bit(In_sync, &rdev->flags) &&
+ rdev->recovery_offset < start)
+ start = rdev->recovery_offset;
+ rcu_read_unlock();
+
+ /* If there is a bitmap, we need to make sure all
+ * writes that started before we added a spare
+ * complete before we start doing a recovery.
+ * Otherwise the write might complete and (via
+ * bitmap_endwrite) set a bit in the bitmap after the
+ * recovery has checked that bit and skipped that
+ * region.
+ */
+ if (mddev->bitmap) {
+ mddev->pers->quiesce(mddev, 1);
+ mddev->pers->quiesce(mddev, 0);
+ }
+ return start;
+ default:
+ return MaxSector;
+ }
+}
+
#define SYNC_MARKS 10
#define SYNC_MARK_STEP (3*HZ)
#define UPDATE_FREQUENCY (5*60*HZ)
@@ -8785,7 +8935,8 @@ void md_do_sync(struct md_thread *thread)
sector_t last_check;
int skipped = 0;
struct md_rdev *rdev;
- char *desc, *action = NULL;
+ enum sync_action action;
+ const char *desc;
struct blk_plug plug;
int ret;
@@ -8816,21 +8967,9 @@ void md_do_sync(struct md_thread *thread)
goto skip;
}
- if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
- if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) {
- desc = "data-check";
- action = "check";
- } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
- desc = "requested-resync";
- action = "repair";
- } else
- desc = "resync";
- } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
- desc = "reshape";
- else
- desc = "recovery";
-
- mddev->last_sync_action = action ?: desc;
+ action = md_sync_action(mddev);
+ desc = md_sync_action_name(action);
+ mddev->last_sync_action = action;
/*
* Before starting a resync we must have set curr_resync to
@@ -8898,56 +9037,8 @@ void md_do_sync(struct md_thread *thread)
spin_unlock(&all_mddevs_lock);
} while (mddev->curr_resync < MD_RESYNC_DELAYED);
- j = 0;
- if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
- /* resync follows the size requested by the personality,
- * which defaults to physical size, but can be virtual size
- */
- max_sectors = mddev->resync_max_sectors;
- atomic64_set(&mddev->resync_mismatches, 0);
- /* we don't use the checkpoint if there's a bitmap */
- if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
- j = mddev->resync_min;
- else if (!mddev->bitmap)
- j = mddev->recovery_cp;
-
- } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
- max_sectors = mddev->resync_max_sectors;
- /*
- * If the original node aborts reshaping then we continue the
- * reshaping, so set j again to avoid restart reshape from the
- * first beginning
- */
- if (mddev_is_clustered(mddev) &&
- mddev->reshape_position != MaxSector)
- j = mddev->reshape_position;
- } else {
- /* recovery follows the physical size of devices */
- max_sectors = mddev->dev_sectors;
- j = MaxSector;
- rcu_read_lock();
- rdev_for_each_rcu(rdev, mddev)
- if (rdev->raid_disk >= 0 &&
- !test_bit(Journal, &rdev->flags) &&
- !test_bit(Faulty, &rdev->flags) &&
- !test_bit(In_sync, &rdev->flags) &&
- rdev->recovery_offset < j)
- j = rdev->recovery_offset;
- rcu_read_unlock();
-
- /* If there is a bitmap, we need to make sure all
- * writes that started before we added a spare
- * complete before we start doing a recovery.
- * Otherwise the write might complete and (via
- * bitmap_endwrite) set a bit in the bitmap after the
- * recovery has checked that bit and skipped that
- * region.
- */
- if (mddev->bitmap) {
- mddev->pers->quiesce(mddev, 1);
- mddev->pers->quiesce(mddev, 0);
- }
- }
+ max_sectors = md_sync_max_sectors(mddev, action);
+ j = md_sync_position(mddev, action);
pr_info("md: %s of RAID array %s\n", desc, mdname(mddev));
pr_debug("md: minimum _guaranteed_ speed: %d KB/sec/disk.\n", speed_min(mddev));
@@ -9029,7 +9120,8 @@ void md_do_sync(struct md_thread *thread)
if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
break;
- sectors = mddev->pers->sync_request(mddev, j, &skipped);
+ sectors = mddev->pers->sync_request(mddev, j, max_sectors,
+ &skipped);
if (sectors == 0) {
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
break;
@@ -9119,7 +9211,7 @@ void md_do_sync(struct md_thread *thread)
mddev->curr_resync_completed = mddev->curr_resync;
sysfs_notify_dirent_safe(mddev->sysfs_completed);
}
- mddev->pers->sync_request(mddev, max_sectors, &skipped);
+ mddev->pers->sync_request(mddev, max_sectors, max_sectors, &skipped);
if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) &&
mddev->curr_resync > MD_RESYNC_ACTIVE) {
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 6733b0b0abf9..c4d7ebf9587d 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -34,6 +34,61 @@
*/
#define MD_FAILFAST (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT)
+/* Status of sync thread. */
+enum sync_action {
+ /*
+ * Represent by MD_RECOVERY_SYNC, start when:
+ * 1) after assemble, sync data from first rdev to other copies, this
+ * must be done first before other sync actions and will only execute
+ * once;
+ * 2) resize the array(notice that this is not reshape), sync data for
+ * the new range;
+ */
+ ACTION_RESYNC,
+ /*
+ * Represent by MD_RECOVERY_RECOVER, start when:
+ * 1) for new replacement, sync data based on the replace rdev or
+ * available copies from other rdev;
+ * 2) for new member disk while the array is degraded, sync data from
+ * other rdev;
+ * 3) reassemble after power failure or re-add a hot removed rdev, sync
+ * data from first rdev to other copies based on bitmap;
+ */
+ ACTION_RECOVER,
+ /*
+ * Represent by MD_RECOVERY_SYNC | MD_RECOVERY_REQUESTED |
+ * MD_RECOVERY_CHECK, start when user echo "check" to sysfs api
+ * sync_action, used to check if data copies from differenct rdev are
+ * the same. The number of mismatch sectors will be exported to user
+ * by sysfs api mismatch_cnt;
+ */
+ ACTION_CHECK,
+ /*
+ * Represent by MD_RECOVERY_SYNC | MD_RECOVERY_REQUESTED, start when
+ * user echo "repair" to sysfs api sync_action, usually paired with
+ * ACTION_CHECK, used to force syncing data once user found that there
+ * are inconsistent data,
+ */
+ ACTION_REPAIR,
+ /*
+ * Represent by MD_RECOVERY_RESHAPE, start when new member disk is added
+ * to the conf, notice that this is different from spares or
+ * replacement;
+ */
+ ACTION_RESHAPE,
+ /*
+ * Represent by MD_RECOVERY_FROZEN, can be set by sysfs api sync_action
+ * or internal usage like setting the array read-only, will forbid above
+ * actions.
+ */
+ ACTION_FROZEN,
+ /*
+ * All above actions don't match.
+ */
+ ACTION_IDLE,
+ NR_SYNC_ACTIONS,
+};
+
/*
* The struct embedded in rdev is used to serialize IO.
*/
@@ -371,13 +426,12 @@ struct mddev {
struct md_thread __rcu *thread; /* management thread */
struct md_thread __rcu *sync_thread; /* doing resync or reconstruct */
- /* 'last_sync_action' is initialized to "none". It is set when a
- * sync operation (i.e "data-check", "requested-resync", "resync",
- * "recovery", or "reshape") is started. It holds this value even
+ /*
+ * Set when a sync operation is started. It holds this value even
* when the sync thread is "frozen" (interrupted) or "idle" (stopped
- * or finished). It is overwritten when a new sync operation is begun.
+ * or finished). It is overwritten when a new sync operation is begun.
*/
- char *last_sync_action;
+ enum sync_action last_sync_action;
sector_t curr_resync; /* last block scheduled */
/* As resync requests can complete out of order, we cannot easily track
* how much resync has been completed. So we occasionally pause until
@@ -540,8 +594,6 @@ struct mddev {
*/
struct list_head deleting;
- /* Used to synchronize idle and frozen for action_store() */
- struct mutex sync_mutex;
/* The sequence number for sync thread */
atomic_t sync_seq;
@@ -551,22 +603,46 @@ struct mddev {
};
enum recovery_flags {
+ /* flags for sync thread running status */
+
+ /*
+ * set when one of sync action is set and new sync thread need to be
+ * registered, or just add/remove spares from conf.
+ */
+ MD_RECOVERY_NEEDED,
+ /* sync thread is running, or about to be started */
+ MD_RECOVERY_RUNNING,
+ /* sync thread needs to be aborted for some reason */
+ MD_RECOVERY_INTR,
+ /* sync thread is done and is waiting to be unregistered */
+ MD_RECOVERY_DONE,
+ /* running sync thread must abort immediately, and not restart */
+ MD_RECOVERY_FROZEN,
+ /* waiting for pers->start() to finish */
+ MD_RECOVERY_WAIT,
+ /* interrupted because io-error */
+ MD_RECOVERY_ERROR,
+
+ /* flags determines sync action, see details in enum sync_action */
+
+ /* if just this flag is set, action is resync. */
+ MD_RECOVERY_SYNC,
+ /*
+ * paired with MD_RECOVERY_SYNC, if MD_RECOVERY_CHECK is not set,
+ * action is repair, means user requested resync.
+ */
+ MD_RECOVERY_REQUESTED,
/*
- * If neither SYNC or RESHAPE are set, then it is a recovery.
+ * paired with MD_RECOVERY_SYNC and MD_RECOVERY_REQUESTED, action is
+ * check.
*/
- MD_RECOVERY_RUNNING, /* a thread is running, or about to be started */
- MD_RECOVERY_SYNC, /* actually doing a resync, not a recovery */
- MD_RECOVERY_RECOVER, /* doing recovery, or need to try it. */
- MD_RECOVERY_INTR, /* resync needs to be aborted for some reason */
- MD_RECOVERY_DONE, /* thread is done and is waiting to be reaped */
- MD_RECOVERY_NEEDED, /* we might need to start a resync/recover */
- MD_RECOVERY_REQUESTED, /* user-space has requested a sync (used with SYNC) */
- MD_RECOVERY_CHECK, /* user-space request for check-only, no repair */
- MD_RECOVERY_RESHAPE, /* A reshape is happening */
- MD_RECOVERY_FROZEN, /* User request to abort, and not restart, any action */
- MD_RECOVERY_ERROR, /* sync-action interrupted because io-error */
- MD_RECOVERY_WAIT, /* waiting for pers->start() to finish */
- MD_RESYNCING_REMOTE, /* remote node is running resync thread */
+ MD_RECOVERY_CHECK,
+ /* recovery, or need to try it */
+ MD_RECOVERY_RECOVER,
+ /* reshape */
+ MD_RECOVERY_RESHAPE,
+ /* remote node is running resync thread */
+ MD_RESYNCING_REMOTE,
};
enum md_ro_state {
@@ -653,7 +729,8 @@ struct md_personality
int (*hot_add_disk) (struct mddev *mddev, struct md_rdev *rdev);
int (*hot_remove_disk) (struct mddev *mddev, struct md_rdev *rdev);
int (*spare_active) (struct mddev *mddev);
- sector_t (*sync_request)(struct mddev *mddev, sector_t sector_nr, int *skipped);
+ sector_t (*sync_request)(struct mddev *mddev, sector_t sector_nr,
+ sector_t max_sector, int *skipped);
int (*resize) (struct mddev *mddev, sector_t sectors);
sector_t (*size) (struct mddev *mddev, sector_t sectors, int raid_disks);
int (*check_reshape) (struct mddev *mddev);
@@ -785,7 +862,10 @@ extern void md_unregister_thread(struct mddev *mddev, struct md_thread __rcu **t
extern void md_wakeup_thread(struct md_thread __rcu *thread);
extern void md_check_recovery(struct mddev *mddev);
extern void md_reap_sync_thread(struct mddev *mddev);
-extern bool md_write_start(struct mddev *mddev, struct bio *bi);
+extern enum sync_action md_sync_action(struct mddev *mddev);
+extern enum sync_action md_sync_action_by_name(const char *page);
+extern const char *md_sync_action_name(enum sync_action action);
+extern void md_write_start(struct mddev *mddev, struct bio *bi);
extern void md_write_inc(struct mddev *mddev, struct bio *bi);
extern void md_write_end(struct mddev *mddev);
extern void md_done_sync(struct mddev *mddev, int blocks, int ok);
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 779cad62f6f8..1a0eba65b8a9 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1687,8 +1687,7 @@ static bool raid1_make_request(struct mddev *mddev, struct bio *bio)
if (bio_data_dir(bio) == READ)
raid1_read_request(mddev, bio, sectors, NULL);
else {
- if (!md_write_start(mddev,bio))
- return false;
+ md_write_start(mddev,bio);
raid1_write_request(mddev, bio, sectors);
}
return true;
@@ -2754,12 +2753,12 @@ static struct r1bio *raid1_alloc_init_r1buf(struct r1conf *conf)
*/
static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
- int *skipped)
+ sector_t max_sector, int *skipped)
{
struct r1conf *conf = mddev->private;
struct r1bio *r1_bio;
struct bio *bio;
- sector_t max_sector, nr_sectors;
+ sector_t nr_sectors;
int disk = -1;
int i;
int wonly = -1;
@@ -2775,7 +2774,6 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
if (init_resync(conf))
return 0;
- max_sector = mddev->dev_sectors;
if (sector_nr >= max_sector) {
/* If we aborted, we need to abort the
* sync on the 'current' bitmap chunk (there will
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 5f6885b53b69..3334aa803c83 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1836,8 +1836,7 @@ static bool raid10_make_request(struct mddev *mddev, struct bio *bio)
&& md_flush_request(mddev, bio))
return true;
- if (!md_write_start(mddev, bio))
- return false;
+ md_write_start(mddev, bio);
if (unlikely(bio_op(bio) == REQ_OP_DISCARD))
if (!raid10_handle_discard(mddev, bio))
@@ -3137,12 +3136,12 @@ static void raid10_set_cluster_sync_high(struct r10conf *conf)
*/
static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
- int *skipped)
+ sector_t max_sector, int *skipped)
{
struct r10conf *conf = mddev->private;
struct r10bio *r10_bio;
struct bio *biolist = NULL, *bio;
- sector_t max_sector, nr_sectors;
+ sector_t nr_sectors;
int i;
int max_sync;
sector_t sync_blocks;
@@ -3172,10 +3171,6 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
return 0;
skipped:
- max_sector = mddev->dev_sectors;
- if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
- test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
- max_sector = mddev->resync_max_sectors;
if (sector_nr >= max_sector) {
conf->cluster_sync_low = 0;
conf->cluster_sync_high = 0;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 72f91eaa3201..0192a6323f09 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -6078,8 +6078,7 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
ctx.do_flush = bi->bi_opf & REQ_PREFLUSH;
}
- if (!md_write_start(mddev, bi))
- return false;
+ md_write_start(mddev, bi);
/*
* If array is degraded, better not do chunk aligned read because
* later we might have to read it again in order to reconstruct
@@ -6255,7 +6254,9 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
safepos = conf->reshape_safe;
sector_div(safepos, data_disks);
if (mddev->reshape_backwards) {
- BUG_ON(writepos < reshape_sectors);
+ if (WARN_ON(writepos < reshape_sectors))
+ return MaxSector;
+
writepos -= reshape_sectors;
readpos += reshape_sectors;
safepos += reshape_sectors;
@@ -6273,14 +6274,18 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
* to set 'stripe_addr' which is where we will write to.
*/
if (mddev->reshape_backwards) {
- BUG_ON(conf->reshape_progress == 0);
+ if (WARN_ON(conf->reshape_progress == 0))
+ return MaxSector;
+
stripe_addr = writepos;
- BUG_ON((mddev->dev_sectors &
- ~((sector_t)reshape_sectors - 1))
- - reshape_sectors - stripe_addr
- != sector_nr);
+ if (WARN_ON((mddev->dev_sectors &
+ ~((sector_t)reshape_sectors - 1)) -
+ reshape_sectors - stripe_addr != sector_nr))
+ return MaxSector;
} else {
- BUG_ON(writepos != sector_nr + reshape_sectors);
+ if (WARN_ON(writepos != sector_nr + reshape_sectors))
+ return MaxSector;
+
stripe_addr = sector_nr;
}
@@ -6458,11 +6463,10 @@ ret:
}
static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_nr,
- int *skipped)
+ sector_t max_sector, int *skipped)
{
struct r5conf *conf = mddev->private;
struct stripe_head *sh;
- sector_t max_sector = mddev->dev_sectors;
sector_t sync_blocks;
int still_degraded = 0;
int i;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 1fa2b148c206..0e8253c1507a 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -931,7 +931,6 @@ static inline unsigned int blk_chunk_sectors_left(sector_t offset,
*/
static inline struct queue_limits
queue_limits_start_update(struct request_queue *q)
- __acquires(q->limits_lock)
{
mutex_lock(&q->limits_lock);
return q->limits;
diff --git a/rust/bindings/bindings_helper.h b/rust/bindings/bindings_helper.h
index ddb5644d4fd9..84f601d7068e 100644
--- a/rust/bindings/bindings_helper.h
+++ b/rust/bindings/bindings_helper.h
@@ -7,6 +7,8 @@
*/
#include <kunit/test.h>
+#include <linux/blk_types.h>
+#include <linux/blk-mq.h>
#include <linux/errname.h>
#include <linux/ethtool.h>
#include <linux/jiffies.h>
@@ -20,6 +22,7 @@
/* `bindgen` gets confused at certain things. */
const size_t RUST_CONST_HELPER_ARCH_SLAB_MINALIGN = ARCH_SLAB_MINALIGN;
+const size_t RUST_CONST_HELPER_PAGE_SIZE = PAGE_SIZE;
const gfp_t RUST_CONST_HELPER_GFP_ATOMIC = GFP_ATOMIC;
const gfp_t RUST_CONST_HELPER_GFP_KERNEL = GFP_KERNEL;
const gfp_t RUST_CONST_HELPER_GFP_KERNEL_ACCOUNT = GFP_KERNEL_ACCOUNT;
diff --git a/rust/helpers.c b/rust/helpers.c
index 2c37a0f5d7a8..3df5217fb2ff 100644
--- a/rust/helpers.c
+++ b/rust/helpers.c
@@ -186,3 +186,19 @@ static_assert(
__alignof__(size_t) == __alignof__(uintptr_t),
"Rust code expects C `size_t` to match Rust `usize`"
);
+
+// This will soon be moved to a separate file, so no need to merge with above.
+#include <linux/blk-mq.h>
+#include <linux/blkdev.h>
+
+void *rust_helper_blk_mq_rq_to_pdu(struct request *rq)
+{
+ return blk_mq_rq_to_pdu(rq);
+}
+EXPORT_SYMBOL_GPL(rust_helper_blk_mq_rq_to_pdu);
+
+struct request *rust_helper_blk_mq_rq_from_pdu(void *pdu)
+{
+ return blk_mq_rq_from_pdu(pdu);
+}
+EXPORT_SYMBOL_GPL(rust_helper_blk_mq_rq_from_pdu);
diff --git a/rust/kernel/block.rs b/rust/kernel/block.rs
new file mode 100644
index 000000000000..150f710efe5b
--- /dev/null
+++ b/rust/kernel/block.rs
@@ -0,0 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0
+
+//! Types for working with the block layer.
+
+pub mod mq;
diff --git a/rust/kernel/block/mq.rs b/rust/kernel/block/mq.rs
new file mode 100644
index 000000000000..fb0f393c1cea
--- /dev/null
+++ b/rust/kernel/block/mq.rs
@@ -0,0 +1,98 @@
+// SPDX-License-Identifier: GPL-2.0
+
+//! This module provides types for implementing block drivers that interface the
+//! blk-mq subsystem.
+//!
+//! To implement a block device driver, a Rust module must do the following:
+//!
+//! - Implement [`Operations`] for a type `T`.
+//! - Create a [`TagSet<T>`].
+//! - Create a [`GenDisk<T>`], via the [`GenDiskBuilder`].
+//! - Add the disk to the system by calling [`GenDiskBuilder::build`] passing in
+//! the `TagSet` reference.
+//!
+//! The types available in this module that have direct C counterparts are:
+//!
+//! - The [`TagSet`] type that abstracts the C type `struct tag_set`.
+//! - The [`GenDisk`] type that abstracts the C type `struct gendisk`.
+//! - The [`Request`] type that abstracts the C type `struct request`.
+//!
+//! The kernel will interface with the block device driver by calling the method
+//! implementations of the `Operations` trait.
+//!
+//! IO requests are passed to the driver as [`kernel::types::ARef<Request>`]
+//! instances. The `Request` type is a wrapper around the C `struct request`.
+//! The driver must mark end of processing by calling one of the
+//! `Request::end`, methods. Failure to do so can lead to deadlock or timeout
+//! errors. Please note that the C function `blk_mq_start_request` is implicitly
+//! called when the request is queued with the driver.
+//!
+//! The `TagSet` is responsible for creating and maintaining a mapping between
+//! `Request`s and integer ids as well as carrying a pointer to the vtable
+//! generated by `Operations`. This mapping is useful for associating
+//! completions from hardware with the correct `Request` instance. The `TagSet`
+//! determines the maximum queue depth by setting the number of `Request`
+//! instances available to the driver, and it determines the number of queues to
+//! instantiate for the driver. If possible, a driver should allocate one queue
+//! per core, to keep queue data local to a core.
+//!
+//! One `TagSet` instance can be shared between multiple `GenDisk` instances.
+//! This can be useful when implementing drivers where one piece of hardware
+//! with one set of IO resources are represented to the user as multiple disks.
+//!
+//! One significant difference between block device drivers implemented with
+//! these Rust abstractions and drivers implemented in C, is that the Rust
+//! drivers have to own a reference count on the `Request` type when the IO is
+//! in flight. This is to ensure that the C `struct request` instances backing
+//! the Rust `Request` instances are live while the Rust driver holds a
+//! reference to the `Request`. In addition, the conversion of an integer tag to
+//! a `Request` via the `TagSet` would not be sound without this bookkeeping.
+//!
+//! [`GenDisk`]: gen_disk::GenDisk
+//! [`GenDisk<T>`]: gen_disk::GenDisk
+//! [`GenDiskBuilder`]: gen_disk::GenDiskBuilder
+//! [`GenDiskBuilder::build`]: gen_disk::GenDiskBuilder::build
+//!
+//! # Example
+//!
+//! ```rust
+//! use kernel::{
+//! alloc::flags,
+//! block::mq::*,
+//! new_mutex,
+//! prelude::*,
+//! sync::{Arc, Mutex},
+//! types::{ARef, ForeignOwnable},
+//! };
+//!
+//! struct MyBlkDevice;
+//!
+//! #[vtable]
+//! impl Operations for MyBlkDevice {
+//!
+//! fn queue_rq(rq: ARef<Request<Self>>, _is_last: bool) -> Result {
+//! Request::end_ok(rq);
+//! Ok(())
+//! }
+//!
+//! fn commit_rqs() {}
+//! }
+//!
+//! let tagset: Arc<TagSet<MyBlkDevice>> =
+//! Arc::pin_init(TagSet::new(1, 256, 1), flags::GFP_KERNEL)?;
+//! let mut disk = gen_disk::GenDiskBuilder::new()
+//! .capacity_sectors(4096)
+//! .build(format_args!("myblk"), tagset)?;
+//!
+//! # Ok::<(), kernel::error::Error>(())
+//! ```
+
+pub mod gen_disk;
+mod operations;
+mod raw_writer;
+mod request;
+mod tag_set;
+
+pub use operations::Operations;
+pub use request::Request;
+pub use tag_set::TagSet;
diff --git a/rust/kernel/block/mq/gen_disk.rs b/rust/kernel/block/mq/gen_disk.rs
new file mode 100644
index 000000000000..f548a6199847
--- /dev/null
+++ b/rust/kernel/block/mq/gen_disk.rs
@@ -0,0 +1,198 @@
+// SPDX-License-Identifier: GPL-2.0
+
+//! Generic disk abstraction.
+//!
+//! C header: [`include/linux/blkdev.h`](srctree/include/linux/blkdev.h)
+//! C header: [`include/linux/blk_mq.h`](srctree/include/linux/blk_mq.h)
+
+use crate::block::mq::{raw_writer::RawWriter, Operations, TagSet};
+use crate::error;
+use crate::{bindings, error::from_err_ptr, error::Result, sync::Arc};
+use core::fmt::{self, Write};
+
+/// A builder for [`GenDisk`].
+///
+/// Use this struct to configure and add new [`GenDisk`] to the VFS.
+pub struct GenDiskBuilder {
+ rotational: bool,
+ logical_block_size: u32,
+ physical_block_size: u32,
+ capacity_sectors: u64,
+}
+
+impl Default for GenDiskBuilder {
+ fn default() -> Self {
+ Self {
+ rotational: false,
+ logical_block_size: bindings::PAGE_SIZE as u32,
+ physical_block_size: bindings::PAGE_SIZE as u32,
+ capacity_sectors: 0,
+ }
+ }
+}
+
+impl GenDiskBuilder {
+ /// Create a new instance.
+ pub fn new() -> Self {
+ Self::default()
+ }
+
+ /// Set the rotational media attribute for the device to be built.
+ pub fn rotational(mut self, rotational: bool) -> Self {
+ self.rotational = rotational;
+ self
+ }
+
+ /// Validate block size by verifying that it is between 512 and `PAGE_SIZE`,
+ /// and that it is a power of two.
+ fn validate_block_size(size: u32) -> Result<()> {
+ if !(512..=bindings::PAGE_SIZE as u32).contains(&size) || !size.is_power_of_two() {
+ Err(error::code::EINVAL)
+ } else {
+ Ok(())
+ }
+ }
+
+ /// Set the logical block size of the device to be built.
+ ///
+ /// This method will check that block size is a power of two and between 512
+ /// and 4096. If not, an error is returned and the block size is not set.
+ ///
+ /// This is the smallest unit the storage device can address. It is
+ /// typically 4096 bytes.
+ pub fn logical_block_size(mut self, block_size: u32) -> Result<Self> {
+ Self::validate_block_size(block_size)?;
+ self.logical_block_size = block_size;
+ Ok(self)
+ }
+
+ /// Set the physical block size of the device to be built.
+ ///
+ /// This method will check that block size is a power of two and between 512
+ /// and 4096. If not, an error is returned and the block size is not set.
+ ///
+ /// This is the smallest unit a physical storage device can write
+ /// atomically. It is usually the same as the logical block size but may be
+ /// bigger. One example is SATA drives with 4096 byte physical block size
+ /// that expose a 512 byte logical block size to the operating system.
+ pub fn physical_block_size(mut self, block_size: u32) -> Result<Self> {
+ Self::validate_block_size(block_size)?;
+ self.physical_block_size = block_size;
+ Ok(self)
+ }
+
+ /// Set the capacity of the device to be built, in sectors (512 bytes).
+ pub fn capacity_sectors(mut self, capacity: u64) -> Self {
+ self.capacity_sectors = capacity;
+ self
+ }
+
+ /// Build a new `GenDisk` and add it to the VFS.
+ pub fn build<T: Operations>(
+ self,
+ name: fmt::Arguments<'_>,
+ tagset: Arc<TagSet<T>>,
+ ) -> Result<GenDisk<T>> {
+ let lock_class_key = crate::sync::LockClassKey::new();
+
+ // SAFETY: `bindings::queue_limits` contain only fields that are valid when zeroed.
+ let mut lim: bindings::queue_limits = unsafe { core::mem::zeroed() };
+
+ lim.logical_block_size = self.logical_block_size;
+ lim.physical_block_size = self.physical_block_size;
+ if self.rotational {
+ lim.features = bindings::BLK_FEAT_ROTATIONAL;
+ }
+
+ // SAFETY: `tagset.raw_tag_set()` points to a valid and initialized tag set
+ let gendisk = from_err_ptr(unsafe {
+ bindings::__blk_mq_alloc_disk(
+ tagset.raw_tag_set(),
+ &mut lim,
+ core::ptr::null_mut(),
+ lock_class_key.as_ptr(),
+ )
+ })?;
+
+ const TABLE: bindings::block_device_operations = bindings::block_device_operations {
+ submit_bio: None,
+ open: None,
+ release: None,
+ ioctl: None,
+ compat_ioctl: None,
+ check_events: None,
+ unlock_native_capacity: None,
+ getgeo: None,
+ set_read_only: None,
+ swap_slot_free_notify: None,
+ report_zones: None,
+ devnode: None,
+ alternative_gpt_sector: None,
+ get_unique_id: None,
+ // TODO: Set to THIS_MODULE. Waiting for const_refs_to_static feature to
+ // be merged (unstable in rustc 1.78 which is staged for linux 6.10)
+ // https://github.com/rust-lang/rust/issues/119618
+ owner: core::ptr::null_mut(),
+ pr_ops: core::ptr::null_mut(),
+ free_disk: None,
+ poll_bio: None,
+ };
+
+ // SAFETY: `gendisk` is a valid pointer as we initialized it above
+ unsafe { (*gendisk).fops = &TABLE };
+
+ let mut raw_writer = RawWriter::from_array(
+ // SAFETY: `gendisk` points to a valid and initialized instance. We
+ // have exclusive access, since the disk is not added to the VFS
+ // yet.
+ unsafe { &mut (*gendisk).disk_name },
+ )?;
+ raw_writer.write_fmt(name)?;
+ raw_writer.write_char('\0')?;
+
+ // SAFETY: `gendisk` points to a valid and initialized instance of
+ // `struct gendisk`. `set_capacity` takes a lock to synchronize this
+ // operation, so we will not race.
+ unsafe { bindings::set_capacity(gendisk, self.capacity_sectors) };
+
+ crate::error::to_result(
+ // SAFETY: `gendisk` points to a valid and initialized instance of
+ // `struct gendisk`.
+ unsafe {
+ bindings::device_add_disk(core::ptr::null_mut(), gendisk, core::ptr::null_mut())
+ },
+ )?;
+
+ // INVARIANT: `gendisk` was initialized above.
+ // INVARIANT: `gendisk` was added to the VFS via `device_add_disk` above.
+ Ok(GenDisk {
+ _tagset: tagset,
+ gendisk,
+ })
+ }
+}
+
+/// A generic block device.
+///
+/// # Invariants
+///
+/// - `gendisk` must always point to an initialized and valid `struct gendisk`.
+/// - `gendisk` was added to the VFS through a call to
+/// `bindings::device_add_disk`.
+pub struct GenDisk<T: Operations> {
+ _tagset: Arc<TagSet<T>>,
+ gendisk: *mut bindings::gendisk,
+}
+
+// SAFETY: `GenDisk` is an owned pointer to a `struct gendisk` and an `Arc` to a
+// `TagSet` It is safe to send this to other threads as long as T is Send.
+unsafe impl<T: Operations + Send> Send for GenDisk<T> {}
+
+impl<T: Operations> Drop for GenDisk<T> {
+ fn drop(&mut self) {
+ // SAFETY: By type invariant, `self.gendisk` points to a valid and
+ // initialized instance of `struct gendisk`, and it was previously added
+ // to the VFS.
+ unsafe { bindings::del_gendisk(self.gendisk) };
+ }
+}
diff --git a/rust/kernel/block/mq/operations.rs b/rust/kernel/block/mq/operations.rs
new file mode 100644
index 000000000000..9ba7fdfeb4b2
--- /dev/null
+++ b/rust/kernel/block/mq/operations.rs
@@ -0,0 +1,245 @@
+// SPDX-License-Identifier: GPL-2.0
+
+//! This module provides an interface for blk-mq drivers to implement.
+//!
+//! C header: [`include/linux/blk-mq.h`](srctree/include/linux/blk-mq.h)
+
+use crate::{
+ bindings,
+ block::mq::request::RequestDataWrapper,
+ block::mq::Request,
+ error::{from_result, Result},
+ types::ARef,
+};
+use core::{marker::PhantomData, sync::atomic::AtomicU64, sync::atomic::Ordering};
+
+/// Implement this trait to interface blk-mq as block devices.
+///
+/// To implement a block device driver, implement this trait as described in the
+/// [module level documentation]. The kernel will use the implementation of the
+/// functions defined in this trait to interface a block device driver. Note:
+/// There is no need for an exit_request() implementation, because the `drop`
+/// implementation of the [`Request`] type will be invoked by automatically by
+/// the C/Rust glue logic.
+///
+/// [module level documentation]: kernel::block::mq
+#[macros::vtable]
+pub trait Operations: Sized {
+ /// Called by the kernel to queue a request with the driver. If `is_last` is
+ /// `false`, the driver is allowed to defer committing the request.
+ fn queue_rq(rq: ARef<Request<Self>>, is_last: bool) -> Result;
+
+ /// Called by the kernel to indicate that queued requests should be submitted.
+ fn commit_rqs();
+
+ /// Called by the kernel to poll the device for completed requests. Only
+ /// used for poll queues.
+ fn poll() -> bool {
+ crate::build_error(crate::error::VTABLE_DEFAULT_ERROR)
+ }
+}
+
+/// A vtable for blk-mq to interact with a block device driver.
+///
+/// A `bindings::blk_mq_ops` vtable is constructed from pointers to the `extern
+/// "C"` functions of this struct, exposed through the `OperationsVTable::VTABLE`.
+///
+/// For general documentation of these methods, see the kernel source
+/// documentation related to `struct blk_mq_operations` in
+/// [`include/linux/blk-mq.h`].
+///
+/// [`include/linux/blk-mq.h`]: srctree/include/linux/blk-mq.h
+pub(crate) struct OperationsVTable<T: Operations>(PhantomData<T>);
+
+impl<T: Operations> OperationsVTable<T> {
+ /// This function is called by the C kernel. A pointer to this function is
+ /// installed in the `blk_mq_ops` vtable for the driver.
+ ///
+ /// # Safety
+ ///
+ /// - The caller of this function must ensure that the pointee of `bd` is
+ /// valid for reads for the duration of this function.
+ /// - This function must be called for an initialized and live `hctx`. That
+ /// is, `Self::init_hctx_callback` was called and
+ /// `Self::exit_hctx_callback()` was not yet called.
+ /// - `(*bd).rq` must point to an initialized and live `bindings:request`.
+ /// That is, `Self::init_request_callback` was called but
+ /// `Self::exit_request_callback` was not yet called for the request.
+ /// - `(*bd).rq` must be owned by the driver. That is, the block layer must
+ /// promise to not access the request until the driver calls
+ /// `bindings::blk_mq_end_request` for the request.
+ unsafe extern "C" fn queue_rq_callback(
+ _hctx: *mut bindings::blk_mq_hw_ctx,
+ bd: *const bindings::blk_mq_queue_data,
+ ) -> bindings::blk_status_t {
+ // SAFETY: `bd.rq` is valid as required by the safety requirement for
+ // this function.
+ let request = unsafe { &*(*bd).rq.cast::<Request<T>>() };
+
+ // One refcount for the ARef, one for being in flight
+ request.wrapper_ref().refcount().store(2, Ordering::Relaxed);
+
+ // SAFETY:
+ // - We own a refcount that we took above. We pass that to `ARef`.
+ // - By the safety requirements of this function, `request` is a valid
+ // `struct request` and the private data is properly initialized.
+ // - `rq` will be alive until `blk_mq_end_request` is called and is
+ // reference counted by `ARef` until then.
+ let rq = unsafe { Request::aref_from_raw((*bd).rq) };
+
+ // SAFETY: We have exclusive access and we just set the refcount above.
+ unsafe { Request::start_unchecked(&rq) };
+
+ let ret = T::queue_rq(
+ rq,
+ // SAFETY: `bd` is valid as required by the safety requirement for
+ // this function.
+ unsafe { (*bd).last },
+ );
+
+ if let Err(e) = ret {
+ e.to_blk_status()
+ } else {
+ bindings::BLK_STS_OK as _
+ }
+ }
+
+ /// This function is called by the C kernel. A pointer to this function is
+ /// installed in the `blk_mq_ops` vtable for the driver.
+ ///
+ /// # Safety
+ ///
+ /// This function may only be called by blk-mq C infrastructure.
+ unsafe extern "C" fn commit_rqs_callback(_hctx: *mut bindings::blk_mq_hw_ctx) {
+ T::commit_rqs()
+ }
+
+ /// This function is called by the C kernel. It is not currently
+ /// implemented, and there is no way to exercise this code path.
+ ///
+ /// # Safety
+ ///
+ /// This function may only be called by blk-mq C infrastructure.
+ unsafe extern "C" fn complete_callback(_rq: *mut bindings::request) {}
+
+ /// This function is called by the C kernel. A pointer to this function is
+ /// installed in the `blk_mq_ops` vtable for the driver.
+ ///
+ /// # Safety
+ ///
+ /// This function may only be called by blk-mq C infrastructure.
+ unsafe extern "C" fn poll_callback(
+ _hctx: *mut bindings::blk_mq_hw_ctx,
+ _iob: *mut bindings::io_comp_batch,
+ ) -> core::ffi::c_int {
+ T::poll().into()
+ }
+
+ /// This function is called by the C kernel. A pointer to this function is
+ /// installed in the `blk_mq_ops` vtable for the driver.
+ ///
+ /// # Safety
+ ///
+ /// This function may only be called by blk-mq C infrastructure. This
+ /// function may only be called once before `exit_hctx_callback` is called
+ /// for the same context.
+ unsafe extern "C" fn init_hctx_callback(
+ _hctx: *mut bindings::blk_mq_hw_ctx,
+ _tagset_data: *mut core::ffi::c_void,
+ _hctx_idx: core::ffi::c_uint,
+ ) -> core::ffi::c_int {
+ from_result(|| Ok(0))
+ }
+
+ /// This function is called by the C kernel. A pointer to this function is
+ /// installed in the `blk_mq_ops` vtable for the driver.
+ ///
+ /// # Safety
+ ///
+ /// This function may only be called by blk-mq C infrastructure.
+ unsafe extern "C" fn exit_hctx_callback(
+ _hctx: *mut bindings::blk_mq_hw_ctx,
+ _hctx_idx: core::ffi::c_uint,
+ ) {
+ }
+
+ /// This function is called by the C kernel. A pointer to this function is
+ /// installed in the `blk_mq_ops` vtable for the driver.
+ ///
+ /// # Safety
+ ///
+ /// - This function may only be called by blk-mq C infrastructure.
+ /// - `_set` must point to an initialized `TagSet<T>`.
+ /// - `rq` must point to an initialized `bindings::request`.
+ /// - The allocation pointed to by `rq` must be at the size of `Request`
+ /// plus the size of `RequestDataWrapper`.
+ unsafe extern "C" fn init_request_callback(
+ _set: *mut bindings::blk_mq_tag_set,
+ rq: *mut bindings::request,
+ _hctx_idx: core::ffi::c_uint,
+ _numa_node: core::ffi::c_uint,
+ ) -> core::ffi::c_int {
+ from_result(|| {
+ // SAFETY: By the safety requirements of this function, `rq` points
+ // to a valid allocation.
+ let pdu = unsafe { Request::wrapper_ptr(rq.cast::<Request<T>>()) };
+
+ // SAFETY: The refcount field is allocated but not initialized, so
+ // it is valid for writes.
+ unsafe { RequestDataWrapper::refcount_ptr(pdu.as_ptr()).write(AtomicU64::new(0)) };
+
+ Ok(0)
+ })
+ }
+
+ /// This function is called by the C kernel. A pointer to this function is
+ /// installed in the `blk_mq_ops` vtable for the driver.
+ ///
+ /// # Safety
+ ///
+ /// - This function may only be called by blk-mq C infrastructure.
+ /// - `_set` must point to an initialized `TagSet<T>`.
+ /// - `rq` must point to an initialized and valid `Request`.
+ unsafe extern "C" fn exit_request_callback(
+ _set: *mut bindings::blk_mq_tag_set,
+ rq: *mut bindings::request,
+ _hctx_idx: core::ffi::c_uint,
+ ) {
+ // SAFETY: The tagset invariants guarantee that all requests are allocated with extra memory
+ // for the request data.
+ let pdu = unsafe { bindings::blk_mq_rq_to_pdu(rq) }.cast::<RequestDataWrapper>();
+
+ // SAFETY: `pdu` is valid for read and write and is properly initialised.
+ unsafe { core::ptr::drop_in_place(pdu) };
+ }
+
+ const VTABLE: bindings::blk_mq_ops = bindings::blk_mq_ops {
+ queue_rq: Some(Self::queue_rq_callback),
+ queue_rqs: None,
+ commit_rqs: Some(Self::commit_rqs_callback),
+ get_budget: None,
+ put_budget: None,
+ set_rq_budget_token: None,
+ get_rq_budget_token: None,
+ timeout: None,
+ poll: if T::HAS_POLL {
+ Some(Self::poll_callback)
+ } else {
+ None
+ },
+ complete: Some(Self::complete_callback),
+ init_hctx: Some(Self::init_hctx_callback),
+ exit_hctx: Some(Self::exit_hctx_callback),
+ init_request: Some(Self::init_request_callback),
+ exit_request: Some(Self::exit_request_callback),
+ cleanup_rq: None,
+ busy: None,
+ map_queues: None,
+ #[cfg(CONFIG_BLK_DEBUG_FS)]
+ show_rq: None,
+ };
+
+ pub(crate) const fn build() -> &'static bindings::blk_mq_ops {
+ &Self::VTABLE
+ }
+}
diff --git a/rust/kernel/block/mq/raw_writer.rs b/rust/kernel/block/mq/raw_writer.rs
new file mode 100644
index 000000000000..9222465d670b
--- /dev/null
+++ b/rust/kernel/block/mq/raw_writer.rs
@@ -0,0 +1,55 @@
+// SPDX-License-Identifier: GPL-2.0
+
+use core::fmt::{self, Write};
+
+use crate::error::Result;
+use crate::prelude::EINVAL;
+
+/// A mutable reference to a byte buffer where a string can be written into.
+///
+/// # Invariants
+///
+/// `buffer` is always null terminated.
+pub(crate) struct RawWriter<'a> {
+ buffer: &'a mut [u8],
+ pos: usize,
+}
+
+impl<'a> RawWriter<'a> {
+ /// Create a new `RawWriter` instance.
+ fn new(buffer: &'a mut [u8]) -> Result<RawWriter<'a>> {
+ *(buffer.last_mut().ok_or(EINVAL)?) = 0;
+
+ // INVARIANT: We null terminated the buffer above.
+ Ok(Self { buffer, pos: 0 })
+ }
+
+ pub(crate) fn from_array<const N: usize>(
+ a: &'a mut [core::ffi::c_char; N],
+ ) -> Result<RawWriter<'a>> {
+ Self::new(
+ // SAFETY: the buffer of `a` is valid for read and write as `u8` for
+ // at least `N` bytes.
+ unsafe { core::slice::from_raw_parts_mut(a.as_mut_ptr().cast::<u8>(), N) },
+ )
+ }
+}
+
+impl Write for RawWriter<'_> {
+ fn write_str(&mut self, s: &str) -> fmt::Result {
+ let bytes = s.as_bytes();
+ let len = bytes.len();
+
+ // We do not want to overwrite our null terminator
+ if self.pos + len > self.buffer.len() - 1 {
+ return Err(fmt::Error);
+ }
+
+ // INVARIANT: We are not overwriting the last byte
+ self.buffer[self.pos..self.pos + len].copy_from_slice(bytes);
+
+ self.pos += len;
+
+ Ok(())
+ }
+}
diff --git a/rust/kernel/block/mq/request.rs b/rust/kernel/block/mq/request.rs
new file mode 100644
index 000000000000..a0e22827f3f4
--- /dev/null
+++ b/rust/kernel/block/mq/request.rs
@@ -0,0 +1,253 @@
+// SPDX-License-Identifier: GPL-2.0
+
+//! This module provides a wrapper for the C `struct request` type.
+//!
+//! C header: [`include/linux/blk-mq.h`](srctree/include/linux/blk-mq.h)
+
+use crate::{
+ bindings,
+ block::mq::Operations,
+ error::Result,
+ types::{ARef, AlwaysRefCounted, Opaque},
+};
+use core::{
+ marker::PhantomData,
+ ptr::{addr_of_mut, NonNull},
+ sync::atomic::{AtomicU64, Ordering},
+};
+
+/// A wrapper around a blk-mq `struct request`. This represents an IO request.
+///
+/// # Implementation details
+///
+/// There are four states for a request that the Rust bindings care about:
+///
+/// A) Request is owned by block layer (refcount 0)
+/// B) Request is owned by driver but with zero `ARef`s in existence
+/// (refcount 1)
+/// C) Request is owned by driver with exactly one `ARef` in existence
+/// (refcount 2)
+/// D) Request is owned by driver with more than one `ARef` in existence
+/// (refcount > 2)
+///
+///
+/// We need to track A and B to ensure we fail tag to request conversions for
+/// requests that are not owned by the driver.
+///
+/// We need to track C and D to ensure that it is safe to end the request and hand
+/// back ownership to the block layer.
+///
+/// The states are tracked through the private `refcount` field of
+/// `RequestDataWrapper`. This structure lives in the private data area of the C
+/// `struct request`.
+///
+/// # Invariants
+///
+/// * `self.0` is a valid `struct request` created by the C portion of the kernel.
+/// * The private data area associated with this request must be an initialized
+/// and valid `RequestDataWrapper<T>`.
+/// * `self` is reference counted by atomic modification of
+/// self.wrapper_ref().refcount().
+///
+#[repr(transparent)]
+pub struct Request<T: Operations>(Opaque<bindings::request>, PhantomData<T>);
+
+impl<T: Operations> Request<T> {
+ /// Create an `ARef<Request>` from a `struct request` pointer.
+ ///
+ /// # Safety
+ ///
+ /// * The caller must own a refcount on `ptr` that is transferred to the
+ /// returned `ARef`.
+ /// * The type invariants for `Request` must hold for the pointee of `ptr`.
+ pub(crate) unsafe fn aref_from_raw(ptr: *mut bindings::request) -> ARef<Self> {
+ // INVARIANT: By the safety requirements of this function, invariants are upheld.
+ // SAFETY: By the safety requirement of this function, we own a
+ // reference count that we can pass to `ARef`.
+ unsafe { ARef::from_raw(NonNull::new_unchecked(ptr as *const Self as *mut Self)) }
+ }
+
+ /// Notify the block layer that a request is going to be processed now.
+ ///
+ /// The block layer uses this hook to do proper initializations such as
+ /// starting the timeout timer. It is a requirement that block device
+ /// drivers call this function when starting to process a request.
+ ///
+ /// # Safety
+ ///
+ /// The caller must have exclusive ownership of `self`, that is
+ /// `self.wrapper_ref().refcount() == 2`.
+ pub(crate) unsafe fn start_unchecked(this: &ARef<Self>) {
+ // SAFETY: By type invariant, `self.0` is a valid `struct request` and
+ // we have exclusive access.
+ unsafe { bindings::blk_mq_start_request(this.0.get()) };
+ }
+
+ /// Try to take exclusive ownership of `this` by dropping the refcount to 0.
+ /// This fails if `this` is not the only `ARef` pointing to the underlying
+ /// `Request`.
+ ///
+ /// If the operation is successful, `Ok` is returned with a pointer to the
+ /// C `struct request`. If the operation fails, `this` is returned in the
+ /// `Err` variant.
+ fn try_set_end(this: ARef<Self>) -> Result<*mut bindings::request, ARef<Self>> {
+ // We can race with `TagSet::tag_to_rq`
+ if let Err(_old) = this.wrapper_ref().refcount().compare_exchange(
+ 2,
+ 0,
+ Ordering::Relaxed,
+ Ordering::Relaxed,
+ ) {
+ return Err(this);
+ }
+
+ let request_ptr = this.0.get();
+ core::mem::forget(this);
+
+ Ok(request_ptr)
+ }
+
+ /// Notify the block layer that the request has been completed without errors.
+ ///
+ /// This function will return `Err` if `this` is not the only `ARef`
+ /// referencing the request.
+ pub fn end_ok(this: ARef<Self>) -> Result<(), ARef<Self>> {
+ let request_ptr = Self::try_set_end(this)?;
+
+ // SAFETY: By type invariant, `this.0` was a valid `struct request`. The
+ // success of the call to `try_set_end` guarantees that there are no
+ // `ARef`s pointing to this request. Therefore it is safe to hand it
+ // back to the block layer.
+ unsafe { bindings::blk_mq_end_request(request_ptr, bindings::BLK_STS_OK as _) };
+
+ Ok(())
+ }
+
+ /// Return a pointer to the `RequestDataWrapper` stored in the private area
+ /// of the request structure.
+ ///
+ /// # Safety
+ ///
+ /// - `this` must point to a valid allocation of size at least size of
+ /// `Self` plus size of `RequestDataWrapper`.
+ pub(crate) unsafe fn wrapper_ptr(this: *mut Self) -> NonNull<RequestDataWrapper> {
+ let request_ptr = this.cast::<bindings::request>();
+ // SAFETY: By safety requirements for this function, `this` is a
+ // valid allocation.
+ let wrapper_ptr =
+ unsafe { bindings::blk_mq_rq_to_pdu(request_ptr).cast::<RequestDataWrapper>() };
+ // SAFETY: By C API contract, wrapper_ptr points to a valid allocation
+ // and is not null.
+ unsafe { NonNull::new_unchecked(wrapper_ptr) }
+ }
+
+ /// Return a reference to the `RequestDataWrapper` stored in the private
+ /// area of the request structure.
+ pub(crate) fn wrapper_ref(&self) -> &RequestDataWrapper {
+ // SAFETY: By type invariant, `self.0` is a valid allocation. Further,
+ // the private data associated with this request is initialized and
+ // valid. The existence of `&self` guarantees that the private data is
+ // valid as a shared reference.
+ unsafe { Self::wrapper_ptr(self as *const Self as *mut Self).as_ref() }
+ }
+}
+
+/// A wrapper around data stored in the private area of the C `struct request`.
+pub(crate) struct RequestDataWrapper {
+ /// The Rust request refcount has the following states:
+ ///
+ /// - 0: The request is owned by C block layer.
+ /// - 1: The request is owned by Rust abstractions but there are no ARef references to it.
+ /// - 2+: There are `ARef` references to the request.
+ refcount: AtomicU64,
+}
+
+impl RequestDataWrapper {
+ /// Return a reference to the refcount of the request that is embedding
+ /// `self`.
+ pub(crate) fn refcount(&self) -> &AtomicU64 {
+ &self.refcount
+ }
+
+ /// Return a pointer to the refcount of the request that is embedding the
+ /// pointee of `this`.
+ ///
+ /// # Safety
+ ///
+ /// - `this` must point to a live allocation of at least the size of `Self`.
+ pub(crate) unsafe fn refcount_ptr(this: *mut Self) -> *mut AtomicU64 {
+ // SAFETY: Because of the safety requirements of this function, the
+ // field projection is safe.
+ unsafe { addr_of_mut!((*this).refcount) }
+ }
+}
+
+// SAFETY: Exclusive access is thread-safe for `Request`. `Request` has no `&mut
+// self` methods and `&self` methods that mutate `self` are internally
+// synchronized.
+unsafe impl<T: Operations> Send for Request<T> {}
+
+// SAFETY: Shared access is thread-safe for `Request`. `&self` methods that
+// mutate `self` are internally synchronized`
+unsafe impl<T: Operations> Sync for Request<T> {}
+
+/// Store the result of `op(target.load())` in target, returning new value of
+/// target.
+fn atomic_relaxed_op_return(target: &AtomicU64, op: impl Fn(u64) -> u64) -> u64 {
+ let old = target.fetch_update(Ordering::Relaxed, Ordering::Relaxed, |x| Some(op(x)));
+
+ // SAFETY: Because the operation passed to `fetch_update` above always
+ // return `Some`, `old` will always be `Ok`.
+ let old = unsafe { old.unwrap_unchecked() };
+
+ op(old)
+}
+
+/// Store the result of `op(target.load)` in `target` if `target.load() !=
+/// pred`, returning true if the target was updated.
+fn atomic_relaxed_op_unless(target: &AtomicU64, op: impl Fn(u64) -> u64, pred: u64) -> bool {
+ target
+ .fetch_update(Ordering::Relaxed, Ordering::Relaxed, |x| {
+ if x == pred {
+ None
+ } else {
+ Some(op(x))
+ }
+ })
+ .is_ok()
+}
+
+// SAFETY: All instances of `Request<T>` are reference counted. This
+// implementation of `AlwaysRefCounted` ensure that increments to the ref count
+// keeps the object alive in memory at least until a matching reference count
+// decrement is executed.
+unsafe impl<T: Operations> AlwaysRefCounted for Request<T> {
+ fn inc_ref(&self) {
+ let refcount = &self.wrapper_ref().refcount();
+
+ #[cfg_attr(not(CONFIG_DEBUG_MISC), allow(unused_variables))]
+ let updated = atomic_relaxed_op_unless(refcount, |x| x + 1, 0);
+
+ #[cfg(CONFIG_DEBUG_MISC)]
+ if !updated {
+ panic!("Request refcount zero on clone")
+ }
+ }
+
+ unsafe fn dec_ref(obj: core::ptr::NonNull<Self>) {
+ // SAFETY: The type invariants of `ARef` guarantee that `obj` is valid
+ // for read.
+ let wrapper_ptr = unsafe { Self::wrapper_ptr(obj.as_ptr()).as_ptr() };
+ // SAFETY: The type invariant of `Request` guarantees that the private
+ // data area is initialized and valid.
+ let refcount = unsafe { &*RequestDataWrapper::refcount_ptr(wrapper_ptr) };
+
+ #[cfg_attr(not(CONFIG_DEBUG_MISC), allow(unused_variables))]
+ let new_refcount = atomic_relaxed_op_return(refcount, |x| x - 1);
+
+ #[cfg(CONFIG_DEBUG_MISC)]
+ if new_refcount == 0 {
+ panic!("Request reached refcount zero in Rust abstractions");
+ }
+ }
+}
diff --git a/rust/kernel/block/mq/tag_set.rs b/rust/kernel/block/mq/tag_set.rs
new file mode 100644
index 000000000000..f9a1ca655a35
--- /dev/null
+++ b/rust/kernel/block/mq/tag_set.rs
@@ -0,0 +1,86 @@
+// SPDX-License-Identifier: GPL-2.0
+
+//! This module provides the `TagSet` struct to wrap the C `struct blk_mq_tag_set`.
+//!
+//! C header: [`include/linux/blk-mq.h`](srctree/include/linux/blk-mq.h)
+
+use core::pin::Pin;
+
+use crate::{
+ bindings,
+ block::mq::{operations::OperationsVTable, request::RequestDataWrapper, Operations},
+ error,
+ prelude::PinInit,
+ try_pin_init,
+ types::Opaque,
+};
+use core::{convert::TryInto, marker::PhantomData};
+use macros::{pin_data, pinned_drop};
+
+/// A wrapper for the C `struct blk_mq_tag_set`.
+///
+/// `struct blk_mq_tag_set` contains a `struct list_head` and so must be pinned.
+///
+/// # Invariants
+///
+/// - `inner` is initialized and valid.
+#[pin_data(PinnedDrop)]
+#[repr(transparent)]
+pub struct TagSet<T: Operations> {
+ #[pin]
+ inner: Opaque<bindings::blk_mq_tag_set>,
+ _p: PhantomData<T>,
+}
+
+impl<T: Operations> TagSet<T> {
+ /// Try to create a new tag set
+ pub fn new(
+ nr_hw_queues: u32,
+ num_tags: u32,
+ num_maps: u32,
+ ) -> impl PinInit<Self, error::Error> {
+ // SAFETY: `blk_mq_tag_set` only contains integers and pointers, which
+ // all are allowed to be 0.
+ let tag_set: bindings::blk_mq_tag_set = unsafe { core::mem::zeroed() };
+ let tag_set = core::mem::size_of::<RequestDataWrapper>()
+ .try_into()
+ .map(|cmd_size| {
+ bindings::blk_mq_tag_set {
+ ops: OperationsVTable::<T>::build(),
+ nr_hw_queues,
+ timeout: 0, // 0 means default which is 30Hz in C
+ numa_node: bindings::NUMA_NO_NODE,
+ queue_depth: num_tags,
+ cmd_size,
+ flags: bindings::BLK_MQ_F_SHOULD_MERGE,
+ driver_data: core::ptr::null_mut::<core::ffi::c_void>(),
+ nr_maps: num_maps,
+ ..tag_set
+ }
+ });
+
+ try_pin_init!(TagSet {
+ inner <- PinInit::<_, error::Error>::pin_chain(Opaque::new(tag_set?), |tag_set| {
+ // SAFETY: we do not move out of `tag_set`.
+ let tag_set = unsafe { Pin::get_unchecked_mut(tag_set) };
+ // SAFETY: `tag_set` is a reference to an initialized `blk_mq_tag_set`.
+ error::to_result( unsafe { bindings::blk_mq_alloc_tag_set(tag_set.get())})
+ }),
+ _p: PhantomData,
+ })
+ }
+
+ /// Return the pointer to the wrapped `struct blk_mq_tag_set`
+ pub(crate) fn raw_tag_set(&self) -> *mut bindings::blk_mq_tag_set {
+ self.inner.get()
+ }
+}
+
+#[pinned_drop]
+impl<T: Operations> PinnedDrop for TagSet<T> {
+ fn drop(self: Pin<&mut Self>) {
+ // SAFETY: By type invariant `inner` is valid and has been properly
+ // initialized during construction.
+ unsafe { bindings::blk_mq_free_tag_set(self.inner.get()) };
+ }
+}
diff --git a/rust/kernel/error.rs b/rust/kernel/error.rs
index 55280ae9fe40..145f5c397009 100644
--- a/rust/kernel/error.rs
+++ b/rust/kernel/error.rs
@@ -126,6 +126,12 @@ impl Error {
self.0
}
+ #[cfg(CONFIG_BLOCK)]
+ pub(crate) fn to_blk_status(self) -> bindings::blk_status_t {
+ // SAFETY: `self.0` is a valid error due to its invariant.
+ unsafe { bindings::errno_to_blk_status(self.0) }
+ }
+
/// Returns the error encoded as a pointer.
#[allow(dead_code)]
pub(crate) fn to_ptr<T>(self) -> *mut T {
diff --git a/rust/kernel/lib.rs b/rust/kernel/lib.rs
index fbd91a48ff8b..2cf7c6b6f66b 100644
--- a/rust/kernel/lib.rs
+++ b/rust/kernel/lib.rs
@@ -27,6 +27,8 @@ compile_error!("Missing kernel configuration for conditional compilation");
extern crate self as kernel;
pub mod alloc;
+#[cfg(CONFIG_BLOCK)]
+pub mod block;
mod build_assert;
pub mod error;
pub mod init;