diff options
Diffstat (limited to 'drivers/md')
49 files changed, 2053 insertions, 1228 deletions
diff --git a/drivers/md/Makefile b/drivers/md/Makefile index 0454b0885b01..84291e38dca8 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile @@ -5,7 +5,7 @@ dm-mod-y += dm.o dm-table.o dm-target.o dm-linear.o dm-stripe.o \ dm-ioctl.o dm-io.o dm-kcopyd.o dm-sysfs.o dm-stats.o \ - dm-rq.o + dm-rq.o dm-io-rewind.o dm-multipath-y += dm-path-selector.o dm-mpath.o dm-historical-service-time-y += dm-ps-historical-service-time.o dm-io-affinity-y += dm-ps-io-affinity.o @@ -83,6 +83,7 @@ obj-$(CONFIG_DM_LOG_WRITES) += dm-log-writes.o obj-$(CONFIG_DM_INTEGRITY) += dm-integrity.o obj-$(CONFIG_DM_ZONED) += dm-zoned.o obj-$(CONFIG_DM_WRITECACHE) += dm-writecache.o +obj-$(CONFIG_SECURITY_LOADPIN_VERITY) += dm-verity-loadpin.o ifeq ($(CONFIG_DM_INIT),y) dm-mod-objs += dm-init.o diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig index cf3e8096942a..529c9d04e9a4 100644 --- a/drivers/md/bcache/Kconfig +++ b/drivers/md/bcache/Kconfig @@ -29,7 +29,7 @@ config BCACHE_CLOSURES_DEBUG operations that get stuck. config BCACHE_ASYNC_REGISTRATION - bool "Asynchronous device registration (EXPERIMENTAL)" + bool "Asynchronous device registration" depends on BCACHE help Add a sysfs file /sys/fs/bcache/register_async. Writing registering diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index e136d6edc1ed..147c493a989a 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -812,7 +812,7 @@ int bch_btree_cache_alloc(struct cache_set *c) c->shrink.seeks = 4; c->shrink.batch = c->btree_pages * 2; - if (register_shrinker(&c->shrink)) + if (register_shrinker(&c->shrink, "md-bcache:%pU", c->set_uuid)) pr_warn("bcache: %s: could not register shrinker\n", __func__); diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 3563d15dbaf2..ba3909bb6bea 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -414,8 +414,8 @@ static void uuid_io_unlock(struct closure *cl) up(&c->uuid_write_mutex); } -static void uuid_io(struct cache_set *c, int op, unsigned long op_flags, - struct bkey *k, struct closure *parent) +static void uuid_io(struct cache_set *c, blk_opf_t opf, struct bkey *k, + struct closure *parent) { struct closure *cl = &c->uuid_write; struct uuid_entry *u; @@ -429,22 +429,22 @@ static void uuid_io(struct cache_set *c, int op, unsigned long op_flags, for (i = 0; i < KEY_PTRS(k); i++) { struct bio *bio = bch_bbio_alloc(c); - bio->bi_opf = REQ_SYNC | REQ_META | op_flags; + bio->bi_opf = opf | REQ_SYNC | REQ_META; bio->bi_iter.bi_size = KEY_SIZE(k) << 9; bio->bi_end_io = uuid_endio; bio->bi_private = cl; - bio_set_op_attrs(bio, op, REQ_SYNC|REQ_META|op_flags); bch_bio_map(bio, c->uuids); bch_submit_bbio(bio, c, k, i); - if (op != REQ_OP_WRITE) + if ((opf & REQ_OP_MASK) != REQ_OP_WRITE) break; } bch_extent_to_text(buf, sizeof(buf), k); - pr_debug("%s UUIDs at %s\n", op == REQ_OP_WRITE ? "wrote" : "read", buf); + pr_debug("%s UUIDs at %s\n", (opf & REQ_OP_MASK) == REQ_OP_WRITE ? + "wrote" : "read", buf); for (u = c->uuids; u < c->uuids + c->nr_uuids; u++) if (!bch_is_zero(u->uuid, 16)) @@ -463,7 +463,7 @@ static char *uuid_read(struct cache_set *c, struct jset *j, struct closure *cl) return "bad uuid pointer"; bkey_copy(&c->uuid_bucket, k); - uuid_io(c, REQ_OP_READ, 0, k, cl); + uuid_io(c, REQ_OP_READ, k, cl); if (j->version < BCACHE_JSET_VERSION_UUIDv1) { struct uuid_entry_v0 *u0 = (void *) c->uuids; @@ -511,7 +511,7 @@ static int __uuid_write(struct cache_set *c) size = meta_bucket_pages(&ca->sb) * PAGE_SECTORS; SET_KEY_SIZE(&k.key, size); - uuid_io(c, REQ_OP_WRITE, 0, &k.key, &cl); + uuid_io(c, REQ_OP_WRITE, &k.key, &cl); closure_sync(&cl); /* Only one bucket used for uuid write */ @@ -587,8 +587,7 @@ static void prio_endio(struct bio *bio) closure_put(&ca->prio); } -static void prio_io(struct cache *ca, uint64_t bucket, int op, - unsigned long op_flags) +static void prio_io(struct cache *ca, uint64_t bucket, blk_opf_t opf) { struct closure *cl = &ca->prio; struct bio *bio = bch_bbio_alloc(ca->set); @@ -601,7 +600,7 @@ static void prio_io(struct cache *ca, uint64_t bucket, int op, bio->bi_end_io = prio_endio; bio->bi_private = ca; - bio_set_op_attrs(bio, op, REQ_SYNC|REQ_META|op_flags); + bio->bi_opf = opf | REQ_SYNC | REQ_META; bch_bio_map(bio, ca->disk_buckets); closure_bio_submit(ca->set, bio, &ca->prio); @@ -661,7 +660,7 @@ int bch_prio_write(struct cache *ca, bool wait) BUG_ON(bucket == -1); mutex_unlock(&ca->set->bucket_lock); - prio_io(ca, bucket, REQ_OP_WRITE, 0); + prio_io(ca, bucket, REQ_OP_WRITE); mutex_lock(&ca->set->bucket_lock); ca->prio_buckets[i] = bucket; @@ -705,7 +704,7 @@ static int prio_read(struct cache *ca, uint64_t bucket) ca->prio_last_buckets[bucket_nr] = bucket; bucket_nr++; - prio_io(ca, bucket, REQ_OP_READ, 0); + prio_io(ca, bucket, REQ_OP_READ); if (p->csum != bch_crc64(&p->magic, meta_bucket_bytes(&ca->sb) - 8)) { @@ -884,7 +883,7 @@ static void bcache_device_free(struct bcache_device *d) if (disk) { ida_simple_remove(&bcache_device_idx, first_minor_to_idx(disk->first_minor)); - blk_cleanup_disk(disk); + put_disk(disk); } bioset_exit(&d->bio_split); diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index 5ffa1dcf84cf..09c7ed2650ca 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -18,6 +18,7 @@ #include <linux/module.h> #include <linux/rbtree.h> #include <linux/stacktrace.h> +#include <linux/jump_label.h> #define DM_MSG_PREFIX "bufio" @@ -81,6 +82,8 @@ */ struct dm_bufio_client { struct mutex lock; + spinlock_t spinlock; + bool no_sleep; struct list_head lru[LIST_SIZE]; unsigned long n_buffers[LIST_SIZE]; @@ -90,7 +93,6 @@ struct dm_bufio_client { s8 sectors_per_block_bits; void (*alloc_callback)(struct dm_buffer *); void (*write_callback)(struct dm_buffer *); - struct kmem_cache *slab_buffer; struct kmem_cache *slab_cache; struct dm_io_client *dm_io; @@ -161,23 +163,34 @@ struct dm_buffer { #endif }; +static DEFINE_STATIC_KEY_FALSE(no_sleep_enabled); + /*----------------------------------------------------------------*/ #define dm_bufio_in_request() (!!current->bio_list) static void dm_bufio_lock(struct dm_bufio_client *c) { - mutex_lock_nested(&c->lock, dm_bufio_in_request()); + if (static_branch_unlikely(&no_sleep_enabled) && c->no_sleep) + spin_lock_bh(&c->spinlock); + else + mutex_lock_nested(&c->lock, dm_bufio_in_request()); } static int dm_bufio_trylock(struct dm_bufio_client *c) { - return mutex_trylock(&c->lock); + if (static_branch_unlikely(&no_sleep_enabled) && c->no_sleep) + return spin_trylock_bh(&c->spinlock); + else + return mutex_trylock(&c->lock); } static void dm_bufio_unlock(struct dm_bufio_client *c) { - mutex_unlock(&c->lock); + if (static_branch_unlikely(&no_sleep_enabled) && c->no_sleep) + spin_unlock_bh(&c->spinlock); + else + mutex_unlock(&c->lock); } /*----------------------------------------------------------------*/ @@ -577,13 +590,12 @@ static void dmio_complete(unsigned long error, void *context) b->end_io(b, unlikely(error != 0) ? BLK_STS_IOERR : 0); } -static void use_dmio(struct dm_buffer *b, int rw, sector_t sector, +static void use_dmio(struct dm_buffer *b, enum req_op op, sector_t sector, unsigned n_sectors, unsigned offset) { int r; struct dm_io_request io_req = { - .bi_op = rw, - .bi_op_flags = 0, + .bi_opf = op, .notify.fn = dmio_complete, .notify.context = b, .client = b->c->dm_io, @@ -616,7 +628,7 @@ static void bio_complete(struct bio *bio) b->end_io(b, status); } -static void use_bio(struct dm_buffer *b, int rw, sector_t sector, +static void use_bio(struct dm_buffer *b, enum req_op op, sector_t sector, unsigned n_sectors, unsigned offset) { struct bio *bio; @@ -630,10 +642,10 @@ static void use_bio(struct dm_buffer *b, int rw, sector_t sector, bio = bio_kmalloc(vec_size, GFP_NOWAIT | __GFP_NORETRY | __GFP_NOWARN); if (!bio) { dmio: - use_dmio(b, rw, sector, n_sectors, offset); + use_dmio(b, op, sector, n_sectors, offset); return; } - bio_init(bio, b->c->bdev, bio->bi_inline_vecs, vec_size, rw); + bio_init(bio, b->c->bdev, bio->bi_inline_vecs, vec_size, op); bio->bi_iter.bi_sector = sector; bio->bi_end_io = bio_complete; bio->bi_private = b; @@ -669,7 +681,8 @@ static inline sector_t block_to_sector(struct dm_bufio_client *c, sector_t block return sector; } -static void submit_io(struct dm_buffer *b, int rw, void (*end_io)(struct dm_buffer *, blk_status_t)) +static void submit_io(struct dm_buffer *b, enum req_op op, + void (*end_io)(struct dm_buffer *, blk_status_t)) { unsigned n_sectors; sector_t sector; @@ -679,7 +692,7 @@ static void submit_io(struct dm_buffer *b, int rw, void (*end_io)(struct dm_buff sector = block_to_sector(b->c, b->block); - if (rw != REQ_OP_WRITE) { + if (op != REQ_OP_WRITE) { n_sectors = b->c->block_size >> SECTOR_SHIFT; offset = 0; } else { @@ -698,9 +711,9 @@ static void submit_io(struct dm_buffer *b, int rw, void (*end_io)(struct dm_buff } if (b->data_mode != DATA_MODE_VMALLOC) - use_bio(b, rw, sector, n_sectors, offset); + use_bio(b, op, sector, n_sectors, offset); else - use_dmio(b, rw, sector, n_sectors, offset); + use_dmio(b, op, sector, n_sectors, offset); } /*---------------------------------------------------------------- @@ -802,6 +815,10 @@ static struct dm_buffer *__get_unclaimed_buffer(struct dm_bufio_client *c) BUG_ON(test_bit(B_WRITING, &b->state)); BUG_ON(test_bit(B_DIRTY, &b->state)); + if (static_branch_unlikely(&no_sleep_enabled) && c->no_sleep && + unlikely(test_bit(B_READING, &b->state))) + continue; + if (!b->hold_count) { __make_buffer_clean(b); __unlink_buffer(b); @@ -810,6 +827,9 @@ static struct dm_buffer *__get_unclaimed_buffer(struct dm_bufio_client *c) cond_resched(); } + if (static_branch_unlikely(&no_sleep_enabled) && c->no_sleep) + return NULL; + list_for_each_entry_reverse(b, &c->lru[LIST_DIRTY], lru_list) { BUG_ON(test_bit(B_READING, &b->state)); @@ -1341,8 +1361,7 @@ EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers); int dm_bufio_issue_flush(struct dm_bufio_client *c) { struct dm_io_request io_req = { - .bi_op = REQ_OP_WRITE, - .bi_op_flags = REQ_PREFLUSH | REQ_SYNC, + .bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC, .mem.type = DM_IO_KMEM, .mem.ptr.addr = NULL, .client = c->dm_io, @@ -1365,8 +1384,7 @@ EXPORT_SYMBOL_GPL(dm_bufio_issue_flush); int dm_bufio_issue_discard(struct dm_bufio_client *c, sector_t block, sector_t count) { struct dm_io_request io_req = { - .bi_op = REQ_OP_DISCARD, - .bi_op_flags = REQ_SYNC, + .bi_opf = REQ_OP_DISCARD | REQ_SYNC, .mem.type = DM_IO_KMEM, .mem.ptr.addr = NULL, .client = c->dm_io, @@ -1619,7 +1637,8 @@ static void drop_buffers(struct dm_bufio_client *c) */ static bool __try_evict_buffer(struct dm_buffer *b, gfp_t gfp) { - if (!(gfp & __GFP_FS)) { + if (!(gfp & __GFP_FS) || + (static_branch_unlikely(&no_sleep_enabled) && b->c->no_sleep)) { if (test_bit(B_READING, &b->state) || test_bit(B_WRITING, &b->state) || test_bit(B_DIRTY, &b->state)) @@ -1717,7 +1736,8 @@ static unsigned long dm_bufio_shrink_count(struct shrinker *shrink, struct shrin struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsigned block_size, unsigned reserved_buffers, unsigned aux_size, void (*alloc_callback)(struct dm_buffer *), - void (*write_callback)(struct dm_buffer *)) + void (*write_callback)(struct dm_buffer *), + unsigned int flags) { int r; struct dm_bufio_client *c; @@ -1747,12 +1767,18 @@ struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsign c->alloc_callback = alloc_callback; c->write_callback = write_callback; + if (flags & DM_BUFIO_CLIENT_NO_SLEEP) { + c->no_sleep = true; + static_branch_inc(&no_sleep_enabled); + } + for (i = 0; i < LIST_SIZE; i++) { INIT_LIST_HEAD(&c->lru[i]); c->n_buffers[i] = 0; } mutex_init(&c->lock); + spin_lock_init(&c->spinlock); INIT_LIST_HEAD(&c->reserved_buffers); c->need_reserved_buffers = reserved_buffers; @@ -1806,7 +1832,8 @@ struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsign c->shrinker.scan_objects = dm_bufio_shrink_scan; c->shrinker.seeks = 1; c->shrinker.batch = 0; - r = register_shrinker(&c->shrinker); + r = register_shrinker(&c->shrinker, "md-%s:(%u:%u)", slab_name, + MAJOR(bdev->bd_dev), MINOR(bdev->bd_dev)); if (r) goto bad; @@ -1878,6 +1905,8 @@ void dm_bufio_client_destroy(struct dm_bufio_client *c) kmem_cache_destroy(c->slab_buffer); dm_io_client_destroy(c->dm_io); mutex_destroy(&c->lock); + if (c->no_sleep) + static_branch_dec(&no_sleep_enabled); kfree(c); } EXPORT_SYMBOL_GPL(dm_bufio_client_destroy); diff --git a/drivers/md/dm-cache-metadata.h b/drivers/md/dm-cache-metadata.h index 179ed5bf81a3..0905f2c1615e 100644 --- a/drivers/md/dm-cache-metadata.h +++ b/drivers/md/dm-cache-metadata.h @@ -131,7 +131,7 @@ void dm_cache_dump(struct dm_cache_metadata *cmd); * hints will be lost. * * The hints are indexed by the cblock, but many policies will not - * neccessarily have a fast way of accessing efficiently via cblock. So + * necessarily have a fast way of accessing efficiently via cblock. So * rather than querying the policy for each cblock, we let it walk its data * structures and fill in the hints in whatever order it wishes. */ diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index 28c5de8eca4a..54a8d5c9a44e 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -2775,7 +2775,7 @@ static int load_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock, /* * The discard block size in the on disk metadata is not - * neccessarily the same as we're currently using. So we have to + * necessarily the same as we're currently using. So we have to * be careful to only set the discarded attribute if we know it * covers a complete block of the new size. */ diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h index c954ff91870e..6c6bd24774f2 100644 --- a/drivers/md/dm-core.h +++ b/drivers/md/dm-core.h @@ -22,6 +22,8 @@ #define DM_RESERVED_MAX_IOS 1024 +struct dm_io; + struct dm_kobject_holder { struct kobject kobj; struct completion completion; @@ -91,6 +93,14 @@ struct mapped_device { spinlock_t deferred_lock; struct bio_list deferred; + /* + * requeue work context is needed for cloning one new bio + * to represent the dm_io to be requeued, since each + * dm_io may point to the original bio from FS. + */ + struct work_struct requeue_work; + struct dm_io *requeue_list; + void *interface_ptr; /* @@ -216,6 +226,13 @@ struct dm_table { #endif }; +static inline struct dm_target *dm_table_get_target(struct dm_table *t, + unsigned int index) +{ + BUG_ON(index >= t->num_targets); + return t->targets + index; +} + /* * One of these is allocated per clone bio. */ @@ -230,6 +247,9 @@ struct dm_target_io { sector_t old_sector; struct bio clone; }; +#define DM_TARGET_IO_BIO_OFFSET (offsetof(struct dm_target_io, clone)) +#define DM_IO_BIO_OFFSET \ + (offsetof(struct dm_target_io, clone) + offsetof(struct dm_io, tio)) /* * dm_target_io flags @@ -272,7 +292,6 @@ struct dm_io { atomic_t io_count; struct mapped_device *md; - struct bio *split_bio; /* The three fields represent mapped part of original bio */ struct bio *orig_bio; unsigned int sector_offset; /* offset to end of orig_bio */ @@ -300,6 +319,8 @@ static inline void dm_io_set_flag(struct dm_io *io, unsigned int bit) io->flags |= (1U << bit); } +void dm_io_rewind(struct dm_io *io, struct bio_set *bs); + static inline struct completion *dm_get_completion_from_kobject(struct kobject *kobj) { return &container_of(kobj, struct dm_kobject_holder, kobj)->completion; diff --git a/drivers/md/dm-ebs-target.c b/drivers/md/dm-ebs-target.c index 0221fa63f888..512cc6cea095 100644 --- a/drivers/md/dm-ebs-target.c +++ b/drivers/md/dm-ebs-target.c @@ -61,7 +61,8 @@ static inline bool __ebs_check_bs(unsigned int bs) * * copy blocks between bufio blocks and bio vector's (partial/overlapping) pages. */ -static int __ebs_rw_bvec(struct ebs_c *ec, int rw, struct bio_vec *bv, struct bvec_iter *iter) +static int __ebs_rw_bvec(struct ebs_c *ec, enum req_op op, struct bio_vec *bv, + struct bvec_iter *iter) { int r = 0; unsigned char *ba, *pa; @@ -81,7 +82,7 @@ static int __ebs_rw_bvec(struct ebs_c *ec, int rw, struct bio_vec *bv, struct bv cur_len = min(dm_bufio_get_block_size(ec->bufio) - buf_off, bv_len); /* Avoid reading for writes in case bio vector's page overwrites block completely. */ - if (rw == READ || buf_off || bv_len < dm_bufio_get_block_size(ec->bufio)) + if (op == REQ_OP_READ || buf_off || bv_len < dm_bufio_get_block_size(ec->bufio)) ba = dm_bufio_read(ec->bufio, block, &b); else ba = dm_bufio_new(ec->bufio, block, &b); @@ -95,7 +96,7 @@ static int __ebs_rw_bvec(struct ebs_c *ec, int rw, struct bio_vec *bv, struct bv } else { /* Copy data to/from bio to buffer if read/new was successful above. */ ba += buf_off; - if (rw == READ) { + if (op == REQ_OP_READ) { memcpy(pa, ba, cur_len); flush_dcache_page(bv->bv_page); } else { @@ -117,14 +118,14 @@ static int __ebs_rw_bvec(struct ebs_c *ec, int rw, struct bio_vec *bv, struct bv } /* READ/WRITE: iterate bio vector's copying between (partial) pages and bufio blocks. */ -static int __ebs_rw_bio(struct ebs_c *ec, int rw, struct bio *bio) +static int __ebs_rw_bio(struct ebs_c *ec, enum req_op op, struct bio *bio) { int r = 0, rr; struct bio_vec bv; struct bvec_iter iter; bio_for_each_bvec(bv, bio, iter) { - rr = __ebs_rw_bvec(ec, rw, &bv, &iter); + rr = __ebs_rw_bvec(ec, op, &bv, &iter); if (rr) r = rr; } @@ -205,10 +206,10 @@ static void __ebs_process_bios(struct work_struct *ws) bio_list_for_each(bio, &bios) { r = -EIO; if (bio_op(bio) == REQ_OP_READ) - r = __ebs_rw_bio(ec, READ, bio); + r = __ebs_rw_bio(ec, REQ_OP_READ, bio); else if (bio_op(bio) == REQ_OP_WRITE) { write = true; - r = __ebs_rw_bio(ec, WRITE, bio); + r = __ebs_rw_bio(ec, REQ_OP_WRITE, bio); } else if (bio_op(bio) == REQ_OP_DISCARD) { __ebs_forget_bio(ec, bio); r = __ebs_discard_bio(ec, bio); @@ -312,7 +313,8 @@ static int ebs_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto bad; } - ec->bufio = dm_bufio_client_create(ec->dev->bdev, to_bytes(ec->u_bs), 1, 0, NULL, NULL); + ec->bufio = dm_bufio_client_create(ec->dev->bdev, to_bytes(ec->u_bs), 1, + 0, NULL, NULL, 0); if (IS_ERR(ec->bufio)) { ti->error = "Cannot create dm bufio client"; r = PTR_ERR(ec->bufio); diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c index f2305eb758a2..89fa7a68c6c4 100644 --- a/drivers/md/dm-flakey.c +++ b/drivers/md/dm-flakey.c @@ -32,7 +32,7 @@ struct flakey_c { unsigned corrupt_bio_byte; unsigned corrupt_bio_rw; unsigned corrupt_bio_value; - unsigned corrupt_bio_flags; + blk_opf_t corrupt_bio_flags; }; enum feature_flag_bits { @@ -145,7 +145,11 @@ static int parse_features(struct dm_arg_set *as, struct flakey_c *fc, /* * Only corrupt bios with these flags set. */ - r = dm_read_arg(_args + 3, as, &fc->corrupt_bio_flags, &ti->error); + BUILD_BUG_ON(sizeof(fc->corrupt_bio_flags) != + sizeof(unsigned int)); + r = dm_read_arg(_args + 3, as, + (__force unsigned *)&fc->corrupt_bio_flags, + &ti->error); if (r) return r; argc--; diff --git a/drivers/md/dm-ima.c b/drivers/md/dm-ima.c index 1842d3a958ef..a1bd7cd52b1b 100644 --- a/drivers/md/dm-ima.c +++ b/drivers/md/dm-ima.c @@ -208,7 +208,7 @@ void dm_ima_measure_on_table_load(struct dm_table *table, unsigned int status_fl if (!target_data_buf) goto error; - num_targets = dm_table_get_num_targets(table); + num_targets = table->num_targets; if (dm_ima_alloc_and_copy_device_data(table->md, &device_data_buf, num_targets, noio)) goto error; @@ -237,9 +237,6 @@ void dm_ima_measure_on_table_load(struct dm_table *table, unsigned int status_fl for (i = 0; i < num_targets; i++) { struct dm_target *ti = dm_table_get_target(table, i); - if (!ti) - goto error; - last_target_measured = 0; /* diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index 3d5a0ce123c9..aaf2472df6e5 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -298,7 +298,7 @@ struct dm_integrity_io { struct work_struct work; struct dm_integrity_c *ic; - enum req_opf op; + enum req_op op; bool fua; struct dm_integrity_range range; @@ -551,14 +551,14 @@ static int sb_mac(struct dm_integrity_c *ic, bool wr) return 0; } -static int sync_rw_sb(struct dm_integrity_c *ic, int op, int op_flags) +static int sync_rw_sb(struct dm_integrity_c *ic, blk_opf_t opf) { struct dm_io_request io_req; struct dm_io_region io_loc; + const enum req_op op = opf & REQ_OP_MASK; int r; - io_req.bi_op = op; - io_req.bi_op_flags = op_flags; + io_req.bi_opf = opf; io_req.mem.type = DM_IO_KMEM; io_req.mem.ptr.addr = ic->sb; io_req.notify.fn = NULL; @@ -1050,8 +1050,9 @@ static void complete_journal_io(unsigned long error, void *context) complete_journal_op(comp); } -static void rw_journal_sectors(struct dm_integrity_c *ic, int op, int op_flags, - unsigned sector, unsigned n_sectors, struct journal_completion *comp) +static void rw_journal_sectors(struct dm_integrity_c *ic, blk_opf_t opf, + unsigned sector, unsigned n_sectors, + struct journal_completion *comp) { struct dm_io_request io_req; struct dm_io_region io_loc; @@ -1067,8 +1068,7 @@ static void rw_journal_sectors(struct dm_integrity_c *ic, int op, int op_flags, pl_index = sector >> (PAGE_SHIFT - SECTOR_SHIFT); pl_offset = (sector << SECTOR_SHIFT) & (PAGE_SIZE - 1); - io_req.bi_op = op; - io_req.bi_op_flags = op_flags; + io_req.bi_opf = opf; io_req.mem.type = DM_IO_PAGE_LIST; if (ic->journal_io) io_req.mem.ptr.pl = &ic->journal_io[pl_index]; @@ -1088,7 +1088,8 @@ static void rw_journal_sectors(struct dm_integrity_c *ic, int op, int op_flags, r = dm_io(&io_req, 1, &io_loc, NULL); if (unlikely(r)) { - dm_integrity_io_error(ic, op == REQ_OP_READ ? "reading journal" : "writing journal", r); + dm_integrity_io_error(ic, (opf & REQ_OP_MASK) == REQ_OP_READ ? + "reading journal" : "writing journal", r); if (comp) { WARN_ONCE(1, "asynchronous dm_io failed: %d", r); complete_journal_io(-1UL, comp); @@ -1096,15 +1097,16 @@ static void rw_journal_sectors(struct dm_integrity_c *ic, int op, int op_flags, } } -static void rw_journal(struct dm_integrity_c *ic, int op, int op_flags, unsigned section, - unsigned n_sections, struct journal_completion *comp) +static void rw_journal(struct dm_integrity_c *ic, blk_opf_t opf, + unsigned section, unsigned n_sections, + struct journal_completion *comp) { unsigned sector, n_sectors; sector = section * ic->journal_section_sectors; n_sectors = n_sections * ic->journal_section_sectors; - rw_journal_sectors(ic, op, op_flags, sector, n_sectors, comp); + rw_journal_sectors(ic, opf, sector, n_sectors, comp); } static void write_journal(struct dm_integrity_c *ic, unsigned commit_start, unsigned commit_sections) @@ -1129,7 +1131,7 @@ static void write_journal(struct dm_integrity_c *ic, unsigned commit_start, unsi for (i = 0; i < commit_sections; i++) rw_section_mac(ic, commit_start + i, true); } - rw_journal(ic, REQ_OP_WRITE, REQ_FUA | REQ_SYNC, commit_start, + rw_journal(ic, REQ_OP_WRITE | REQ_FUA | REQ_SYNC, commit_start, commit_sections, &io_comp); } else { unsigned to_end; @@ -1141,7 +1143,8 @@ static void write_journal(struct dm_integrity_c *ic, unsigned commit_start, unsi crypt_comp_1.in_flight = (atomic_t)ATOMIC_INIT(0); encrypt_journal(ic, true, commit_start, to_end, &crypt_comp_1); if (try_wait_for_completion(&crypt_comp_1.comp)) { - rw_journal(ic, REQ_OP_WRITE, REQ_FUA, commit_start, to_end, &io_comp); + rw_journal(ic, REQ_OP_WRITE | REQ_FUA, + commit_start, to_end, &io_comp); reinit_completion(&crypt_comp_1.comp); crypt_comp_1.in_flight = (atomic_t)ATOMIC_INIT(0); encrypt_journal(ic, true, 0, commit_sections - to_end, &crypt_comp_1); @@ -1152,17 +1155,17 @@ static void write_journal(struct dm_integrity_c *ic, unsigned commit_start, unsi crypt_comp_2.in_flight = (atomic_t)ATOMIC_INIT(0); encrypt_journal(ic, true, 0, commit_sections - to_end, &crypt_comp_2); wait_for_completion_io(&crypt_comp_1.comp); - rw_journal(ic, REQ_OP_WRITE, REQ_FUA, commit_start, to_end, &io_comp); + rw_journal(ic, REQ_OP_WRITE | REQ_FUA, commit_start, to_end, &io_comp); wait_for_completion_io(&crypt_comp_2.comp); } } else { for (i = 0; i < to_end; i++) rw_section_mac(ic, commit_start + i, true); - rw_journal(ic, REQ_OP_WRITE, REQ_FUA, commit_start, to_end, &io_comp); + rw_journal(ic, REQ_OP_WRITE | REQ_FUA, commit_start, to_end, &io_comp); for (i = 0; i < commit_sections - to_end; i++) rw_section_mac(ic, i, true); } - rw_journal(ic, REQ_OP_WRITE, REQ_FUA, 0, commit_sections - to_end, &io_comp); + rw_journal(ic, REQ_OP_WRITE | REQ_FUA, 0, commit_sections - to_end, &io_comp); } wait_for_completion_io(&io_comp.comp); @@ -1188,8 +1191,7 @@ static void copy_from_journal(struct dm_integrity_c *ic, unsigned section, unsig pl_index = sector >> (PAGE_SHIFT - SECTOR_SHIFT); pl_offset = (sector << SECTOR_SHIFT) & (PAGE_SIZE - 1); - io_req.bi_op = REQ_OP_WRITE; - io_req.bi_op_flags = 0; + io_req.bi_opf = REQ_OP_WRITE; io_req.mem.type = DM_IO_PAGE_LIST; io_req.mem.ptr.pl = &ic->journal[pl_index]; io_req.mem.offset = pl_offset; @@ -1516,8 +1518,7 @@ static void dm_integrity_flush_buffers(struct dm_integrity_c *ic, bool flush_dat if (!ic->meta_dev) flush_data = false; if (flush_data) { - fr.io_req.bi_op = REQ_OP_WRITE, - fr.io_req.bi_op_flags = REQ_PREFLUSH | REQ_SYNC, + fr.io_req.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC, fr.io_req.mem.type = DM_IO_KMEM, fr.io_req.mem.ptr.addr = NULL, fr.io_req.notify.fn = flush_notify, @@ -2626,7 +2627,7 @@ static void recalc_write_super(struct dm_integrity_c *ic) if (dm_integrity_failed(ic)) return; - r = sync_rw_sb(ic, REQ_OP_WRITE, 0); + r = sync_rw_sb(ic, REQ_OP_WRITE); if (unlikely(r)) dm_integrity_io_error(ic, "writing superblock", r); } @@ -2706,8 +2707,7 @@ next_chunk: if (unlikely(dm_integrity_failed(ic))) goto err; - io_req.bi_op = REQ_OP_READ; - io_req.bi_op_flags = 0; + io_req.bi_opf = REQ_OP_READ; io_req.mem.type = DM_IO_VMA; io_req.mem.ptr.addr = ic->recalc_buffer; io_req.notify.fn = NULL; @@ -2800,7 +2800,7 @@ static void bitmap_block_work(struct work_struct *w) if (bio_list_empty(&waiting)) return; - rw_journal_sectors(ic, REQ_OP_WRITE, REQ_FUA | REQ_SYNC, + rw_journal_sectors(ic, REQ_OP_WRITE | REQ_FUA | REQ_SYNC, bbs->idx * (BITMAP_BLOCK_SIZE >> SECTOR_SHIFT), BITMAP_BLOCK_SIZE >> SECTOR_SHIFT, NULL); @@ -2846,7 +2846,7 @@ static void bitmap_flush_work(struct work_struct *work) block_bitmap_op(ic, ic->journal, 0, limit, BITMAP_OP_CLEAR); block_bitmap_op(ic, ic->may_write_bitmap, 0, limit, BITMAP_OP_CLEAR); - rw_journal_sectors(ic, REQ_OP_WRITE, REQ_FUA | REQ_SYNC, 0, + rw_journal_sectors(ic, REQ_OP_WRITE | REQ_FUA | REQ_SYNC, 0, ic->n_bitmap_blocks * (BITMAP_BLOCK_SIZE >> SECTOR_SHIFT), NULL); spin_lock_irq(&ic->endio_wait.lock); @@ -2918,7 +2918,7 @@ static void replay_journal(struct dm_integrity_c *ic) if (!ic->just_formatted) { DEBUG_print("reading journal\n"); - rw_journal(ic, REQ_OP_READ, 0, 0, ic->journal_sections, NULL); + rw_journal(ic, REQ_OP_READ, 0, ic->journal_sections, NULL); if (ic->journal_io) DEBUG_bytes(lowmem_page_address(ic->journal_io[0].page), 64, "read journal"); if (ic->journal_io) { @@ -3113,7 +3113,7 @@ static void dm_integrity_postsuspend(struct dm_target *ti) /* set to 0 to test bitmap replay code */ init_journal(ic, 0, ic->journal_sections, 0); ic->sb->flags &= ~cpu_to_le32(SB_FLAG_DIRTY_BITMAP); - r = sync_rw_sb(ic, REQ_OP_WRITE, REQ_FUA); + r = sync_rw_sb(ic, REQ_OP_WRITE | REQ_FUA); if (unlikely(r)) dm_integrity_io_error(ic, "writing superblock", r); #endif @@ -3136,23 +3136,23 @@ static void dm_integrity_resume(struct dm_target *ti) if (ic->provided_data_sectors > old_provided_data_sectors && ic->mode == 'B' && ic->sb->log2_blocks_per_bitmap_bit == ic->log2_blocks_per_bitmap_bit) { - rw_journal_sectors(ic, REQ_OP_READ, 0, 0, + rw_journal_sectors(ic, REQ_OP_READ, 0, ic->n_bitmap_blocks * (BITMAP_BLOCK_SIZE >> SECTOR_SHIFT), NULL); block_bitmap_op(ic, ic->journal, old_provided_data_sectors, ic->provided_data_sectors - old_provided_data_sectors, BITMAP_OP_SET); - rw_journal_sectors(ic, REQ_OP_WRITE, REQ_FUA | REQ_SYNC, 0, + rw_journal_sectors(ic, REQ_OP_WRITE | REQ_FUA | REQ_SYNC, 0, ic->n_bitmap_blocks * (BITMAP_BLOCK_SIZE >> SECTOR_SHIFT), NULL); } ic->sb->provided_data_sectors = cpu_to_le64(ic->provided_data_sectors); - r = sync_rw_sb(ic, REQ_OP_WRITE, REQ_FUA); + r = sync_rw_sb(ic, REQ_OP_WRITE | REQ_FUA); if (unlikely(r)) dm_integrity_io_error(ic, "writing superblock", r); } if (ic->sb->flags & cpu_to_le32(SB_FLAG_DIRTY_BITMAP)) { DEBUG_print("resume dirty_bitmap\n"); - rw_journal_sectors(ic, REQ_OP_READ, 0, 0, + rw_journal_sectors(ic, REQ_OP_READ, 0, ic->n_bitmap_blocks * (BITMAP_BLOCK_SIZE >> SECTOR_SHIFT), NULL); if (ic->mode == 'B') { if (ic->sb->log2_blocks_per_bitmap_bit == ic->log2_blocks_per_bitmap_bit && @@ -3171,7 +3171,7 @@ static void dm_integrity_resume(struct dm_target *ti) block_bitmap_op(ic, ic->recalc_bitmap, 0, ic->provided_data_sectors, BITMAP_OP_SET); block_bitmap_op(ic, ic->may_write_bitmap, 0, ic->provided_data_sectors, BITMAP_OP_SET); block_bitmap_op(ic, ic->journal, 0, ic->provided_data_sectors, BITMAP_OP_SET); - rw_journal_sectors(ic, REQ_OP_WRITE, REQ_FUA | REQ_SYNC, 0, + rw_journal_sectors(ic, REQ_OP_WRITE | REQ_FUA | REQ_SYNC, 0, ic->n_bitmap_blocks * (BITMAP_BLOCK_SIZE >> SECTOR_SHIFT), NULL); ic->sb->flags |= cpu_to_le32(SB_FLAG_RECALCULATING); ic->sb->recalc_sector = cpu_to_le64(0); @@ -3187,7 +3187,7 @@ static void dm_integrity_resume(struct dm_target *ti) replay_journal(ic); ic->sb->flags &= ~cpu_to_le32(SB_FLAG_DIRTY_BITMAP); } - r = sync_rw_sb(ic, REQ_OP_WRITE, REQ_FUA); + r = sync_rw_sb(ic, REQ_OP_WRITE | REQ_FUA); if (unlikely(r)) dm_integrity_io_error(ic, "writing superblock", r); } else { @@ -3199,7 +3199,7 @@ static void dm_integrity_resume(struct dm_target *ti) if (ic->mode == 'B') { ic->sb->flags |= cpu_to_le32(SB_FLAG_DIRTY_BITMAP); ic->sb->log2_blocks_per_bitmap_bit = ic->log2_blocks_per_bitmap_bit; - r = sync_rw_sb(ic, REQ_OP_WRITE, REQ_FUA); + r = sync_rw_sb(ic, REQ_OP_WRITE | REQ_FUA); if (unlikely(r)) dm_integrity_io_error(ic, "writing superblock", r); @@ -3215,7 +3215,7 @@ static void dm_integrity_resume(struct dm_target *ti) block_bitmap_op(ic, ic->may_write_bitmap, le64_to_cpu(ic->sb->recalc_sector), ic->provided_data_sectors - le64_to_cpu(ic->sb->recalc_sector), BITMAP_OP_SET); } - rw_journal_sectors(ic, REQ_OP_WRITE, REQ_FUA | REQ_SYNC, 0, + rw_journal_sectors(ic, REQ_OP_WRITE | REQ_FUA | REQ_SYNC, 0, ic->n_bitmap_blocks * (BITMAP_BLOCK_SIZE >> SECTOR_SHIFT), NULL); } } @@ -4256,7 +4256,7 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv) goto bad; } - r = sync_rw_sb(ic, REQ_OP_READ, 0); + r = sync_rw_sb(ic, REQ_OP_READ); if (r) { ti->error = "Error reading superblock"; goto bad; @@ -4439,7 +4439,7 @@ try_smaller_buffer: } ic->bufio = dm_bufio_client_create(ic->meta_dev ? ic->meta_dev->bdev : ic->dev->bdev, - 1U << (SECTOR_SHIFT + ic->log2_buffer_sectors), 1, 0, NULL, NULL); + 1U << (SECTOR_SHIFT + ic->log2_buffer_sectors), 1, 0, NULL, NULL, 0); if (IS_ERR(ic->bufio)) { r = PTR_ERR(ic->bufio); ti->error = "Cannot initialize dm-bufio"; @@ -4500,7 +4500,7 @@ try_smaller_buffer: ti->error = "Error initializing journal"; goto bad; } - r = sync_rw_sb(ic, REQ_OP_WRITE, REQ_FUA); + r = sync_rw_sb(ic, REQ_OP_WRITE | REQ_FUA); if (r) { ti->error = "Error initializing superblock"; goto bad; diff --git a/drivers/md/dm-io-rewind.c b/drivers/md/dm-io-rewind.c new file mode 100644 index 000000000000..0db53ccb94ba --- /dev/null +++ b/drivers/md/dm-io-rewind.c @@ -0,0 +1,166 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright 2022 Red Hat, Inc. + */ + +#include <linux/bio.h> +#include <linux/blk-crypto.h> +#include <linux/blk-integrity.h> + +#include "dm-core.h" + +static inline bool dm_bvec_iter_rewind(const struct bio_vec *bv, + struct bvec_iter *iter, + unsigned int bytes) +{ + int idx; + + iter->bi_size += bytes; + if (bytes <= iter->bi_bvec_done) { + iter->bi_bvec_done -= bytes; + return true; + } + + bytes -= iter->bi_bvec_done; + idx = iter->bi_idx - 1; + + while (idx >= 0 && bytes && bytes > bv[idx].bv_len) { + bytes -= bv[idx].bv_len; + idx--; + } + + if (WARN_ONCE(idx < 0 && bytes, + "Attempted to rewind iter beyond bvec's boundaries\n")) { + iter->bi_size -= bytes; + iter->bi_bvec_done = 0; + iter->bi_idx = 0; + return false; + } + + iter->bi_idx = idx; + iter->bi_bvec_done = bv[idx].bv_len - bytes; + return true; +} + +#if defined(CONFIG_BLK_DEV_INTEGRITY) + +/** + * dm_bio_integrity_rewind - Rewind integrity vector + * @bio: bio whose integrity vector to update + * @bytes_done: number of data bytes to rewind + * + * Description: This function calculates how many integrity bytes the + * number of completed data bytes correspond to and rewind the + * integrity vector accordingly. + */ +static void dm_bio_integrity_rewind(struct bio *bio, unsigned int bytes_done) +{ + struct bio_integrity_payload *bip = bio_integrity(bio); + struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); + unsigned bytes = bio_integrity_bytes(bi, bytes_done >> 9); + + bip->bip_iter.bi_sector -= bio_integrity_intervals(bi, bytes_done >> 9); + dm_bvec_iter_rewind(bip->bip_vec, &bip->bip_iter, bytes); +} + +#else /* CONFIG_BLK_DEV_INTEGRITY */ + +static inline void dm_bio_integrity_rewind(struct bio *bio, + unsigned int bytes_done) +{ + return; +} + +#endif + +#if defined(CONFIG_BLK_INLINE_ENCRYPTION) + +/* Decrements @dun by @dec, treating @dun as a multi-limb integer. */ +static void dm_bio_crypt_dun_decrement(u64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE], + unsigned int dec) +{ + int i; + + for (i = 0; dec && i < BLK_CRYPTO_DUN_ARRAY_SIZE; i++) { + u64 prev = dun[i]; + + dun[i] -= dec; + if (dun[i] > prev) + dec = 1; + else + dec = 0; + } +} + +static void dm_bio_crypt_rewind(struct bio *bio, unsigned int bytes) +{ + struct bio_crypt_ctx *bc = bio->bi_crypt_context; + + dm_bio_crypt_dun_decrement(bc->bc_dun, + bytes >> bc->bc_key->data_unit_size_bits); +} + +#else /* CONFIG_BLK_INLINE_ENCRYPTION */ + +static inline void dm_bio_crypt_rewind(struct bio *bio, unsigned int bytes) +{ + return; +} + +#endif + +static inline void dm_bio_rewind_iter(const struct bio *bio, + struct bvec_iter *iter, unsigned int bytes) +{ + iter->bi_sector -= bytes >> 9; + + /* No advance means no rewind */ + if (bio_no_advance_iter(bio)) + iter->bi_size += bytes; + else + dm_bvec_iter_rewind(bio->bi_io_vec, iter, bytes); +} + +/** + * dm_bio_rewind - update ->bi_iter of @bio by rewinding @bytes. + * @bio: bio to rewind + * @bytes: how many bytes to rewind + * + * WARNING: + * Caller must ensure that @bio has a fixed end sector, to allow + * rewinding from end of bio and restoring its original position. + * Caller is also responsibile for restoring bio's size. + */ +static void dm_bio_rewind(struct bio *bio, unsigned bytes) +{ + if (bio_integrity(bio)) + dm_bio_integrity_rewind(bio, bytes); + + if (bio_has_crypt_ctx(bio)) + dm_bio_crypt_rewind(bio, bytes); + + dm_bio_rewind_iter(bio, &bio->bi_iter, bytes); +} + +void dm_io_rewind(struct dm_io *io, struct bio_set *bs) +{ + struct bio *orig = io->orig_bio; + struct bio *new_orig = bio_alloc_clone(orig->bi_bdev, orig, + GFP_NOIO, bs); + /* + * dm_bio_rewind can restore to previous position since the + * end sector is fixed for original bio, but we still need + * to restore bio's size manually (using io->sectors). + */ + dm_bio_rewind(new_orig, ((io->sector_offset << 9) - + orig->bi_iter.bi_size)); + bio_trim(new_orig, 0, io->sectors); + + bio_chain(new_orig, orig); + /* + * __bi_remaining was increased (by dm_split_and_process_bio), + * so must drop the one added in bio_chain. + */ + atomic_dec(&orig->__bi_remaining); + io->orig_bio = new_orig; +} diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c index e4b95eaeec8c..783564533459 100644 --- a/drivers/md/dm-io.c +++ b/drivers/md/dm-io.c @@ -293,7 +293,7 @@ static void km_dp_init(struct dpages *dp, void *data) /*----------------------------------------------------------------- * IO routines that accept a list of pages. *---------------------------------------------------------------*/ -static void do_region(int op, int op_flags, unsigned region, +static void do_region(const blk_opf_t opf, unsigned region, struct dm_io_region *where, struct dpages *dp, struct io *io) { @@ -306,6 +306,7 @@ static void do_region(int op, int op_flags, unsigned region, struct request_queue *q = bdev_get_queue(where->bdev); sector_t num_sectors; unsigned int special_cmd_max_sectors; + const enum req_op op = opf & REQ_OP_MASK; /* * Reject unsupported discard and write same requests. @@ -339,8 +340,8 @@ static void do_region(int op, int op_flags, unsigned region, (PAGE_SIZE >> SECTOR_SHIFT))); } - bio = bio_alloc_bioset(where->bdev, num_bvecs, op | op_flags, - GFP_NOIO, &io->client->bios); + bio = bio_alloc_bioset(where->bdev, num_bvecs, opf, GFP_NOIO, + &io->client->bios); bio->bi_iter.bi_sector = where->sector + (where->count - remaining); bio->bi_end_io = endio; store_io_and_region_in_bio(bio, io, region); @@ -368,7 +369,7 @@ static void do_region(int op, int op_flags, unsigned region, } while (remaining); } -static void dispatch_io(int op, int op_flags, unsigned int num_regions, +static void dispatch_io(blk_opf_t opf, unsigned int num_regions, struct dm_io_region *where, struct dpages *dp, struct io *io, int sync) { @@ -378,7 +379,7 @@ static void dispatch_io(int op, int op_flags, unsigned int num_regions, BUG_ON(num_regions > DM_IO_MAX_REGIONS); if (sync) - op_flags |= REQ_SYNC; + opf |= REQ_SYNC; /* * For multiple regions we need to be careful to rewind @@ -386,8 +387,8 @@ static void dispatch_io(int op, int op_flags, unsigned int num_regions, */ for (i = 0; i < num_regions; i++) { *dp = old_pages; - if (where[i].count || (op_flags & REQ_PREFLUSH)) - do_region(op, op_flags, i, where + i, dp, io); + if (where[i].count || (opf & REQ_PREFLUSH)) + do_region(opf, i, where + i, dp, io); } /* @@ -411,13 +412,13 @@ static void sync_io_complete(unsigned long error, void *context) } static int sync_io(struct dm_io_client *client, unsigned int num_regions, - struct dm_io_region *where, int op, int op_flags, - struct dpages *dp, unsigned long *error_bits) + struct dm_io_region *where, blk_opf_t opf, struct dpages *dp, + unsigned long *error_bits) { struct io *io; struct sync_io sio; - if (num_regions > 1 && !op_is_write(op)) { + if (num_regions > 1 && !op_is_write(opf)) { WARN_ON(1); return -EIO; } @@ -434,7 +435,7 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions, io->vma_invalidate_address = dp->vma_invalidate_address; io->vma_invalidate_size = dp->vma_invalidate_size; - dispatch_io(op, op_flags, num_regions, where, dp, io, 1); + dispatch_io(opf, num_regions, where, dp, io, 1); wait_for_completion_io(&sio.wait); @@ -445,12 +446,12 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions, } static int async_io(struct dm_io_client *client, unsigned int num_regions, - struct dm_io_region *where, int op, int op_flags, + struct dm_io_region *where, blk_opf_t opf, struct dpages *dp, io_notify_fn fn, void *context) { struct io *io; - if (num_regions > 1 && !op_is_write(op)) { + if (num_regions > 1 && !op_is_write(opf)) { WARN_ON(1); fn(1, context); return -EIO; @@ -466,7 +467,7 @@ static int async_io(struct dm_io_client *client, unsigned int num_regions, io->vma_invalidate_address = dp->vma_invalidate_address; io->vma_invalidate_size = dp->vma_invalidate_size; - dispatch_io(op, op_flags, num_regions, where, dp, io, 0); + dispatch_io(opf, num_regions, where, dp, io, 0); return 0; } @@ -489,7 +490,7 @@ static int dp_init(struct dm_io_request *io_req, struct dpages *dp, case DM_IO_VMA: flush_kernel_vmap_range(io_req->mem.ptr.vma, size); - if (io_req->bi_op == REQ_OP_READ) { + if ((io_req->bi_opf & REQ_OP_MASK) == REQ_OP_READ) { dp->vma_invalidate_address = io_req->mem.ptr.vma; dp->vma_invalidate_size = size; } @@ -519,11 +520,10 @@ int dm_io(struct dm_io_request *io_req, unsigned num_regions, if (!io_req->notify.fn) return sync_io(io_req->client, num_regions, where, - io_req->bi_op, io_req->bi_op_flags, &dp, - sync_error_bits); + io_req->bi_opf, &dp, sync_error_bits); - return async_io(io_req->client, num_regions, where, io_req->bi_op, - io_req->bi_op_flags, &dp, io_req->notify.fn, + return async_io(io_req->client, num_regions, where, + io_req->bi_opf, &dp, io_req->notify.fn, io_req->notify.context); } EXPORT_SYMBOL(dm_io); diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index 87310fceb0d8..98976aaa9db9 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -832,7 +832,7 @@ static void __dev_status(struct mapped_device *md, struct dm_ioctl *param) if (!(param->flags & DM_QUERY_INACTIVE_TABLE_FLAG)) { if (get_disk_ro(disk)) param->flags |= DM_READONLY_FLAG; - param->target_count = dm_table_get_num_targets(table); + param->target_count = table->num_targets; } param->flags |= DM_ACTIVE_PRESENT_FLAG; @@ -845,7 +845,7 @@ static void __dev_status(struct mapped_device *md, struct dm_ioctl *param) if (table) { if (!(dm_table_get_mode(table) & FMODE_WRITE)) param->flags |= DM_READONLY_FLAG; - param->target_count = dm_table_get_num_targets(table); + param->target_count = table->num_targets; } dm_put_live_table(md, srcu_idx); } @@ -1248,7 +1248,7 @@ static void retrieve_status(struct dm_table *table, type = STATUSTYPE_INFO; /* Get all the target info */ - num_targets = dm_table_get_num_targets(table); + num_targets = table->num_targets; for (i = 0; i < num_targets; i++) { struct dm_target *ti = dm_table_get_target(table, i); size_t l; diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c index 37b03ab7e5c9..4d3bbbea2e9a 100644 --- a/drivers/md/dm-kcopyd.c +++ b/drivers/md/dm-kcopyd.c @@ -219,7 +219,7 @@ static struct page_list *alloc_pl(gfp_t gfp) if (!pl) return NULL; - pl->page = alloc_page(gfp); + pl->page = alloc_page(gfp | __GFP_HIGHMEM); if (!pl->page) { kfree(pl); return NULL; @@ -350,9 +350,9 @@ struct kcopyd_job { unsigned long write_err; /* - * Either READ or WRITE + * REQ_OP_READ, REQ_OP_WRITE or REQ_OP_WRITE_ZEROES. */ - int rw; + enum req_op op; struct dm_io_region source; /* @@ -418,7 +418,8 @@ static struct kcopyd_job *pop_io_job(struct list_head *jobs, * constraint and sequential writes that are at the right position. */ list_for_each_entry(job, jobs, list) { - if (job->rw == READ || !(job->flags & BIT(DM_KCOPYD_WRITE_SEQ))) { + if (job->op == REQ_OP_READ || + !(job->flags & BIT(DM_KCOPYD_WRITE_SEQ))) { list_del(&job->list); return job; } @@ -518,7 +519,7 @@ static void complete_io(unsigned long error, void *context) io_job_finish(kc->throttle); if (error) { - if (op_is_write(job->rw)) + if (op_is_write(job->op)) job->write_err |= error; else job->read_err = 1; @@ -530,11 +531,11 @@ static void complete_io(unsigned long error, void *context) } } - if (op_is_write(job->rw)) + if (op_is_write(job->op)) push(&kc->complete_jobs, job); else { - job->rw = WRITE; + job->op = REQ_OP_WRITE; push(&kc->io_jobs, job); } @@ -549,8 +550,7 @@ static int run_io_job(struct kcopyd_job *job) { int r; struct dm_io_request io_req = { - .bi_op = job->rw, - .bi_op_flags = 0, + .bi_opf = job->op, .mem.type = DM_IO_PAGE_LIST, .mem.ptr.pl = job->pages, .mem.offset = 0, @@ -571,7 +571,7 @@ static int run_io_job(struct kcopyd_job *job) io_job_start(job->kc->throttle); - if (job->rw == READ) + if (job->op == REQ_OP_READ) r = dm_io(&io_req, 1, &job->source, NULL); else r = dm_io(&io_req, job->num_dests, job->dests, NULL); @@ -614,7 +614,7 @@ static int process_jobs(struct list_head *jobs, struct dm_kcopyd_client *kc, if (r < 0) { /* error this rogue job */ - if (op_is_write(job->rw)) + if (op_is_write(job->op)) job->write_err = (unsigned long) -1L; else job->read_err = 1; @@ -817,7 +817,7 @@ void dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from, if (from) { job->source = *from; job->pages = NULL; - job->rw = READ; + job->op = REQ_OP_READ; } else { memset(&job->source, 0, sizeof job->source); job->source.count = job->dests[0].count; @@ -826,10 +826,10 @@ void dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from, /* * Use WRITE ZEROES to optimize zeroing if all dests support it. */ - job->rw = REQ_OP_WRITE_ZEROES; + job->op = REQ_OP_WRITE_ZEROES; for (i = 0; i < job->num_dests; i++) if (!bdev_write_zeroes_sectors(job->dests[i].bdev)) { - job->rw = WRITE; + job->op = REQ_OP_WRITE; break; } } diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c index 0c6620e7b7bf..cf10fa667797 100644 --- a/drivers/md/dm-log.c +++ b/drivers/md/dm-log.c @@ -291,10 +291,9 @@ static void header_from_disk(struct log_header_core *core, struct log_header_dis core->nr_regions = le64_to_cpu(disk->nr_regions); } -static int rw_header(struct log_c *lc, int op) +static int rw_header(struct log_c *lc, enum req_op op) { - lc->io_req.bi_op = op; - lc->io_req.bi_op_flags = 0; + lc->io_req.bi_opf = op; return dm_io(&lc->io_req, 1, &lc->header_location, NULL); } @@ -307,8 +306,7 @@ static int flush_header(struct log_c *lc) .count = 0, }; - lc->io_req.bi_op = REQ_OP_WRITE; - lc->io_req.bi_op_flags = REQ_PREFLUSH; + lc->io_req.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; return dm_io(&lc->io_req, 1, &null_location, NULL); } diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 80c9f7134e9b..c640be453313 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -1369,7 +1369,7 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as, } rs->md.bitmap_info.daemon_sleep = value; } else if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_DATA_OFFSET))) { - /* Userspace passes new data_offset after having extended the the data image LV */ + /* Userspace passes new data_offset after having extended the data image LV */ if (test_and_set_bit(__CTR_FLAG_DATA_OFFSET, &rs->ctr_flags)) { rs->ti->error = "Only one data_offset argument pair allowed"; return -EINVAL; @@ -2038,7 +2038,7 @@ static int read_disk_sb(struct md_rdev *rdev, int size, bool force_reload) rdev->sb_loaded = 0; - if (!sync_page_io(rdev, 0, size, rdev->sb_page, REQ_OP_READ, 0, true)) { + if (!sync_page_io(rdev, 0, size, rdev->sb_page, REQ_OP_READ, true)) { DMERR("Failed to read superblock of device at position %d", rdev->raid_disk); md_error(rdev->mddev, rdev); @@ -3097,6 +3097,7 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv) INIT_WORK(&rs->md.event_work, do_table_event); ti->private = rs; ti->num_flush_bios = 1; + ti->needs_bio_set_dev = true; /* Restore any requested new layout for conversion decision */ rs_config_restore(rs, &rs_layout); @@ -3509,7 +3510,7 @@ static void raid_status(struct dm_target *ti, status_type_t type, { struct raid_set *rs = ti->private; struct mddev *mddev = &rs->md; - struct r5conf *conf = mddev->private; + struct r5conf *conf = rs_is_raid456(rs) ? mddev->private : NULL; int i, max_nr_stripes = conf ? conf->max_nr_stripes : 0; unsigned long recovery; unsigned int raid_param_cnt = 1; /* at least 1 for chunksize */ @@ -3727,6 +3728,7 @@ static int raid_message(struct dm_target *ti, unsigned int argc, char **argv, if (!strcasecmp(argv[0], "idle") || !strcasecmp(argv[0], "frozen")) { if (mddev->sync_thread) { set_bit(MD_RECOVERY_INTR, &mddev->recovery); + md_unregister_thread(&mddev->sync_thread); md_reap_sync_thread(mddev); } } else if (decipher_sync_action(mddev, mddev->recovery) != st_idle) @@ -3819,7 +3821,7 @@ static void attempt_restore_of_faulty_devices(struct raid_set *rs) memset(cleared_failed_devices, 0, sizeof(cleared_failed_devices)); - for (i = 0; i < mddev->raid_disks; i++) { + for (i = 0; i < rs->raid_disks; i++) { r = &rs->dev[i].rdev; /* HM FIXME: enhance journal device recovery processing */ if (test_bit(Journal, &r->flags)) diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index 8811d484fdd1..06a38dc32025 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c @@ -260,8 +260,7 @@ static int mirror_flush(struct dm_target *ti) struct dm_io_region io[MAX_NR_MIRRORS]; struct mirror *m; struct dm_io_request io_req = { - .bi_op = REQ_OP_WRITE, - .bi_op_flags = REQ_PREFLUSH | REQ_SYNC, + .bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC, .mem.type = DM_IO_KMEM, .mem.ptr.addr = NULL, .client = ms->io_client, @@ -535,8 +534,7 @@ static void read_async_bio(struct mirror *m, struct bio *bio) { struct dm_io_region io; struct dm_io_request io_req = { - .bi_op = REQ_OP_READ, - .bi_op_flags = 0, + .bi_opf = REQ_OP_READ, .mem.type = DM_IO_BIO, .mem.ptr.bio = bio, .notify.fn = read_callback, @@ -648,9 +646,9 @@ static void do_write(struct mirror_set *ms, struct bio *bio) unsigned int i; struct dm_io_region io[MAX_NR_MIRRORS], *dest = io; struct mirror *m; + blk_opf_t op_flags = bio->bi_opf & (REQ_FUA | REQ_PREFLUSH); struct dm_io_request io_req = { - .bi_op = REQ_OP_WRITE, - .bi_op_flags = bio->bi_opf & (REQ_FUA | REQ_PREFLUSH), + .bi_opf = REQ_OP_WRITE | op_flags, .mem.type = DM_IO_BIO, .mem.ptr.bio = bio, .notify.fn = write_callback, @@ -659,7 +657,7 @@ static void do_write(struct mirror_set *ms, struct bio *bio) }; if (bio_op(bio) == REQ_OP_DISCARD) { - io_req.bi_op = REQ_OP_DISCARD; + io_req.bi_opf = REQ_OP_DISCARD | op_flags; io_req.mem.type = DM_IO_KMEM; io_req.mem.ptr.addr = NULL; } diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c index a83b98a8d2a9..4f49bbcce4f1 100644 --- a/drivers/md/dm-rq.c +++ b/drivers/md/dm-rq.c @@ -43,7 +43,6 @@ unsigned dm_get_reserved_rq_based_ios(void) return __dm_get_module_param(&reserved_rq_based_ios, RESERVED_REQUEST_BASED_IOS, DM_RESERVED_MAX_IOS); } -EXPORT_SYMBOL_GPL(dm_get_reserved_rq_based_ios); static unsigned dm_get_blk_mq_nr_hw_queues(void) { diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c index 3bb5cff5d6fc..680cc05ec654 100644 --- a/drivers/md/dm-snap-persistent.c +++ b/drivers/md/dm-snap-persistent.c @@ -226,8 +226,8 @@ static void do_metadata(struct work_struct *work) /* * Read or write a chunk aligned and sized block of data from a device. */ -static int chunk_io(struct pstore *ps, void *area, chunk_t chunk, int op, - int op_flags, int metadata) +static int chunk_io(struct pstore *ps, void *area, chunk_t chunk, blk_opf_t opf, + int metadata) { struct dm_io_region where = { .bdev = dm_snap_cow(ps->store->snap)->bdev, @@ -235,8 +235,7 @@ static int chunk_io(struct pstore *ps, void *area, chunk_t chunk, int op, .count = ps->store->chunk_size, }; struct dm_io_request io_req = { - .bi_op = op, - .bi_op_flags = op_flags, + .bi_opf = opf, .mem.type = DM_IO_VMA, .mem.ptr.vma = area, .client = ps->io_client, @@ -282,11 +281,11 @@ static void skip_metadata(struct pstore *ps) * Read or write a metadata area. Remembering to skip the first * chunk which holds the header. */ -static int area_io(struct pstore *ps, int op, int op_flags) +static int area_io(struct pstore *ps, blk_opf_t opf) { chunk_t chunk = area_location(ps, ps->current_area); - return chunk_io(ps, ps->area, chunk, op, op_flags, 0); + return chunk_io(ps, ps->area, chunk, opf, 0); } static void zero_memory_area(struct pstore *ps) @@ -297,7 +296,7 @@ static void zero_memory_area(struct pstore *ps) static int zero_disk_area(struct pstore *ps, chunk_t area) { return chunk_io(ps, ps->zero_area, area_location(ps, area), - REQ_OP_WRITE, 0, 0); + REQ_OP_WRITE, 0); } static int read_header(struct pstore *ps, int *new_snapshot) @@ -329,7 +328,7 @@ static int read_header(struct pstore *ps, int *new_snapshot) if (r) return r; - r = chunk_io(ps, ps->header_area, 0, REQ_OP_READ, 0, 1); + r = chunk_io(ps, ps->header_area, 0, REQ_OP_READ, 1); if (r) goto bad; @@ -390,7 +389,7 @@ static int write_header(struct pstore *ps) dh->version = cpu_to_le32(ps->version); dh->chunk_size = cpu_to_le32(ps->store->chunk_size); - return chunk_io(ps, ps->header_area, 0, REQ_OP_WRITE, 0, 1); + return chunk_io(ps, ps->header_area, 0, REQ_OP_WRITE, 1); } /* @@ -494,7 +493,7 @@ static int read_exceptions(struct pstore *ps, client = dm_bufio_client_create(dm_snap_cow(ps->store->snap)->bdev, ps->store->chunk_size << SECTOR_SHIFT, - 1, 0, NULL, NULL); + 1, 0, NULL, NULL, 0); if (IS_ERR(client)) return PTR_ERR(client); @@ -734,8 +733,8 @@ static void persistent_commit_exception(struct dm_exception_store *store, /* * Commit exceptions to disk. */ - if (ps->valid && area_io(ps, REQ_OP_WRITE, - REQ_PREFLUSH | REQ_FUA | REQ_SYNC)) + if (ps->valid && area_io(ps, REQ_OP_WRITE | REQ_PREFLUSH | REQ_FUA | + REQ_SYNC)) ps->valid = 0; /* @@ -775,7 +774,7 @@ static int persistent_prepare_merge(struct dm_exception_store *store, return 0; ps->current_area--; - r = area_io(ps, REQ_OP_READ, 0); + r = area_io(ps, REQ_OP_READ); if (r < 0) return r; ps->current_committed = ps->exceptions_per_area; @@ -812,7 +811,7 @@ static int persistent_commit_merge(struct dm_exception_store *store, for (i = 0; i < nr_merged; i++) clear_exception(ps, ps->current_committed - 1 - i); - r = area_io(ps, REQ_OP_WRITE, REQ_PREFLUSH | REQ_FUA); + r = area_io(ps, REQ_OP_WRITE | REQ_PREFLUSH | REQ_FUA); if (r < 0) return r; diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index 0d336b5ec571..d1c2f84d27e3 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -2026,7 +2026,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio) /* * Write to snapshot - higher level takes care of RW/RO * flags so we should only get this if we are - * writeable. + * writable. */ if (bio_data_dir(bio) == WRITE) { pe = __lookup_pending_exception(s, chunk); diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index bd539afbfe88..332f96b58252 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -6,6 +6,7 @@ */ #include "dm-core.h" +#include "dm-rq.h" #include <linux/module.h> #include <linux/vmalloc.h> @@ -174,8 +175,6 @@ static void dm_table_destroy_crypto_profile(struct dm_table *t); void dm_table_destroy(struct dm_table *t) { - unsigned int i; - if (!t) return; @@ -184,13 +183,13 @@ void dm_table_destroy(struct dm_table *t) kvfree(t->index[t->depth - 2]); /* free the targets */ - for (i = 0; i < t->num_targets; i++) { - struct dm_target *tgt = t->targets + i; + for (unsigned int i = 0; i < t->num_targets; i++) { + struct dm_target *ti = dm_table_get_target(t, i); - if (tgt->type->dtr) - tgt->type->dtr(tgt); + if (ti->type->dtr) + ti->type->dtr(ti); - dm_put_target_type(tgt->type); + dm_put_target_type(ti->type); } kvfree(t->highs); @@ -450,14 +449,14 @@ EXPORT_SYMBOL(dm_put_device); /* * Checks to see if the target joins onto the end of the table. */ -static int adjoin(struct dm_table *table, struct dm_target *ti) +static int adjoin(struct dm_table *t, struct dm_target *ti) { struct dm_target *prev; - if (!table->num_targets) + if (!t->num_targets) return !ti->begin; - prev = &table->targets[table->num_targets - 1]; + prev = &t->targets[t->num_targets - 1]; return (ti->begin == (prev->begin + prev->len)); } @@ -564,8 +563,8 @@ int dm_split_args(int *argc, char ***argvp, char *input) * two or more targets, the size of each piece it gets split into must * be compatible with the logical_block_size of the target processing it. */ -static int validate_hardware_logical_block_alignment(struct dm_table *table, - struct queue_limits *limits) +static int validate_hardware_logical_block_alignment(struct dm_table *t, + struct queue_limits *limits) { /* * This function uses arithmetic modulo the logical_block_size @@ -587,13 +586,13 @@ static int validate_hardware_logical_block_alignment(struct dm_table *table, struct dm_target *ti; struct queue_limits ti_limits; - unsigned i; + unsigned int i; /* * Check each entry in the table in turn. */ - for (i = 0; i < dm_table_get_num_targets(table); i++) { - ti = dm_table_get_target(table, i); + for (i = 0; i < t->num_targets; i++) { + ti = dm_table_get_target(t, i); blk_set_stacking_limits(&ti_limits); @@ -621,7 +620,7 @@ static int validate_hardware_logical_block_alignment(struct dm_table *table, if (remaining) { DMWARN("%s: table line %u (start sect %llu len %llu) " "not aligned to h/w logical block size %u", - dm_device_name(table->md), i, + dm_device_name(t->md), i, (unsigned long long) ti->begin, (unsigned long long) ti->len, limits->logical_block_size); @@ -636,7 +635,7 @@ int dm_table_add_target(struct dm_table *t, const char *type, { int r = -EINVAL, argc; char **argv; - struct dm_target *tgt; + struct dm_target *ti; if (t->singleton) { DMERR("%s: target type %s must appear alone in table", @@ -646,87 +645,87 @@ int dm_table_add_target(struct dm_table *t, const char *type, BUG_ON(t->num_targets >= t->num_allocated); - tgt = t->targets + t->num_targets; - memset(tgt, 0, sizeof(*tgt)); + ti = t->targets + t->num_targets; + memset(ti, 0, sizeof(*ti)); if (!len) { DMERR("%s: zero-length target", dm_device_name(t->md)); return -EINVAL; } - tgt->type = dm_get_target_type(type); - if (!tgt->type) { + ti->type = dm_get_target_type(type); + if (!ti->type) { DMERR("%s: %s: unknown target type", dm_device_name(t->md), type); return -EINVAL; } - if (dm_target_needs_singleton(tgt->type)) { + if (dm_target_needs_singleton(ti->type)) { if (t->num_targets) { - tgt->error = "singleton target type must appear alone in table"; + ti->error = "singleton target type must appear alone in table"; goto bad; } t->singleton = true; } - if (dm_target_always_writeable(tgt->type) && !(t->mode & FMODE_WRITE)) { - tgt->error = "target type may not be included in a read-only table"; + if (dm_target_always_writeable(ti->type) && !(t->mode & FMODE_WRITE)) { + ti->error = "target type may not be included in a read-only table"; goto bad; } if (t->immutable_target_type) { - if (t->immutable_target_type != tgt->type) { - tgt->error = "immutable target type cannot be mixed with other target types"; + if (t->immutable_target_type != ti->type) { + ti->error = "immutable target type cannot be mixed with other target types"; goto bad; } - } else if (dm_target_is_immutable(tgt->type)) { + } else if (dm_target_is_immutable(ti->type)) { if (t->num_targets) { - tgt->error = "immutable target type cannot be mixed with other target types"; + ti->error = "immutable target type cannot be mixed with other target types"; goto bad; } - t->immutable_target_type = tgt->type; + t->immutable_target_type = ti->type; } - if (dm_target_has_integrity(tgt->type)) + if (dm_target_has_integrity(ti->type)) t->integrity_added = 1; - tgt->table = t; - tgt->begin = start; - tgt->len = len; - tgt->error = "Unknown error"; + ti->table = t; + ti->begin = start; + ti->len = len; + ti->error = "Unknown error"; /* * Does this target adjoin the previous one ? */ - if (!adjoin(t, tgt)) { - tgt->error = "Gap in table"; + if (!adjoin(t, ti)) { + ti->error = "Gap in table"; goto bad; } r = dm_split_args(&argc, &argv, params); if (r) { - tgt->error = "couldn't split parameters"; + ti->error = "couldn't split parameters"; goto bad; } - r = tgt->type->ctr(tgt, argc, argv); + r = ti->type->ctr(ti, argc, argv); kfree(argv); if (r) goto bad; - t->highs[t->num_targets++] = tgt->begin + tgt->len - 1; + t->highs[t->num_targets++] = ti->begin + ti->len - 1; - if (!tgt->num_discard_bios && tgt->discards_supported) + if (!ti->num_discard_bios && ti->discards_supported) DMWARN("%s: %s: ignoring discards_supported because num_discard_bios is zero.", dm_device_name(t->md), type); - if (tgt->limit_swap_bios && !static_key_enabled(&swap_bios_enabled.key)) + if (ti->limit_swap_bios && !static_key_enabled(&swap_bios_enabled.key)) static_branch_enable(&swap_bios_enabled); return 0; bad: - DMERR("%s: %s: %s (%pe)", dm_device_name(t->md), type, tgt->error, ERR_PTR(r)); - dm_put_target_type(tgt->type); + DMERR("%s: %s: %s (%pe)", dm_device_name(t->md), type, ti->error, ERR_PTR(r)); + dm_put_target_type(ti->type); return r; } @@ -825,14 +824,11 @@ static int device_not_dax_synchronous_capable(struct dm_target *ti, struct dm_de } static bool dm_table_supports_dax(struct dm_table *t, - iterate_devices_callout_fn iterate_fn) + iterate_devices_callout_fn iterate_fn) { - struct dm_target *ti; - unsigned i; - /* Ensure that all targets support DAX. */ - for (i = 0; i < dm_table_get_num_targets(t); i++) { - ti = dm_table_get_target(t, i); + for (unsigned int i = 0; i < t->num_targets; i++) { + struct dm_target *ti = dm_table_get_target(t, i); if (!ti->type->direct_access) return false; @@ -860,9 +856,8 @@ static int device_is_rq_stackable(struct dm_target *ti, struct dm_dev *dev, static int dm_table_determine_type(struct dm_table *t) { - unsigned i; unsigned bio_based = 0, request_based = 0, hybrid = 0; - struct dm_target *tgt; + struct dm_target *ti; struct list_head *devices = dm_table_get_devices(t); enum dm_queue_mode live_md_type = dm_get_md_type(t->md); @@ -876,11 +871,11 @@ static int dm_table_determine_type(struct dm_table *t) goto verify_rq_based; } - for (i = 0; i < t->num_targets; i++) { - tgt = t->targets + i; - if (dm_target_hybrid(tgt)) + for (unsigned int i = 0; i < t->num_targets; i++) { + ti = dm_table_get_target(t, i); + if (dm_target_hybrid(ti)) hybrid = 1; - else if (dm_target_request_based(tgt)) + else if (dm_target_request_based(ti)) request_based = 1; else bio_based = 1; @@ -942,18 +937,18 @@ verify_rq_based: return 0; } - tgt = dm_table_get_immutable_target(t); - if (!tgt) { + ti = dm_table_get_immutable_target(t); + if (!ti) { DMERR("table load rejected: immutable target is required"); return -EINVAL; - } else if (tgt->max_io_len) { + } else if (ti->max_io_len) { DMERR("table load rejected: immutable target that splits IO is not supported"); return -EINVAL; } /* Non-request-stackable devices can't be used for request-based dm */ - if (!tgt->type->iterate_devices || - !tgt->type->iterate_devices(tgt, device_is_rq_stackable, NULL)) { + if (!ti->type->iterate_devices || + !ti->type->iterate_devices(ti, device_is_rq_stackable, NULL)) { DMERR("table load rejected: including non-request-stackable devices"); return -EINVAL; } @@ -983,11 +978,9 @@ struct dm_target *dm_table_get_immutable_target(struct dm_table *t) struct dm_target *dm_table_get_wildcard_target(struct dm_table *t) { - struct dm_target *ti; - unsigned i; + for (unsigned int i = 0; i < t->num_targets; i++) { + struct dm_target *ti = dm_table_get_target(t, i); - for (i = 0; i < dm_table_get_num_targets(t); i++) { - ti = dm_table_get_target(t, i); if (dm_target_is_wildcard(ti->type)) return ti; } @@ -1010,32 +1003,56 @@ static bool dm_table_supports_poll(struct dm_table *t); static int dm_table_alloc_md_mempools(struct dm_table *t, struct mapped_device *md) { enum dm_queue_mode type = dm_table_get_type(t); - unsigned per_io_data_size = 0; - unsigned min_pool_size = 0; - struct dm_target *ti; - unsigned i; - bool poll_supported = false; + unsigned int per_io_data_size = 0, front_pad, io_front_pad; + unsigned int min_pool_size = 0, pool_size; + struct dm_md_mempools *pools; if (unlikely(type == DM_TYPE_NONE)) { DMWARN("no table type is set, can't allocate mempools"); return -EINVAL; } - if (__table_type_bio_based(type)) { - for (i = 0; i < t->num_targets; i++) { - ti = t->targets + i; - per_io_data_size = max(per_io_data_size, ti->per_io_data_size); - min_pool_size = max(min_pool_size, ti->num_flush_bios); - } - poll_supported = dm_table_supports_poll(t); + pools = kzalloc_node(sizeof(*pools), GFP_KERNEL, md->numa_node_id); + if (!pools) + return -ENOMEM; + + if (type == DM_TYPE_REQUEST_BASED) { + pool_size = dm_get_reserved_rq_based_ios(); + front_pad = offsetof(struct dm_rq_clone_bio_info, clone); + goto init_bs; } - t->mempools = dm_alloc_md_mempools(md, type, per_io_data_size, min_pool_size, - t->integrity_supported, poll_supported); - if (!t->mempools) - return -ENOMEM; + for (unsigned int i = 0; i < t->num_targets; i++) { + struct dm_target *ti = dm_table_get_target(t, i); + per_io_data_size = max(per_io_data_size, ti->per_io_data_size); + min_pool_size = max(min_pool_size, ti->num_flush_bios); + } + pool_size = max(dm_get_reserved_bio_based_ios(), min_pool_size); + front_pad = roundup(per_io_data_size, + __alignof__(struct dm_target_io)) + DM_TARGET_IO_BIO_OFFSET; + + io_front_pad = roundup(per_io_data_size, + __alignof__(struct dm_io)) + DM_IO_BIO_OFFSET; + if (bioset_init(&pools->io_bs, pool_size, io_front_pad, + dm_table_supports_poll(t) ? BIOSET_PERCPU_CACHE : 0)) + goto out_free_pools; + if (t->integrity_supported && + bioset_integrity_create(&pools->io_bs, pool_size)) + goto out_free_pools; +init_bs: + if (bioset_init(&pools->bs, pool_size, front_pad, 0)) + goto out_free_pools; + if (t->integrity_supported && + bioset_integrity_create(&pools->bs, pool_size)) + goto out_free_pools; + + t->mempools = pools; return 0; + +out_free_pools: + dm_free_md_mempools(pools); + return -ENOMEM; } static int setup_indexes(struct dm_table *t) @@ -1100,10 +1117,10 @@ static struct gendisk * dm_table_get_integrity_disk(struct dm_table *t) struct list_head *devices = dm_table_get_devices(t); struct dm_dev_internal *dd = NULL; struct gendisk *prev_disk = NULL, *template_disk = NULL; - unsigned i; - for (i = 0; i < dm_table_get_num_targets(t); i++) { + for (unsigned int i = 0; i < t->num_targets; i++) { struct dm_target *ti = dm_table_get_target(t, i); + if (!dm_target_passes_integrity(ti->type)) goto no_integrity; } @@ -1217,18 +1234,19 @@ static int dm_keyslot_evict(struct blk_crypto_profile *profile, struct dm_keyslot_evict_args args = { key }; struct dm_table *t; int srcu_idx; - int i; - struct dm_target *ti; t = dm_get_live_table(md, &srcu_idx); if (!t) return 0; - for (i = 0; i < dm_table_get_num_targets(t); i++) { - ti = dm_table_get_target(t, i); + + for (unsigned int i = 0; i < t->num_targets; i++) { + struct dm_target *ti = dm_table_get_target(t, i); + if (!ti->type->iterate_devices) continue; ti->type->iterate_devices(ti, dm_keyslot_evict_callback, &args); } + dm_put_live_table(md, srcu_idx); return args.err; } @@ -1277,7 +1295,6 @@ static int dm_table_construct_crypto_profile(struct dm_table *t) { struct dm_crypto_profile *dmcp; struct blk_crypto_profile *profile; - struct dm_target *ti; unsigned int i; bool empty_profile = true; @@ -1293,8 +1310,8 @@ static int dm_table_construct_crypto_profile(struct dm_table *t) memset(profile->modes_supported, 0xFF, sizeof(profile->modes_supported)); - for (i = 0; i < dm_table_get_num_targets(t); i++) { - ti = dm_table_get_target(t, i); + for (i = 0; i < t->num_targets; i++) { + struct dm_target *ti = dm_table_get_target(t, i); if (!dm_target_passes_crypto(ti->type)) { blk_crypto_intersect_capabilities(profile, NULL); @@ -1444,14 +1461,6 @@ inline sector_t dm_table_get_size(struct dm_table *t) } EXPORT_SYMBOL(dm_table_get_size); -struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index) -{ - if (index >= t->num_targets) - return NULL; - - return t->targets + index; -} - /* * Search the btree for the correct target. * @@ -1512,11 +1521,8 @@ static int device_not_poll_capable(struct dm_target *ti, struct dm_dev *dev, static bool dm_table_any_dev_attr(struct dm_table *t, iterate_devices_callout_fn func, void *data) { - struct dm_target *ti; - unsigned int i; - - for (i = 0; i < dm_table_get_num_targets(t); i++) { - ti = dm_table_get_target(t, i); + for (unsigned int i = 0; i < t->num_targets; i++) { + struct dm_target *ti = dm_table_get_target(t, i); if (ti->type->iterate_devices && ti->type->iterate_devices(ti, func, data)) @@ -1538,11 +1544,8 @@ static int count_device(struct dm_target *ti, struct dm_dev *dev, static bool dm_table_supports_poll(struct dm_table *t) { - struct dm_target *ti; - unsigned i = 0; - - while (i < dm_table_get_num_targets(t)) { - ti = dm_table_get_target(t, i++); + for (unsigned int i = 0; i < t->num_targets; i++) { + struct dm_target *ti = dm_table_get_target(t, i); if (!ti->type->iterate_devices || ti->type->iterate_devices(ti, device_not_poll_capable, NULL)) @@ -1558,18 +1561,15 @@ static bool dm_table_supports_poll(struct dm_table *t) * Returns false if the result is unknown because a target doesn't * support iterate_devices. */ -bool dm_table_has_no_data_devices(struct dm_table *table) +bool dm_table_has_no_data_devices(struct dm_table *t) { - struct dm_target *ti; - unsigned i, num_devices; - - for (i = 0; i < dm_table_get_num_targets(table); i++) { - ti = dm_table_get_target(table, i); + for (unsigned int i = 0; i < t->num_targets; i++) { + struct dm_target *ti = dm_table_get_target(t, i); + unsigned num_devices = 0; if (!ti->type->iterate_devices) return false; - num_devices = 0; ti->type->iterate_devices(ti, count_device, &num_devices); if (num_devices) return false; @@ -1597,11 +1597,8 @@ static int device_not_zoned_model(struct dm_target *ti, struct dm_dev *dev, static bool dm_table_supports_zoned_model(struct dm_table *t, enum blk_zoned_model zoned_model) { - struct dm_target *ti; - unsigned i; - - for (i = 0; i < dm_table_get_num_targets(t); i++) { - ti = dm_table_get_target(t, i); + for (unsigned int i = 0; i < t->num_targets; i++) { + struct dm_target *ti = dm_table_get_target(t, i); if (dm_target_supports_zoned_hm(ti->type)) { if (!ti->type->iterate_devices || @@ -1620,13 +1617,11 @@ static bool dm_table_supports_zoned_model(struct dm_table *t, static int device_not_matches_zone_sectors(struct dm_target *ti, struct dm_dev *dev, sector_t start, sector_t len, void *data) { - struct request_queue *q = bdev_get_queue(dev->bdev); unsigned int *zone_sectors = data; - if (!blk_queue_is_zoned(q)) + if (!bdev_is_zoned(dev->bdev)) return 0; - - return blk_queue_zone_sectors(q) != *zone_sectors; + return bdev_zone_sectors(dev->bdev) != *zone_sectors; } /* @@ -1634,16 +1629,16 @@ static int device_not_matches_zone_sectors(struct dm_target *ti, struct dm_dev * * zone sectors, if the destination device is a zoned block device, it shall * have the specified zone_sectors. */ -static int validate_hardware_zoned_model(struct dm_table *table, +static int validate_hardware_zoned_model(struct dm_table *t, enum blk_zoned_model zoned_model, unsigned int zone_sectors) { if (zoned_model == BLK_ZONED_NONE) return 0; - if (!dm_table_supports_zoned_model(table, zoned_model)) { + if (!dm_table_supports_zoned_model(t, zoned_model)) { DMERR("%s: zoned model is not consistent across all devices", - dm_device_name(table->md)); + dm_device_name(t->md)); return -EINVAL; } @@ -1651,9 +1646,9 @@ static int validate_hardware_zoned_model(struct dm_table *table, if (!zone_sectors || !is_power_of_2(zone_sectors)) return -EINVAL; - if (dm_table_any_dev_attr(table, device_not_matches_zone_sectors, &zone_sectors)) { + if (dm_table_any_dev_attr(t, device_not_matches_zone_sectors, &zone_sectors)) { DMERR("%s: zone sectors is not consistent across all zoned devices", - dm_device_name(table->md)); + dm_device_name(t->md)); return -EINVAL; } @@ -1663,21 +1658,19 @@ static int validate_hardware_zoned_model(struct dm_table *table, /* * Establish the new table's queue_limits and validate them. */ -int dm_calculate_queue_limits(struct dm_table *table, +int dm_calculate_queue_limits(struct dm_table *t, struct queue_limits *limits) { - struct dm_target *ti; struct queue_limits ti_limits; - unsigned i; enum blk_zoned_model zoned_model = BLK_ZONED_NONE; unsigned int zone_sectors = 0; blk_set_stacking_limits(limits); - for (i = 0; i < dm_table_get_num_targets(table); i++) { - blk_set_stacking_limits(&ti_limits); + for (unsigned int i = 0; i < t->num_targets; i++) { + struct dm_target *ti = dm_table_get_target(t, i); - ti = dm_table_get_target(table, i); + blk_set_stacking_limits(&ti_limits); if (!ti->type->iterate_devices) goto combine_limits; @@ -1718,7 +1711,7 @@ combine_limits: DMWARN("%s: adding target device " "(start sect %llu len %llu) " "caused an alignment inconsistency", - dm_device_name(table->md), + dm_device_name(t->md), (unsigned long long) ti->begin, (unsigned long long) ti->len); } @@ -1738,10 +1731,10 @@ combine_limits: zoned_model = limits->zoned; zone_sectors = limits->chunk_sectors; } - if (validate_hardware_zoned_model(table, zoned_model, zone_sectors)) + if (validate_hardware_zoned_model(t, zoned_model, zone_sectors)) return -EINVAL; - return validate_hardware_logical_block_alignment(table, limits); + return validate_hardware_logical_block_alignment(t, limits); } /* @@ -1785,17 +1778,14 @@ static int device_flush_capable(struct dm_target *ti, struct dm_dev *dev, static bool dm_table_supports_flush(struct dm_table *t, unsigned long flush) { - struct dm_target *ti; - unsigned i; - /* * Require at least one underlying device to support flushes. * t->devices includes internal dm devices such as mirror logs * so we need to use iterate_devices here, which targets * supporting flushes must provide. */ - for (i = 0; i < dm_table_get_num_targets(t); i++) { - ti = dm_table_get_target(t, i); + for (unsigned int i = 0; i < t->num_targets; i++) { + struct dm_target *ti = dm_table_get_target(t, i); if (!ti->num_flush_bios) continue; @@ -1849,11 +1839,8 @@ static int device_not_write_zeroes_capable(struct dm_target *ti, struct dm_dev * static bool dm_table_supports_write_zeroes(struct dm_table *t) { - struct dm_target *ti; - unsigned i = 0; - - while (i < dm_table_get_num_targets(t)) { - ti = dm_table_get_target(t, i++); + for (unsigned int i = 0; i < t->num_targets; i++) { + struct dm_target *ti = dm_table_get_target(t, i); if (!ti->num_write_zeroes_bios) return false; @@ -1876,11 +1863,8 @@ static int device_not_nowait_capable(struct dm_target *ti, struct dm_dev *dev, static bool dm_table_supports_nowait(struct dm_table *t) { - struct dm_target *ti; - unsigned i = 0; - - while (i < dm_table_get_num_targets(t)) { - ti = dm_table_get_target(t, i++); + for (unsigned int i = 0; i < t->num_targets; i++) { + struct dm_target *ti = dm_table_get_target(t, i); if (!dm_target_supports_nowait(ti->type)) return false; @@ -1901,11 +1885,8 @@ static int device_not_discard_capable(struct dm_target *ti, struct dm_dev *dev, static bool dm_table_supports_discards(struct dm_table *t) { - struct dm_target *ti; - unsigned i; - - for (i = 0; i < dm_table_get_num_targets(t); i++) { - ti = dm_table_get_target(t, i); + for (unsigned int i = 0; i < t->num_targets; i++) { + struct dm_target *ti = dm_table_get_target(t, i); if (!ti->num_discard_bios) return false; @@ -1933,11 +1914,8 @@ static int device_not_secure_erase_capable(struct dm_target *ti, static bool dm_table_supports_secure_erase(struct dm_table *t) { - struct dm_target *ti; - unsigned int i; - - for (i = 0; i < dm_table_get_num_targets(t); i++) { - ti = dm_table_get_target(t, i); + for (unsigned int i = 0; i < t->num_targets; i++) { + struct dm_target *ti = dm_table_get_target(t, i); if (!ti->num_secure_erase_bios) return false; @@ -2067,11 +2045,6 @@ int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, return 0; } -unsigned int dm_table_get_num_targets(struct dm_table *t) -{ - return t->num_targets; -} - struct list_head *dm_table_get_devices(struct dm_table *t) { return &t->devices; @@ -2091,12 +2064,11 @@ enum suspend_mode { static void suspend_targets(struct dm_table *t, enum suspend_mode mode) { - int i = t->num_targets; - struct dm_target *ti = t->targets; - lockdep_assert_held(&t->md->suspend_lock); - while (i--) { + for (unsigned int i = 0; i < t->num_targets; i++) { + struct dm_target *ti = dm_table_get_target(t, i); + switch (mode) { case PRESUSPEND: if (ti->type->presuspend) @@ -2111,7 +2083,6 @@ static void suspend_targets(struct dm_table *t, enum suspend_mode mode) ti->type->postsuspend(ti); break; } - ti++; } } @@ -2141,12 +2112,13 @@ void dm_table_postsuspend_targets(struct dm_table *t) int dm_table_resume_targets(struct dm_table *t) { - int i, r = 0; + unsigned int i; + int r = 0; lockdep_assert_held(&t->md->suspend_lock); for (i = 0; i < t->num_targets; i++) { - struct dm_target *ti = t->targets + i; + struct dm_target *ti = dm_table_get_target(t, i); if (!ti->type->preresume) continue; @@ -2160,7 +2132,7 @@ int dm_table_resume_targets(struct dm_table *t) } for (i = 0; i < t->num_targets; i++) { - struct dm_target *ti = t->targets + i; + struct dm_target *ti = dm_table_get_target(t, i); if (ti->type->resume) ti->type->resume(ti); diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c index 2db7030aba00..a27395c8621f 100644 --- a/drivers/md/dm-thin-metadata.c +++ b/drivers/md/dm-thin-metadata.c @@ -2045,10 +2045,13 @@ int dm_pool_register_metadata_threshold(struct dm_pool_metadata *pmd, dm_sm_threshold_fn fn, void *context) { - int r; + int r = -EINVAL; pmd_write_lock_in_core(pmd); - r = dm_sm_register_threshold_callback(pmd->metadata_sm, threshold, fn, context); + if (!pmd->fail_io) { + r = dm_sm_register_threshold_callback(pmd->metadata_sm, + threshold, fn, context); + } pmd_write_unlock(pmd); return r; diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index 84c083f76673..e76c96c760a9 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -3375,8 +3375,10 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv) calc_metadata_threshold(pt), metadata_low_callback, pool); - if (r) + if (r) { + ti->error = "Error registering metadata threshold"; goto out_flags_changed; + } dm_pool_register_pre_commit_callback(pool->pmd, metadata_pre_commit_callback, pool); diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c index cea2b3789736..23cffce56403 100644 --- a/drivers/md/dm-verity-fec.c +++ b/drivers/md/dm-verity-fec.c @@ -749,7 +749,7 @@ int verity_fec_ctr(struct dm_verity *v) f->bufio = dm_bufio_client_create(f->dev->bdev, f->io_size, - 1, 0, NULL, NULL); + 1, 0, NULL, NULL, 0); if (IS_ERR(f->bufio)) { ti->error = "Cannot initialize FEC bufio client"; return PTR_ERR(f->bufio); @@ -765,7 +765,7 @@ int verity_fec_ctr(struct dm_verity *v) f->data_bufio = dm_bufio_client_create(v->data_dev->bdev, 1 << v->data_dev_block_bits, - 1, 0, NULL, NULL); + 1, 0, NULL, NULL, 0); if (IS_ERR(f->data_bufio)) { ti->error = "Cannot initialize FEC data bufio client"; return PTR_ERR(f->data_bufio); diff --git a/drivers/md/dm-verity-loadpin.c b/drivers/md/dm-verity-loadpin.c new file mode 100644 index 000000000000..387ec43aef72 --- /dev/null +++ b/drivers/md/dm-verity-loadpin.c @@ -0,0 +1,75 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include <linux/list.h> +#include <linux/kernel.h> +#include <linux/dm-verity-loadpin.h> + +#include "dm.h" +#include "dm-core.h" +#include "dm-verity.h" + +#define DM_MSG_PREFIX "verity-loadpin" + +LIST_HEAD(dm_verity_loadpin_trusted_root_digests); + +static bool is_trusted_verity_target(struct dm_target *ti) +{ + u8 *root_digest; + unsigned int digest_size; + struct dm_verity_loadpin_trusted_root_digest *trd; + bool trusted = false; + + if (!dm_is_verity_target(ti)) + return false; + + if (dm_verity_get_root_digest(ti, &root_digest, &digest_size)) + return false; + + list_for_each_entry(trd, &dm_verity_loadpin_trusted_root_digests, node) { + if ((trd->len == digest_size) && + !memcmp(trd->data, root_digest, digest_size)) { + trusted = true; + break; + } + } + + kfree(root_digest); + + return trusted; +} + +/* + * Determines whether the file system of a superblock is located on + * a verity device that is trusted by LoadPin. + */ +bool dm_verity_loadpin_is_bdev_trusted(struct block_device *bdev) +{ + struct mapped_device *md; + struct dm_table *table; + struct dm_target *ti; + int srcu_idx; + bool trusted = false; + + if (list_empty(&dm_verity_loadpin_trusted_root_digests)) + return false; + + md = dm_get_md(bdev->bd_dev); + if (!md) + return false; + + table = dm_get_live_table(md, &srcu_idx); + + if (table->num_targets != 1) + goto out; + + ti = dm_table_get_target(table, 0); + + if (is_trusted_verity_target(ti)) + trusted = true; + +out: + dm_put_live_table(md, srcu_idx); + dm_put(md); + + return trusted; +} diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c index d6dbd47492a8..94b6cb599db4 100644 --- a/drivers/md/dm-verity-target.c +++ b/drivers/md/dm-verity-target.c @@ -19,6 +19,8 @@ #include <linux/module.h> #include <linux/reboot.h> #include <linux/scatterlist.h> +#include <linux/string.h> +#include <linux/jump_label.h> #define DM_MSG_PREFIX "verity" @@ -34,14 +36,17 @@ #define DM_VERITY_OPT_PANIC "panic_on_corruption" #define DM_VERITY_OPT_IGN_ZEROES "ignore_zero_blocks" #define DM_VERITY_OPT_AT_MOST_ONCE "check_at_most_once" +#define DM_VERITY_OPT_TASKLET_VERIFY "try_verify_in_tasklet" -#define DM_VERITY_OPTS_MAX (3 + DM_VERITY_OPTS_FEC + \ +#define DM_VERITY_OPTS_MAX (4 + DM_VERITY_OPTS_FEC + \ DM_VERITY_ROOT_HASH_VERIFICATION_OPTS) static unsigned dm_verity_prefetch_cluster = DM_VERITY_DEFAULT_PREFETCH_SIZE; module_param_named(prefetch_cluster, dm_verity_prefetch_cluster, uint, S_IRUGO | S_IWUSR); +static DEFINE_STATIC_KEY_FALSE(use_tasklet_enabled); + struct dm_verity_prefetch_work { struct work_struct work; struct dm_verity *v; @@ -220,7 +225,7 @@ static int verity_handle_err(struct dm_verity *v, enum verity_block_type type, struct mapped_device *md = dm_table_get_md(v->ti->table); /* Corruption should be visible in device status in all modes */ - v->hash_failed = 1; + v->hash_failed = true; if (v->corrupted_errs >= DM_VERITY_MAX_CORRUPTED_ERRS) goto out; @@ -286,7 +291,19 @@ static int verity_verify_level(struct dm_verity *v, struct dm_verity_io *io, verity_hash_at_level(v, block, level, &hash_block, &offset); - data = dm_bufio_read(v->bufio, hash_block, &buf); + if (static_branch_unlikely(&use_tasklet_enabled) && io->in_tasklet) { + data = dm_bufio_get(v->bufio, hash_block, &buf); + if (data == NULL) { + /* + * In tasklet and the hash was not in the bufio cache. + * Return early and resume execution from a work-queue + * to read the hash from disk. + */ + return -EAGAIN; + } + } else + data = dm_bufio_read(v->bufio, hash_block, &buf); + if (IS_ERR(data)) return PTR_ERR(data); @@ -307,6 +324,15 @@ static int verity_verify_level(struct dm_verity *v, struct dm_verity_io *io, if (likely(memcmp(verity_io_real_digest(v, io), want_digest, v->digest_size) == 0)) aux->hash_verified = 1; + else if (static_branch_unlikely(&use_tasklet_enabled) && + io->in_tasklet) { + /* + * Error handling code (FEC included) cannot be run in a + * tasklet since it may sleep, so fallback to work-queue. + */ + r = -EAGAIN; + goto release_ret_r; + } else if (verity_fec_decode(v, io, DM_VERITY_BLOCK_TYPE_METADATA, hash_block, data, NULL) == 0) @@ -473,10 +499,24 @@ static int verity_verify_io(struct dm_verity_io *io) { bool is_zero; struct dm_verity *v = io->v; +#if defined(CONFIG_DM_VERITY_FEC) struct bvec_iter start; - unsigned b; +#endif + struct bvec_iter iter_copy; + struct bvec_iter *iter; struct crypto_wait wait; struct bio *bio = dm_bio_from_per_bio_data(io, v->ti->per_io_data_size); + unsigned int b; + + if (static_branch_unlikely(&use_tasklet_enabled) && io->in_tasklet) { + /* + * Copy the iterator in case we need to restart + * verification in a work-queue. + */ + iter_copy = io->iter; + iter = &iter_copy; + } else + iter = &io->iter; for (b = 0; b < io->n_blocks; b++) { int r; @@ -485,7 +525,7 @@ static int verity_verify_io(struct dm_verity_io *io) if (v->validated_blocks && likely(test_bit(cur_block, v->validated_blocks))) { - verity_bv_skip_block(v, io, &io->iter); + verity_bv_skip_block(v, io, iter); continue; } @@ -500,7 +540,7 @@ static int verity_verify_io(struct dm_verity_io *io) * If we expect a zero block, don't validate, just * return zeros. */ - r = verity_for_bv_block(v, io, &io->iter, + r = verity_for_bv_block(v, io, iter, verity_bv_zero); if (unlikely(r < 0)) return r; @@ -512,8 +552,11 @@ static int verity_verify_io(struct dm_verity_io *io) if (unlikely(r < 0)) return r; - start = io->iter; - r = verity_for_io_block(v, io, &io->iter, &wait); +#if defined(CONFIG_DM_VERITY_FEC) + if (verity_fec_is_enabled(v)) + start = *iter; +#endif + r = verity_for_io_block(v, io, iter, &wait); if (unlikely(r < 0)) return r; @@ -527,11 +570,19 @@ static int verity_verify_io(struct dm_verity_io *io) if (v->validated_blocks) set_bit(cur_block, v->validated_blocks); continue; - } - else if (verity_fec_decode(v, io, DM_VERITY_BLOCK_TYPE_DATA, - cur_block, NULL, &start) == 0) + } else if (static_branch_unlikely(&use_tasklet_enabled) && + io->in_tasklet) { + /* + * Error handling code (FEC included) cannot be run in a + * tasklet since it may sleep, so fallback to work-queue. + */ + return -EAGAIN; +#if defined(CONFIG_DM_VERITY_FEC) + } else if (verity_fec_decode(v, io, DM_VERITY_BLOCK_TYPE_DATA, + cur_block, NULL, &start) == 0) { continue; - else { +#endif + } else { if (bio->bi_status) { /* * Error correction failed; Just return error @@ -567,7 +618,8 @@ static void verity_finish_io(struct dm_verity_io *io, blk_status_t status) bio->bi_end_io = io->orig_bi_end_io; bio->bi_status = status; - verity_fec_finish_io(io); + if (!static_branch_unlikely(&use_tasklet_enabled) || !io->in_tasklet) + verity_fec_finish_io(io); bio_endio(bio); } @@ -576,9 +628,29 @@ static void verity_work(struct work_struct *w) { struct dm_verity_io *io = container_of(w, struct dm_verity_io, work); + io->in_tasklet = false; + + verity_fec_init_io(io); verity_finish_io(io, errno_to_blk_status(verity_verify_io(io))); } +static void verity_tasklet(unsigned long data) +{ + struct dm_verity_io *io = (struct dm_verity_io *)data; + int err; + + io->in_tasklet = true; + err = verity_verify_io(io); + if (err == -EAGAIN) { + /* fallback to retrying with work-queue */ + INIT_WORK(&io->work, verity_work); + queue_work(io->v->verify_wq, &io->work); + return; + } + + verity_finish_io(io, errno_to_blk_status(err)); +} + static void verity_end_io(struct bio *bio) { struct dm_verity_io *io = bio->bi_private; @@ -589,8 +661,13 @@ static void verity_end_io(struct bio *bio) return; } - INIT_WORK(&io->work, verity_work); - queue_work(io->v->verify_wq, &io->work); + if (static_branch_unlikely(&use_tasklet_enabled) && io->v->use_tasklet) { + tasklet_init(&io->tasklet, verity_tasklet, (unsigned long)io); + tasklet_schedule(&io->tasklet); + } else { + INIT_WORK(&io->work, verity_work); + queue_work(io->v->verify_wq, &io->work); + } } /* @@ -701,8 +778,6 @@ static int verity_map(struct dm_target *ti, struct bio *bio) bio->bi_private = io; io->iter = bio->bi_iter; - verity_fec_init_io(io); - verity_submit_prefetch(v, io); submit_bio_noacct(bio); @@ -752,6 +827,8 @@ static void verity_status(struct dm_target *ti, status_type_t type, args++; if (v->validated_blocks) args++; + if (v->use_tasklet) + args++; if (v->signature_key_desc) args += DM_VERITY_ROOT_HASH_VERIFICATION_OPTS; if (!args) @@ -777,6 +854,8 @@ static void verity_status(struct dm_target *ti, status_type_t type, DMEMIT(" " DM_VERITY_OPT_IGN_ZEROES); if (v->validated_blocks) DMEMIT(" " DM_VERITY_OPT_AT_MOST_ONCE); + if (v->use_tasklet) + DMEMIT(" " DM_VERITY_OPT_TASKLET_VERIFY); sz = verity_fec_status_table(v, sz, result, maxlen); if (v->signature_key_desc) DMEMIT(" " DM_VERITY_ROOT_HASH_VERIFICATION_OPT_SIG_KEY @@ -890,6 +969,9 @@ static void verity_dtr(struct dm_target *ti) kfree(v->signature_key_desc); + if (v->use_tasklet) + static_branch_dec(&use_tasklet_enabled); + kfree(v); } @@ -968,9 +1050,10 @@ static int verity_parse_verity_mode(struct dm_verity *v, const char *arg_name) } static int verity_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v, - struct dm_verity_sig_opts *verify_args) + struct dm_verity_sig_opts *verify_args, + bool only_modifier_opts) { - int r; + int r = 0; unsigned argc; struct dm_target *ti = v->ti; const char *arg_name; @@ -991,6 +1074,8 @@ static int verity_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v, argc--; if (verity_is_verity_mode(arg_name)) { + if (only_modifier_opts) + continue; r = verity_parse_verity_mode(v, arg_name); if (r) { ti->error = "Conflicting error handling parameters"; @@ -999,6 +1084,8 @@ static int verity_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v, continue; } else if (!strcasecmp(arg_name, DM_VERITY_OPT_IGN_ZEROES)) { + if (only_modifier_opts) + continue; r = verity_alloc_zero_digest(v); if (r) { ti->error = "Cannot allocate zero digest"; @@ -1007,17 +1094,29 @@ static int verity_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v, continue; } else if (!strcasecmp(arg_name, DM_VERITY_OPT_AT_MOST_ONCE)) { + if (only_modifier_opts) + continue; r = verity_alloc_most_once(v); if (r) return r; continue; + } else if (!strcasecmp(arg_name, DM_VERITY_OPT_TASKLET_VERIFY)) { + v->use_tasklet = true; + static_branch_inc(&use_tasklet_enabled); + continue; + } else if (verity_is_fec_opt_arg(arg_name)) { + if (only_modifier_opts) + continue; r = verity_fec_parse_opt_args(as, v, &argc, arg_name); if (r) return r; continue; + } else if (verity_verify_is_sig_opt_arg(arg_name)) { + if (only_modifier_opts) + continue; r = verity_verify_sig_parse_opt_args(as, v, verify_args, &argc, arg_name); @@ -1025,8 +1124,17 @@ static int verity_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v, return r; continue; + } else if (only_modifier_opts) { + /* + * Ignore unrecognized opt, could easily be an extra + * argument to an option whose parsing was skipped. + * Normal parsing (@only_modifier_opts=false) will + * properly parse all options (and their extra args). + */ + continue; } + DMERR("Unrecognized verity feature request: %s", arg_name); ti->error = "Unrecognized verity feature request"; return -EINVAL; } while (argc && !r); @@ -1054,6 +1162,7 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv) struct dm_verity_sig_opts verify_args = {0}; struct dm_arg_set as; unsigned int num; + unsigned int wq_flags; unsigned long long num_ll; int r; int i; @@ -1085,6 +1194,15 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv) goto bad; } + /* Parse optional parameters that modify primary args */ + if (argc > 10) { + as.argc = argc - 10; + as.argv = argv + 10; + r = verity_parse_opt_args(&as, v, &verify_args, true); + if (r < 0) + goto bad; + } + if (sscanf(argv[0], "%u%c", &num, &dummy) != 1 || num > 1) { ti->error = "Invalid version"; @@ -1156,7 +1274,8 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv) goto bad; } - v->tfm = crypto_alloc_ahash(v->alg_name, 0, 0); + v->tfm = crypto_alloc_ahash(v->alg_name, 0, + v->use_tasklet ? CRYPTO_ALG_ASYNC : 0); if (IS_ERR(v->tfm)) { ti->error = "Cannot initialize hash function"; r = PTR_ERR(v->tfm); @@ -1218,8 +1337,7 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv) if (argc) { as.argc = argc; as.argv = argv; - - r = verity_parse_opt_args(&as, v, &verify_args); + r = verity_parse_opt_args(&as, v, &verify_args, false); if (r < 0) goto bad; } @@ -1266,7 +1384,8 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv) v->bufio = dm_bufio_client_create(v->hash_dev->bdev, 1 << v->hash_dev_block_bits, 1, sizeof(struct buffer_aux), - dm_bufio_alloc_callback, NULL); + dm_bufio_alloc_callback, NULL, + v->use_tasklet ? DM_BUFIO_CLIENT_NO_SLEEP : 0); if (IS_ERR(v->bufio)) { ti->error = "Cannot initialize dm-bufio"; r = PTR_ERR(v->bufio); @@ -1281,7 +1400,16 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv) } /* WQ_UNBOUND greatly improves performance when running on ramdisk */ - v->verify_wq = alloc_workqueue("kverityd", WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM | WQ_UNBOUND, num_online_cpus()); + wq_flags = WQ_MEM_RECLAIM | WQ_UNBOUND; + if (v->use_tasklet) { + /* + * Allow verify_wq to preempt softirq since verification in + * tasklet will fall-back to using it for error handling + * (or if the bufio cache doesn't have required hashes). + */ + wq_flags |= WQ_HIGHPRI; + } + v->verify_wq = alloc_workqueue("kverityd", wq_flags, num_online_cpus()); if (!v->verify_wq) { ti->error = "Cannot allocate workqueue"; r = -ENOMEM; @@ -1310,10 +1438,40 @@ bad: return r; } +/* + * Check whether a DM target is a verity target. + */ +bool dm_is_verity_target(struct dm_target *ti) +{ + return ti->type->module == THIS_MODULE; +} + +/* + * Get the root digest of a verity target. + * + * Returns a copy of the root digest, the caller is responsible for + * freeing the memory of the digest. + */ +int dm_verity_get_root_digest(struct dm_target *ti, u8 **root_digest, unsigned int *digest_size) +{ + struct dm_verity *v = ti->private; + + if (!dm_is_verity_target(ti)) + return -EINVAL; + + *root_digest = kmemdup(v->root_digest, v->digest_size, GFP_KERNEL); + if (*root_digest == NULL) + return -ENOMEM; + + *digest_size = v->digest_size; + + return 0; +} + static struct target_type verity_target = { .name = "verity", .features = DM_TARGET_IMMUTABLE, - .version = {1, 8, 0}, + .version = {1, 9, 0}, .module = THIS_MODULE, .ctr = verity_ctr, .dtr = verity_dtr, diff --git a/drivers/md/dm-verity.h b/drivers/md/dm-verity.h index 4e769d13473a..45455de1b4bc 100644 --- a/drivers/md/dm-verity.h +++ b/drivers/md/dm-verity.h @@ -13,6 +13,7 @@ #include <linux/dm-bufio.h> #include <linux/device-mapper.h> +#include <linux/interrupt.h> #include <crypto/hash.h> #define DM_VERITY_MAX_LEVELS 63 @@ -51,9 +52,10 @@ struct dm_verity { unsigned char hash_per_block_bits; /* log2(hashes in hash block) */ unsigned char levels; /* the number of tree levels */ unsigned char version; + bool hash_failed:1; /* set if hash of any block failed */ + bool use_tasklet:1; /* try to verify in tasklet before work-queue */ unsigned digest_size; /* digest size for the current hash algorithm */ unsigned int ahash_reqsize;/* the size of temporary space for crypto */ - int hash_failed; /* set to 1 if hash of any block failed */ enum verity_mode mode; /* mode for handling verification errors */ unsigned corrupted_errs;/* Number of errors for corrupted blocks */ @@ -76,10 +78,12 @@ struct dm_verity_io { sector_t block; unsigned n_blocks; + bool in_tasklet; struct bvec_iter iter; struct work_struct work; + struct tasklet_struct tasklet; /* * Three variably-size fields follow this struct: @@ -129,4 +133,8 @@ extern int verity_hash(struct dm_verity *v, struct ahash_request *req, extern int verity_hash_for_block(struct dm_verity *v, struct dm_verity_io *io, sector_t block, u8 *digest, bool *is_zero); +extern bool dm_is_verity_target(struct dm_target *ti); +extern int dm_verity_get_root_digest(struct dm_target *ti, u8 **root_digest, + unsigned int *digest_size); + #endif /* DM_VERITY_H */ diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c index d74c5a7a0ab4..96a003eb7323 100644 --- a/drivers/md/dm-writecache.c +++ b/drivers/md/dm-writecache.c @@ -22,7 +22,7 @@ #define HIGH_WATERMARK 50 #define LOW_WATERMARK 45 -#define MAX_WRITEBACK_JOBS 0 +#define MAX_WRITEBACK_JOBS min(0x10000000 / PAGE_SIZE, totalram_pages() / 16) #define ENDIO_LATENCY 16 #define WRITEBACK_LATENCY 64 #define AUTOCOMMIT_BLOCKS_SSD 65536 @@ -523,8 +523,7 @@ static void ssd_commit_flushed(struct dm_writecache *wc, bool wait_for_ios) region.sector += wc->start_sector; atomic_inc(&endio.count); - req.bi_op = REQ_OP_WRITE; - req.bi_op_flags = REQ_SYNC; + req.bi_opf = REQ_OP_WRITE | REQ_SYNC; req.mem.type = DM_IO_VMA; req.mem.ptr.vma = (char *)wc->memory_map + (size_t)i * BITMAP_GRANULARITY; req.client = wc->dm_io; @@ -562,8 +561,7 @@ static void ssd_commit_superblock(struct dm_writecache *wc) region.sector += wc->start_sector; - req.bi_op = REQ_OP_WRITE; - req.bi_op_flags = REQ_SYNC | REQ_FUA; + req.bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_FUA; req.mem.type = DM_IO_VMA; req.mem.ptr.vma = (char *)wc->memory_map; req.client = wc->dm_io; @@ -592,8 +590,7 @@ static void writecache_disk_flush(struct dm_writecache *wc, struct dm_dev *dev) region.bdev = dev->bdev; region.sector = 0; region.count = 0; - req.bi_op = REQ_OP_WRITE; - req.bi_op_flags = REQ_PREFLUSH; + req.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; req.mem.type = DM_IO_KMEM; req.mem.ptr.addr = NULL; req.client = wc->dm_io; @@ -981,8 +978,7 @@ static int writecache_read_metadata(struct dm_writecache *wc, sector_t n_sectors region.bdev = wc->ssd_dev->bdev; region.sector = wc->start_sector; region.count = n_sectors; - req.bi_op = REQ_OP_READ; - req.bi_op_flags = REQ_SYNC; + req.bi_opf = REQ_OP_READ | REQ_SYNC; req.mem.type = DM_IO_VMA; req.mem.ptr.vma = (char *)wc->memory_map; req.client = wc->dm_io; @@ -1329,8 +1325,8 @@ enum wc_map_op { WC_MAP_ERROR, }; -static enum wc_map_op writecache_map_remap_origin(struct dm_writecache *wc, struct bio *bio, - struct wc_entry *e) +static void writecache_map_remap_origin(struct dm_writecache *wc, struct bio *bio, + struct wc_entry *e) { if (e) { sector_t next_boundary = @@ -1338,8 +1334,6 @@ static enum wc_map_op writecache_map_remap_origin(struct dm_writecache *wc, stru if (next_boundary < bio->bi_iter.bi_size >> SECTOR_SHIFT) dm_accept_partial_bio(bio, next_boundary); } - - return WC_MAP_REMAP_ORIGIN; } static enum wc_map_op writecache_map_read(struct dm_writecache *wc, struct bio *bio) @@ -1366,14 +1360,16 @@ read_next_block: map_op = WC_MAP_REMAP; } } else { - map_op = writecache_map_remap_origin(wc, bio, e); + writecache_map_remap_origin(wc, bio, e); + wc->stats.reads += (bio->bi_iter.bi_size - wc->block_size) >> wc->block_size_bits; + map_op = WC_MAP_REMAP_ORIGIN; } return map_op; } -static enum wc_map_op writecache_bio_copy_ssd(struct dm_writecache *wc, struct bio *bio, - struct wc_entry *e, bool search_used) +static void writecache_bio_copy_ssd(struct dm_writecache *wc, struct bio *bio, + struct wc_entry *e, bool search_used) { unsigned bio_size = wc->block_size; sector_t start_cache_sec = cache_sector(wc, e); @@ -1413,14 +1409,15 @@ static enum wc_map_op writecache_bio_copy_ssd(struct dm_writecache *wc, struct b bio->bi_iter.bi_sector = start_cache_sec; dm_accept_partial_bio(bio, bio_size >> SECTOR_SHIFT); + wc->stats.writes += bio->bi_iter.bi_size >> wc->block_size_bits; + wc->stats.writes_allocate += (bio->bi_iter.bi_size - wc->block_size) >> wc->block_size_bits; + if (unlikely(wc->uncommitted_blocks >= wc->autocommit_blocks)) { wc->uncommitted_blocks = 0; queue_work(wc->writeback_wq, &wc->flush_work); } else { writecache_schedule_autocommit(wc); } - - return WC_MAP_REMAP; } static enum wc_map_op writecache_map_write(struct dm_writecache *wc, struct bio *bio) @@ -1430,9 +1427,10 @@ static enum wc_map_op writecache_map_write(struct dm_writecache *wc, struct bio do { bool found_entry = false; bool search_used = false; - wc->stats.writes++; - if (writecache_has_error(wc)) + if (writecache_has_error(wc)) { + wc->stats.writes += bio->bi_iter.bi_size >> wc->block_size_bits; return WC_MAP_ERROR; + } e = writecache_find_entry(wc, bio->bi_iter.bi_sector, 0); if (e) { if (!writecache_entry_is_committed(wc, e)) { @@ -1456,9 +1454,11 @@ static enum wc_map_op writecache_map_write(struct dm_writecache *wc, struct bio if (unlikely(!e)) { if (!WC_MODE_PMEM(wc) && !found_entry) { direct_write: - wc->stats.writes_around++; e = writecache_find_entry(wc, bio->bi_iter.bi_sector, WFE_RETURN_FOLLOWING); - return writecache_map_remap_origin(wc, bio, e); + writecache_map_remap_origin(wc, bio, e); + wc->stats.writes_around += bio->bi_iter.bi_size >> wc->block_size_bits; + wc->stats.writes += bio->bi_iter.bi_size >> wc->block_size_bits; + return WC_MAP_REMAP_ORIGIN; } wc->stats.writes_blocked_on_freelist++; writecache_wait_on_freelist(wc); @@ -1469,10 +1469,13 @@ direct_write: wc->uncommitted_blocks++; wc->stats.writes_allocate++; bio_copy: - if (WC_MODE_PMEM(wc)) + if (WC_MODE_PMEM(wc)) { bio_copy_block(wc, bio, memory_data(wc, e)); - else - return writecache_bio_copy_ssd(wc, bio, e, search_used); + wc->stats.writes++; + } else { + writecache_bio_copy_ssd(wc, bio, e, search_used); + return WC_MAP_REMAP; + } } while (bio->bi_iter.bi_size); if (unlikely(bio->bi_opf & REQ_FUA || wc->uncommitted_blocks >= wc->autocommit_blocks)) @@ -1507,7 +1510,7 @@ static enum wc_map_op writecache_map_flush(struct dm_writecache *wc, struct bio static enum wc_map_op writecache_map_discard(struct dm_writecache *wc, struct bio *bio) { - wc->stats.discards++; + wc->stats.discards += bio->bi_iter.bi_size >> wc->block_size_bits; if (writecache_has_error(wc)) return WC_MAP_ERROR; @@ -1591,7 +1594,8 @@ done: default: BUG(); - return -1; + wc_unlock(wc); + return DM_MAPIO_KILL; } } diff --git a/drivers/md/dm-zone.c b/drivers/md/dm-zone.c index 3e7b1fe1580b..3dafc0e8b7a9 100644 --- a/drivers/md/dm-zone.c +++ b/drivers/md/dm-zone.c @@ -139,13 +139,11 @@ bool dm_is_zone_write(struct mapped_device *md, struct bio *bio) void dm_cleanup_zoned_dev(struct mapped_device *md) { - struct request_queue *q = md->queue; - - if (q) { - kfree(q->conv_zones_bitmap); - q->conv_zones_bitmap = NULL; - kfree(q->seq_zones_wlock); - q->seq_zones_wlock = NULL; + if (md->disk) { + kfree(md->disk->conv_zones_bitmap); + md->disk->conv_zones_bitmap = NULL; + kfree(md->disk->seq_zones_wlock); + md->disk->seq_zones_wlock = NULL; } kvfree(md->zwp_offset); @@ -179,31 +177,31 @@ static int dm_zone_revalidate_cb(struct blk_zone *zone, unsigned int idx, void *data) { struct mapped_device *md = data; - struct request_queue *q = md->queue; + struct gendisk *disk = md->disk; switch (zone->type) { case BLK_ZONE_TYPE_CONVENTIONAL: - if (!q->conv_zones_bitmap) { - q->conv_zones_bitmap = - kcalloc(BITS_TO_LONGS(q->nr_zones), + if (!disk->conv_zones_bitmap) { + disk->conv_zones_bitmap = + kcalloc(BITS_TO_LONGS(disk->nr_zones), sizeof(unsigned long), GFP_NOIO); - if (!q->conv_zones_bitmap) + if (!disk->conv_zones_bitmap) return -ENOMEM; } - set_bit(idx, q->conv_zones_bitmap); + set_bit(idx, disk->conv_zones_bitmap); break; case BLK_ZONE_TYPE_SEQWRITE_REQ: case BLK_ZONE_TYPE_SEQWRITE_PREF: - if (!q->seq_zones_wlock) { - q->seq_zones_wlock = - kcalloc(BITS_TO_LONGS(q->nr_zones), + if (!disk->seq_zones_wlock) { + disk->seq_zones_wlock = + kcalloc(BITS_TO_LONGS(disk->nr_zones), sizeof(unsigned long), GFP_NOIO); - if (!q->seq_zones_wlock) + if (!disk->seq_zones_wlock) return -ENOMEM; } if (!md->zwp_offset) { md->zwp_offset = - kvcalloc(q->nr_zones, sizeof(unsigned int), + kvcalloc(disk->nr_zones, sizeof(unsigned int), GFP_KERNEL); if (!md->zwp_offset) return -ENOMEM; @@ -228,7 +226,7 @@ static int dm_zone_revalidate_cb(struct blk_zone *zone, unsigned int idx, */ static int dm_revalidate_zones(struct mapped_device *md, struct dm_table *t) { - struct request_queue *q = md->queue; + struct gendisk *disk = md->disk; unsigned int noio_flag; int ret; @@ -236,7 +234,7 @@ static int dm_revalidate_zones(struct mapped_device *md, struct dm_table *t) * Check if something changed. If yes, cleanup the current resources * and reallocate everything. */ - if (!q->nr_zones || q->nr_zones != md->nr_zones) + if (!disk->nr_zones || disk->nr_zones != md->nr_zones) dm_cleanup_zoned_dev(md); if (md->nr_zones) return 0; @@ -246,17 +244,17 @@ static int dm_revalidate_zones(struct mapped_device *md, struct dm_table *t) * operations in this context are done as if GFP_NOIO was specified. */ noio_flag = memalloc_noio_save(); - ret = dm_blk_do_report_zones(md, t, 0, q->nr_zones, + ret = dm_blk_do_report_zones(md, t, 0, disk->nr_zones, dm_zone_revalidate_cb, md); memalloc_noio_restore(noio_flag); if (ret < 0) goto err; - if (ret != q->nr_zones) { + if (ret != disk->nr_zones) { ret = -EIO; goto err; } - md->nr_zones = q->nr_zones; + md->nr_zones = disk->nr_zones; return 0; @@ -270,16 +268,13 @@ static int device_not_zone_append_capable(struct dm_target *ti, struct dm_dev *dev, sector_t start, sector_t len, void *data) { - return !blk_queue_is_zoned(bdev_get_queue(dev->bdev)); + return !bdev_is_zoned(dev->bdev); } static bool dm_table_supports_zone_append(struct dm_table *t) { - struct dm_target *ti; - unsigned int i; - - for (i = 0; i < dm_table_get_num_targets(t); i++) { - ti = dm_table_get_target(t, i); + for (unsigned int i = 0; i < t->num_targets; i++) { + struct dm_target *ti = dm_table_get_target(t, i); if (ti->emulate_zone_append) return false; @@ -301,7 +296,7 @@ int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q) * correct value to be exposed in sysfs queue/nr_zones. */ WARN_ON_ONCE(queue_is_mq(q)); - q->nr_zones = blkdev_nr_zones(md->disk); + md->disk->nr_zones = bdev_nr_zones(md->disk->part0); /* Check if zone append is natively supported */ if (dm_table_supports_zone_append(t)) { @@ -334,7 +329,7 @@ static int dm_update_zone_wp_offset_cb(struct blk_zone *zone, unsigned int idx, static int dm_update_zone_wp_offset(struct mapped_device *md, unsigned int zno, unsigned int *wp_ofst) { - sector_t sector = zno * blk_queue_zone_sectors(md->queue); + sector_t sector = zno * bdev_zone_sectors(md->disk->part0); unsigned int noio_flag; struct dm_table *t; int srcu_idx, ret; @@ -361,7 +356,7 @@ static int dm_update_zone_wp_offset(struct mapped_device *md, unsigned int zno, } struct orig_bio_details { - unsigned int op; + enum req_op op; unsigned int nr_sectors; }; @@ -373,7 +368,7 @@ struct orig_bio_details { static bool dm_zone_map_bio_begin(struct mapped_device *md, unsigned int zno, struct bio *clone) { - sector_t zsectors = blk_queue_zone_sectors(md->queue); + sector_t zsectors = bdev_zone_sectors(md->disk->part0); unsigned int zwp_offset = READ_ONCE(md->zwp_offset[zno]); /* @@ -443,7 +438,7 @@ static blk_status_t dm_zone_map_bio_end(struct mapped_device *md, unsigned int z return BLK_STS_OK; case REQ_OP_ZONE_FINISH: WRITE_ONCE(md->zwp_offset[zno], - blk_queue_zone_sectors(md->queue)); + bdev_zone_sectors(md->disk->part0)); return BLK_STS_OK; case REQ_OP_WRITE_ZEROES: case REQ_OP_WRITE: @@ -466,26 +461,26 @@ static blk_status_t dm_zone_map_bio_end(struct mapped_device *md, unsigned int z } } -static inline void dm_zone_lock(struct request_queue *q, - unsigned int zno, struct bio *clone) +static inline void dm_zone_lock(struct gendisk *disk, unsigned int zno, + struct bio *clone) { if (WARN_ON_ONCE(bio_flagged(clone, BIO_ZONE_WRITE_LOCKED))) return; - wait_on_bit_lock_io(q->seq_zones_wlock, zno, TASK_UNINTERRUPTIBLE); + wait_on_bit_lock_io(disk->seq_zones_wlock, zno, TASK_UNINTERRUPTIBLE); bio_set_flag(clone, BIO_ZONE_WRITE_LOCKED); } -static inline void dm_zone_unlock(struct request_queue *q, - unsigned int zno, struct bio *clone) +static inline void dm_zone_unlock(struct gendisk *disk, unsigned int zno, + struct bio *clone) { if (!bio_flagged(clone, BIO_ZONE_WRITE_LOCKED)) return; - WARN_ON_ONCE(!test_bit(zno, q->seq_zones_wlock)); - clear_bit_unlock(zno, q->seq_zones_wlock); + WARN_ON_ONCE(!test_bit(zno, disk->seq_zones_wlock)); + clear_bit_unlock(zno, disk->seq_zones_wlock); smp_mb__after_atomic(); - wake_up_bit(q->seq_zones_wlock, zno); + wake_up_bit(disk->seq_zones_wlock, zno); bio_clear_flag(clone, BIO_ZONE_WRITE_LOCKED); } @@ -520,7 +515,6 @@ int dm_zone_map_bio(struct dm_target_io *tio) struct dm_io *io = tio->io; struct dm_target *ti = tio->ti; struct mapped_device *md = io->md; - struct request_queue *q = md->queue; struct bio *clone = &tio->clone; struct orig_bio_details orig_bio_details; unsigned int zno; @@ -536,7 +530,7 @@ int dm_zone_map_bio(struct dm_target_io *tio) /* Lock the target zone */ zno = bio_zone_no(clone); - dm_zone_lock(q, zno, clone); + dm_zone_lock(md->disk, zno, clone); orig_bio_details.nr_sectors = bio_sectors(clone); orig_bio_details.op = bio_op(clone); @@ -546,7 +540,7 @@ int dm_zone_map_bio(struct dm_target_io *tio) * both valid, and if the bio is a zone append, remap it to a write. */ if (!dm_zone_map_bio_begin(md, zno, clone)) { - dm_zone_unlock(q, zno, clone); + dm_zone_unlock(md->disk, zno, clone); return DM_MAPIO_KILL; } @@ -570,12 +564,12 @@ int dm_zone_map_bio(struct dm_target_io *tio) sts = dm_zone_map_bio_end(md, zno, &orig_bio_details, *tio->len_ptr); if (sts != BLK_STS_OK) - dm_zone_unlock(q, zno, clone); + dm_zone_unlock(md->disk, zno, clone); break; case DM_MAPIO_REQUEUE: case DM_MAPIO_KILL: default: - dm_zone_unlock(q, zno, clone); + dm_zone_unlock(md->disk, zno, clone); sts = BLK_STS_IOERR; break; } @@ -592,7 +586,7 @@ int dm_zone_map_bio(struct dm_target_io *tio) void dm_zone_endio(struct dm_io *io, struct bio *clone) { struct mapped_device *md = io->md; - struct request_queue *q = md->queue; + struct gendisk *disk = md->disk; struct bio *orig_bio = io->orig_bio; unsigned int zwp_offset; unsigned int zno; @@ -608,7 +602,8 @@ void dm_zone_endio(struct dm_io *io, struct bio *clone) */ if (clone->bi_status == BLK_STS_OK && bio_op(clone) == REQ_OP_ZONE_APPEND) { - sector_t mask = (sector_t)blk_queue_zone_sectors(q) - 1; + sector_t mask = + (sector_t)bdev_zone_sectors(disk->part0) - 1; orig_bio->bi_iter.bi_sector += clone->bi_iter.bi_sector & mask; @@ -649,5 +644,5 @@ void dm_zone_endio(struct dm_io *io, struct bio *clone) zwp_offset - bio_sectors(orig_bio); } - dm_zone_unlock(q, zno, clone); + dm_zone_unlock(disk, zno, clone); } diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c index d1ea66114d14..0278482fac94 100644 --- a/drivers/md/dm-zoned-metadata.c +++ b/drivers/md/dm-zoned-metadata.c @@ -737,7 +737,7 @@ static int dmz_write_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk, /* * Read/write a metadata block. */ -static int dmz_rdwr_block(struct dmz_dev *dev, int op, +static int dmz_rdwr_block(struct dmz_dev *dev, enum req_op op, sector_t block, struct page *page) { struct bio *bio; @@ -2045,7 +2045,8 @@ struct dm_zone *dmz_get_zone_for_reclaim(struct dmz_metadata *zmd, * allocated and used to map the chunk. * The zone returned will be set to the active state. */ -struct dm_zone *dmz_get_chunk_mapping(struct dmz_metadata *zmd, unsigned int chunk, int op) +struct dm_zone *dmz_get_chunk_mapping(struct dmz_metadata *zmd, + unsigned int chunk, enum req_op op) { struct dmz_mblock *dmap_mblk = zmd->map_mblk[chunk >> DMZ_MAP_ENTRIES_SHIFT]; struct dmz_map *dmap = (struct dmz_map *) dmap_mblk->data; @@ -2944,7 +2945,9 @@ int dmz_ctr_metadata(struct dmz_dev *dev, int num_dev, zmd->mblk_shrinker.seeks = DEFAULT_SEEKS; /* Metadata cache shrinker */ - ret = register_shrinker(&zmd->mblk_shrinker); + ret = register_shrinker(&zmd->mblk_shrinker, "md-meta:(%u:%u)", + MAJOR(dev->bdev->bd_dev), + MINOR(dev->bdev->bd_dev)); if (ret) { dmz_zmd_err(zmd, "Register metadata cache shrinker failed"); goto err; diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c index 0ec5d8b9b1a4..95b132b52f33 100644 --- a/drivers/md/dm-zoned-target.c +++ b/drivers/md/dm-zoned-target.c @@ -764,8 +764,7 @@ static void dmz_put_zoned_device(struct dm_target *ti) static int dmz_fixup_devices(struct dm_target *ti) { struct dmz_target *dmz = ti->private; - struct dmz_dev *reg_dev, *zoned_dev; - struct request_queue *q; + struct dmz_dev *reg_dev = NULL; sector_t zone_nr_sectors = 0; int i; @@ -780,32 +779,32 @@ static int dmz_fixup_devices(struct dm_target *ti) return -EINVAL; } for (i = 1; i < dmz->nr_ddevs; i++) { - zoned_dev = &dmz->dev[i]; + struct dmz_dev *zoned_dev = &dmz->dev[i]; + struct block_device *bdev = zoned_dev->bdev; + if (zoned_dev->flags & DMZ_BDEV_REGULAR) { ti->error = "Secondary disk is not a zoned device"; return -EINVAL; } - q = bdev_get_queue(zoned_dev->bdev); if (zone_nr_sectors && - zone_nr_sectors != blk_queue_zone_sectors(q)) { + zone_nr_sectors != bdev_zone_sectors(bdev)) { ti->error = "Zone nr sectors mismatch"; return -EINVAL; } - zone_nr_sectors = blk_queue_zone_sectors(q); + zone_nr_sectors = bdev_zone_sectors(bdev); zoned_dev->zone_nr_sectors = zone_nr_sectors; - zoned_dev->nr_zones = - blkdev_nr_zones(zoned_dev->bdev->bd_disk); + zoned_dev->nr_zones = bdev_nr_zones(bdev); } } else { - reg_dev = NULL; - zoned_dev = &dmz->dev[0]; + struct dmz_dev *zoned_dev = &dmz->dev[0]; + struct block_device *bdev = zoned_dev->bdev; + if (zoned_dev->flags & DMZ_BDEV_REGULAR) { ti->error = "Disk is not a zoned device"; return -EINVAL; } - q = bdev_get_queue(zoned_dev->bdev); - zoned_dev->zone_nr_sectors = blk_queue_zone_sectors(q); - zoned_dev->nr_zones = blkdev_nr_zones(zoned_dev->bdev->bd_disk); + zoned_dev->zone_nr_sectors = bdev_zone_sectors(bdev); + zoned_dev->nr_zones = bdev_nr_zones(bdev); } if (reg_dev) { diff --git a/drivers/md/dm-zoned.h b/drivers/md/dm-zoned.h index a02744a0846c..265494d3f711 100644 --- a/drivers/md/dm-zoned.h +++ b/drivers/md/dm-zoned.h @@ -248,7 +248,7 @@ struct dm_zone *dmz_get_zone_for_reclaim(struct dmz_metadata *zmd, unsigned int dev_idx, bool idle); struct dm_zone *dmz_get_chunk_mapping(struct dmz_metadata *zmd, - unsigned int chunk, int op); + unsigned int chunk, enum req_op op); void dmz_put_chunk_mapping(struct dmz_metadata *zmd, struct dm_zone *zone); struct dm_zone *dmz_get_chunk_buffer(struct dmz_metadata *zmd, struct dm_zone *dzone); diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 2b75f1ef7386..60549b65c799 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -88,10 +88,6 @@ struct clone_info { bool submit_as_polled:1; }; -#define DM_TARGET_IO_BIO_OFFSET (offsetof(struct dm_target_io, clone)) -#define DM_IO_BIO_OFFSET \ - (offsetof(struct dm_target_io, clone) + offsetof(struct dm_io, tio)) - static inline struct dm_target_io *clone_to_tio(struct bio *clone) { return container_of(clone, struct dm_target_io, clone); @@ -415,7 +411,7 @@ static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo) static int dm_prepare_ioctl(struct mapped_device *md, int *srcu_idx, struct block_device **bdev) { - struct dm_target *tgt; + struct dm_target *ti; struct dm_table *map; int r; @@ -426,17 +422,17 @@ retry: return r; /* We only support devices that have a single target */ - if (dm_table_get_num_targets(map) != 1) + if (map->num_targets != 1) return r; - tgt = dm_table_get_target(map, 0); - if (!tgt->type->prepare_ioctl) + ti = dm_table_get_target(map, 0); + if (!ti->type->prepare_ioctl) return r; if (dm_suspended_md(md)) return -EAGAIN; - r = tgt->type->prepare_ioctl(tgt, bdev); + r = ti->type->prepare_ioctl(ti, bdev); if (r == -ENOTCONN && !fatal_signal_pending(current)) { dm_put_live_table(md, *srcu_idx); msleep(10); @@ -578,9 +574,6 @@ static struct dm_io *alloc_io(struct mapped_device *md, struct bio *bio) struct bio *clone; clone = bio_alloc_clone(NULL, bio, GFP_NOIO, &md->mempools->io_bs); - /* Set default bdev, but target must bio_set_dev() before issuing IO */ - clone->bi_bdev = md->disk->part0; - tio = clone_to_tio(clone); tio->flags = 0; dm_tio_set_flag(tio, DM_TIO_INSIDE_DM_IO); @@ -594,7 +587,6 @@ static struct dm_io *alloc_io(struct mapped_device *md, struct bio *bio) atomic_set(&io->io_count, 2); this_cpu_inc(*md->pending_io); io->orig_bio = bio; - io->split_bio = NULL; io->md = md; spin_lock_init(&io->lock); io->start_time = jiffies; @@ -614,6 +606,7 @@ static void free_io(struct dm_io *io) static struct bio *alloc_tio(struct clone_info *ci, struct dm_target *ti, unsigned target_bio_nr, unsigned *len, gfp_t gfp_mask) { + struct mapped_device *md = ci->io->md; struct dm_target_io *tio; struct bio *clone; @@ -623,14 +616,10 @@ static struct bio *alloc_tio(struct clone_info *ci, struct dm_target *ti, /* alloc_io() already initialized embedded clone */ clone = &tio->clone; } else { - struct mapped_device *md = ci->io->md; - clone = bio_alloc_clone(NULL, ci->bio, gfp_mask, &md->mempools->bs); if (!clone) return NULL; - /* Set default bdev, but target must bio_set_dev() before issuing IO */ - clone->bi_bdev = md->disk->part0; /* REQ_DM_POLL_LIST shouldn't be inherited */ clone->bi_opf &= ~REQ_DM_POLL_LIST; @@ -646,6 +635,11 @@ static struct bio *alloc_tio(struct clone_info *ci, struct dm_target *ti, tio->len_ptr = len; tio->old_sector = 0; + /* Set default bdev, but target must bio_set_dev() before issuing IO */ + clone->bi_bdev = md->disk->part0; + if (unlikely(ti->needs_bio_set_dev)) + bio_set_dev(clone, md->disk->part0); + if (len) { clone->bi_iter.bi_size = to_bytes(*len); if (bio_integrity(clone)) @@ -716,7 +710,7 @@ static void dm_put_live_table_fast(struct mapped_device *md) __releases(RCU) } static inline struct dm_table *dm_get_live_table_bio(struct mapped_device *md, - int *srcu_idx, unsigned bio_opf) + int *srcu_idx, blk_opf_t bio_opf) { if (bio_opf & REQ_NOWAIT) return dm_get_live_table_fast(md); @@ -725,7 +719,7 @@ static inline struct dm_table *dm_get_live_table_bio(struct mapped_device *md, } static inline void dm_put_live_table_bio(struct mapped_device *md, int srcu_idx, - unsigned bio_opf) + blk_opf_t bio_opf) { if (bio_opf & REQ_NOWAIT) dm_put_live_table_fast(md); @@ -758,7 +752,7 @@ static int open_table_device(struct table_device *td, dev_t dev, } td->dm_dev.bdev = bdev; - td->dm_dev.dax_dev = fs_dax_get_by_bdev(bdev, &part_off); + td->dm_dev.dax_dev = fs_dax_get_by_bdev(bdev, &part_off, NULL, NULL); return 0; } @@ -884,22 +878,63 @@ static int __noflush_suspending(struct mapped_device *md) return test_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); } -static void dm_io_complete(struct dm_io *io) +static void dm_requeue_add_io(struct dm_io *io, bool first_stage) { - blk_status_t io_error; struct mapped_device *md = io->md; - struct bio *bio = io->split_bio ? io->split_bio : io->orig_bio; - if (io->status == BLK_STS_DM_REQUEUE) { + if (first_stage) { + struct dm_io *next = md->requeue_list; + + md->requeue_list = io; + io->next = next; + } else { + bio_list_add_head(&md->deferred, io->orig_bio); + } +} + +static void dm_kick_requeue(struct mapped_device *md, bool first_stage) +{ + if (first_stage) + queue_work(md->wq, &md->requeue_work); + else + queue_work(md->wq, &md->work); +} + +/* + * Return true if the dm_io's original bio is requeued. + * io->status is updated with error if requeue disallowed. + */ +static bool dm_handle_requeue(struct dm_io *io, bool first_stage) +{ + struct bio *bio = io->orig_bio; + bool handle_requeue = (io->status == BLK_STS_DM_REQUEUE); + bool handle_polled_eagain = ((io->status == BLK_STS_AGAIN) && + (bio->bi_opf & REQ_POLLED)); + struct mapped_device *md = io->md; + bool requeued = false; + + if (handle_requeue || handle_polled_eagain) { unsigned long flags; + + if (bio->bi_opf & REQ_POLLED) { + /* + * Upper layer won't help us poll split bio + * (io->orig_bio may only reflect a subset of the + * pre-split original) so clear REQ_POLLED. + */ + bio_clear_polled(bio); + } + /* - * Target requested pushing back the I/O. + * Target requested pushing back the I/O or + * polled IO hit BLK_STS_AGAIN. */ spin_lock_irqsave(&md->deferred_lock, flags); - if (__noflush_suspending(md) && - !WARN_ON_ONCE(dm_is_zone_write(md, bio))) { - /* NOTE early return due to BLK_STS_DM_REQUEUE below */ - bio_list_add_head(&md->deferred, bio); + if ((__noflush_suspending(md) && + !WARN_ON_ONCE(dm_is_zone_write(md, bio))) || + handle_polled_eagain || first_stage) { + dm_requeue_add_io(io, first_stage); + requeued = true; } else { /* * noflush suspend was interrupted or this is @@ -910,6 +945,23 @@ static void dm_io_complete(struct dm_io *io) spin_unlock_irqrestore(&md->deferred_lock, flags); } + if (requeued) + dm_kick_requeue(md, first_stage); + + return requeued; +} + +static void __dm_io_complete(struct dm_io *io, bool first_stage) +{ + struct bio *bio = io->orig_bio; + struct mapped_device *md = io->md; + blk_status_t io_error; + bool requeued; + + requeued = dm_handle_requeue(io, first_stage); + if (requeued && first_stage) + return; + io_error = io->status; if (dm_io_flagged(io, DM_IO_ACCOUNTED)) dm_end_io_acct(io); @@ -929,23 +981,9 @@ static void dm_io_complete(struct dm_io *io) if (unlikely(wq_has_sleeper(&md->wait))) wake_up(&md->wait); - if (io_error == BLK_STS_DM_REQUEUE || io_error == BLK_STS_AGAIN) { - if (bio->bi_opf & REQ_POLLED) { - /* - * Upper layer won't help us poll split bio (io->orig_bio - * may only reflect a subset of the pre-split original) - * so clear REQ_POLLED in case of requeue. - */ - bio_clear_polled(bio); - if (io_error == BLK_STS_AGAIN) { - /* io_uring doesn't handle BLK_STS_AGAIN (yet) */ - queue_io(md, bio); - return; - } - } - if (io_error == BLK_STS_DM_REQUEUE) - return; - } + /* Return early if the original bio was requeued */ + if (requeued) + return; if (bio_is_flush_with_data(bio)) { /* @@ -962,6 +1000,58 @@ static void dm_io_complete(struct dm_io *io) } } +static void dm_wq_requeue_work(struct work_struct *work) +{ + struct mapped_device *md = container_of(work, struct mapped_device, + requeue_work); + unsigned long flags; + struct dm_io *io; + + /* reuse deferred lock to simplify dm_handle_requeue */ + spin_lock_irqsave(&md->deferred_lock, flags); + io = md->requeue_list; + md->requeue_list = NULL; + spin_unlock_irqrestore(&md->deferred_lock, flags); + + while (io) { + struct dm_io *next = io->next; + + dm_io_rewind(io, &md->disk->bio_split); + + io->next = NULL; + __dm_io_complete(io, false); + io = next; + } +} + +/* + * Two staged requeue: + * + * 1) io->orig_bio points to the real original bio, and the part mapped to + * this io must be requeued, instead of other parts of the original bio. + * + * 2) io->orig_bio points to new cloned bio which matches the requeued dm_io. + */ +static void dm_io_complete(struct dm_io *io) +{ + bool first_requeue; + + /* + * Only dm_io that has been split needs two stage requeue, otherwise + * we may run into long bio clone chain during suspend and OOM could + * be triggered. + * + * Also flush data dm_io won't be marked as DM_IO_WAS_SPLIT, so they + * also aren't handled via the first stage requeue. + */ + if (dm_io_flagged(io, DM_IO_WAS_SPLIT)) + first_requeue = true; + else + first_requeue = false; + + __dm_io_complete(io, first_requeue); +} + /* * Decrements the number of outstanding ios that a bio has been * cloned into, completing the original io if necc. @@ -1033,7 +1123,7 @@ static void clone_endio(struct bio *bio) } if (static_branch_unlikely(&zoned_enabled) && - unlikely(blk_queue_is_zoned(bdev_get_queue(bio->bi_bdev)))) + unlikely(bdev_is_zoned(bio->bi_bdev))) dm_zone_endio(io, bio); if (endio) { @@ -1086,23 +1176,18 @@ static sector_t max_io_len(struct dm_target *ti, sector_t sector) { sector_t target_offset = dm_target_offset(ti, sector); sector_t len = max_io_len_target_boundary(ti, target_offset); - sector_t max_len; /* * Does the target need to split IO even further? * - varied (per target) IO splitting is a tenet of DM; this * explains why stacked chunk_sectors based splitting via - * blk_max_size_offset() isn't possible here. So pass in - * ti->max_io_len to override stacked chunk_sectors. + * bio_split_to_limits() isn't possible here. */ - if (ti->max_io_len) { - max_len = blk_max_size_offset(ti->table->md->queue, - target_offset, ti->max_io_len); - if (len > max_len) - len = max_len; - } - - return len; + if (!ti->max_io_len) + return len; + return min_t(sector_t, len, + min(queue_max_sectors(ti->table->md->queue), + blk_chunk_sectors_left(target_offset, ti->max_io_len))); } int dm_set_target_max_io_len(struct dm_target *ti, sector_t len) @@ -1245,6 +1330,7 @@ out: void dm_accept_partial_bio(struct bio *bio, unsigned n_sectors) { struct dm_target_io *tio = clone_to_tio(bio); + struct dm_io *io = tio->io; unsigned bio_sectors = bio_sectors(bio); BUG_ON(dm_tio_flagged(tio, DM_TIO_IS_DUPLICATE_BIO)); @@ -1260,8 +1346,9 @@ void dm_accept_partial_bio(struct bio *bio, unsigned n_sectors) * __split_and_process_bio() may have already saved mapped part * for accounting but it is being reduced so update accordingly. */ - dm_io_set_flag(tio->io, DM_IO_WAS_SPLIT); - tio->io->sectors = n_sectors; + dm_io_set_flag(io, DM_IO_WAS_SPLIT); + io->sectors = n_sectors; + io->sector_offset = bio_sectors(io->orig_bio); } EXPORT_SYMBOL_GPL(dm_accept_partial_bio); @@ -1384,17 +1471,7 @@ static void setup_split_accounting(struct clone_info *ci, unsigned len) */ dm_io_set_flag(io, DM_IO_WAS_SPLIT); io->sectors = len; - } - - if (static_branch_unlikely(&stats_enabled) && - unlikely(dm_stats_used(&io->md->stats))) { - /* - * Save bi_sector in terms of its offset from end of - * original bio, only needed for DM-stats' benefit. - * - saved regardless of whether split needed so that - * dm_accept_partial_bio() doesn't need to. - */ - io->sector_offset = bio_end_sector(ci->bio) - ci->sector; + io->sector_offset = bio_sectors(ci->bio); } } @@ -1428,11 +1505,11 @@ static void alloc_multiple_bios(struct bio_list *blist, struct clone_info *ci, } static int __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti, - unsigned num_bios, unsigned *len) + unsigned int num_bios, unsigned *len) { struct bio_list blist = BIO_EMPTY_LIST; struct bio *clone; - int ret = 0; + unsigned int ret = 0; switch (num_bios) { case 0: @@ -1460,8 +1537,7 @@ static int __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti, static void __send_empty_flush(struct clone_info *ci) { - unsigned target_nr = 0; - struct dm_target *ti; + struct dm_table *t = ci->map; struct bio flush_bio; /* @@ -1476,8 +1552,9 @@ static void __send_empty_flush(struct clone_info *ci) ci->sector_count = 0; ci->io->tio.clone.bi_iter.bi_size = 0; - while ((ti = dm_table_get_target(ci->map, target_nr++))) { - int bios; + for (unsigned int i = 0; i < t->num_targets; i++) { + unsigned int bios; + struct dm_target *ti = dm_table_get_target(t, i); atomic_add(ti->num_flush_bios, &ci->io->io_count); bios = __send_duplicate_bios(ci, ti, ti->num_flush_bios, NULL); @@ -1497,7 +1574,7 @@ static void __send_changing_extent_only(struct clone_info *ci, struct dm_target unsigned num_bios) { unsigned len; - int bios; + unsigned int bios; len = min_t(sector_t, ci->sector_count, max_io_len_target_boundary(ti, dm_target_offset(ti, ci->sector))); @@ -1516,7 +1593,7 @@ static void __send_changing_extent_only(struct clone_info *ci, struct dm_target static bool is_abnormal_io(struct bio *bio) { - unsigned int op = bio_op(bio); + enum req_op op = bio_op(bio); if (op != REQ_OP_READ && op != REQ_OP_WRITE && op != REQ_OP_FLUSH) { switch (op) { @@ -1547,6 +1624,8 @@ static blk_status_t __process_abnormal_io(struct clone_info *ci, case REQ_OP_WRITE_ZEROES: num_bios = ti->num_write_zeroes_bios; break; + default: + break; } /* @@ -1628,7 +1707,7 @@ static blk_status_t __split_and_process_bio(struct clone_info *ci) * Only support bio polling for normal IO, and the target io is * exactly inside the dm_io instance (verified in dm_poll_dm_io) */ - ci->submit_as_polled = ci->bio->bi_opf & REQ_POLLED; + ci->submit_as_polled = !!(ci->bio->bi_opf & REQ_POLLED); len = min_t(sector_t, max_io_len(ti, ci->sector), ci->sector_count); setup_split_accounting(ci, len); @@ -1672,10 +1751,10 @@ static void dm_split_and_process_bio(struct mapped_device *md, is_abnormal = is_abnormal_io(bio); if (unlikely(is_abnormal)) { /* - * Use blk_queue_split() for abnormal IO (e.g. discard, etc) + * Use bio_split_to_limits() for abnormal IO (e.g. discard, etc) * otherwise associated queue_limits won't be imposed. */ - blk_queue_split(&bio); + bio = bio_split_to_limits(bio); } init_clone_info(&ci, md, map, bio, is_abnormal); @@ -1694,11 +1773,9 @@ static void dm_split_and_process_bio(struct mapped_device *md, * Remainder must be passed to submit_bio_noacct() so it gets handled * *after* bios already submitted have been completely processed. */ - WARN_ON_ONCE(!dm_io_flagged(io, DM_IO_WAS_SPLIT)); - io->split_bio = bio_split(bio, io->sectors, GFP_NOIO, - &md->queue->bio_split); - bio_chain(io->split_bio, bio); - trace_block_split(io->split_bio, bio->bi_iter.bi_sector); + bio_trim(bio, io->sectors, ci.sector_count); + trace_block_split(bio, bio->bi_iter.bi_sector); + bio_inc_remaining(bio); submit_bio_noacct(bio); out: /* @@ -1725,7 +1802,7 @@ static void dm_submit_bio(struct bio *bio) struct mapped_device *md = bio->bi_bdev->bd_disk->private_data; int srcu_idx; struct dm_table *map; - unsigned bio_opf = bio->bi_opf; + blk_opf_t bio_opf = bio->bi_opf; map = dm_get_live_table_bio(md, &srcu_idx, bio_opf); @@ -1899,7 +1976,7 @@ static void cleanup_mapped_device(struct mapped_device *md) del_gendisk(md->disk); } dm_queue_destroy_crypto_profile(md->queue); - blk_cleanup_disk(md->disk); + put_disk(md->disk); } if (md->pending_io) { @@ -1974,9 +2051,11 @@ static struct mapped_device *alloc_dev(int minor) init_waitqueue_head(&md->wait); INIT_WORK(&md->work, dm_wq_work); + INIT_WORK(&md->requeue_work, dm_wq_requeue_work); init_waitqueue_head(&md->eventq); init_completion(&md->kobj_holder.completion); + md->requeue_list = NULL; md->swap_bios = get_swap_bios(); sema_init(&md->swap_bios_semaphore, md->swap_bios); mutex_init(&md->swap_bios_lock); @@ -2983,54 +3062,6 @@ int dm_noflush_suspending(struct dm_target *ti) } EXPORT_SYMBOL_GPL(dm_noflush_suspending); -struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, enum dm_queue_mode type, - unsigned per_io_data_size, unsigned min_pool_size, - bool integrity, bool poll) -{ - struct dm_md_mempools *pools = kzalloc_node(sizeof(*pools), GFP_KERNEL, md->numa_node_id); - unsigned int pool_size = 0; - unsigned int front_pad, io_front_pad; - int ret; - - if (!pools) - return NULL; - - switch (type) { - case DM_TYPE_BIO_BASED: - case DM_TYPE_DAX_BIO_BASED: - pool_size = max(dm_get_reserved_bio_based_ios(), min_pool_size); - front_pad = roundup(per_io_data_size, __alignof__(struct dm_target_io)) + DM_TARGET_IO_BIO_OFFSET; - io_front_pad = roundup(per_io_data_size, __alignof__(struct dm_io)) + DM_IO_BIO_OFFSET; - ret = bioset_init(&pools->io_bs, pool_size, io_front_pad, poll ? BIOSET_PERCPU_CACHE : 0); - if (ret) - goto out; - if (integrity && bioset_integrity_create(&pools->io_bs, pool_size)) - goto out; - break; - case DM_TYPE_REQUEST_BASED: - pool_size = max(dm_get_reserved_rq_based_ios(), min_pool_size); - front_pad = offsetof(struct dm_rq_clone_bio_info, clone); - /* per_io_data_size is used for blk-mq pdu at queue allocation */ - break; - default: - BUG(); - } - - ret = bioset_init(&pools->bs, pool_size, front_pad, 0); - if (ret) - goto out; - - if (integrity && bioset_integrity_create(&pools->bs, pool_size)) - goto out; - - return pools; - -out: - dm_free_md_mempools(pools); - - return NULL; -} - void dm_free_md_mempools(struct dm_md_mempools *pools) { if (!pools) @@ -3046,11 +3077,14 @@ struct dm_pr { u64 old_key; u64 new_key; u32 flags; + bool abort; bool fail_early; + int ret; + enum pr_type type; }; static int dm_call_pr(struct block_device *bdev, iterate_devices_callout_fn fn, - void *data) + struct dm_pr *pr) { struct mapped_device *md = bdev->bd_disk->private_data; struct dm_table *table; @@ -3062,15 +3096,21 @@ static int dm_call_pr(struct block_device *bdev, iterate_devices_callout_fn fn, goto out; /* We only support devices that have a single target */ - if (dm_table_get_num_targets(table) != 1) + if (table->num_targets != 1) goto out; ti = dm_table_get_target(table, 0); + if (dm_suspended_md(md)) { + ret = -EAGAIN; + goto out; + } + ret = -EINVAL; if (!ti->type->iterate_devices) goto out; - ret = ti->type->iterate_devices(ti, fn, data); + ti->type->iterate_devices(ti, fn, pr); + ret = 0; out: dm_put_live_table(md, srcu_idx); return ret; @@ -3084,10 +3124,24 @@ static int __dm_pr_register(struct dm_target *ti, struct dm_dev *dev, { struct dm_pr *pr = data; const struct pr_ops *ops = dev->bdev->bd_disk->fops->pr_ops; + int ret; - if (!ops || !ops->pr_register) - return -EOPNOTSUPP; - return ops->pr_register(dev->bdev, pr->old_key, pr->new_key, pr->flags); + if (!ops || !ops->pr_register) { + pr->ret = -EOPNOTSUPP; + return -1; + } + + ret = ops->pr_register(dev->bdev, pr->old_key, pr->new_key, pr->flags); + if (!ret) + return 0; + + if (!pr->ret) + pr->ret = ret; + + if (pr->fail_early) + return -1; + + return 0; } static int dm_pr_register(struct block_device *bdev, u64 old_key, u64 new_key, @@ -3098,82 +3152,145 @@ static int dm_pr_register(struct block_device *bdev, u64 old_key, u64 new_key, .new_key = new_key, .flags = flags, .fail_early = true, + .ret = 0, }; int ret; ret = dm_call_pr(bdev, __dm_pr_register, &pr); - if (ret && new_key) { - /* unregister all paths if we failed to register any path */ - pr.old_key = new_key; - pr.new_key = 0; - pr.flags = 0; - pr.fail_early = false; - dm_call_pr(bdev, __dm_pr_register, &pr); + if (ret) { + /* Didn't even get to register a path */ + return ret; } + if (!pr.ret) + return 0; + ret = pr.ret; + + if (!new_key) + return ret; + + /* unregister all paths if we failed to register any path */ + pr.old_key = new_key; + pr.new_key = 0; + pr.flags = 0; + pr.fail_early = false; + (void) dm_call_pr(bdev, __dm_pr_register, &pr); return ret; } + +static int __dm_pr_reserve(struct dm_target *ti, struct dm_dev *dev, + sector_t start, sector_t len, void *data) +{ + struct dm_pr *pr = data; + const struct pr_ops *ops = dev->bdev->bd_disk->fops->pr_ops; + + if (!ops || !ops->pr_reserve) { + pr->ret = -EOPNOTSUPP; + return -1; + } + + pr->ret = ops->pr_reserve(dev->bdev, pr->old_key, pr->type, pr->flags); + if (!pr->ret) + return -1; + + return 0; +} + static int dm_pr_reserve(struct block_device *bdev, u64 key, enum pr_type type, u32 flags) { - struct mapped_device *md = bdev->bd_disk->private_data; - const struct pr_ops *ops; - int r, srcu_idx; + struct dm_pr pr = { + .old_key = key, + .flags = flags, + .type = type, + .fail_early = false, + .ret = 0, + }; + int ret; - r = dm_prepare_ioctl(md, &srcu_idx, &bdev); - if (r < 0) - goto out; + ret = dm_call_pr(bdev, __dm_pr_reserve, &pr); + if (ret) + return ret; - ops = bdev->bd_disk->fops->pr_ops; - if (ops && ops->pr_reserve) - r = ops->pr_reserve(bdev, key, type, flags); - else - r = -EOPNOTSUPP; -out: - dm_unprepare_ioctl(md, srcu_idx); - return r; + return pr.ret; +} + +/* + * If there is a non-All Registrants type of reservation, the release must be + * sent down the holding path. For the cases where there is no reservation or + * the path is not the holder the device will also return success, so we must + * try each path to make sure we got the correct path. + */ +static int __dm_pr_release(struct dm_target *ti, struct dm_dev *dev, + sector_t start, sector_t len, void *data) +{ + struct dm_pr *pr = data; + const struct pr_ops *ops = dev->bdev->bd_disk->fops->pr_ops; + + if (!ops || !ops->pr_release) { + pr->ret = -EOPNOTSUPP; + return -1; + } + + pr->ret = ops->pr_release(dev->bdev, pr->old_key, pr->type); + if (pr->ret) + return -1; + + return 0; } static int dm_pr_release(struct block_device *bdev, u64 key, enum pr_type type) { - struct mapped_device *md = bdev->bd_disk->private_data; - const struct pr_ops *ops; - int r, srcu_idx; + struct dm_pr pr = { + .old_key = key, + .type = type, + .fail_early = false, + }; + int ret; - r = dm_prepare_ioctl(md, &srcu_idx, &bdev); - if (r < 0) - goto out; + ret = dm_call_pr(bdev, __dm_pr_release, &pr); + if (ret) + return ret; - ops = bdev->bd_disk->fops->pr_ops; - if (ops && ops->pr_release) - r = ops->pr_release(bdev, key, type); - else - r = -EOPNOTSUPP; -out: - dm_unprepare_ioctl(md, srcu_idx); - return r; + return pr.ret; +} + +static int __dm_pr_preempt(struct dm_target *ti, struct dm_dev *dev, + sector_t start, sector_t len, void *data) +{ + struct dm_pr *pr = data; + const struct pr_ops *ops = dev->bdev->bd_disk->fops->pr_ops; + + if (!ops || !ops->pr_preempt) { + pr->ret = -EOPNOTSUPP; + return -1; + } + + pr->ret = ops->pr_preempt(dev->bdev, pr->old_key, pr->new_key, pr->type, + pr->abort); + if (!pr->ret) + return -1; + + return 0; } static int dm_pr_preempt(struct block_device *bdev, u64 old_key, u64 new_key, enum pr_type type, bool abort) { - struct mapped_device *md = bdev->bd_disk->private_data; - const struct pr_ops *ops; - int r, srcu_idx; + struct dm_pr pr = { + .new_key = new_key, + .old_key = old_key, + .type = type, + .fail_early = false, + }; + int ret; - r = dm_prepare_ioctl(md, &srcu_idx, &bdev); - if (r < 0) - goto out; + ret = dm_call_pr(bdev, __dm_pr_preempt, &pr); + if (ret) + return ret; - ops = bdev->bd_disk->fops->pr_ops; - if (ops && ops->pr_preempt) - r = ops->pr_preempt(bdev, old_key, new_key, type, abort); - else - r = -EOPNOTSUPP; -out: - dm_unprepare_ioctl(md, srcu_idx); - return r; + return pr.ret; } static int dm_pr_clear(struct block_device *bdev, u64 key) diff --git a/drivers/md/dm.h b/drivers/md/dm.h index a8405ce305a9..5201df03ce40 100644 --- a/drivers/md/dm.h +++ b/drivers/md/dm.h @@ -53,7 +53,6 @@ struct dm_io; *---------------------------------------------------------------*/ void dm_table_event_callback(struct dm_table *t, void (*fn)(void *), void *context); -struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index); struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector); bool dm_table_has_no_data_devices(struct dm_table *table); int dm_calculate_queue_limits(struct dm_table *table, @@ -218,9 +217,6 @@ void dm_kcopyd_exit(void); /* * Mempool operations */ -struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, enum dm_queue_mode type, - unsigned per_io_data_size, unsigned min_pool_size, - bool integrity, bool poll); void dm_free_md_mempools(struct dm_md_mempools *pools); /* diff --git a/drivers/md/md-autodetect.c b/drivers/md/md-autodetect.c index 2cf973722f59..91836e6de326 100644 --- a/drivers/md/md-autodetect.c +++ b/drivers/md/md-autodetect.c @@ -125,7 +125,6 @@ static void __init md_setup_drive(struct md_setup_args *args) char *devname = args->device_names; dev_t devices[MD_SB_DISKS + 1], mdev; struct mdu_array_info_s ainfo = { }; - struct block_device *bdev; struct mddev *mddev; int err = 0, i; char name[16]; @@ -169,24 +168,16 @@ static void __init md_setup_drive(struct md_setup_args *args) pr_info("md: Loading %s: %s\n", name, args->device_names); - bdev = blkdev_get_by_dev(mdev, FMODE_READ, NULL); - if (IS_ERR(bdev)) { - pr_err("md: open failed - cannot start array %s\n", name); + mddev = md_alloc(mdev, name); + if (IS_ERR(mddev)) { + pr_err("md: md_alloc failed - cannot start array %s\n", name); return; } - err = -EIO; - if (WARN(bdev->bd_disk->fops != &md_fops, - "Opening block device %x resulted in non-md device\n", - mdev)) - goto out_blkdev_put; - - mddev = bdev->bd_disk->private_data; - err = mddev_lock(mddev); if (err) { pr_err("md: failed to lock array %s\n", name); - goto out_blkdev_put; + goto out_mddev_put; } if (!list_empty(&mddev->disks) || mddev->raid_disks) { @@ -230,8 +221,8 @@ static void __init md_setup_drive(struct md_setup_args *args) pr_warn("md: starting %s failed\n", name); out_unlock: mddev_unlock(mddev); -out_blkdev_put: - blkdev_put(bdev, FMODE_READ); +out_mddev_put: + mddev_put(mddev); } static int __init raid_setup(char *str) diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index d87f674ab762..bf6dffadbe6f 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -165,7 +165,7 @@ static int read_sb_page(struct mddev *mddev, loff_t offset, if (sync_page_io(rdev, target, roundup(size, bdev_logical_block_size(rdev->bdev)), - page, REQ_OP_READ, 0, true)) { + page, REQ_OP_READ, true)) { page->index = index; return 0; } @@ -302,7 +302,7 @@ static void write_page(struct bitmap *bitmap, struct page *page, int wait) atomic_inc(&bitmap->pending_writes); set_buffer_locked(bh); set_buffer_mapped(bh); - submit_bh(REQ_OP_WRITE, REQ_SYNC, bh); + submit_bh(REQ_OP_WRITE | REQ_SYNC, bh); bh = bh->b_this_page; } @@ -394,7 +394,7 @@ static int read_page(struct file *file, unsigned long index, atomic_inc(&bitmap->pending_writes); set_buffer_locked(bh); set_buffer_mapped(bh); - submit_bh(REQ_OP_READ, 0, bh); + submit_bh(REQ_OP_READ, bh); } blk_cur++; bh = bh->b_this_page; diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index 37cbcce3cc66..742b2349fea3 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -40,7 +40,7 @@ struct resync_info { /* Lock the send communication. This is done through * bit manipulation as opposed to a mutex in order to - * accomodate lock and hold. See next comment. + * accommodate lock and hold. See next comment. */ #define MD_CLUSTER_SEND_LOCK 4 /* If cluster operations (such as adding a disk) must lock the @@ -689,7 +689,7 @@ static int lock_comm(struct md_cluster_info *cinfo, bool mddev_locked) /* * If resync thread run after raid1d thread, then process_metadata_update * could not continue if raid1d held reconfig_mutex (and raid1d is blocked - * since another node already got EX on Token and waitting the EX of Ack), + * since another node already got EX on Token and waiting the EX of Ack), * so let resync wake up thread in case flag is set. */ if (mddev_locked && !test_bit(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, diff --git a/drivers/md/md.c b/drivers/md/md.c index c7ecb0bffda0..afaf36b2f6ab 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -368,28 +368,6 @@ EXPORT_SYMBOL_GPL(md_new_event); static LIST_HEAD(all_mddevs); static DEFINE_SPINLOCK(all_mddevs_lock); -/* - * iterates through all used mddevs in the system. - * We take care to grab the all_mddevs_lock whenever navigating - * the list, and to always hold a refcount when unlocked. - * Any code which breaks out of this loop while own - * a reference to the current mddev and must mddev_put it. - */ -#define for_each_mddev(_mddev,_tmp) \ - \ - for (({ spin_lock(&all_mddevs_lock); \ - _tmp = all_mddevs.next; \ - _mddev = NULL;}); \ - ({ if (_tmp != &all_mddevs) \ - mddev_get(list_entry(_tmp, struct mddev, all_mddevs));\ - spin_unlock(&all_mddevs_lock); \ - if (_mddev) mddev_put(_mddev); \ - _mddev = list_entry(_tmp, struct mddev, all_mddevs); \ - _tmp != &all_mddevs;}); \ - ({ spin_lock(&all_mddevs_lock); \ - _tmp = _tmp->next;}) \ - ) - /* Rather than calling directly into the personality make_request function, * IO requests come here first so that we can check if the device is * being suspended pending a reconfiguration. @@ -464,7 +442,7 @@ static void md_submit_bio(struct bio *bio) return; } - blk_queue_split(&bio); + bio = bio_split_to_limits(bio); if (mddev->ro == 1 && unlikely(rw == WRITE)) { if (bio_sectors(bio) != 0) @@ -647,13 +625,17 @@ EXPORT_SYMBOL(md_flush_request); static inline struct mddev *mddev_get(struct mddev *mddev) { + lockdep_assert_held(&all_mddevs_lock); + + if (test_bit(MD_DELETED, &mddev->flags)) + return NULL; atomic_inc(&mddev->active); return mddev; } static void mddev_delayed_delete(struct work_struct *ws); -static void mddev_put(struct mddev *mddev) +void mddev_put(struct mddev *mddev) { if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock)) return; @@ -661,7 +643,7 @@ static void mddev_put(struct mddev *mddev) mddev->ctime == 0 && !mddev->hold_active) { /* Array is not configured at all, and not held active, * so destroy it */ - list_del_init(&mddev->all_mddevs); + set_bit(MD_DELETED, &mddev->flags); /* * Call queue_work inside the spinlock so that @@ -678,7 +660,6 @@ static void md_safemode_timeout(struct timer_list *t); void mddev_init(struct mddev *mddev) { - kobject_init(&mddev->kobj, &md_ktype); mutex_init(&mddev->open_mutex); mutex_init(&mddev->reconfig_mutex); mutex_init(&mddev->bitmap_info.mutex); @@ -733,22 +714,6 @@ static dev_t mddev_alloc_unit(void) return dev; } -static struct mddev *mddev_find(dev_t unit) -{ - struct mddev *mddev; - - if (MAJOR(unit) != MD_MAJOR) - unit &= ~((1 << MdpMinorShift) - 1); - - spin_lock(&all_mddevs_lock); - mddev = mddev_find_locked(unit); - if (mddev) - mddev_get(mddev); - spin_unlock(&all_mddevs_lock); - - return mddev; -} - static struct mddev *mddev_alloc(dev_t unit) { struct mddev *new; @@ -791,6 +756,15 @@ out_free_new: return ERR_PTR(error); } +static void mddev_free(struct mddev *mddev) +{ + spin_lock(&all_mddevs_lock); + list_del(&mddev->all_mddevs); + spin_unlock(&all_mddevs_lock); + + kfree(mddev); +} + static const struct attribute_group md_redundancy_group; void mddev_unlock(struct mddev *mddev) @@ -993,15 +967,15 @@ int md_super_wait(struct mddev *mddev) } int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, - struct page *page, int op, int op_flags, bool metadata_op) + struct page *page, blk_opf_t opf, bool metadata_op) { struct bio bio; struct bio_vec bvec; if (metadata_op && rdev->meta_bdev) - bio_init(&bio, rdev->meta_bdev, &bvec, 1, op | op_flags); + bio_init(&bio, rdev->meta_bdev, &bvec, 1, opf); else - bio_init(&bio, rdev->bdev, &bvec, 1, op | op_flags); + bio_init(&bio, rdev->bdev, &bvec, 1, opf); if (metadata_op) bio.bi_iter.bi_sector = sector + rdev->sb_start; @@ -1024,7 +998,7 @@ static int read_disk_sb(struct md_rdev *rdev, int size) if (rdev->sb_loaded) return 0; - if (!sync_page_io(rdev, 0, size, rdev->sb_page, REQ_OP_READ, 0, true)) + if (!sync_page_io(rdev, 0, size, rdev->sb_page, REQ_OP_READ, true)) goto fail; rdev->sb_loaded = 1; return 0; @@ -1722,7 +1696,7 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_ return -EINVAL; bb_sector = (long long)offset; if (!sync_page_io(rdev, bb_sector, sectors << 9, - rdev->bb_page, REQ_OP_READ, 0, true)) + rdev->bb_page, REQ_OP_READ, true)) return -EIO; bbp = (__le64 *)page_address(rdev->bb_page); rdev->badblocks.shift = sb->bblog_shift; @@ -2438,7 +2412,7 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev) mdname(mddev), mddev->max_disks); return -EBUSY; } - bdevname(rdev->bdev,b); + snprintf(b, sizeof(b), "%pg", rdev->bdev); strreplace(b, '/', '!'); rdev->mddev = mddev; @@ -3335,14 +3309,35 @@ rdev_size_show(struct md_rdev *rdev, char *page) return sprintf(page, "%llu\n", (unsigned long long)rdev->sectors / 2); } -static int overlaps(sector_t s1, sector_t l1, sector_t s2, sector_t l2) +static int md_rdevs_overlap(struct md_rdev *a, struct md_rdev *b) { /* check if two start/length pairs overlap */ - if (s1+l1 <= s2) - return 0; - if (s2+l2 <= s1) - return 0; - return 1; + if (a->data_offset + a->sectors <= b->data_offset) + return false; + if (b->data_offset + b->sectors <= a->data_offset) + return false; + return true; +} + +static bool md_rdev_overlaps(struct md_rdev *rdev) +{ + struct mddev *mddev; + struct md_rdev *rdev2; + + spin_lock(&all_mddevs_lock); + list_for_each_entry(mddev, &all_mddevs, all_mddevs) { + if (test_bit(MD_DELETED, &mddev->flags)) + continue; + rdev_for_each(rdev2, mddev) { + if (rdev != rdev2 && rdev->bdev == rdev2->bdev && + md_rdevs_overlap(rdev, rdev2)) { + spin_unlock(&all_mddevs_lock); + return true; + } + } + } + spin_unlock(&all_mddevs_lock); + return false; } static int strict_blocks_to_sectors(const char *buf, sector_t *sectors) @@ -3394,46 +3389,21 @@ rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len) return -EINVAL; /* component must fit device */ rdev->sectors = sectors; - if (sectors > oldsectors && my_mddev->external) { - /* Need to check that all other rdevs with the same - * ->bdev do not overlap. 'rcu' is sufficient to walk - * the rdev lists safely. - * This check does not provide a hard guarantee, it - * just helps avoid dangerous mistakes. - */ - struct mddev *mddev; - int overlap = 0; - struct list_head *tmp; - - rcu_read_lock(); - for_each_mddev(mddev, tmp) { - struct md_rdev *rdev2; - rdev_for_each(rdev2, mddev) - if (rdev->bdev == rdev2->bdev && - rdev != rdev2 && - overlaps(rdev->data_offset, rdev->sectors, - rdev2->data_offset, - rdev2->sectors)) { - overlap = 1; - break; - } - if (overlap) { - mddev_put(mddev); - break; - } - } - rcu_read_unlock(); - if (overlap) { - /* Someone else could have slipped in a size - * change here, but doing so is just silly. - * We put oldsectors back because we *know* it is - * safe, and trust userspace not to race with - * itself - */ - rdev->sectors = oldsectors; - return -EBUSY; - } + /* + * Check that all other rdevs with the same bdev do not overlap. This + * check does not provide a hard guarantee, it just helps avoid + * dangerous mistakes. + */ + if (sectors > oldsectors && my_mddev->external && + md_rdev_overlaps(rdev)) { + /* + * Someone else could have slipped in a size change here, but + * doing so is just silly. We put oldsectors back because we + * know it is safe, and trust userspace not to race with itself. + */ + rdev->sectors = oldsectors; + return -EBUSY; } return len; } @@ -4830,6 +4800,19 @@ action_store(struct mddev *mddev, const char *page, size_t len) if (work_pending(&mddev->del_work)) flush_workqueue(md_misc_wq); if (mddev->sync_thread) { + sector_t save_rp = mddev->reshape_position; + + mddev_unlock(mddev); + set_bit(MD_RECOVERY_INTR, &mddev->recovery); + md_unregister_thread(&mddev->sync_thread); + mddev_lock_nointr(mddev); + /* + * set RECOVERY_INTR again and restore reshape + * position in case others changed them after + * got lock, eg, reshape_position_store and + * md_check_recovery. + */ + mddev->reshape_position = save_rp; set_bit(MD_RECOVERY_INTR, &mddev->recovery); md_reap_sync_thread(mddev); } @@ -5001,7 +4984,7 @@ static ssize_t sync_speed_show(struct mddev *mddev, char *page) { unsigned long resync, dt, db; - if (mddev->curr_resync == 0) + if (mddev->curr_resync == MD_RESYNC_NONE) return sprintf(page, "none\n"); resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active); dt = (jiffies - mddev->resync_mark) / HZ; @@ -5020,8 +5003,8 @@ sync_completed_show(struct mddev *mddev, char *page) if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) return sprintf(page, "none\n"); - if (mddev->curr_resync == 1 || - mddev->curr_resync == 2) + if (mddev->curr_resync == MD_RESYNC_YIELDED || + mddev->curr_resync == MD_RESYNC_DELAYED) return sprintf(page, "delayed\n"); if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || @@ -5532,11 +5515,10 @@ md_attr_show(struct kobject *kobj, struct attribute *attr, char *page) if (!entry->show) return -EIO; spin_lock(&all_mddevs_lock); - if (list_empty(&mddev->all_mddevs)) { + if (!mddev_get(mddev)) { spin_unlock(&all_mddevs_lock); return -EBUSY; } - mddev_get(mddev); spin_unlock(&all_mddevs_lock); rv = entry->show(mddev, page); @@ -5557,18 +5539,17 @@ md_attr_store(struct kobject *kobj, struct attribute *attr, if (!capable(CAP_SYS_ADMIN)) return -EACCES; spin_lock(&all_mddevs_lock); - if (list_empty(&mddev->all_mddevs)) { + if (!mddev_get(mddev)) { spin_unlock(&all_mddevs_lock); return -EBUSY; } - mddev_get(mddev); spin_unlock(&all_mddevs_lock); rv = entry->store(mddev, page, length); mddev_put(mddev); return rv; } -static void md_free(struct kobject *ko) +static void md_kobj_release(struct kobject *ko) { struct mddev *mddev = container_of(ko, struct mddev, kobj); @@ -5577,15 +5558,8 @@ static void md_free(struct kobject *ko) if (mddev->sysfs_level) sysfs_put(mddev->sysfs_level); - if (mddev->gendisk) { - del_gendisk(mddev->gendisk); - blk_cleanup_disk(mddev->gendisk); - } - percpu_ref_exit(&mddev->writes_pending); - - bioset_exit(&mddev->bio_set); - bioset_exit(&mddev->sync_set); - kfree(mddev); + del_gendisk(mddev->gendisk); + put_disk(mddev->gendisk); } static const struct sysfs_ops md_sysfs_ops = { @@ -5593,7 +5567,7 @@ static const struct sysfs_ops md_sysfs_ops = { .store = md_attr_store, }; static struct kobj_type md_ktype = { - .release = md_free, + .release = md_kobj_release, .sysfs_ops = &md_sysfs_ops, .default_groups = md_attr_groups, }; @@ -5604,7 +5578,6 @@ static void mddev_delayed_delete(struct work_struct *ws) { struct mddev *mddev = container_of(ws, struct mddev, del_work); - kobject_del(&mddev->kobj); kobject_put(&mddev->kobj); } @@ -5623,7 +5596,7 @@ int mddev_init_writes_pending(struct mddev *mddev) } EXPORT_SYMBOL_GPL(mddev_init_writes_pending); -static int md_alloc(dev_t dev, char *name) +struct mddev *md_alloc(dev_t dev, char *name) { /* * If dev is zero, name is the name of a device to allocate with @@ -5651,8 +5624,8 @@ static int md_alloc(dev_t dev, char *name) mutex_lock(&disks_mutex); mddev = mddev_alloc(dev); if (IS_ERR(mddev)) { - mutex_unlock(&disks_mutex); - return PTR_ERR(mddev); + error = PTR_ERR(mddev); + goto out_unlock; } partitioned = (MAJOR(mddev->unit) != MD_MAJOR); @@ -5670,7 +5643,7 @@ static int md_alloc(dev_t dev, char *name) strcmp(mddev2->gendisk->disk_name, name) == 0) { spin_unlock(&all_mddevs_lock); error = -EEXIST; - goto out_unlock_disks_mutex; + goto out_free_mddev; } spin_unlock(&all_mddevs_lock); } @@ -5683,7 +5656,7 @@ static int md_alloc(dev_t dev, char *name) error = -ENOMEM; disk = blk_alloc_disk(NUMA_NO_NODE); if (!disk) - goto out_unlock_disks_mutex; + goto out_free_mddev; disk->major = MAJOR(mddev->unit); disk->first_minor = unit << shift; @@ -5704,25 +5677,45 @@ static int md_alloc(dev_t dev, char *name) mddev->gendisk = disk; error = add_disk(disk); if (error) - goto out_cleanup_disk; + goto out_put_disk; + kobject_init(&mddev->kobj, &md_ktype); error = kobject_add(&mddev->kobj, &disk_to_dev(disk)->kobj, "%s", "md"); - if (error) - goto out_del_gendisk; + if (error) { + /* + * The disk is already live at this point. Clear the hold flag + * and let mddev_put take care of the deletion, as it isn't any + * different from a normal close on last release now. + */ + mddev->hold_active = 0; + mutex_unlock(&disks_mutex); + mddev_put(mddev); + return ERR_PTR(error); + } kobject_uevent(&mddev->kobj, KOBJ_ADD); mddev->sysfs_state = sysfs_get_dirent_safe(mddev->kobj.sd, "array_state"); mddev->sysfs_level = sysfs_get_dirent_safe(mddev->kobj.sd, "level"); - goto out_unlock_disks_mutex; + mutex_unlock(&disks_mutex); + return mddev; -out_del_gendisk: - del_gendisk(disk); -out_cleanup_disk: - blk_cleanup_disk(disk); -out_unlock_disks_mutex: +out_put_disk: + put_disk(disk); +out_free_mddev: + mddev_free(mddev); +out_unlock: mutex_unlock(&disks_mutex); + return ERR_PTR(error); +} + +static int md_alloc_and_put(dev_t dev, char *name) +{ + struct mddev *mddev = md_alloc(dev, name); + + if (IS_ERR(mddev)) + return PTR_ERR(mddev); mddev_put(mddev); - return error; + return 0; } static void md_probe(dev_t dev) @@ -5730,7 +5723,7 @@ static void md_probe(dev_t dev) if (MAJOR(dev) == MD_MAJOR && MINOR(dev) >= 512) return; if (create_on_open) - md_alloc(dev, NULL); + md_alloc_and_put(dev, NULL); } static int add_named_array(const char *val, const struct kernel_param *kp) @@ -5752,12 +5745,12 @@ static int add_named_array(const char *val, const struct kernel_param *kp) return -E2BIG; strscpy(buf, val, len+1); if (strncmp(buf, "md_", 3) == 0) - return md_alloc(0, buf); + return md_alloc_and_put(0, buf); if (strncmp(buf, "md", 2) == 0 && isdigit(buf[2]) && kstrtoul(buf+2, 10, &devnum) == 0 && devnum <= MINORMASK) - return md_alloc(MKDEV(MD_MAJOR, devnum), NULL); + return md_alloc_and_put(MKDEV(MD_MAJOR, devnum), NULL); return -EINVAL; } @@ -6197,6 +6190,7 @@ static void __md_stop_writes(struct mddev *mddev) flush_workqueue(md_misc_wq); if (mddev->sync_thread) { set_bit(MD_RECOVERY_INTR, &mddev->recovery); + md_unregister_thread(&mddev->sync_thread); md_reap_sync_thread(mddev); } @@ -6244,11 +6238,11 @@ static void mddev_detach(struct mddev *mddev) static void __md_stop(struct mddev *mddev) { struct md_personality *pers = mddev->pers; - md_bitmap_destroy(mddev); mddev_detach(mddev); /* Ensure ->event_work is done */ if (mddev->event_work.func) flush_workqueue(md_misc_wq); + md_bitmap_destroy(mddev); spin_lock(&mddev->lock); mddev->pers = NULL; spin_unlock(&mddev->lock); @@ -6497,9 +6491,8 @@ static void autorun_devices(int part) break; } - md_probe(dev); - mddev = mddev_find(dev); - if (!mddev) + mddev = md_alloc(dev, NULL); + if (IS_ERR(mddev)) break; if (mddev_lock(mddev)) @@ -7782,45 +7775,33 @@ out_unlock: static int md_open(struct block_device *bdev, fmode_t mode) { - /* - * Succeed if we can lock the mddev, which confirms that - * it isn't being stopped right now. - */ - struct mddev *mddev = mddev_find(bdev->bd_dev); + struct mddev *mddev; int err; + spin_lock(&all_mddevs_lock); + mddev = mddev_get(bdev->bd_disk->private_data); + spin_unlock(&all_mddevs_lock); if (!mddev) return -ENODEV; - if (mddev->gendisk != bdev->bd_disk) { - /* we are racing with mddev_put which is discarding this - * bd_disk. - */ - mddev_put(mddev); - /* Wait until bdev->bd_disk is definitely gone */ - if (work_pending(&mddev->del_work)) - flush_workqueue(md_misc_wq); - return -EBUSY; - } - BUG_ON(mddev != bdev->bd_disk->private_data); - - if ((err = mutex_lock_interruptible(&mddev->open_mutex))) + err = mutex_lock_interruptible(&mddev->open_mutex); + if (err) goto out; - if (test_bit(MD_CLOSING, &mddev->flags)) { - mutex_unlock(&mddev->open_mutex); - err = -ENODEV; - goto out; - } + err = -ENODEV; + if (test_bit(MD_CLOSING, &mddev->flags)) + goto out_unlock; - err = 0; atomic_inc(&mddev->openers); mutex_unlock(&mddev->open_mutex); bdev_check_media_change(bdev); - out: - if (err) - mddev_put(mddev); + return 0; + +out_unlock: + mutex_unlock(&mddev->open_mutex); +out: + mddev_put(mddev); return err; } @@ -7844,6 +7825,17 @@ static unsigned int md_check_events(struct gendisk *disk, unsigned int clearing) return ret; } +static void md_free_disk(struct gendisk *disk) +{ + struct mddev *mddev = disk->private_data; + + percpu_ref_exit(&mddev->writes_pending); + bioset_exit(&mddev->bio_set); + bioset_exit(&mddev->sync_set); + + mddev_free(mddev); +} + const struct block_device_operations md_fops = { .owner = THIS_MODULE, @@ -7857,6 +7849,7 @@ const struct block_device_operations md_fops = .getgeo = md_getgeo, .check_events = md_check_events, .set_read_only = md_set_read_only, + .free_disk = md_free_disk, }; static int md_thread(void *arg) @@ -8018,16 +8011,26 @@ static int status_resync(struct seq_file *seq, struct mddev *mddev) max_sectors = mddev->dev_sectors; resync = mddev->curr_resync; - if (resync <= 3) { + if (resync < MD_RESYNC_ACTIVE) { if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) /* Still cleaning up */ resync = max_sectors; - } else if (resync > max_sectors) + } else if (resync > max_sectors) { resync = max_sectors; - else + } else { resync -= atomic_read(&mddev->recovery_active); + if (resync < MD_RESYNC_ACTIVE) { + /* + * Resync has started, but the subtraction has + * yielded one of the special values. Force it + * to active to ensure the status reports an + * active resync. + */ + resync = MD_RESYNC_ACTIVE; + } + } - if (resync == 0) { + if (resync == MD_RESYNC_NONE) { if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery)) { struct md_rdev *rdev; @@ -8051,7 +8054,7 @@ static int status_resync(struct seq_file *seq, struct mddev *mddev) } return 0; } - if (resync < 3) { + if (resync < MD_RESYNC_ACTIVE) { seq_printf(seq, "\tresync=DELAYED"); return 1; } @@ -8152,6 +8155,8 @@ static void *md_seq_start(struct seq_file *seq, loff_t *pos) if (!l--) { mddev = list_entry(tmp, struct mddev, all_mddevs); mddev_get(mddev); + if (!mddev_get(mddev)) + continue; spin_unlock(&all_mddevs_lock); return mddev; } @@ -8165,25 +8170,35 @@ static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct list_head *tmp; struct mddev *next_mddev, *mddev = v; + struct mddev *to_put = NULL; ++*pos; if (v == (void*)2) return NULL; spin_lock(&all_mddevs_lock); - if (v == (void*)1) + if (v == (void*)1) { tmp = all_mddevs.next; - else + } else { + to_put = mddev; + tmp = mddev->all_mddevs.next; + } + + for (;;) { + if (tmp == &all_mddevs) { + next_mddev = (void*)2; + *pos = 0x10000; + break; + } + next_mddev = list_entry(tmp, struct mddev, all_mddevs); + if (mddev_get(next_mddev)) + break; + mddev = next_mddev; tmp = mddev->all_mddevs.next; - if (tmp != &all_mddevs) - next_mddev = mddev_get(list_entry(tmp,struct mddev,all_mddevs)); - else { - next_mddev = (void*)2; - *pos = 0x10000; } spin_unlock(&all_mddevs_lock); - if (v != (void*)1) + if (to_put) mddev_put(mddev); return next_mddev; @@ -8682,7 +8697,6 @@ void md_do_sync(struct md_thread *thread) unsigned long update_time; sector_t mark_cnt[SYNC_MARKS]; int last_mark,m; - struct list_head *tmp; sector_t last_check; int skipped = 0; struct md_rdev *rdev; @@ -8729,13 +8743,7 @@ void md_do_sync(struct md_thread *thread) mddev->last_sync_action = action ?: desc; - /* we overload curr_resync somewhat here. - * 0 == not engaged in resync at all - * 2 == checking that there is no conflict with another sync - * 1 == like 2, but have yielded to allow conflicting resync to - * commence - * other == active in resync - this many blocks - * + /* * Before starting a resync we must have set curr_resync to * 2, and then checked that every "conflicting" array has curr_resync * less than ours. When we find one that is the same or higher @@ -8747,24 +8755,29 @@ void md_do_sync(struct md_thread *thread) do { int mddev2_minor = -1; - mddev->curr_resync = 2; + mddev->curr_resync = MD_RESYNC_DELAYED; try_again: if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) goto skip; - for_each_mddev(mddev2, tmp) { + spin_lock(&all_mddevs_lock); + list_for_each_entry(mddev2, &all_mddevs, all_mddevs) { + if (test_bit(MD_DELETED, &mddev2->flags)) + continue; if (mddev2 == mddev) continue; if (!mddev->parallel_resync && mddev2->curr_resync && match_mddev_units(mddev, mddev2)) { DEFINE_WAIT(wq); - if (mddev < mddev2 && mddev->curr_resync == 2) { + if (mddev < mddev2 && + mddev->curr_resync == MD_RESYNC_DELAYED) { /* arbitrarily yield */ - mddev->curr_resync = 1; + mddev->curr_resync = MD_RESYNC_YIELDED; wake_up(&resync_wait); } - if (mddev > mddev2 && mddev->curr_resync == 1) + if (mddev > mddev2 && + mddev->curr_resync == MD_RESYNC_YIELDED) /* no need to wait here, we can wait the next * time 'round when curr_resync == 2 */ @@ -8782,7 +8795,8 @@ void md_do_sync(struct md_thread *thread) desc, mdname(mddev), mdname(mddev2)); } - mddev_put(mddev2); + spin_unlock(&all_mddevs_lock); + if (signal_pending(current)) flush_signals(current); schedule(); @@ -8792,7 +8806,8 @@ void md_do_sync(struct md_thread *thread) finish_wait(&resync_wait, &wq); } } - } while (mddev->curr_resync < 2); + spin_unlock(&all_mddevs_lock); + } while (mddev->curr_resync < MD_RESYNC_DELAYED); j = 0; if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { @@ -8876,7 +8891,7 @@ void md_do_sync(struct md_thread *thread) desc, mdname(mddev)); mddev->curr_resync = j; } else - mddev->curr_resync = 3; /* no longer delayed */ + mddev->curr_resync = MD_RESYNC_ACTIVE; /* no longer delayed */ mddev->curr_resync_completed = j; sysfs_notify_dirent_safe(mddev->sysfs_completed); md_new_event(); @@ -9011,14 +9026,14 @@ void md_do_sync(struct md_thread *thread) if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && !test_bit(MD_RECOVERY_INTR, &mddev->recovery) && - mddev->curr_resync > 3) { + mddev->curr_resync >= MD_RESYNC_ACTIVE) { mddev->curr_resync_completed = mddev->curr_resync; sysfs_notify_dirent_safe(mddev->sysfs_completed); } mddev->pers->sync_request(mddev, max_sectors, &skipped); if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) && - mddev->curr_resync > 3) { + mddev->curr_resync >= MD_RESYNC_ACTIVE) { if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { if (mddev->curr_resync >= mddev->recovery_cp) { @@ -9082,7 +9097,7 @@ void md_do_sync(struct md_thread *thread) } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) mddev->resync_min = mddev->curr_resync_completed; set_bit(MD_RECOVERY_DONE, &mddev->recovery); - mddev->curr_resync = 0; + mddev->curr_resync = MD_RESYNC_NONE; spin_unlock(&mddev->lock); wake_up(&resync_wait); @@ -9303,6 +9318,7 @@ void md_check_recovery(struct mddev *mddev) * ->spare_active and clear saved_raid_disk */ set_bit(MD_RECOVERY_INTR, &mddev->recovery); + md_unregister_thread(&mddev->sync_thread); md_reap_sync_thread(mddev); clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); @@ -9338,6 +9354,7 @@ void md_check_recovery(struct mddev *mddev) goto unlock; } if (mddev->sync_thread) { + md_unregister_thread(&mddev->sync_thread); md_reap_sync_thread(mddev); goto unlock; } @@ -9417,8 +9434,7 @@ void md_reap_sync_thread(struct mddev *mddev) sector_t old_dev_sectors = mddev->dev_sectors; bool is_reshaped = false; - /* resync has finished, collect result */ - md_unregister_thread(&mddev->sync_thread); + /* sync_thread should be unregistered, collect result */ if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && mddev->degraded != mddev->raid_disks) { @@ -9466,6 +9482,7 @@ void md_reap_sync_thread(struct mddev *mddev) wake_up(&resync_wait); /* flag recovery needed just to double check */ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + sysfs_notify_dirent_safe(mddev->sysfs_completed); sysfs_notify_dirent_safe(mddev->sysfs_action); md_new_event(); if (mddev->event_work.func) @@ -9544,11 +9561,14 @@ EXPORT_SYMBOL_GPL(rdev_clear_badblocks); static int md_notify_reboot(struct notifier_block *this, unsigned long code, void *x) { - struct list_head *tmp; - struct mddev *mddev; + struct mddev *mddev, *n; int need_delay = 0; - for_each_mddev(mddev, tmp) { + spin_lock(&all_mddevs_lock); + list_for_each_entry_safe(mddev, n, &all_mddevs, all_mddevs) { + if (!mddev_get(mddev)) + continue; + spin_unlock(&all_mddevs_lock); if (mddev_trylock(mddev)) { if (mddev->pers) __md_stop_writes(mddev); @@ -9557,7 +9577,11 @@ static int md_notify_reboot(struct notifier_block *this, mddev_unlock(mddev); } need_delay = 1; + mddev_put(mddev); + spin_lock(&all_mddevs_lock); } + spin_unlock(&all_mddevs_lock); + /* * certain more exotic SCSI devices are known to be * volatile wrt too early system reboots. While the @@ -9876,8 +9900,7 @@ void md_autostart_arrays(int part) static __exit void md_exit(void) { - struct mddev *mddev; - struct list_head *tmp; + struct mddev *mddev, *n; int delay = 1; unregister_blkdev(MD_MAJOR,"md"); @@ -9897,17 +9920,24 @@ static __exit void md_exit(void) } remove_proc_entry("mdstat", NULL); - for_each_mddev(mddev, tmp) { + spin_lock(&all_mddevs_lock); + list_for_each_entry_safe(mddev, n, &all_mddevs, all_mddevs) { + if (!mddev_get(mddev)) + continue; + spin_unlock(&all_mddevs_lock); export_array(mddev); mddev->ctime = 0; mddev->hold_active = 0; /* - * for_each_mddev() will call mddev_put() at the end of each - * iteration. As the mddev is now fully clear, this will - * schedule the mddev for destruction by a workqueue, and the + * As the mddev is now fully clear, mddev_put will schedule + * the mddev for destruction by a workqueue, and the * destroy_workqueue() below will wait for that to complete. */ + mddev_put(mddev); + spin_lock(&all_mddevs_lock); } + spin_unlock(&all_mddevs_lock); + destroy_workqueue(md_rdev_misc_wq); destroy_workqueue(md_misc_wq); destroy_workqueue(md_wq); diff --git a/drivers/md/md.h b/drivers/md/md.h index cf2cbb17acbd..b4e2d8b87b61 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -254,6 +254,7 @@ struct md_cluster_info; * @MD_NOT_READY: do_md_run() is active, so 'array_state', ust not report that * array is ready yet. * @MD_BROKEN: This is used to stop writes and mark array as failed. + * @MD_DELETED: This device is being deleted * * change UNSUPPORTED_MDDEV_FLAGS for each array type if new flag is added */ @@ -270,6 +271,7 @@ enum mddev_flags { MD_UPDATING_SB, MD_NOT_READY, MD_BROKEN, + MD_DELETED, }; enum mddev_sb_flags { @@ -288,6 +290,21 @@ struct serial_info { sector_t _subtree_last; /* highest sector in subtree of rb node */ }; +/* + * mddev->curr_resync stores the current sector of the resync but + * also has some overloaded values. + */ +enum { + /* No resync in progress */ + MD_RESYNC_NONE = 0, + /* Yielded to allow another conflicting resync to commence */ + MD_RESYNC_YIELDED = 1, + /* Delayed to check that there is no conflict with another sync */ + MD_RESYNC_DELAYED = 2, + /* Any value greater than or equal to this is in an active resync */ + MD_RESYNC_ACTIVE = 3, +}; + struct mddev { void *private; struct md_personality *pers; @@ -738,8 +755,7 @@ extern void md_super_write(struct mddev *mddev, struct md_rdev *rdev, sector_t sector, int size, struct page *page); extern int md_super_wait(struct mddev *mddev); extern int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, - struct page *page, int op, int op_flags, - bool metadata_op); + struct page *page, blk_opf_t opf, bool metadata_op); extern void md_do_sync(struct md_thread *thread); extern void md_new_event(void); extern void md_allow_write(struct mddev *mddev); @@ -751,6 +767,8 @@ extern int md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev); extern int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale); extern void mddev_init(struct mddev *mddev); +struct mddev *md_alloc(dev_t dev, char *name); +void mddev_put(struct mddev *mddev); extern int md_run(struct mddev *mddev); extern int md_start(struct mddev *mddev); extern void md_stop(struct mddev *mddev); diff --git a/drivers/md/persistent-data/dm-block-manager.c b/drivers/md/persistent-data/dm-block-manager.c index 54c089a50b15..11935864f50f 100644 --- a/drivers/md/persistent-data/dm-block-manager.c +++ b/drivers/md/persistent-data/dm-block-manager.c @@ -391,7 +391,8 @@ struct dm_block_manager *dm_block_manager_create(struct block_device *bdev, bm->bufio = dm_bufio_client_create(bdev, block_size, max_held_per_thread, sizeof(struct buffer_aux), dm_block_manager_alloc_callback, - dm_block_manager_write_callback); + dm_block_manager_write_callback, + 0); if (IS_ERR(bm->bufio)) { r = PTR_ERR(bm->bufio); kfree(bm); diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 258d4eb2d63c..05d8438cfec8 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1220,8 +1220,8 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio, struct raid1_info *mirror; struct bio *read_bio; struct bitmap *bitmap = mddev->bitmap; - const int op = bio_op(bio); - const unsigned long do_sync = (bio->bi_opf & REQ_SYNC); + const enum req_op op = bio_op(bio); + const blk_opf_t do_sync = bio->bi_opf & REQ_SYNC; int max_sectors; int rdisk; bool r1bio_existed = !!r1_bio; @@ -1240,7 +1240,7 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio, rcu_read_lock(); rdev = rcu_dereference(conf->mirrors[r1_bio->read_disk].rdev); if (rdev) - bdevname(rdev->bdev, b); + snprintf(b, sizeof(b), "%pg", rdev->bdev); else strcpy(b, "???"); rcu_read_unlock(); @@ -1988,9 +1988,9 @@ static void end_sync_write(struct bio *bio) } static int r1_sync_page_io(struct md_rdev *rdev, sector_t sector, - int sectors, struct page *page, int rw) + int sectors, struct page *page, int rw) { - if (sync_page_io(rdev, sector, sectors << 9, page, rw, 0, false)) + if (sync_page_io(rdev, sector, sectors << 9, page, rw, false)) /* success */ return 1; if (rw == WRITE) { @@ -2057,7 +2057,7 @@ static int fix_sync_read_error(struct r1bio *r1_bio) rdev = conf->mirrors[d].rdev; if (sync_page_io(rdev, sect, s<<9, pages[idx], - REQ_OP_READ, 0, false)) { + REQ_OP_READ, false)) { success = 1; break; } @@ -2305,7 +2305,7 @@ static void fix_read_error(struct r1conf *conf, int read_disk, atomic_inc(&rdev->nr_pending); rcu_read_unlock(); if (sync_page_io(rdev, sect, s<<9, - conf->tmppage, REQ_OP_READ, 0, false)) + conf->tmppage, REQ_OP_READ, false)) success = 1; rdev_dec_pending(rdev, mddev); if (success) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index d589f823feb1..9117fcdee1be 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1136,8 +1136,8 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio, { struct r10conf *conf = mddev->private; struct bio *read_bio; - const int op = bio_op(bio); - const unsigned long do_sync = (bio->bi_opf & REQ_SYNC); + const enum req_op op = bio_op(bio); + const blk_opf_t do_sync = bio->bi_opf & REQ_SYNC; int max_sectors; struct md_rdev *rdev; char b[BDEVNAME_SIZE]; @@ -1164,7 +1164,7 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio, disk = r10_bio->devs[slot].devnum; err_rdev = rcu_dereference(conf->mirrors[disk].rdev); if (err_rdev) - bdevname(err_rdev->bdev, b); + snprintf(b, sizeof(b), "%pg", err_rdev->bdev); else { strcpy(b, "???"); /* This never gets dereferenced */ @@ -1230,9 +1230,9 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio, struct bio *bio, bool replacement, int n_copy) { - const int op = bio_op(bio); - const unsigned long do_sync = (bio->bi_opf & REQ_SYNC); - const unsigned long do_fua = (bio->bi_opf & REQ_FUA); + const enum req_op op = bio_op(bio); + const blk_opf_t do_sync = bio->bi_opf & REQ_SYNC; + const blk_opf_t do_fua = bio->bi_opf & REQ_FUA; unsigned long flags; struct blk_plug_cb *cb; struct raid1_plug_cb *plug = NULL; @@ -2167,9 +2167,12 @@ static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev) int err = 0; int number = rdev->raid_disk; struct md_rdev **rdevp; - struct raid10_info *p = conf->mirrors + number; + struct raid10_info *p; print_conf(conf); + if (unlikely(number >= mddev->raid_disks)) + return 0; + p = conf->mirrors + number; if (rdev == p->rdev) rdevp = &p->rdev; else if (rdev == p->replacement) @@ -2512,7 +2515,7 @@ static void fix_recovery_read_error(struct r10bio *r10_bio) addr, s << 9, pages[idx], - REQ_OP_READ, 0, false); + REQ_OP_READ, false); if (ok) { rdev = conf->mirrors[dw].rdev; addr = r10_bio->devs[1].addr + sect; @@ -2520,7 +2523,7 @@ static void fix_recovery_read_error(struct r10bio *r10_bio) addr, s << 9, pages[idx], - REQ_OP_WRITE, 0, false); + REQ_OP_WRITE, false); if (!ok) { set_bit(WriteErrorSeen, &rdev->flags); if (!test_and_set_bit(WantReplacement, @@ -2644,7 +2647,7 @@ static int r10_sync_page_io(struct md_rdev *rdev, sector_t sector, if (is_badblock(rdev, sector, sectors, &first_bad, &bad_sectors) && (rw == READ || test_bit(WriteErrorSeen, &rdev->flags))) return -1; - if (sync_page_io(rdev, sector, sectors << 9, page, rw, 0, false)) + if (sync_page_io(rdev, sector, sectors << 9, page, rw, false)) /* success */ return 1; if (rw == WRITE) { @@ -2726,7 +2729,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 sect, s<<9, conf->tmppage, - REQ_OP_READ, 0, false); + REQ_OP_READ, false); rdev_dec_pending(rdev, mddev); rcu_read_lock(); if (success) @@ -5107,7 +5110,7 @@ static int handle_reshape_read_error(struct mddev *mddev, addr, s << 9, pages[idx], - REQ_OP_READ, 0, false); + REQ_OP_READ, false); rdev_dec_pending(rdev, mddev); rcu_read_lock(); if (success) diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index 83c184eddbda..f4e1cc1ece43 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -1590,18 +1590,13 @@ void r5l_quiesce(struct r5l_log *log, int quiesce) bool r5l_log_disk_error(struct r5conf *conf) { - struct r5l_log *log; - bool ret; - /* don't allow write if journal disk is missing */ - rcu_read_lock(); - log = rcu_dereference(conf->log); + struct r5l_log *log = conf->log; + /* don't allow write if journal disk is missing */ if (!log) - ret = test_bit(MD_HAS_JOURNAL, &conf->mddev->flags); + return test_bit(MD_HAS_JOURNAL, &conf->mddev->flags); else - ret = test_bit(Faulty, &log->rdev->flags); - rcu_read_unlock(); - return ret; + return test_bit(Faulty, &log->rdev->flags); } #define R5L_RECOVERY_PAGE_POOL_SIZE 256 @@ -1788,7 +1783,7 @@ static int r5l_log_write_empty_meta_block(struct r5l_log *log, sector_t pos, mb = page_address(page); mb->checksum = cpu_to_le32(crc32c_le(log->uuid_checksum, mb, PAGE_SIZE)); - if (!sync_page_io(log->rdev, pos, PAGE_SIZE, page, REQ_OP_WRITE, + if (!sync_page_io(log->rdev, pos, PAGE_SIZE, page, REQ_OP_WRITE | REQ_SYNC | REQ_FUA, false)) { __free_page(page); return -EIO; @@ -1898,7 +1893,7 @@ r5l_recovery_replay_one_stripe(struct r5conf *conf, atomic_inc(&rdev->nr_pending); rcu_read_unlock(); sync_page_io(rdev, sh->sector, PAGE_SIZE, - sh->dev[disk_index].page, REQ_OP_WRITE, 0, + sh->dev[disk_index].page, REQ_OP_WRITE, false); rdev_dec_pending(rdev, rdev->mddev); rcu_read_lock(); @@ -1908,7 +1903,7 @@ r5l_recovery_replay_one_stripe(struct r5conf *conf, atomic_inc(&rrdev->nr_pending); rcu_read_unlock(); sync_page_io(rrdev, sh->sector, PAGE_SIZE, - sh->dev[disk_index].page, REQ_OP_WRITE, 0, + sh->dev[disk_index].page, REQ_OP_WRITE, false); rdev_dec_pending(rrdev, rrdev->mddev); rcu_read_lock(); @@ -2394,7 +2389,7 @@ r5c_recovery_rewrite_data_only_stripes(struct r5l_log *log, PAGE_SIZE)); kunmap_atomic(addr); sync_page_io(log->rdev, write_pos, PAGE_SIZE, - dev->page, REQ_OP_WRITE, 0, false); + dev->page, REQ_OP_WRITE, false); write_pos = r5l_ring_add(log, write_pos, BLOCK_SECTORS); offset += sizeof(__le32) + @@ -2406,7 +2401,7 @@ r5c_recovery_rewrite_data_only_stripes(struct r5l_log *log, mb->checksum = cpu_to_le32(crc32c_le(log->uuid_checksum, mb, PAGE_SIZE)); sync_page_io(log->rdev, ctx->pos, PAGE_SIZE, page, - REQ_OP_WRITE, REQ_SYNC | REQ_FUA, false); + REQ_OP_WRITE | REQ_SYNC | REQ_FUA, false); sh->log_start = ctx->pos; list_add_tail(&sh->r5c, &log->stripe_in_journal_list); atomic_inc(&log->stripe_in_journal_count); @@ -2534,12 +2529,13 @@ static ssize_t r5c_journal_mode_show(struct mddev *mddev, char *page) struct r5conf *conf; int ret; - spin_lock(&mddev->lock); + ret = mddev_lock(mddev); + if (ret) + return ret; + conf = mddev->private; - if (!conf || !conf->log) { - spin_unlock(&mddev->lock); - return 0; - } + if (!conf || !conf->log) + goto out_unlock; switch (conf->log->r5c_journal_mode) { case R5C_JOURNAL_MODE_WRITE_THROUGH: @@ -2557,7 +2553,9 @@ static ssize_t r5c_journal_mode_show(struct mddev *mddev, char *page) default: ret = 0; } - spin_unlock(&mddev->lock); + +out_unlock: + mddev_unlock(mddev); return ret; } @@ -2639,7 +2637,7 @@ int r5c_try_caching_write(struct r5conf *conf, int i; struct r5dev *dev; int to_cache = 0; - void **pslot; + void __rcu **pslot; sector_t tree_index; int ret; uintptr_t refcount; @@ -2806,7 +2804,7 @@ void r5c_finish_stripe_write_out(struct r5conf *conf, int i; int do_wakeup = 0; sector_t tree_index; - void **pslot; + void __rcu **pslot; uintptr_t refcount; if (!log || !test_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags)) @@ -2971,7 +2969,7 @@ static int r5l_load_log(struct r5l_log *log) if (!page) return -ENOMEM; - if (!sync_page_io(rdev, cp, PAGE_SIZE, page, REQ_OP_READ, 0, false)) { + if (!sync_page_io(rdev, cp, PAGE_SIZE, page, REQ_OP_READ, false)) { ret = -EIO; goto ioerr; } @@ -3145,7 +3143,7 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev) spin_lock_init(&log->stripe_in_journal_lock); atomic_set(&log->stripe_in_journal_count, 0); - rcu_assign_pointer(conf->log, log); + conf->log = log; set_bit(MD_HAS_JOURNAL, &conf->mddev->flags); return 0; @@ -3167,13 +3165,13 @@ void r5l_exit_log(struct r5conf *conf) { struct r5l_log *log = conf->log; - conf->log = NULL; - synchronize_rcu(); - /* Ensure disable_writeback_work wakes up and exits */ wake_up(&conf->mddev->sb_wait); flush_work(&log->disable_writeback_work); md_unregister_thread(&log->reclaim_thread); + + conf->log = NULL; + mempool_exit(&log->meta_pool); bioset_exit(&log->bs); mempool_exit(&log->io_pool); diff --git a/drivers/md/raid5-log.h b/drivers/md/raid5-log.h index 43c714a8798c..c8332502669e 100644 --- a/drivers/md/raid5-log.h +++ b/drivers/md/raid5-log.h @@ -2,49 +2,46 @@ #ifndef _RAID5_LOG_H #define _RAID5_LOG_H -extern int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev); -extern void r5l_exit_log(struct r5conf *conf); -extern int r5l_write_stripe(struct r5l_log *log, struct stripe_head *head_sh); -extern void r5l_write_stripe_run(struct r5l_log *log); -extern void r5l_flush_stripe_to_raid(struct r5l_log *log); -extern void r5l_stripe_write_finished(struct stripe_head *sh); -extern int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio); -extern void r5l_quiesce(struct r5l_log *log, int quiesce); -extern bool r5l_log_disk_error(struct r5conf *conf); -extern bool r5c_is_writeback(struct r5l_log *log); -extern int -r5c_try_caching_write(struct r5conf *conf, struct stripe_head *sh, - struct stripe_head_state *s, int disks); -extern void -r5c_finish_stripe_write_out(struct r5conf *conf, struct stripe_head *sh, - struct stripe_head_state *s); -extern void r5c_release_extra_page(struct stripe_head *sh); -extern void r5c_use_extra_page(struct stripe_head *sh); -extern void r5l_wake_reclaim(struct r5l_log *log, sector_t space); -extern void r5c_handle_cached_data_endio(struct r5conf *conf, - struct stripe_head *sh, int disks); -extern int r5c_cache_data(struct r5l_log *log, struct stripe_head *sh); -extern void r5c_make_stripe_write_out(struct stripe_head *sh); -extern void r5c_flush_cache(struct r5conf *conf, int num); -extern void r5c_check_stripe_cache_usage(struct r5conf *conf); -extern void r5c_check_cached_full_stripe(struct r5conf *conf); +int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev); +void r5l_exit_log(struct r5conf *conf); +int r5l_write_stripe(struct r5l_log *log, struct stripe_head *head_sh); +void r5l_write_stripe_run(struct r5l_log *log); +void r5l_flush_stripe_to_raid(struct r5l_log *log); +void r5l_stripe_write_finished(struct stripe_head *sh); +int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio); +void r5l_quiesce(struct r5l_log *log, int quiesce); +bool r5l_log_disk_error(struct r5conf *conf); +bool r5c_is_writeback(struct r5l_log *log); +int r5c_try_caching_write(struct r5conf *conf, struct stripe_head *sh, + struct stripe_head_state *s, int disks); +void r5c_finish_stripe_write_out(struct r5conf *conf, struct stripe_head *sh, + struct stripe_head_state *s); +void r5c_release_extra_page(struct stripe_head *sh); +void r5c_use_extra_page(struct stripe_head *sh); +void r5l_wake_reclaim(struct r5l_log *log, sector_t space); +void r5c_handle_cached_data_endio(struct r5conf *conf, + struct stripe_head *sh, int disks); +int r5c_cache_data(struct r5l_log *log, struct stripe_head *sh); +void r5c_make_stripe_write_out(struct stripe_head *sh); +void r5c_flush_cache(struct r5conf *conf, int num); +void r5c_check_stripe_cache_usage(struct r5conf *conf); +void r5c_check_cached_full_stripe(struct r5conf *conf); extern struct md_sysfs_entry r5c_journal_mode; -extern void r5c_update_on_rdev_error(struct mddev *mddev, - struct md_rdev *rdev); -extern bool r5c_big_stripe_cached(struct r5conf *conf, sector_t sect); -extern int r5l_start(struct r5l_log *log); +void r5c_update_on_rdev_error(struct mddev *mddev, struct md_rdev *rdev); +bool r5c_big_stripe_cached(struct r5conf *conf, sector_t sect); +int r5l_start(struct r5l_log *log); -extern struct dma_async_tx_descriptor * +struct dma_async_tx_descriptor * ops_run_partial_parity(struct stripe_head *sh, struct raid5_percpu *percpu, struct dma_async_tx_descriptor *tx); -extern int ppl_init_log(struct r5conf *conf); -extern void ppl_exit_log(struct r5conf *conf); -extern int ppl_write_stripe(struct r5conf *conf, struct stripe_head *sh); -extern void ppl_write_stripe_run(struct r5conf *conf); -extern void ppl_stripe_write_finished(struct stripe_head *sh); -extern int ppl_modify_log(struct r5conf *conf, struct md_rdev *rdev, bool add); -extern void ppl_quiesce(struct r5conf *conf, int quiesce); -extern int ppl_handle_flush_request(struct r5l_log *log, struct bio *bio); +int ppl_init_log(struct r5conf *conf); +void ppl_exit_log(struct r5conf *conf); +int ppl_write_stripe(struct r5conf *conf, struct stripe_head *sh); +void ppl_write_stripe_run(struct r5conf *conf); +void ppl_stripe_write_finished(struct stripe_head *sh); +int ppl_modify_log(struct r5conf *conf, struct md_rdev *rdev, bool add); +void ppl_quiesce(struct r5conf *conf, int quiesce); +int ppl_handle_flush_request(struct bio *bio); extern struct md_sysfs_entry ppl_write_hint; static inline bool raid5_has_log(struct r5conf *conf) @@ -111,7 +108,7 @@ static inline int log_handle_flush_request(struct r5conf *conf, struct bio *bio) if (conf->log) ret = r5l_handle_flush_request(conf->log, bio); else if (raid5_has_ppl(conf)) - ret = ppl_handle_flush_request(conf->log, bio); + ret = ppl_handle_flush_request(bio); return ret; } diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c index 0a2e4806b1ec..31b9157bc9ae 100644 --- a/drivers/md/raid5-ppl.c +++ b/drivers/md/raid5-ppl.c @@ -679,7 +679,7 @@ void ppl_quiesce(struct r5conf *conf, int quiesce) } } -int ppl_handle_flush_request(struct r5l_log *log, struct bio *bio) +int ppl_handle_flush_request(struct bio *bio) { if (bio->bi_iter.bi_size == 0) { bio_endio(bio); @@ -897,7 +897,7 @@ static int ppl_recover_entry(struct ppl_log *log, struct ppl_header_entry *e, __func__, indent, "", rdev->bdev, (unsigned long long)sector); if (!sync_page_io(rdev, sector, block_size, page2, - REQ_OP_READ, 0, false)) { + REQ_OP_READ, false)) { md_error(mddev, rdev); pr_debug("%s:%*s read failed!\n", __func__, indent, ""); @@ -919,7 +919,7 @@ static int ppl_recover_entry(struct ppl_log *log, struct ppl_header_entry *e, (unsigned long long)(ppl_sector + i)); if (!sync_page_io(log->rdev, ppl_sector - log->rdev->data_offset + i, - block_size, page2, REQ_OP_READ, 0, + block_size, page2, REQ_OP_READ, false)) { pr_debug("%s:%*s read failed!\n", __func__, indent, ""); @@ -946,7 +946,7 @@ static int ppl_recover_entry(struct ppl_log *log, struct ppl_header_entry *e, (unsigned long long)parity_sector, parity_rdev->bdev); if (!sync_page_io(parity_rdev, parity_sector, block_size, - page1, REQ_OP_WRITE, 0, false)) { + page1, REQ_OP_WRITE, false)) { pr_debug("%s:%*s parity write error!\n", __func__, indent, ""); md_error(mddev, parity_rdev); @@ -998,7 +998,7 @@ static int ppl_recover(struct ppl_log *log, struct ppl_header *pplhdr, int s = pp_size > PAGE_SIZE ? PAGE_SIZE : pp_size; if (!sync_page_io(rdev, sector - rdev->data_offset, - s, page, REQ_OP_READ, 0, false)) { + s, page, REQ_OP_READ, false)) { md_error(mddev, rdev); ret = -EIO; goto out; @@ -1062,7 +1062,7 @@ static int ppl_write_empty_header(struct ppl_log *log) if (!sync_page_io(rdev, rdev->ppl.sector - rdev->data_offset, PPL_HEADER_SIZE, page, REQ_OP_WRITE | REQ_SYNC | - REQ_FUA, 0, false)) { + REQ_FUA, false)) { md_error(rdev->mddev, rdev); ret = -EIO; } @@ -1100,7 +1100,7 @@ static int ppl_load_distributed(struct ppl_log *log) if (!sync_page_io(rdev, rdev->ppl.sector - rdev->data_offset + pplhdr_offset, PAGE_SIZE, page, REQ_OP_READ, - 0, false)) { + false)) { md_error(mddev, rdev); ret = -EIO; /* if not able to read - don't recover any PPL */ diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 20e53b167f81..31a0cbf63384 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -61,6 +61,8 @@ #define cpu_to_group(cpu) cpu_to_node(cpu) #define ANY_GROUP NUMA_NO_NODE +#define RAID5_MAX_REQ_STRIPES 256 + static bool devices_handle_discard_safely = false; module_param(devices_handle_discard_safely, bool, 0644); MODULE_PARM_DESC(devices_handle_discard_safely, @@ -624,6 +626,49 @@ static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector, return NULL; } +static struct stripe_head *find_get_stripe(struct r5conf *conf, + sector_t sector, short generation, int hash) +{ + int inc_empty_inactive_list_flag; + struct stripe_head *sh; + + sh = __find_stripe(conf, sector, generation); + if (!sh) + return NULL; + + if (atomic_inc_not_zero(&sh->count)) + return sh; + + /* + * Slow path. The reference count is zero which means the stripe must + * be on a list (sh->lru). Must remove the stripe from the list that + * references it with the device_lock held. + */ + + spin_lock(&conf->device_lock); + if (!atomic_read(&sh->count)) { + if (!test_bit(STRIPE_HANDLE, &sh->state)) + atomic_inc(&conf->active_stripes); + BUG_ON(list_empty(&sh->lru) && + !test_bit(STRIPE_EXPANDING, &sh->state)); + inc_empty_inactive_list_flag = 0; + if (!list_empty(conf->inactive_list + hash)) + inc_empty_inactive_list_flag = 1; + list_del_init(&sh->lru); + if (list_empty(conf->inactive_list + hash) && + inc_empty_inactive_list_flag) + atomic_inc(&conf->empty_inactive_list_nr); + if (sh->group) { + sh->group->stripes_cnt--; + sh->group = NULL; + } + } + atomic_inc(&sh->count); + spin_unlock(&conf->device_lock); + + return sh; +} + /* * Need to check if array has failed when deciding whether to: * - start an array @@ -710,80 +755,121 @@ static bool has_failed(struct r5conf *conf) return degraded > conf->max_degraded; } -struct stripe_head * -raid5_get_active_stripe(struct r5conf *conf, sector_t sector, - int previous, int noblock, int noquiesce) +enum stripe_result { + STRIPE_SUCCESS = 0, + STRIPE_RETRY, + STRIPE_SCHEDULE_AND_RETRY, + STRIPE_FAIL, +}; + +struct stripe_request_ctx { + /* a reference to the last stripe_head for batching */ + struct stripe_head *batch_last; + + /* first sector in the request */ + sector_t first_sector; + + /* last sector in the request */ + sector_t last_sector; + + /* + * bitmap to track stripe sectors that have been added to stripes + * add one to account for unaligned requests + */ + DECLARE_BITMAP(sectors_to_do, RAID5_MAX_REQ_STRIPES + 1); + + /* the request had REQ_PREFLUSH, cleared after the first stripe_head */ + bool do_flush; +}; + +/* + * Block until another thread clears R5_INACTIVE_BLOCKED or + * there are fewer than 3/4 the maximum number of active stripes + * and there is an inactive stripe available. + */ +static bool is_inactive_blocked(struct r5conf *conf, int hash) +{ + int active = atomic_read(&conf->active_stripes); + + if (list_empty(conf->inactive_list + hash)) + return false; + + if (!test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)) + return true; + + return active < (conf->max_nr_stripes * 3 / 4); +} + +static struct stripe_head *__raid5_get_active_stripe(struct r5conf *conf, + struct stripe_request_ctx *ctx, sector_t sector, + bool previous, bool noblock, bool noquiesce) { struct stripe_head *sh; int hash = stripe_hash_locks_hash(conf, sector); - int inc_empty_inactive_list_flag; pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector); spin_lock_irq(conf->hash_locks + hash); - do { - wait_event_lock_irq(conf->wait_for_quiescent, - conf->quiesce == 0 || noquiesce, +retry: + if (!noquiesce && conf->quiesce) { + /* + * Must release the reference to batch_last before waiting, + * on quiesce, otherwise the batch_last will hold a reference + * to a stripe and raid5_quiesce() will deadlock waiting for + * active_stripes to go to zero. + */ + if (ctx && ctx->batch_last) { + raid5_release_stripe(ctx->batch_last); + ctx->batch_last = NULL; + } + + wait_event_lock_irq(conf->wait_for_quiescent, !conf->quiesce, *(conf->hash_locks + hash)); - sh = __find_stripe(conf, sector, conf->generation - previous); - if (!sh) { - if (!test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)) { - sh = get_free_stripe(conf, hash); - if (!sh && !test_bit(R5_DID_ALLOC, - &conf->cache_state)) - set_bit(R5_ALLOC_MORE, - &conf->cache_state); - } - if (noblock && sh == NULL) - break; + } - r5c_check_stripe_cache_usage(conf); - if (!sh) { - set_bit(R5_INACTIVE_BLOCKED, - &conf->cache_state); - r5l_wake_reclaim(conf->log, 0); - wait_event_lock_irq( - conf->wait_for_stripe, - !list_empty(conf->inactive_list + hash) && - (atomic_read(&conf->active_stripes) - < (conf->max_nr_stripes * 3 / 4) - || !test_bit(R5_INACTIVE_BLOCKED, - &conf->cache_state)), - *(conf->hash_locks + hash)); - clear_bit(R5_INACTIVE_BLOCKED, - &conf->cache_state); - } else { - init_stripe(sh, sector, previous); - atomic_inc(&sh->count); - } - } else if (!atomic_inc_not_zero(&sh->count)) { - spin_lock(&conf->device_lock); - if (!atomic_read(&sh->count)) { - if (!test_bit(STRIPE_HANDLE, &sh->state)) - atomic_inc(&conf->active_stripes); - BUG_ON(list_empty(&sh->lru) && - !test_bit(STRIPE_EXPANDING, &sh->state)); - inc_empty_inactive_list_flag = 0; - if (!list_empty(conf->inactive_list + hash)) - inc_empty_inactive_list_flag = 1; - list_del_init(&sh->lru); - if (list_empty(conf->inactive_list + hash) && inc_empty_inactive_list_flag) - atomic_inc(&conf->empty_inactive_list_nr); - if (sh->group) { - sh->group->stripes_cnt--; - sh->group = NULL; - } - } - atomic_inc(&sh->count); - spin_unlock(&conf->device_lock); - } - } while (sh == NULL); + sh = find_get_stripe(conf, sector, conf->generation - previous, hash); + if (sh) + goto out; + + if (test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)) + goto wait_for_stripe; + + sh = get_free_stripe(conf, hash); + if (sh) { + r5c_check_stripe_cache_usage(conf); + init_stripe(sh, sector, previous); + atomic_inc(&sh->count); + goto out; + } + + if (!test_bit(R5_DID_ALLOC, &conf->cache_state)) + set_bit(R5_ALLOC_MORE, &conf->cache_state); + +wait_for_stripe: + if (noblock) + goto out; + set_bit(R5_INACTIVE_BLOCKED, &conf->cache_state); + r5l_wake_reclaim(conf->log, 0); + wait_event_lock_irq(conf->wait_for_stripe, + is_inactive_blocked(conf, hash), + *(conf->hash_locks + hash)); + clear_bit(R5_INACTIVE_BLOCKED, &conf->cache_state); + goto retry; + +out: spin_unlock_irq(conf->hash_locks + hash); return sh; } +struct stripe_head *raid5_get_active_stripe(struct r5conf *conf, + sector_t sector, bool previous, bool noblock, bool noquiesce) +{ + return __raid5_get_active_stripe(conf, NULL, sector, previous, noblock, + noquiesce); +} + static bool is_full_stripe_write(struct stripe_head *sh) { BUG_ON(sh->overwrite_disks > (sh->disks - sh->raid_conf->max_degraded)); @@ -824,13 +910,13 @@ static bool stripe_can_batch(struct stripe_head *sh) } /* we only do back search */ -static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh) +static void stripe_add_to_batch_list(struct r5conf *conf, + struct stripe_head *sh, struct stripe_head *last_sh) { struct stripe_head *head; sector_t head_sector, tmp_sec; int hash; int dd_idx; - int inc_empty_inactive_list_flag; /* Don't cross chunks, so stripe pd_idx/qd_idx is the same */ tmp_sec = sh->sector; @@ -838,36 +924,20 @@ static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh return; head_sector = sh->sector - RAID5_STRIPE_SECTORS(conf); - hash = stripe_hash_locks_hash(conf, head_sector); - spin_lock_irq(conf->hash_locks + hash); - head = __find_stripe(conf, head_sector, conf->generation); - if (head && !atomic_inc_not_zero(&head->count)) { - spin_lock(&conf->device_lock); - if (!atomic_read(&head->count)) { - if (!test_bit(STRIPE_HANDLE, &head->state)) - atomic_inc(&conf->active_stripes); - BUG_ON(list_empty(&head->lru) && - !test_bit(STRIPE_EXPANDING, &head->state)); - inc_empty_inactive_list_flag = 0; - if (!list_empty(conf->inactive_list + hash)) - inc_empty_inactive_list_flag = 1; - list_del_init(&head->lru); - if (list_empty(conf->inactive_list + hash) && inc_empty_inactive_list_flag) - atomic_inc(&conf->empty_inactive_list_nr); - if (head->group) { - head->group->stripes_cnt--; - head->group = NULL; - } - } + if (last_sh && head_sector == last_sh->sector) { + head = last_sh; atomic_inc(&head->count); - spin_unlock(&conf->device_lock); + } else { + hash = stripe_hash_locks_hash(conf, head_sector); + spin_lock_irq(conf->hash_locks + hash); + head = find_get_stripe(conf, head_sector, conf->generation, + hash); + spin_unlock_irq(conf->hash_locks + hash); + if (!head) + return; + if (!stripe_can_batch(head)) + goto out; } - spin_unlock_irq(conf->hash_locks + hash); - - if (!head) - return; - if (!stripe_can_batch(head)) - goto out; lock_two_stripes(head, sh); /* clear_batch_ready clear the flag */ @@ -1082,7 +1152,8 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) should_defer = conf->batch_bio_dispatch && conf->group_cnt; for (i = disks; i--; ) { - int op, op_flags = 0; + enum req_op op; + blk_opf_t op_flags = 0; int replace_only = 0; struct bio *bi, *rbi; struct md_rdev *rdev, *rrdev = NULL; @@ -2881,10 +2952,10 @@ static void raid5_end_write_request(struct bio *bi) if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags)) clear_bit(R5_LOCKED, &sh->dev[i].flags); set_bit(STRIPE_HANDLE, &sh->state); - raid5_release_stripe(sh); if (sh->batch_head && sh != sh->batch_head) raid5_release_stripe(sh->batch_head); + raid5_release_stripe(sh); } static void raid5_error(struct mddev *mddev, struct md_rdev *rdev) @@ -3412,39 +3483,32 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s, s->locked, s->ops_request); } -/* - * Each stripe/dev can have one or more bion attached. - * toread/towrite point to the first in a chain. - * The bi_next chain must be in order. - */ -static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, - int forwrite, int previous) +static bool stripe_bio_overlaps(struct stripe_head *sh, struct bio *bi, + int dd_idx, int forwrite) { - struct bio **bip; struct r5conf *conf = sh->raid_conf; - int firstwrite=0; + struct bio **bip; - pr_debug("adding bi b#%llu to stripe s#%llu\n", - (unsigned long long)bi->bi_iter.bi_sector, - (unsigned long long)sh->sector); + pr_debug("checking bi b#%llu to stripe s#%llu\n", + bi->bi_iter.bi_sector, sh->sector); - spin_lock_irq(&sh->stripe_lock); /* Don't allow new IO added to stripes in batch list */ if (sh->batch_head) - goto overlap; - if (forwrite) { + return true; + + if (forwrite) bip = &sh->dev[dd_idx].towrite; - if (*bip == NULL) - firstwrite = 1; - } else + else bip = &sh->dev[dd_idx].toread; + while (*bip && (*bip)->bi_iter.bi_sector < bi->bi_iter.bi_sector) { if (bio_end_sector(*bip) > bi->bi_iter.bi_sector) - goto overlap; - bip = & (*bip)->bi_next; + return true; + bip = &(*bip)->bi_next; } + if (*bip && (*bip)->bi_iter.bi_sector < bio_end_sector(bi)) - goto overlap; + return true; if (forwrite && raid5_has_ppl(conf)) { /* @@ -3473,9 +3537,30 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, } if (first + conf->chunk_sectors * (count - 1) != last) - goto overlap; + return true; + } + + return false; +} + +static void __add_stripe_bio(struct stripe_head *sh, struct bio *bi, + int dd_idx, int forwrite, int previous) +{ + struct r5conf *conf = sh->raid_conf; + struct bio **bip; + int firstwrite = 0; + + if (forwrite) { + bip = &sh->dev[dd_idx].towrite; + if (!*bip) + firstwrite = 1; + } else { + bip = &sh->dev[dd_idx].toread; } + while (*bip && (*bip)->bi_iter.bi_sector < bi->bi_iter.bi_sector) + bip = &(*bip)->bi_next; + if (!forwrite || previous) clear_bit(STRIPE_BATCH_READY, &sh->state); @@ -3501,9 +3586,9 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, sh->overwrite_disks++; } - pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n", - (unsigned long long)(*bip)->bi_iter.bi_sector, - (unsigned long long)sh->sector, dd_idx); + pr_debug("added bi b#%llu to stripe s#%llu, disk %d, logical %llu\n", + (*bip)->bi_iter.bi_sector, sh->sector, dd_idx, + sh->dev[dd_idx].sector); if (conf->mddev->bitmap && firstwrite) { /* Cannot hold spinlock over bitmap_startwrite, @@ -3511,7 +3596,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, * we have added to the bitmap and set bm_seq. * So set STRIPE_BITMAP_PENDING to prevent * batching. - * If multiple add_stripe_bio() calls race here they + * If multiple __add_stripe_bio() calls race here they * much all set STRIPE_BITMAP_PENDING. So only the first one * to complete "bitmap_startwrite" gets to set * STRIPE_BIT_DELAY. This is important as once a stripe @@ -3529,16 +3614,27 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, set_bit(STRIPE_BIT_DELAY, &sh->state); } } - spin_unlock_irq(&sh->stripe_lock); +} - if (stripe_can_batch(sh)) - stripe_add_to_batch_list(conf, sh); - return 1; +/* + * Each stripe/dev can have one or more bios attached. + * toread/towrite point to the first in a chain. + * The bi_next chain must be in order. + */ +static bool add_stripe_bio(struct stripe_head *sh, struct bio *bi, + int dd_idx, int forwrite, int previous) +{ + spin_lock_irq(&sh->stripe_lock); + + if (stripe_bio_overlaps(sh, bi, dd_idx, forwrite)) { + set_bit(R5_Overlap, &sh->dev[dd_idx].flags); + spin_unlock_irq(&sh->stripe_lock); + return false; + } - overlap: - set_bit(R5_Overlap, &sh->dev[dd_idx].flags); + __add_stripe_bio(sh, bi, dd_idx, forwrite, previous); spin_unlock_irq(&sh->stripe_lock); - return 0; + return true; } static void end_reshape(struct r5conf *conf); @@ -5784,17 +5880,215 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi) bio_endio(bi); } -static bool raid5_make_request(struct mddev *mddev, struct bio * bi) +static bool ahead_of_reshape(struct mddev *mddev, sector_t sector, + sector_t reshape_sector) { - struct r5conf *conf = mddev->private; + return mddev->reshape_backwards ? sector < reshape_sector : + sector >= reshape_sector; +} + +static bool range_ahead_of_reshape(struct mddev *mddev, sector_t min, + sector_t max, sector_t reshape_sector) +{ + return mddev->reshape_backwards ? max < reshape_sector : + min >= reshape_sector; +} + +static bool stripe_ahead_of_reshape(struct mddev *mddev, struct r5conf *conf, + struct stripe_head *sh) +{ + sector_t max_sector = 0, min_sector = MaxSector; + bool ret = false; int dd_idx; - sector_t new_sector; - sector_t logical_sector, last_sector; + + for (dd_idx = 0; dd_idx < sh->disks; dd_idx++) { + if (dd_idx == sh->pd_idx) + continue; + + min_sector = min(min_sector, sh->dev[dd_idx].sector); + max_sector = min(max_sector, sh->dev[dd_idx].sector); + } + + spin_lock_irq(&conf->device_lock); + + if (!range_ahead_of_reshape(mddev, min_sector, max_sector, + conf->reshape_progress)) + /* mismatch, need to try again */ + ret = true; + + spin_unlock_irq(&conf->device_lock); + + return ret; +} + +static int add_all_stripe_bios(struct r5conf *conf, + struct stripe_request_ctx *ctx, struct stripe_head *sh, + struct bio *bi, int forwrite, int previous) +{ + int dd_idx; + int ret = 1; + + spin_lock_irq(&sh->stripe_lock); + + for (dd_idx = 0; dd_idx < sh->disks; dd_idx++) { + struct r5dev *dev = &sh->dev[dd_idx]; + + if (dd_idx == sh->pd_idx || dd_idx == sh->qd_idx) + continue; + + if (dev->sector < ctx->first_sector || + dev->sector >= ctx->last_sector) + continue; + + if (stripe_bio_overlaps(sh, bi, dd_idx, forwrite)) { + set_bit(R5_Overlap, &dev->flags); + ret = 0; + continue; + } + } + + if (!ret) + goto out; + + for (dd_idx = 0; dd_idx < sh->disks; dd_idx++) { + struct r5dev *dev = &sh->dev[dd_idx]; + + if (dd_idx == sh->pd_idx || dd_idx == sh->qd_idx) + continue; + + if (dev->sector < ctx->first_sector || + dev->sector >= ctx->last_sector) + continue; + + __add_stripe_bio(sh, bi, dd_idx, forwrite, previous); + clear_bit((dev->sector - ctx->first_sector) >> + RAID5_STRIPE_SHIFT(conf), ctx->sectors_to_do); + } + +out: + spin_unlock_irq(&sh->stripe_lock); + return ret; +} + +static enum stripe_result make_stripe_request(struct mddev *mddev, + struct r5conf *conf, struct stripe_request_ctx *ctx, + sector_t logical_sector, struct bio *bi) +{ + const int rw = bio_data_dir(bi); + enum stripe_result ret; struct stripe_head *sh; + sector_t new_sector; + int previous = 0; + int seq, dd_idx; + + seq = read_seqcount_begin(&conf->gen_lock); + + if (unlikely(conf->reshape_progress != MaxSector)) { + /* + * Spinlock is needed as reshape_progress may be + * 64bit on a 32bit platform, and so it might be + * possible to see a half-updated value + * Of course reshape_progress could change after + * the lock is dropped, so once we get a reference + * to the stripe that we think it is, we will have + * to check again. + */ + spin_lock_irq(&conf->device_lock); + if (ahead_of_reshape(mddev, logical_sector, + conf->reshape_progress)) { + previous = 1; + } else { + if (ahead_of_reshape(mddev, logical_sector, + conf->reshape_safe)) { + spin_unlock_irq(&conf->device_lock); + return STRIPE_SCHEDULE_AND_RETRY; + } + } + spin_unlock_irq(&conf->device_lock); + } + + new_sector = raid5_compute_sector(conf, logical_sector, previous, + &dd_idx, NULL); + pr_debug("raid456: %s, sector %llu logical %llu\n", __func__, + new_sector, logical_sector); + + sh = __raid5_get_active_stripe(conf, ctx, new_sector, previous, + (bi->bi_opf & REQ_RAHEAD), 0); + if (unlikely(!sh)) { + /* cannot get stripe, just give-up */ + bi->bi_status = BLK_STS_IOERR; + return STRIPE_FAIL; + } + + if (unlikely(previous) && + stripe_ahead_of_reshape(mddev, conf, sh)) { + /* + * Expansion moved on while waiting for a stripe. + * Expansion could still move past after this + * test, but as we are holding a reference to + * 'sh', we know that if that happens, + * STRIPE_EXPANDING will get set and the expansion + * won't proceed until we finish with the stripe. + */ + ret = STRIPE_SCHEDULE_AND_RETRY; + goto out_release; + } + + if (read_seqcount_retry(&conf->gen_lock, seq)) { + /* Might have got the wrong stripe_head by accident */ + ret = STRIPE_RETRY; + goto out_release; + } + + if (test_bit(STRIPE_EXPANDING, &sh->state) || + !add_all_stripe_bios(conf, ctx, sh, bi, rw, previous)) { + /* + * Stripe is busy expanding or add failed due to + * overlap. Flush everything and wait a while. + */ + md_wakeup_thread(mddev->thread); + ret = STRIPE_SCHEDULE_AND_RETRY; + goto out_release; + } + + if (stripe_can_batch(sh)) { + stripe_add_to_batch_list(conf, sh, ctx->batch_last); + if (ctx->batch_last) + raid5_release_stripe(ctx->batch_last); + atomic_inc(&sh->count); + ctx->batch_last = sh; + } + + if (ctx->do_flush) { + set_bit(STRIPE_R5C_PREFLUSH, &sh->state); + /* we only need flush for one stripe */ + ctx->do_flush = false; + } + + set_bit(STRIPE_HANDLE, &sh->state); + clear_bit(STRIPE_DELAYED, &sh->state); + if ((!sh->batch_head || sh == sh->batch_head) && + (bi->bi_opf & REQ_SYNC) && + !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) + atomic_inc(&conf->preread_active_stripes); + + release_stripe_plug(mddev, sh); + return STRIPE_SUCCESS; + +out_release: + raid5_release_stripe(sh); + return ret; +} + +static bool raid5_make_request(struct mddev *mddev, struct bio * bi) +{ + DEFINE_WAIT_FUNC(wait, woken_wake_function); + struct r5conf *conf = mddev->private; + sector_t logical_sector; + struct stripe_request_ctx ctx = {}; const int rw = bio_data_dir(bi); - DEFINE_WAIT(w); - bool do_prepare; - bool do_flush = false; + enum stripe_result res; + int s, stripe_cnt; if (unlikely(bi->bi_opf & REQ_PREFLUSH)) { int ret = log_handle_flush_request(conf, bi); @@ -5810,7 +6104,7 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi) * if r5l_handle_flush_request() didn't clear REQ_PREFLUSH, * we need to flush journal device */ - do_flush = bi->bi_opf & REQ_PREFLUSH; + ctx.do_flush = bi->bi_opf & REQ_PREFLUSH; } if (!md_write_start(mddev, bi)) @@ -5834,134 +6128,68 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi) } logical_sector = bi->bi_iter.bi_sector & ~((sector_t)RAID5_STRIPE_SECTORS(conf)-1); - last_sector = bio_end_sector(bi); + ctx.first_sector = logical_sector; + ctx.last_sector = bio_end_sector(bi); bi->bi_next = NULL; + stripe_cnt = DIV_ROUND_UP_SECTOR_T(ctx.last_sector - logical_sector, + RAID5_STRIPE_SECTORS(conf)); + bitmap_set(ctx.sectors_to_do, 0, stripe_cnt); + + pr_debug("raid456: %s, logical %llu to %llu\n", __func__, + bi->bi_iter.bi_sector, ctx.last_sector); + /* Bail out if conflicts with reshape and REQ_NOWAIT is set */ if ((bi->bi_opf & REQ_NOWAIT) && (conf->reshape_progress != MaxSector) && - (mddev->reshape_backwards - ? (logical_sector > conf->reshape_progress && logical_sector <= conf->reshape_safe) - : (logical_sector >= conf->reshape_safe && logical_sector < conf->reshape_progress))) { + !ahead_of_reshape(mddev, logical_sector, conf->reshape_progress) && + ahead_of_reshape(mddev, logical_sector, conf->reshape_safe)) { bio_wouldblock_error(bi); if (rw == WRITE) md_write_end(mddev); return true; } md_account_bio(mddev, &bi); - prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); - for (; logical_sector < last_sector; logical_sector += RAID5_STRIPE_SECTORS(conf)) { - int previous; - int seq; - - do_prepare = false; - retry: - seq = read_seqcount_begin(&conf->gen_lock); - previous = 0; - if (do_prepare) - prepare_to_wait(&conf->wait_for_overlap, &w, - TASK_UNINTERRUPTIBLE); - if (unlikely(conf->reshape_progress != MaxSector)) { - /* spinlock is needed as reshape_progress may be - * 64bit on a 32bit platform, and so it might be - * possible to see a half-updated value - * Of course reshape_progress could change after - * the lock is dropped, so once we get a reference - * to the stripe that we think it is, we will have - * to check again. - */ - spin_lock_irq(&conf->device_lock); - if (mddev->reshape_backwards - ? logical_sector < conf->reshape_progress - : logical_sector >= conf->reshape_progress) { - previous = 1; - } else { - if (mddev->reshape_backwards - ? logical_sector < conf->reshape_safe - : logical_sector >= conf->reshape_safe) { - spin_unlock_irq(&conf->device_lock); - schedule(); - do_prepare = true; - goto retry; - } - } - spin_unlock_irq(&conf->device_lock); - } - new_sector = raid5_compute_sector(conf, logical_sector, - previous, - &dd_idx, NULL); - pr_debug("raid456: raid5_make_request, sector %llu logical %llu\n", - (unsigned long long)new_sector, - (unsigned long long)logical_sector); + add_wait_queue(&conf->wait_for_overlap, &wait); + while (1) { + res = make_stripe_request(mddev, conf, &ctx, logical_sector, + bi); + if (res == STRIPE_FAIL) + break; - sh = raid5_get_active_stripe(conf, new_sector, previous, - (bi->bi_opf & REQ_RAHEAD), 0); - if (sh) { - if (unlikely(previous)) { - /* expansion might have moved on while waiting for a - * stripe, so we must do the range check again. - * Expansion could still move past after this - * test, but as we are holding a reference to - * 'sh', we know that if that happens, - * STRIPE_EXPANDING will get set and the expansion - * won't proceed until we finish with the stripe. - */ - int must_retry = 0; - spin_lock_irq(&conf->device_lock); - if (mddev->reshape_backwards - ? logical_sector >= conf->reshape_progress - : logical_sector < conf->reshape_progress) - /* mismatch, need to try again */ - must_retry = 1; - spin_unlock_irq(&conf->device_lock); - if (must_retry) { - raid5_release_stripe(sh); - schedule(); - do_prepare = true; - goto retry; - } - } - if (read_seqcount_retry(&conf->gen_lock, seq)) { - /* Might have got the wrong stripe_head - * by accident - */ - raid5_release_stripe(sh); - goto retry; - } + if (res == STRIPE_RETRY) + continue; - if (test_bit(STRIPE_EXPANDING, &sh->state) || - !add_stripe_bio(sh, bi, dd_idx, rw, previous)) { - /* Stripe is busy expanding or - * add failed due to overlap. Flush everything - * and wait a while - */ - md_wakeup_thread(mddev->thread); - raid5_release_stripe(sh); - schedule(); - do_prepare = true; - goto retry; - } - if (do_flush) { - set_bit(STRIPE_R5C_PREFLUSH, &sh->state); - /* we only need flush for one stripe */ - do_flush = false; + if (res == STRIPE_SCHEDULE_AND_RETRY) { + /* + * Must release the reference to batch_last before + * scheduling and waiting for work to be done, + * otherwise the batch_last stripe head could prevent + * raid5_activate_delayed() from making progress + * and thus deadlocking. + */ + if (ctx.batch_last) { + raid5_release_stripe(ctx.batch_last); + ctx.batch_last = NULL; } - set_bit(STRIPE_HANDLE, &sh->state); - clear_bit(STRIPE_DELAYED, &sh->state); - if ((!sh->batch_head || sh == sh->batch_head) && - (bi->bi_opf & REQ_SYNC) && - !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) - atomic_inc(&conf->preread_active_stripes); - release_stripe_plug(mddev, sh); - } else { - /* cannot get stripe for read-ahead, just give-up */ - bi->bi_status = BLK_STS_IOERR; - break; + wait_woken(&wait, TASK_UNINTERRUPTIBLE, + MAX_SCHEDULE_TIMEOUT); + continue; } + + s = find_first_bit(ctx.sectors_to_do, stripe_cnt); + if (s == stripe_cnt) + break; + + logical_sector = ctx.first_sector + + (s << RAID5_STRIPE_SHIFT(conf)); } - finish_wait(&conf->wait_for_overlap, &w); + remove_wait_queue(&conf->wait_for_overlap, &wait); + + if (ctx.batch_last) + raid5_release_stripe(ctx.batch_last); if (rw == WRITE) md_write_end(mddev); @@ -7304,7 +7532,9 @@ static struct r5conf *setup_conf(struct mddev *mddev) goto abort; conf->mddev = mddev; - if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL) + ret = -ENOMEM; + conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL); + if (!conf->stripe_hashtbl) goto abort; /* We init hash_locks[0] separately to that it can be used @@ -7414,7 +7644,7 @@ static struct r5conf *setup_conf(struct mddev *mddev) conf->shrinker.count_objects = raid5_cache_count; conf->shrinker.batch = 128; conf->shrinker.flags = 0; - ret = register_shrinker(&conf->shrinker); + ret = register_shrinker(&conf->shrinker, "md-raid5:%s", mdname(mddev)); if (ret) { pr_warn("md/raid:%s: couldn't register shrinker.\n", mdname(mddev)); @@ -7812,7 +8042,15 @@ static int raid5_run(struct mddev *mddev) mddev->queue->limits.discard_granularity < stripe) blk_queue_max_discard_sectors(mddev->queue, 0); - blk_queue_max_hw_sectors(mddev->queue, UINT_MAX); + /* + * Requests require having a bitmap for each stripe. + * Limit the max sectors based on this. + */ + blk_queue_max_hw_sectors(mddev->queue, + RAID5_MAX_REQ_STRIPES << RAID5_STRIPE_SHIFT(conf)); + + /* No restrictions on the number of segments in the request */ + blk_queue_max_segments(mddev->queue, USHRT_MAX); } if (log_init(conf, journal_dev, raid5_has_ppl(conf))) @@ -8063,8 +8301,7 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev) * find the disk ... but prefer rdev->saved_raid_disk * if possible. */ - if (rdev->saved_raid_disk >= 0 && - rdev->saved_raid_disk >= first && + if (rdev->saved_raid_disk >= first && rdev->saved_raid_disk <= last && conf->disks[rdev->saved_raid_disk].rdev == NULL) first = rdev->saved_raid_disk; @@ -8701,8 +8938,11 @@ static int raid5_change_consistency_policy(struct mddev *mddev, const char *buf) err = log_init(conf, NULL, true); if (!err) { err = resize_stripes(conf, conf->pool_size); - if (err) + if (err) { + mddev_suspend(mddev); log_exit(conf); + mddev_resume(mddev); + } } } else err = -EINVAL; diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 638d29863503..a5082bed83c8 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -812,7 +812,7 @@ extern sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector, struct stripe_head *sh); extern struct stripe_head * raid5_get_active_stripe(struct r5conf *conf, sector_t sector, - int previous, int noblock, int noquiesce); + bool previous, bool noblock, bool noquiesce); extern int raid5_calc_degraded(struct r5conf *conf); extern int r5c_journal_mode_set(struct mddev *mddev, int journal_mode); #endif |