diff options
Diffstat (limited to 'drivers/md')
39 files changed, 824 insertions, 283 deletions
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index 098bf526136c..23e0b71b991e 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -529,9 +529,6 @@ static struct dm_buffer *list_to_buffer(struct list_head *l) { struct lru_entry *le = list_entry(l, struct lru_entry, list); - if (!le) - return NULL; - return le_to_buffer(le); } @@ -2474,7 +2471,8 @@ struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsign int r; unsigned int num_locks; struct dm_bufio_client *c; - char slab_name[27]; + char slab_name[64]; + static atomic_t seqno = ATOMIC_INIT(0); if (!block_size || block_size & ((1 << SECTOR_SHIFT) - 1)) { DMERR("%s: block size not specified or is not multiple of 512b", __func__); @@ -2525,7 +2523,8 @@ struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsign (block_size < PAGE_SIZE || !is_power_of_2(block_size))) { unsigned int align = min(1U << __ffs(block_size), (unsigned int)PAGE_SIZE); - snprintf(slab_name, sizeof(slab_name), "dm_bufio_cache-%u", block_size); + snprintf(slab_name, sizeof(slab_name), "dm_bufio_cache-%u-%u", + block_size, atomic_inc_return(&seqno)); c->slab_cache = kmem_cache_create(slab_name, block_size, align, SLAB_RECLAIM_ACCOUNT, NULL); if (!c->slab_cache) { @@ -2534,9 +2533,11 @@ struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsign } } if (aux_size) - snprintf(slab_name, sizeof(slab_name), "dm_bufio_buffer-%u", aux_size); + snprintf(slab_name, sizeof(slab_name), "dm_bufio_buffer-%u-%u", + aux_size, atomic_inc_return(&seqno)); else - snprintf(slab_name, sizeof(slab_name), "dm_bufio_buffer"); + snprintf(slab_name, sizeof(slab_name), "dm_bufio_buffer-%u", + atomic_inc_return(&seqno)); c->slab_buffer = kmem_cache_create(slab_name, sizeof(struct dm_buffer) + aux_size, 0, SLAB_RECLAIM_ACCOUNT, NULL); if (!c->slab_buffer) { diff --git a/drivers/md/dm-cache-background-tracker.c b/drivers/md/dm-cache-background-tracker.c index 9c5308298cf1..f3051bd7d2df 100644 --- a/drivers/md/dm-cache-background-tracker.c +++ b/drivers/md/dm-cache-background-tracker.c @@ -11,12 +11,6 @@ #define DM_MSG_PREFIX "dm-background-tracker" -struct bt_work { - struct list_head list; - struct rb_node node; - struct policy_work work; -}; - struct background_tracker { unsigned int max_work; atomic_t pending_promotes; @@ -26,10 +20,10 @@ struct background_tracker { struct list_head issued; struct list_head queued; struct rb_root pending; - - struct kmem_cache *work_cache; }; +struct kmem_cache *btracker_work_cache = NULL; + struct background_tracker *btracker_create(unsigned int max_work) { struct background_tracker *b = kmalloc(sizeof(*b), GFP_KERNEL); @@ -48,12 +42,6 @@ struct background_tracker *btracker_create(unsigned int max_work) INIT_LIST_HEAD(&b->queued); b->pending = RB_ROOT; - b->work_cache = KMEM_CACHE(bt_work, 0); - if (!b->work_cache) { - DMERR("couldn't create mempool for background work items"); - kfree(b); - b = NULL; - } return b; } @@ -66,10 +54,9 @@ void btracker_destroy(struct background_tracker *b) BUG_ON(!list_empty(&b->issued)); list_for_each_entry_safe (w, tmp, &b->queued, list) { list_del(&w->list); - kmem_cache_free(b->work_cache, w); + kmem_cache_free(btracker_work_cache, w); } - kmem_cache_destroy(b->work_cache); kfree(b); } EXPORT_SYMBOL_GPL(btracker_destroy); @@ -180,7 +167,7 @@ static struct bt_work *alloc_work(struct background_tracker *b) if (max_work_reached(b)) return NULL; - return kmem_cache_alloc(b->work_cache, GFP_NOWAIT); + return kmem_cache_alloc(btracker_work_cache, GFP_NOWAIT); } int btracker_queue(struct background_tracker *b, @@ -203,7 +190,7 @@ int btracker_queue(struct background_tracker *b, * There was a race, we'll just ignore this second * bit of work for the same oblock. */ - kmem_cache_free(b->work_cache, w); + kmem_cache_free(btracker_work_cache, w); return -EINVAL; } @@ -244,7 +231,7 @@ void btracker_complete(struct background_tracker *b, update_stats(b, &w->work, -1); rb_erase(&w->node, &b->pending); list_del(&w->list); - kmem_cache_free(b->work_cache, w); + kmem_cache_free(btracker_work_cache, w); } EXPORT_SYMBOL_GPL(btracker_complete); diff --git a/drivers/md/dm-cache-background-tracker.h b/drivers/md/dm-cache-background-tracker.h index 5b8f5c667b81..09c8fc59f7bb 100644 --- a/drivers/md/dm-cache-background-tracker.h +++ b/drivers/md/dm-cache-background-tracker.h @@ -26,6 +26,14 @@ * protected with a spinlock. */ +struct bt_work { + struct list_head list; + struct rb_node node; + struct policy_work work; +}; + +extern struct kmem_cache *btracker_work_cache; + struct background_work; struct background_tracker; diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index 17f0fab1e254..9cb797a561d6 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -10,6 +10,7 @@ #include "dm-bio-record.h" #include "dm-cache-metadata.h" #include "dm-io-tracker.h" +#include "dm-cache-background-tracker.h" #include <linux/dm-io.h> #include <linux/dm-kcopyd.h> @@ -1368,7 +1369,7 @@ static void mg_copy(struct work_struct *ws) */ bool rb = bio_detain_shared(mg->cache, mg->op->oblock, mg->overwrite_bio); - BUG_ON(rb); /* An exclussive lock must _not_ be held for this block */ + BUG_ON(rb); /* An exclusive lock must _not_ be held for this block */ mg->overwrite_bio = NULL; inc_io_migrations(mg->cache); mg_full_copy(ws); @@ -1905,16 +1906,13 @@ static void check_migrations(struct work_struct *ws) * This function gets called on the error paths of the constructor, so we * have to cope with a partially initialised struct. */ -static void destroy(struct cache *cache) +static void __destroy(struct cache *cache) { - unsigned int i; - mempool_exit(&cache->migration_pool); if (cache->prison) dm_bio_prison_destroy_v2(cache->prison); - cancel_delayed_work_sync(&cache->waker); if (cache->wq) destroy_workqueue(cache->wq); @@ -1942,13 +1940,22 @@ static void destroy(struct cache *cache) if (cache->policy) dm_cache_policy_destroy(cache->policy); + bioset_exit(&cache->bs); + + kfree(cache); +} + +static void destroy(struct cache *cache) +{ + unsigned int i; + + cancel_delayed_work_sync(&cache->waker); + for (i = 0; i < cache->nr_ctr_args ; i++) kfree(cache->ctr_args[i]); kfree(cache->ctr_args); - bioset_exit(&cache->bs); - - kfree(cache); + __destroy(cache); } static void cache_dtr(struct dm_target *ti) @@ -2003,7 +2010,6 @@ struct cache_args { sector_t cache_sectors; struct dm_dev *origin_dev; - sector_t origin_sectors; uint32_t block_size; @@ -2084,6 +2090,7 @@ static int parse_cache_dev(struct cache_args *ca, struct dm_arg_set *as, static int parse_origin_dev(struct cache_args *ca, struct dm_arg_set *as, char **error) { + sector_t origin_sectors; int r; if (!at_least_one_arg(as, error)) @@ -2096,8 +2103,8 @@ static int parse_origin_dev(struct cache_args *ca, struct dm_arg_set *as, return r; } - ca->origin_sectors = get_dev_size(ca->origin_dev); - if (ca->ti->len > ca->origin_sectors) { + origin_sectors = get_dev_size(ca->origin_dev); + if (ca->ti->len > origin_sectors) { *error = "Device size larger than cached device"; return -EINVAL; } @@ -2257,7 +2264,7 @@ static int parse_cache_args(struct cache_args *ca, int argc, char **argv, /*----------------------------------------------------------------*/ -static struct kmem_cache *migration_cache; +static struct kmem_cache *migration_cache = NULL; #define NOT_CORE_OPTION 1 @@ -2407,7 +2414,7 @@ static int cache_create(struct cache_args *ca, struct cache **result) ca->metadata_dev = ca->origin_dev = ca->cache_dev = NULL; - origin_blocks = cache->origin_sectors = ca->origin_sectors; + origin_blocks = cache->origin_sectors = ti->len; origin_blocks = block_div(origin_blocks, ca->block_size); cache->origin_blocks = to_oblock(origin_blocks); @@ -2561,7 +2568,7 @@ static int cache_create(struct cache_args *ca, struct cache **result) *result = cache; return 0; bad: - destroy(cache); + __destroy(cache); return r; } @@ -2612,7 +2619,7 @@ static int cache_ctr(struct dm_target *ti, unsigned int argc, char **argv) r = copy_ctr_args(cache, argc - 3, (const char **)argv + 3); if (r) { - destroy(cache); + __destroy(cache); goto out; } @@ -2895,19 +2902,19 @@ static dm_cblock_t get_cache_dev_size(struct cache *cache) static bool can_resize(struct cache *cache, dm_cblock_t new_size) { if (from_cblock(new_size) > from_cblock(cache->cache_size)) { - if (cache->sized) { - DMERR("%s: unable to extend cache due to missing cache table reload", - cache_device_name(cache)); - return false; - } + DMERR("%s: unable to extend cache due to missing cache table reload", + cache_device_name(cache)); + return false; } /* * We can't drop a dirty block when shrinking the cache. */ - while (from_cblock(new_size) < from_cblock(cache->cache_size)) { - new_size = to_cblock(from_cblock(new_size) + 1); - if (is_dirty(cache, new_size)) { + if (cache->loaded_mappings) { + new_size = to_cblock(find_next_bit(cache->dirty_bitset, + from_cblock(cache->cache_size), + from_cblock(new_size))); + if (new_size != cache->cache_size) { DMERR("%s: unable to shrink cache; cache block %llu is dirty", cache_device_name(cache), (unsigned long long) from_cblock(new_size)); @@ -2943,20 +2950,15 @@ static int cache_preresume(struct dm_target *ti) /* * Check to see if the cache has resized. */ - if (!cache->sized) { - r = resize_cache_dev(cache, csize); - if (r) - return r; - - cache->sized = true; - - } else if (csize != cache->cache_size) { + if (!cache->sized || csize != cache->cache_size) { if (!can_resize(cache, csize)) return -EINVAL; r = resize_cache_dev(cache, csize); if (r) return r; + + cache->sized = true; } if (!cache->loaded_mappings) { @@ -3200,8 +3202,6 @@ static int parse_cblock_range(struct cache *cache, const char *str, * Try and parse form (ii) first. */ r = sscanf(str, "%llu-%llu%c", &b, &e, &dummy); - if (r < 0) - return r; if (r == 2) { result->begin = to_cblock(b); @@ -3213,8 +3213,6 @@ static int parse_cblock_range(struct cache *cache, const char *str, * That didn't work, try form (i). */ r = sscanf(str, "%llu%c", &b, &dummy); - if (r < 0) - return r; if (r == 1) { result->begin = to_cblock(b); @@ -3364,7 +3362,7 @@ static int cache_iterate_devices(struct dm_target *ti, static void disable_passdown_if_not_supported(struct cache *cache) { struct block_device *origin_bdev = cache->origin_dev->bdev; - struct queue_limits *origin_limits = &bdev_get_queue(origin_bdev)->limits; + struct queue_limits *origin_limits = bdev_limits(origin_bdev); const char *reason = NULL; if (!cache->features.discard_passdown) @@ -3386,7 +3384,7 @@ static void disable_passdown_if_not_supported(struct cache *cache) static void set_discard_limits(struct cache *cache, struct queue_limits *limits) { struct block_device *origin_bdev = cache->origin_dev->bdev; - struct queue_limits *origin_limits = &bdev_get_queue(origin_bdev)->limits; + struct queue_limits *origin_limits = bdev_limits(origin_bdev); if (!cache->features.discard_passdown) { /* No passdown is done so setting own virtual limits */ @@ -3448,22 +3446,36 @@ static int __init dm_cache_init(void) int r; migration_cache = KMEM_CACHE(dm_cache_migration, 0); - if (!migration_cache) - return -ENOMEM; + if (!migration_cache) { + r = -ENOMEM; + goto err; + } + + btracker_work_cache = kmem_cache_create("dm_cache_bt_work", + sizeof(struct bt_work), __alignof__(struct bt_work), 0, NULL); + if (!btracker_work_cache) { + r = -ENOMEM; + goto err; + } r = dm_register_target(&cache_target); if (r) { - kmem_cache_destroy(migration_cache); - return r; + goto err; } return 0; + +err: + kmem_cache_destroy(migration_cache); + kmem_cache_destroy(btracker_work_cache); + return r; } static void __exit dm_cache_exit(void) { dm_unregister_target(&cache_target); kmem_cache_destroy(migration_cache); + kmem_cache_destroy(btracker_work_cache); } module_init(dm_cache_init); diff --git a/drivers/md/dm-clone-metadata.c b/drivers/md/dm-clone-metadata.c index 2db84cd2202b..14c5c28d938b 100644 --- a/drivers/md/dm-clone-metadata.c +++ b/drivers/md/dm-clone-metadata.c @@ -530,10 +530,7 @@ static int __load_bitset_in_core(struct dm_clone_metadata *cmd) return r; for (i = 0; ; i++) { - if (dm_bitset_cursor_get_value(&c)) - __set_bit(i, cmd->region_map); - else - __clear_bit(i, cmd->region_map); + __assign_bit(i, cmd->region_map, dm_bitset_cursor_get_value(&c)); if (i >= (cmd->nr_regions - 1)) break; diff --git a/drivers/md/dm-clone-target.c b/drivers/md/dm-clone-target.c index 12bbe487a4c8..e956d980672c 100644 --- a/drivers/md/dm-clone-target.c +++ b/drivers/md/dm-clone-target.c @@ -2020,7 +2020,7 @@ static void clone_resume(struct dm_target *ti) static void disable_passdown_if_not_supported(struct clone *clone) { struct block_device *dest_dev = clone->dest_dev->bdev; - struct queue_limits *dest_limits = &bdev_get_queue(dest_dev)->limits; + struct queue_limits *dest_limits = bdev_limits(dest_dev); const char *reason = NULL; if (!test_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags)) @@ -2041,7 +2041,7 @@ static void disable_passdown_if_not_supported(struct clone *clone) static void set_discard_limits(struct clone *clone, struct queue_limits *limits) { struct block_device *dest_bdev = clone->dest_dev->bdev; - struct queue_limits *dest_limits = &bdev_get_queue(dest_bdev)->limits; + struct queue_limits *dest_limits = bdev_limits(dest_bdev); if (!test_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags)) { /* No passdown is done so we set our own virtual limits */ diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 348b4b26c272..1ae2c71bb383 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -28,7 +28,7 @@ #include <linux/rbtree.h> #include <linux/ctype.h> #include <asm/page.h> -#include <asm/unaligned.h> +#include <linux/unaligned.h> #include <crypto/hash.h> #include <crypto/md5.h> #include <crypto/skcipher.h> @@ -147,6 +147,7 @@ enum cipher_flags { CRYPT_MODE_INTEGRITY_AEAD, /* Use authenticated mode for cipher */ CRYPT_IV_LARGE_SECTORS, /* Calculate IV from sector_size, not 512B sectors */ CRYPT_ENCRYPT_PREPROCESS, /* Must preprocess data for encryption (elephant) */ + CRYPT_KEY_MAC_SIZE_SET, /* The integrity_key_size option was used */ }; /* @@ -2613,35 +2614,31 @@ static int crypt_set_keyring_key(struct crypt_config *cc, const char *key_string key = request_key(type, key_desc + 1, NULL); if (IS_ERR(key)) { - kfree_sensitive(new_key_string); - return PTR_ERR(key); + ret = PTR_ERR(key); + goto free_new_key_string; } down_read(&key->sem); - ret = set_key(cc, key); - if (ret < 0) { - up_read(&key->sem); - key_put(key); - kfree_sensitive(new_key_string); - return ret; - } - up_read(&key->sem); key_put(key); + if (ret < 0) + goto free_new_key_string; /* clear the flag since following operations may invalidate previously valid key */ clear_bit(DM_CRYPT_KEY_VALID, &cc->flags); ret = crypt_setkey(cc); + if (ret) + goto free_new_key_string; - if (!ret) { - set_bit(DM_CRYPT_KEY_VALID, &cc->flags); - kfree_sensitive(cc->key_string); - cc->key_string = new_key_string; - } else - kfree_sensitive(new_key_string); + set_bit(DM_CRYPT_KEY_VALID, &cc->flags); + kfree_sensitive(cc->key_string); + cc->key_string = new_key_string; + return 0; +free_new_key_string: + kfree_sensitive(new_key_string); return ret; } @@ -2937,7 +2934,8 @@ static int crypt_ctr_auth_cipher(struct crypt_config *cc, char *cipher_api) if (IS_ERR(mac)) return PTR_ERR(mac); - cc->key_mac_size = crypto_ahash_digestsize(mac); + if (!test_bit(CRYPT_KEY_MAC_SIZE_SET, &cc->cipher_flags)) + cc->key_mac_size = crypto_ahash_digestsize(mac); crypto_free_ahash(mac); cc->authenc_key = kmalloc(crypt_authenckey_size(cc), GFP_KERNEL); @@ -3219,6 +3217,13 @@ static int crypt_ctr_optional(struct dm_target *ti, unsigned int argc, char **ar cc->cipher_auth = kstrdup(sval, GFP_KERNEL); if (!cc->cipher_auth) return -ENOMEM; + } else if (sscanf(opt_string, "integrity_key_size:%u%c", &val, &dummy) == 1) { + if (!val) { + ti->error = "Invalid integrity_key_size argument"; + return -EINVAL; + } + cc->key_mac_size = val; + set_bit(CRYPT_KEY_MAC_SIZE_SET, &cc->cipher_flags); } else if (sscanf(opt_string, "sector_size:%hu%c", &cc->sector_size, &dummy) == 1) { if (cc->sector_size < (1 << SECTOR_SHIFT) || cc->sector_size > 4096 || @@ -3607,10 +3612,10 @@ static void crypt_status(struct dm_target *ti, status_type_t type, num_feature_args += test_bit(DM_CRYPT_NO_OFFLOAD, &cc->flags); num_feature_args += test_bit(DM_CRYPT_NO_READ_WORKQUEUE, &cc->flags); num_feature_args += test_bit(DM_CRYPT_NO_WRITE_WORKQUEUE, &cc->flags); + num_feature_args += !!cc->used_tag_size; num_feature_args += cc->sector_size != (1 << SECTOR_SHIFT); num_feature_args += test_bit(CRYPT_IV_LARGE_SECTORS, &cc->cipher_flags); - if (cc->used_tag_size) - num_feature_args++; + num_feature_args += test_bit(CRYPT_KEY_MAC_SIZE_SET, &cc->cipher_flags); if (num_feature_args) { DMEMIT(" %d", num_feature_args); if (ti->num_discard_bios) @@ -3631,6 +3636,8 @@ static void crypt_status(struct dm_target *ti, status_type_t type, DMEMIT(" sector_size:%d", cc->sector_size); if (test_bit(CRYPT_IV_LARGE_SECTORS, &cc->cipher_flags)) DMEMIT(" iv_large_sectors"); + if (test_bit(CRYPT_KEY_MAC_SIZE_SET, &cc->cipher_flags)) + DMEMIT(" integrity_key_size:%u", cc->key_mac_size); } break; @@ -3758,7 +3765,7 @@ static void crypt_io_hints(struct dm_target *ti, struct queue_limits *limits) static struct target_type crypt_target = { .name = "crypt", - .version = {1, 27, 0}, + .version = {1, 28, 0}, .module = THIS_MODULE, .ctr = crypt_ctr, .dtr = crypt_dtr, diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index acff2f64f251..ee9f7cecd78e 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -284,6 +284,7 @@ struct dm_integrity_c { mempool_t recheck_pool; struct bio_set recheck_bios; + struct bio_set recalc_bios; struct notifier_block reboot_notifier; }; @@ -321,7 +322,9 @@ struct dm_integrity_io { struct dm_bio_details bio_details; char *integrity_payload; + unsigned payload_len; bool integrity_payload_from_mempool; + bool integrity_range_locked; }; struct journal_completion { @@ -359,7 +362,7 @@ static struct kmem_cache *journal_io_cache; #endif static void dm_integrity_map_continue(struct dm_integrity_io *dio, bool from_map); -static int dm_integrity_map_inline(struct dm_integrity_io *dio); +static int dm_integrity_map_inline(struct dm_integrity_io *dio, bool from_map); static void integrity_bio_wait(struct work_struct *w); static void dm_integrity_dtr(struct dm_target *ti); @@ -491,7 +494,8 @@ static int sb_mac(struct dm_integrity_c *ic, bool wr) __u8 *sb = (__u8 *)ic->sb; __u8 *mac = sb + (1 << SECTOR_SHIFT) - mac_size; - if (sizeof(struct superblock) + mac_size > 1 << SECTOR_SHIFT) { + if (sizeof(struct superblock) + mac_size > 1 << SECTOR_SHIFT || + mac_size > HASH_MAX_DIGESTSIZE) { dm_integrity_io_error(ic, "digest is too long", -EINVAL); return -EINVAL; } @@ -1500,15 +1504,15 @@ static void dm_integrity_flush_buffers(struct dm_integrity_c *ic, bool flush_dat if (!ic->meta_dev) flush_data = false; if (flush_data) { - fr.io_req.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC, - fr.io_req.mem.type = DM_IO_KMEM, - fr.io_req.mem.ptr.addr = NULL, - fr.io_req.notify.fn = flush_notify, + fr.io_req.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC; + fr.io_req.mem.type = DM_IO_KMEM; + fr.io_req.mem.ptr.addr = NULL; + fr.io_req.notify.fn = flush_notify; fr.io_req.notify.context = &fr; - fr.io_req.client = dm_bufio_get_dm_io_client(ic->bufio), - fr.io_reg.bdev = ic->dev->bdev, - fr.io_reg.sector = 0, - fr.io_reg.count = 0, + fr.io_req.client = dm_bufio_get_dm_io_client(ic->bufio); + fr.io_reg.bdev = ic->dev->bdev; + fr.io_reg.sector = 0; + fr.io_reg.count = 0; fr.ic = ic; init_completion(&fr.comp); r = dm_io(&fr.io_req, 1, &fr.io_reg, NULL, IOPRIO_DEFAULT); @@ -1946,8 +1950,13 @@ static int dm_integrity_map(struct dm_target *ti, struct bio *bio) dio->bi_status = 0; dio->op = bio_op(bio); - if (ic->mode == 'I') - return dm_integrity_map_inline(dio); + if (ic->mode == 'I') { + bio->bi_iter.bi_sector = dm_target_offset(ic->ti, bio->bi_iter.bi_sector); + dio->integrity_payload = NULL; + dio->integrity_payload_from_mempool = false; + dio->integrity_range_locked = false; + return dm_integrity_map_inline(dio, true); + } if (unlikely(dio->op == REQ_OP_DISCARD)) { if (ti->max_io_len) { @@ -2397,15 +2406,13 @@ journal_read_write: do_endio_flush(ic, dio); } -static int dm_integrity_map_inline(struct dm_integrity_io *dio) +static int dm_integrity_map_inline(struct dm_integrity_io *dio, bool from_map) { struct dm_integrity_c *ic = dio->ic; struct bio *bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io)); struct bio_integrity_payload *bip; - unsigned payload_len, digest_size, extra_size, ret; - - dio->integrity_payload = NULL; - dio->integrity_payload_from_mempool = false; + unsigned ret; + sector_t recalc_sector; if (unlikely(bio_integrity(bio))) { bio->bi_status = BLK_STS_NOTSUPP; @@ -2418,28 +2425,67 @@ static int dm_integrity_map_inline(struct dm_integrity_io *dio) return DM_MAPIO_REMAPPED; retry: - payload_len = ic->tuple_size * (bio_sectors(bio) >> ic->sb->log2_sectors_per_block); - digest_size = crypto_shash_digestsize(ic->internal_hash); - extra_size = unlikely(digest_size > ic->tag_size) ? digest_size - ic->tag_size : 0; - payload_len += extra_size; - dio->integrity_payload = kmalloc(payload_len, GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN); - if (unlikely(!dio->integrity_payload)) { - const unsigned x_size = PAGE_SIZE << 1; - if (payload_len > x_size) { - unsigned sectors = ((x_size - extra_size) / ic->tuple_size) << ic->sb->log2_sectors_per_block; - if (WARN_ON(!sectors || sectors >= bio_sectors(bio))) { - bio->bi_status = BLK_STS_NOTSUPP; - bio_endio(bio); - return DM_MAPIO_SUBMITTED; + if (!dio->integrity_payload) { + unsigned digest_size, extra_size; + dio->payload_len = ic->tuple_size * (bio_sectors(bio) >> ic->sb->log2_sectors_per_block); + digest_size = crypto_shash_digestsize(ic->internal_hash); + extra_size = unlikely(digest_size > ic->tag_size) ? digest_size - ic->tag_size : 0; + dio->payload_len += extra_size; + dio->integrity_payload = kmalloc(dio->payload_len, GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN); + if (unlikely(!dio->integrity_payload)) { + const unsigned x_size = PAGE_SIZE << 1; + if (dio->payload_len > x_size) { + unsigned sectors = ((x_size - extra_size) / ic->tuple_size) << ic->sb->log2_sectors_per_block; + if (WARN_ON(!sectors || sectors >= bio_sectors(bio))) { + bio->bi_status = BLK_STS_NOTSUPP; + bio_endio(bio); + return DM_MAPIO_SUBMITTED; + } + dm_accept_partial_bio(bio, sectors); + goto retry; } - dm_accept_partial_bio(bio, sectors); - goto retry; } + } + + dio->range.logical_sector = bio->bi_iter.bi_sector; + dio->range.n_sectors = bio_sectors(bio); + + if (!(ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING))) + goto skip_spinlock; +#ifdef CONFIG_64BIT + /* + * On 64-bit CPUs we can optimize the lock away (so that it won't cause + * cache line bouncing) and use acquire/release barriers instead. + * + * Paired with smp_store_release in integrity_recalc_inline. + */ + recalc_sector = le64_to_cpu(smp_load_acquire(&ic->sb->recalc_sector)); + if (likely(dio->range.logical_sector + dio->range.n_sectors <= recalc_sector)) + goto skip_spinlock; +#endif + spin_lock_irq(&ic->endio_wait.lock); + recalc_sector = le64_to_cpu(ic->sb->recalc_sector); + if (dio->range.logical_sector + dio->range.n_sectors <= recalc_sector) + goto skip_unlock; + if (unlikely(!add_new_range(ic, &dio->range, true))) { + if (from_map) { + spin_unlock_irq(&ic->endio_wait.lock); + INIT_WORK(&dio->work, integrity_bio_wait); + queue_work(ic->wait_wq, &dio->work); + return DM_MAPIO_SUBMITTED; + } + wait_and_add_new_range(ic, &dio->range); + } + dio->integrity_range_locked = true; +skip_unlock: + spin_unlock_irq(&ic->endio_wait.lock); +skip_spinlock: + + if (unlikely(!dio->integrity_payload)) { dio->integrity_payload = page_to_virt((struct page *)mempool_alloc(&ic->recheck_pool, GFP_NOIO)); dio->integrity_payload_from_mempool = true; } - bio->bi_iter.bi_sector = dm_target_offset(ic->ti, bio->bi_iter.bi_sector); dio->bio_details.bi_iter = bio->bi_iter; if (unlikely(!dm_integrity_check_limits(ic, bio->bi_iter.bi_sector, bio))) { @@ -2449,7 +2495,7 @@ retry: bio->bi_iter.bi_sector += ic->start + SB_SECTORS; bip = bio_integrity_alloc(bio, GFP_NOIO, 1); - if (unlikely(IS_ERR(bip))) { + if (IS_ERR(bip)) { bio->bi_status = errno_to_blk_status(PTR_ERR(bip)); bio_endio(bio); return DM_MAPIO_SUBMITTED; @@ -2470,8 +2516,8 @@ retry: } ret = bio_integrity_add_page(bio, virt_to_page(dio->integrity_payload), - payload_len, offset_in_page(dio->integrity_payload)); - if (unlikely(ret != payload_len)) { + dio->payload_len, offset_in_page(dio->integrity_payload)); + if (unlikely(ret != dio->payload_len)) { bio->bi_status = BLK_STS_RESOURCE; bio_endio(bio); return DM_MAPIO_SUBMITTED; @@ -2522,7 +2568,7 @@ static void dm_integrity_inline_recheck(struct work_struct *w) } bip = bio_integrity_alloc(outgoing_bio, GFP_NOIO, 1); - if (unlikely(IS_ERR(bip))) { + if (IS_ERR(bip)) { bio_put(outgoing_bio); bio->bi_status = errno_to_blk_status(PTR_ERR(bip)); bio_endio(bio); @@ -2579,6 +2625,9 @@ static int dm_integrity_end_io(struct dm_target *ti, struct bio *bio, blk_status struct dm_integrity_io *dio = dm_per_bio_data(bio, sizeof(struct dm_integrity_io)); if (dio->op == REQ_OP_READ && likely(*status == BLK_STS_OK)) { unsigned pos = 0; + if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING) && + unlikely(dio->integrity_range_locked)) + goto skip_check; while (dio->bio_details.bi_iter.bi_size) { char digest[HASH_MAX_DIGESTSIZE]; struct bio_vec bv = bio_iter_iovec(bio, dio->bio_details.bi_iter); @@ -2598,9 +2647,10 @@ static int dm_integrity_end_io(struct dm_target *ti, struct bio *bio, blk_status bio_advance_iter_single(bio, &dio->bio_details.bi_iter, ic->sectors_per_block << SECTOR_SHIFT); } } - if (likely(dio->op == REQ_OP_READ) || likely(dio->op == REQ_OP_WRITE)) { - dm_integrity_free_payload(dio); - } +skip_check: + dm_integrity_free_payload(dio); + if (unlikely(dio->integrity_range_locked)) + remove_range(ic, &dio->range); } return DM_ENDIO_DONE; } @@ -2608,8 +2658,26 @@ static int dm_integrity_end_io(struct dm_target *ti, struct bio *bio, blk_status static void integrity_bio_wait(struct work_struct *w) { struct dm_integrity_io *dio = container_of(w, struct dm_integrity_io, work); + struct dm_integrity_c *ic = dio->ic; - dm_integrity_map_continue(dio, false); + if (ic->mode == 'I') { + struct bio *bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io)); + int r = dm_integrity_map_inline(dio, false); + switch (r) { + case DM_MAPIO_KILL: + bio->bi_status = BLK_STS_IOERR; + fallthrough; + case DM_MAPIO_REMAPPED: + submit_bio_noacct(bio); + fallthrough; + case DM_MAPIO_SUBMITTED: + return; + default: + BUG(); + } + } else { + dm_integrity_map_continue(dio, false); + } } static void pad_uncommitted(struct dm_integrity_c *ic) @@ -3081,6 +3149,133 @@ free_ret: kvfree(recalc_tags); } +static void integrity_recalc_inline(struct work_struct *w) +{ + struct dm_integrity_c *ic = container_of(w, struct dm_integrity_c, recalc_work); + size_t recalc_tags_size; + u8 *recalc_buffer = NULL; + u8 *recalc_tags = NULL; + struct dm_integrity_range range; + struct bio *bio; + struct bio_integrity_payload *bip; + __u8 *t; + unsigned int i; + int r; + unsigned ret; + unsigned int super_counter = 0; + unsigned recalc_sectors = RECALC_SECTORS; + +retry: + recalc_buffer = kmalloc(recalc_sectors << SECTOR_SHIFT, GFP_NOIO | __GFP_NOWARN); + if (!recalc_buffer) { +oom: + recalc_sectors >>= 1; + if (recalc_sectors >= 1U << ic->sb->log2_sectors_per_block) + goto retry; + DMCRIT("out of memory for recalculate buffer - recalculation disabled"); + goto free_ret; + } + + recalc_tags_size = (recalc_sectors >> ic->sb->log2_sectors_per_block) * ic->tuple_size; + if (crypto_shash_digestsize(ic->internal_hash) > ic->tuple_size) + recalc_tags_size += crypto_shash_digestsize(ic->internal_hash) - ic->tuple_size; + recalc_tags = kmalloc(recalc_tags_size, GFP_NOIO | __GFP_NOWARN); + if (!recalc_tags) { + kfree(recalc_buffer); + recalc_buffer = NULL; + goto oom; + } + + spin_lock_irq(&ic->endio_wait.lock); + +next_chunk: + if (unlikely(dm_post_suspending(ic->ti))) + goto unlock_ret; + + range.logical_sector = le64_to_cpu(ic->sb->recalc_sector); + if (unlikely(range.logical_sector >= ic->provided_data_sectors)) + goto unlock_ret; + range.n_sectors = min((sector_t)recalc_sectors, ic->provided_data_sectors - range.logical_sector); + + add_new_range_and_wait(ic, &range); + spin_unlock_irq(&ic->endio_wait.lock); + + if (unlikely(++super_counter == RECALC_WRITE_SUPER)) { + recalc_write_super(ic); + super_counter = 0; + } + + if (unlikely(dm_integrity_failed(ic))) + goto err; + + DEBUG_print("recalculating: %llx - %llx\n", range.logical_sector, range.n_sectors); + + bio = bio_alloc_bioset(ic->dev->bdev, 1, REQ_OP_READ, GFP_NOIO, &ic->recalc_bios); + bio->bi_iter.bi_sector = ic->start + SB_SECTORS + range.logical_sector; + __bio_add_page(bio, virt_to_page(recalc_buffer), range.n_sectors << SECTOR_SHIFT, offset_in_page(recalc_buffer)); + r = submit_bio_wait(bio); + bio_put(bio); + if (unlikely(r)) { + dm_integrity_io_error(ic, "reading data", r); + goto err; + } + + t = recalc_tags; + for (i = 0; i < range.n_sectors; i += ic->sectors_per_block) { + memset(t, 0, ic->tuple_size); + integrity_sector_checksum(ic, range.logical_sector + i, recalc_buffer + (i << SECTOR_SHIFT), t); + t += ic->tuple_size; + } + + bio = bio_alloc_bioset(ic->dev->bdev, 1, REQ_OP_WRITE, GFP_NOIO, &ic->recalc_bios); + bio->bi_iter.bi_sector = ic->start + SB_SECTORS + range.logical_sector; + __bio_add_page(bio, virt_to_page(recalc_buffer), range.n_sectors << SECTOR_SHIFT, offset_in_page(recalc_buffer)); + + bip = bio_integrity_alloc(bio, GFP_NOIO, 1); + if (unlikely(IS_ERR(bip))) { + bio_put(bio); + DMCRIT("out of memory for bio integrity payload - recalculation disabled"); + goto err; + } + ret = bio_integrity_add_page(bio, virt_to_page(recalc_tags), t - recalc_tags, offset_in_page(recalc_tags)); + if (unlikely(ret != t - recalc_tags)) { + bio_put(bio); + dm_integrity_io_error(ic, "attaching integrity tags", -ENOMEM); + goto err; + } + + r = submit_bio_wait(bio); + bio_put(bio); + if (unlikely(r)) { + dm_integrity_io_error(ic, "writing data", r); + goto err; + } + + cond_resched(); + spin_lock_irq(&ic->endio_wait.lock); + remove_range_unlocked(ic, &range); +#ifdef CONFIG_64BIT + /* Paired with smp_load_acquire in dm_integrity_map_inline. */ + smp_store_release(&ic->sb->recalc_sector, cpu_to_le64(range.logical_sector + range.n_sectors)); +#else + ic->sb->recalc_sector = cpu_to_le64(range.logical_sector + range.n_sectors); +#endif + goto next_chunk; + +err: + remove_range(ic, &range); + goto free_ret; + +unlock_ret: + spin_unlock_irq(&ic->endio_wait.lock); + + recalc_write_super(ic); + +free_ret: + kfree(recalc_buffer); + kfree(recalc_tags); +} + static void bitmap_block_work(struct work_struct *w) { struct bitmap_block_status *bbs = container_of(w, struct bitmap_block_status, work); @@ -4619,6 +4814,17 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned int argc, char **argv r = -ENOMEM; goto bad; } + r = bioset_init(&ic->recalc_bios, 1, 0, BIOSET_NEED_BVECS); + if (r) { + ti->error = "Cannot allocate bio set"; + goto bad; + } + r = bioset_integrity_create(&ic->recalc_bios, 1); + if (r) { + ti->error = "Cannot allocate bio integrity set"; + r = -ENOMEM; + goto bad; + } } ic->metadata_wq = alloc_workqueue("dm-integrity-metadata", @@ -4717,13 +4923,18 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned int argc, char **argv ti->error = "Block size doesn't match the information in superblock"; goto bad; } - if (!le32_to_cpu(ic->sb->journal_sections) != (ic->mode == 'I')) { - r = -EINVAL; - if (ic->mode != 'I') + if (ic->mode != 'I') { + if (!le32_to_cpu(ic->sb->journal_sections)) { + r = -EINVAL; ti->error = "Corrupted superblock, journal_sections is 0"; - else + goto bad; + } + } else { + if (le32_to_cpu(ic->sb->journal_sections)) { + r = -EINVAL; ti->error = "Corrupted superblock, journal_sections is not 0"; - goto bad; + goto bad; + } } /* make sure that ti->max_io_len doesn't overflow */ if (!ic->meta_dev) { @@ -4830,7 +5041,7 @@ try_smaller_buffer: r = -ENOMEM; goto bad; } - INIT_WORK(&ic->recalc_work, integrity_recalc); + INIT_WORK(&ic->recalc_work, ic->mode == 'I' ? integrity_recalc_inline : integrity_recalc); } else { if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)) { ti->error = "Recalculate can only be specified with internal_hash"; @@ -4847,17 +5058,15 @@ try_smaller_buffer: goto bad; } - if (ic->mode != 'I') { - ic->bufio = dm_bufio_client_create(ic->meta_dev ? ic->meta_dev->bdev : ic->dev->bdev, - 1U << (SECTOR_SHIFT + ic->log2_buffer_sectors), 1, 0, NULL, NULL, 0); - if (IS_ERR(ic->bufio)) { - r = PTR_ERR(ic->bufio); - ti->error = "Cannot initialize dm-bufio"; - ic->bufio = NULL; - goto bad; - } - dm_bufio_set_sector_offset(ic->bufio, ic->start + ic->initial_sectors); + ic->bufio = dm_bufio_client_create(ic->meta_dev ? ic->meta_dev->bdev : ic->dev->bdev, + 1U << (SECTOR_SHIFT + ic->log2_buffer_sectors), 1, 0, NULL, NULL, 0); + if (IS_ERR(ic->bufio)) { + r = PTR_ERR(ic->bufio); + ti->error = "Cannot initialize dm-bufio"; + ic->bufio = NULL; + goto bad; } + dm_bufio_set_sector_offset(ic->bufio, ic->start + ic->initial_sectors); if (ic->mode != 'R' && ic->mode != 'I') { r = create_journal(ic, &ti->error); @@ -4979,6 +5188,7 @@ static void dm_integrity_dtr(struct dm_target *ti) kvfree(ic->bbs); if (ic->bufio) dm_bufio_client_destroy(ic->bufio); + bioset_exit(&ic->recalc_bios); bioset_exit(&ic->recheck_bios); mempool_exit(&ic->recheck_pool); mempool_exit(&ic->journal_io_mempool); @@ -5033,7 +5243,7 @@ static void dm_integrity_dtr(struct dm_target *ti) static struct target_type integrity_target = { .name = "integrity", - .version = {1, 12, 0}, + .version = {1, 13, 0}, .module = THIS_MODULE, .features = DM_TARGET_SINGLETON | DM_TARGET_INTEGRITY, .ctr = dm_integrity_ctr, diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 63682d27fc8d..1e0d3b9b75d6 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -2519,7 +2519,7 @@ static int super_validate(struct raid_set *rs, struct md_rdev *rdev) rdev->saved_raid_disk = rdev->raid_disk; } - /* Reshape support -> restore repective data offsets */ + /* Reshape support -> restore respective data offsets */ rdev->data_offset = le64_to_cpu(sb->data_offset); rdev->new_data_offset = le64_to_cpu(sb->new_data_offset); diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c index f7e9a3632eb3..499f8cc8a39f 100644 --- a/drivers/md/dm-rq.c +++ b/drivers/md/dm-rq.c @@ -496,8 +496,10 @@ static blk_status_t dm_mq_queue_rq(struct blk_mq_hw_ctx *hctx, map = dm_get_live_table(md, &srcu_idx); if (unlikely(!map)) { + DMERR_LIMIT("%s: mapping table unavailable, erroring io", + dm_device_name(md)); dm_put_live_table(md, srcu_idx); - return BLK_STS_RESOURCE; + return BLK_STS_IOERR; } ti = dm_table_find_target(map, 0); dm_put_live_table(md, srcu_idx); diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index a0c1620e90c8..9095f19a84f3 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -2842,7 +2842,7 @@ static void disable_discard_passdown_if_not_supported(struct pool_c *pt) { struct pool *pool = pt->pool; struct block_device *data_bdev = pt->data_dev->bdev; - struct queue_limits *data_limits = &bdev_get_queue(data_bdev)->limits; + struct queue_limits *data_limits = bdev_limits(data_bdev); const char *reason = NULL; if (!pt->adjusted_pf.discard_passdown) @@ -2948,7 +2948,7 @@ static struct pool *pool_create(struct mapped_device *pool_md, pmd = dm_pool_metadata_open(metadata_dev, block_size, format_device); if (IS_ERR(pmd)) { *error = "Error creating metadata object"; - return (struct pool *)pmd; + return ERR_CAST(pmd); } pool = kzalloc(sizeof(*pool), GFP_KERNEL); diff --git a/drivers/md/dm-unstripe.c b/drivers/md/dm-unstripe.c index 48587c16c445..e8a9432057dc 100644 --- a/drivers/md/dm-unstripe.c +++ b/drivers/md/dm-unstripe.c @@ -85,8 +85,8 @@ static int unstripe_ctr(struct dm_target *ti, unsigned int argc, char **argv) } uc->physical_start = start; - uc->unstripe_offset = uc->unstripe * uc->chunk_size; - uc->unstripe_width = (uc->stripes - 1) * uc->chunk_size; + uc->unstripe_offset = (sector_t)uc->unstripe * uc->chunk_size; + uc->unstripe_width = (sector_t)(uc->stripes - 1) * uc->chunk_size; uc->chunk_shift = is_power_of_2(uc->chunk_size) ? fls(uc->chunk_size) - 1 : 0; tmp_len = ti->len; diff --git a/drivers/md/dm-vdo/data-vio.c b/drivers/md/dm-vdo/data-vio.c index ab3ea8337809..0d502f6a86ad 100644 --- a/drivers/md/dm-vdo/data-vio.c +++ b/drivers/md/dm-vdo/data-vio.c @@ -501,6 +501,7 @@ static void launch_data_vio(struct data_vio *data_vio, logical_block_number_t lb memset(&data_vio->record_name, 0, sizeof(data_vio->record_name)); memset(&data_vio->duplicate, 0, sizeof(data_vio->duplicate)); + vdo_reset_completion(&data_vio->decrement_completion); vdo_reset_completion(completion); completion->error_handler = handle_data_vio_error; set_data_vio_logical_callback(data_vio, attempt_logical_block_lock); @@ -1273,12 +1274,14 @@ static void clean_hash_lock(struct vdo_completion *completion) static void finish_cleanup(struct data_vio *data_vio) { struct vdo_completion *completion = &data_vio->vio.completion; + u32 discard_size = min_t(u32, data_vio->remaining_discard, + VDO_BLOCK_SIZE - data_vio->offset); VDO_ASSERT_LOG_ONLY(data_vio->allocation.lock == NULL, "complete data_vio has no allocation lock"); VDO_ASSERT_LOG_ONLY(data_vio->hash_lock == NULL, "complete data_vio has no hash lock"); - if ((data_vio->remaining_discard <= VDO_BLOCK_SIZE) || + if ((data_vio->remaining_discard <= discard_size) || (completion->result != VDO_SUCCESS)) { struct data_vio_pool *pool = completion->vdo->data_vio_pool; @@ -1287,12 +1290,12 @@ static void finish_cleanup(struct data_vio *data_vio) return; } - data_vio->remaining_discard -= min_t(u32, data_vio->remaining_discard, - VDO_BLOCK_SIZE - data_vio->offset); + data_vio->remaining_discard -= discard_size; data_vio->is_partial = (data_vio->remaining_discard < VDO_BLOCK_SIZE); data_vio->read = data_vio->is_partial; data_vio->offset = 0; completion->requeue = true; + data_vio->first_reference_operation_complete = false; launch_data_vio(data_vio, data_vio->logical.lbn + 1); } @@ -1965,7 +1968,8 @@ static void allocate_block(struct vdo_completion *completion) .state = VDO_MAPPING_STATE_UNCOMPRESSED, }; - if (data_vio->fua) { + if (data_vio->fua || + data_vio->remaining_discard > (u32) (VDO_BLOCK_SIZE - data_vio->offset)) { prepare_for_dedupe(data_vio); return; } @@ -2042,7 +2046,6 @@ void continue_data_vio_with_block_map_slot(struct vdo_completion *completion) return; } - /* * We don't need to write any data, so skip allocation and just update the block map and * reference counts (via the journal). @@ -2051,7 +2054,7 @@ void continue_data_vio_with_block_map_slot(struct vdo_completion *completion) if (data_vio->is_zero) data_vio->new_mapped.state = VDO_MAPPING_STATE_UNCOMPRESSED; - if (data_vio->remaining_discard > VDO_BLOCK_SIZE) { + if (data_vio->remaining_discard > (u32) (VDO_BLOCK_SIZE - data_vio->offset)) { /* This is not the final block of a discard so we can't acknowledge it yet. */ update_metadata_for_data_vio_write(data_vio, NULL); return; diff --git a/drivers/md/dm-vdo/dedupe.c b/drivers/md/dm-vdo/dedupe.c index 39ac68614419..80628ae93fba 100644 --- a/drivers/md/dm-vdo/dedupe.c +++ b/drivers/md/dm-vdo/dedupe.c @@ -729,6 +729,7 @@ static void process_update_result(struct data_vio *agent) !change_context_state(context, DEDUPE_CONTEXT_COMPLETE, DEDUPE_CONTEXT_IDLE)) return; + agent->dedupe_context = NULL; release_context(context); } @@ -1648,6 +1649,7 @@ static void process_query_result(struct data_vio *agent) if (change_context_state(context, DEDUPE_CONTEXT_COMPLETE, DEDUPE_CONTEXT_IDLE)) { agent->is_duplicate = decode_uds_advice(context); + agent->dedupe_context = NULL; release_context(context); } } @@ -2321,6 +2323,7 @@ static void timeout_index_operations_callback(struct vdo_completion *completion) * send its requestor on its way. */ list_del_init(&context->list_entry); + context->requestor->dedupe_context = NULL; continue_data_vio(context->requestor); timed_out++; } diff --git a/drivers/md/dm-vdo/dm-vdo-target.c b/drivers/md/dm-vdo/dm-vdo-target.c index dd05691e4097..0e04c2021682 100644 --- a/drivers/md/dm-vdo/dm-vdo-target.c +++ b/drivers/md/dm-vdo/dm-vdo-target.c @@ -1105,6 +1105,9 @@ static int vdo_message(struct dm_target *ti, unsigned int argc, char **argv, if ((argc == 1) && (strcasecmp(argv[0], "stats") == 0)) { vdo_write_stats(vdo, result_buffer, maxlen); result = 1; + } else if ((argc == 1) && (strcasecmp(argv[0], "config") == 0)) { + vdo_write_config(vdo, &result_buffer, &maxlen); + result = 1; } else { result = vdo_status_to_errno(process_vdo_message(vdo, argc, argv)); } @@ -2293,6 +2296,14 @@ static void handle_load_error(struct vdo_completion *completion) return; } + if ((completion->result == VDO_UNSUPPORTED_VERSION) && + (vdo->admin.phase == LOAD_PHASE_MAKE_DIRTY)) { + vdo_log_error("Aborting load due to unsupported version"); + vdo->admin.phase = LOAD_PHASE_FINISHED; + load_callback(completion); + return; + } + vdo_log_error_strerror(completion->result, "Entering read-only mode due to load error"); vdo->admin.phase = LOAD_PHASE_WAIT_FOR_READ_ONLY; @@ -2737,6 +2748,19 @@ static int vdo_preresume_registered(struct dm_target *ti, struct vdo *vdo) vdo_log_info("starting device '%s'", device_name); result = perform_admin_operation(vdo, LOAD_PHASE_START, load_callback, handle_load_error, "load"); + if (result == VDO_UNSUPPORTED_VERSION) { + /* + * A component version is not supported. This can happen when the + * recovery journal metadata is in an old version format. Abort the + * load without saving the state. + */ + vdo->suspend_type = VDO_ADMIN_STATE_SUSPENDING; + perform_admin_operation(vdo, SUSPEND_PHASE_START, + suspend_callback, suspend_callback, + "suspend"); + return result; + } + if ((result != VDO_SUCCESS) && (result != VDO_READ_ONLY)) { /* * Something has gone very wrong. Make sure everything has drained and @@ -2808,7 +2832,8 @@ static int vdo_preresume(struct dm_target *ti) vdo_register_thread_device_id(&instance_thread, &vdo->instance); result = vdo_preresume_registered(ti, vdo); - if ((result == VDO_PARAMETER_MISMATCH) || (result == VDO_INVALID_ADMIN_STATE)) + if ((result == VDO_PARAMETER_MISMATCH) || (result == VDO_INVALID_ADMIN_STATE) || + (result == VDO_UNSUPPORTED_VERSION)) result = -EINVAL; vdo_unregister_thread_device_id(); return vdo_status_to_errno(result); @@ -2832,7 +2857,7 @@ static void vdo_resume(struct dm_target *ti) static struct target_type vdo_target_bio = { .features = DM_TARGET_SINGLETON, .name = "vdo", - .version = { 9, 0, 0 }, + .version = { 9, 1, 0 }, .module = THIS_MODULE, .ctr = vdo_ctr, .dtr = vdo_dtr, diff --git a/drivers/md/dm-vdo/indexer/chapter-index.c b/drivers/md/dm-vdo/indexer/chapter-index.c index 7e32a25d3f2f..fb1db41c794b 100644 --- a/drivers/md/dm-vdo/indexer/chapter-index.c +++ b/drivers/md/dm-vdo/indexer/chapter-index.c @@ -177,7 +177,7 @@ int uds_pack_open_chapter_index_page(struct open_chapter_index *chapter_index, if (list_number < 0) return UDS_OVERFLOW; - next_list = first_list + list_number--, + next_list = first_list + list_number--; result = uds_start_delta_index_search(delta_index, next_list, 0, &entry); if (result != UDS_SUCCESS) diff --git a/drivers/md/dm-vdo/io-submitter.c b/drivers/md/dm-vdo/io-submitter.c index 9a3716bb3c05..ab62abe18827 100644 --- a/drivers/md/dm-vdo/io-submitter.c +++ b/drivers/md/dm-vdo/io-submitter.c @@ -346,7 +346,6 @@ void __submit_metadata_vio(struct vio *vio, physical_block_number_t physical, VDO_ASSERT_LOG_ONLY(!code->quiescent, "I/O not allowed in state %s", code->name); - VDO_ASSERT_LOG_ONLY(vio->bio->bi_next == NULL, "metadata bio has no next bio"); vdo_reset_completion(completion); completion->error_handler = error_handler; diff --git a/drivers/md/dm-vdo/message-stats.c b/drivers/md/dm-vdo/message-stats.c index 2802cf92922b..75dfcd7c5f63 100644 --- a/drivers/md/dm-vdo/message-stats.c +++ b/drivers/md/dm-vdo/message-stats.c @@ -4,6 +4,7 @@ */ #include "dedupe.h" +#include "indexer.h" #include "logger.h" #include "memory-alloc.h" #include "message-stats.h" @@ -430,3 +431,50 @@ int vdo_write_stats(struct vdo *vdo, char *buf, unsigned int maxlen) vdo_free(stats); return VDO_SUCCESS; } + +static void write_index_memory(u32 mem, char **buf, unsigned int *maxlen) +{ + char *prefix = "memorySize : "; + + /* Convert index memory to fractional value */ + if (mem == (u32)UDS_MEMORY_CONFIG_256MB) + write_string(prefix, "0.25, ", NULL, buf, maxlen); + else if (mem == (u32)UDS_MEMORY_CONFIG_512MB) + write_string(prefix, "0.50, ", NULL, buf, maxlen); + else if (mem == (u32)UDS_MEMORY_CONFIG_768MB) + write_string(prefix, "0.75, ", NULL, buf, maxlen); + else + write_u32(prefix, mem, ", ", buf, maxlen); +} + +static void write_index_config(struct index_config *config, char **buf, + unsigned int *maxlen) +{ + write_string("index : ", "{ ", NULL, buf, maxlen); + /* index mem size */ + write_index_memory(config->mem, buf, maxlen); + /* whether the index is sparse or not */ + write_bool("isSparse : ", config->sparse, ", ", buf, maxlen); + write_string(NULL, "}", ", ", buf, maxlen); +} + +int vdo_write_config(struct vdo *vdo, char **buf, unsigned int *maxlen) +{ + struct vdo_config *config = &vdo->states.vdo.config; + + write_string(NULL, "{ ", NULL, buf, maxlen); + /* version */ + write_u32("version : ", 1, ", ", buf, maxlen); + /* physical size */ + write_block_count_t("physicalSize : ", config->physical_blocks * VDO_BLOCK_SIZE, ", ", + buf, maxlen); + /* logical size */ + write_block_count_t("logicalSize : ", config->logical_blocks * VDO_BLOCK_SIZE, ", ", + buf, maxlen); + /* slab size */ + write_block_count_t("slabSize : ", config->slab_size, ", ", buf, maxlen); + /* index config */ + write_index_config(&vdo->geometry.index_config, buf, maxlen); + write_string(NULL, "}", NULL, buf, maxlen); + return VDO_SUCCESS; +} diff --git a/drivers/md/dm-vdo/message-stats.h b/drivers/md/dm-vdo/message-stats.h index f7fceca9acab..f9c95eff569d 100644 --- a/drivers/md/dm-vdo/message-stats.h +++ b/drivers/md/dm-vdo/message-stats.h @@ -8,6 +8,7 @@ #include "types.h" +int vdo_write_config(struct vdo *vdo, char **buf, unsigned int *maxlen); int vdo_write_stats(struct vdo *vdo, char *buf, unsigned int maxlen); #endif /* VDO_MESSAGE_STATS_H */ diff --git a/drivers/md/dm-vdo/murmurhash3.c b/drivers/md/dm-vdo/murmurhash3.c index 3a989efae142..13008b089206 100644 --- a/drivers/md/dm-vdo/murmurhash3.c +++ b/drivers/md/dm-vdo/murmurhash3.c @@ -8,7 +8,7 @@ #include "murmurhash3.h" -#include <asm/unaligned.h> +#include <linux/unaligned.h> static inline u64 rotl64(u64 x, s8 r) { diff --git a/drivers/md/dm-vdo/numeric.h b/drivers/md/dm-vdo/numeric.h index dc8c400b21d2..f568dc59e6f1 100644 --- a/drivers/md/dm-vdo/numeric.h +++ b/drivers/md/dm-vdo/numeric.h @@ -6,7 +6,7 @@ #ifndef UDS_NUMERIC_H #define UDS_NUMERIC_H -#include <asm/unaligned.h> +#include <linux/unaligned.h> #include <linux/kernel.h> #include <linux/types.h> diff --git a/drivers/md/dm-vdo/repair.c b/drivers/md/dm-vdo/repair.c index 7e0009d2f67d..ffff2c999518 100644 --- a/drivers/md/dm-vdo/repair.c +++ b/drivers/md/dm-vdo/repair.c @@ -1202,17 +1202,14 @@ static bool __must_check is_valid_recovery_journal_block(const struct recovery_j * @journal: The journal to use. * @header: The unpacked block header to check. * @sequence: The expected sequence number. - * @type: The expected metadata type. * * Return: True if the block matches. */ static bool __must_check is_exact_recovery_journal_block(const struct recovery_journal *journal, const struct recovery_block_header *header, - sequence_number_t sequence, - enum vdo_metadata_type type) + sequence_number_t sequence) { - return ((header->metadata_type == type) && - (header->sequence_number == sequence) && + return ((header->sequence_number == sequence) && (is_valid_recovery_journal_block(journal, header, true))); } @@ -1371,7 +1368,8 @@ static void extract_entries_from_block(struct repair_completion *repair, get_recovery_journal_block_header(journal, repair->journal_data, sequence); - if (!is_exact_recovery_journal_block(journal, &header, sequence, format)) { + if (!is_exact_recovery_journal_block(journal, &header, sequence) || + (header.metadata_type != format)) { /* This block is invalid, so skip it. */ return; } @@ -1557,10 +1555,13 @@ static int parse_journal_for_recovery(struct repair_completion *repair) sequence_number_t i, head; bool found_entries = false; struct recovery_journal *journal = repair->completion.vdo->recovery_journal; + struct recovery_block_header header; + enum vdo_metadata_type expected_format; head = min(repair->block_map_head, repair->slab_journal_head); + header = get_recovery_journal_block_header(journal, repair->journal_data, head); + expected_format = header.metadata_type; for (i = head; i <= repair->highest_tail; i++) { - struct recovery_block_header header; journal_entry_count_t block_entries; u8 j; @@ -1572,19 +1573,15 @@ static int parse_journal_for_recovery(struct repair_completion *repair) }; header = get_recovery_journal_block_header(journal, repair->journal_data, i); - if (header.metadata_type == VDO_METADATA_RECOVERY_JOURNAL) { - /* This is an old format block, so we need to upgrade */ - vdo_log_error_strerror(VDO_UNSUPPORTED_VERSION, - "Recovery journal is in the old format, a read-only rebuild is required."); - vdo_enter_read_only_mode(repair->completion.vdo, - VDO_UNSUPPORTED_VERSION); - return VDO_UNSUPPORTED_VERSION; - } - - if (!is_exact_recovery_journal_block(journal, &header, i, - VDO_METADATA_RECOVERY_JOURNAL_2)) { + if (!is_exact_recovery_journal_block(journal, &header, i)) { /* A bad block header was found so this must be the end of the journal. */ break; + } else if (header.metadata_type != expected_format) { + /* There is a mix of old and new format blocks, so we need to rebuild. */ + vdo_log_error_strerror(VDO_CORRUPT_JOURNAL, + "Recovery journal is in an invalid format, a read-only rebuild is required."); + vdo_enter_read_only_mode(repair->completion.vdo, VDO_CORRUPT_JOURNAL); + return VDO_CORRUPT_JOURNAL; } block_entries = header.entry_count; @@ -1620,8 +1617,14 @@ static int parse_journal_for_recovery(struct repair_completion *repair) break; } - if (!found_entries) + if (!found_entries) { return validate_heads(repair); + } else if (expected_format == VDO_METADATA_RECOVERY_JOURNAL) { + /* All journal blocks have the old format, so we need to upgrade. */ + vdo_log_error_strerror(VDO_UNSUPPORTED_VERSION, + "Recovery journal is in the old format. Downgrade and complete recovery, then upgrade with a clean volume"); + return VDO_UNSUPPORTED_VERSION; + } /* Set the tail to the last valid tail block, if there is one. */ if (repair->tail_recovery_point.sector_count == 0) diff --git a/drivers/md/dm-vdo/status-codes.c b/drivers/md/dm-vdo/status-codes.c index d3493450b169..dd252d660b6d 100644 --- a/drivers/md/dm-vdo/status-codes.c +++ b/drivers/md/dm-vdo/status-codes.c @@ -28,7 +28,7 @@ const struct error_info vdo_status_list[] = { { "VDO_LOCK_ERROR", "A lock is held incorrectly" }, { "VDO_READ_ONLY", "The device is in read-only mode" }, { "VDO_SHUTTING_DOWN", "The device is shutting down" }, - { "VDO_CORRUPT_JOURNAL", "Recovery journal entries corrupted" }, + { "VDO_CORRUPT_JOURNAL", "Recovery journal corrupted" }, { "VDO_TOO_MANY_SLABS", "Exceeds maximum number of slabs supported" }, { "VDO_INVALID_FRAGMENT", "Compressed block fragment is invalid" }, { "VDO_RETRY_AFTER_REBUILD", "Retry operation after rebuilding finishes" }, diff --git a/drivers/md/dm-vdo/status-codes.h b/drivers/md/dm-vdo/status-codes.h index 72da04159f88..426dc8e2ca5d 100644 --- a/drivers/md/dm-vdo/status-codes.h +++ b/drivers/md/dm-vdo/status-codes.h @@ -52,7 +52,7 @@ enum vdo_status_codes { VDO_READ_ONLY, /* the VDO is shutting down */ VDO_SHUTTING_DOWN, - /* the recovery journal has corrupt entries */ + /* the recovery journal has corrupt entries or corrupt metadata */ VDO_CORRUPT_JOURNAL, /* exceeds maximum number of slabs supported */ VDO_TOO_MANY_SLABS, diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c index 24ba9a10444c..c142ec5458b7 100644 --- a/drivers/md/dm-verity-target.c +++ b/drivers/md/dm-verity-target.c @@ -36,11 +36,13 @@ #define DM_VERITY_OPT_LOGGING "ignore_corruption" #define DM_VERITY_OPT_RESTART "restart_on_corruption" #define DM_VERITY_OPT_PANIC "panic_on_corruption" +#define DM_VERITY_OPT_ERROR_RESTART "restart_on_error" +#define DM_VERITY_OPT_ERROR_PANIC "panic_on_error" #define DM_VERITY_OPT_IGN_ZEROES "ignore_zero_blocks" #define DM_VERITY_OPT_AT_MOST_ONCE "check_at_most_once" #define DM_VERITY_OPT_TASKLET_VERIFY "try_verify_in_tasklet" -#define DM_VERITY_OPTS_MAX (4 + DM_VERITY_OPTS_FEC + \ +#define DM_VERITY_OPTS_MAX (5 + DM_VERITY_OPTS_FEC + \ DM_VERITY_ROOT_HASH_VERIFICATION_OPTS) static unsigned int dm_verity_prefetch_cluster = DM_VERITY_DEFAULT_PREFETCH_SIZE; @@ -354,9 +356,9 @@ static int verity_verify_level(struct dm_verity *v, struct dm_verity_io *io, else if (verity_handle_err(v, DM_VERITY_BLOCK_TYPE_METADATA, hash_block)) { - struct bio *bio = - dm_bio_from_per_bio_data(io, - v->ti->per_io_data_size); + struct bio *bio; + io->had_mismatch = true; + bio = dm_bio_from_per_bio_data(io, v->ti->per_io_data_size); dm_audit_log_bio(DM_MSG_PREFIX, "verify-metadata", bio, block, 0); r = -EIO; @@ -480,6 +482,7 @@ static int verity_handle_data_hash_mismatch(struct dm_verity *v, return -EIO; /* Error correction failed; Just return error */ if (verity_handle_err(v, DM_VERITY_BLOCK_TYPE_DATA, blkno)) { + io->had_mismatch = true; dm_audit_log_bio(DM_MSG_PREFIX, "verify-data", bio, blkno, 0); return -EIO; } @@ -583,6 +586,11 @@ static inline bool verity_is_system_shutting_down(void) || system_state == SYSTEM_RESTART; } +static void restart_io_error(struct work_struct *w) +{ + kernel_restart("dm-verity device has I/O error"); +} + /* * End one "io" structure with a given error. */ @@ -597,6 +605,24 @@ static void verity_finish_io(struct dm_verity_io *io, blk_status_t status) if (!static_branch_unlikely(&use_bh_wq_enabled) || !io->in_bh) verity_fec_finish_io(io); + if (unlikely(status != BLK_STS_OK) && + unlikely(!(bio->bi_opf & REQ_RAHEAD)) && + !io->had_mismatch && + !verity_is_system_shutting_down()) { + if (v->error_mode == DM_VERITY_MODE_PANIC) { + panic("dm-verity device has I/O error"); + } + if (v->error_mode == DM_VERITY_MODE_RESTART) { + static DECLARE_WORK(restart_work, restart_io_error); + queue_work(v->verify_wq, &restart_work); + /* + * We deliberately don't call bio_endio here, because + * the machine will be restarted anyway. + */ + return; + } + } + bio_endio(bio); } @@ -755,6 +781,7 @@ static int verity_map(struct dm_target *ti, struct bio *bio) io->orig_bi_end_io = bio->bi_end_io; io->block = bio->bi_iter.bi_sector >> (v->data_dev_block_bits - SECTOR_SHIFT); io->n_blocks = bio->bi_iter.bi_size >> v->data_dev_block_bits; + io->had_mismatch = false; bio->bi_end_io = verity_end_io; bio->bi_private = io; @@ -805,6 +832,8 @@ static void verity_status(struct dm_target *ti, status_type_t type, DMEMIT("%02x", v->salt[x]); if (v->mode != DM_VERITY_MODE_EIO) args++; + if (v->error_mode != DM_VERITY_MODE_EIO) + args++; if (verity_fec_is_enabled(v)) args += DM_VERITY_OPTS_FEC; if (v->zero_digest) @@ -834,6 +863,19 @@ static void verity_status(struct dm_target *ti, status_type_t type, BUG(); } } + if (v->error_mode != DM_VERITY_MODE_EIO) { + DMEMIT(" "); + switch (v->error_mode) { + case DM_VERITY_MODE_RESTART: + DMEMIT(DM_VERITY_OPT_ERROR_RESTART); + break; + case DM_VERITY_MODE_PANIC: + DMEMIT(DM_VERITY_OPT_ERROR_PANIC); + break; + default: + BUG(); + } + } if (v->zero_digest) DMEMIT(" " DM_VERITY_OPT_IGN_ZEROES); if (v->validated_blocks) @@ -886,6 +928,19 @@ static void verity_status(struct dm_target *ti, status_type_t type, DMEMIT("invalid"); } } + if (v->error_mode != DM_VERITY_MODE_EIO) { + DMEMIT(",verity_error_mode="); + switch (v->error_mode) { + case DM_VERITY_MODE_RESTART: + DMEMIT(DM_VERITY_OPT_ERROR_RESTART); + break; + case DM_VERITY_MODE_PANIC: + DMEMIT(DM_VERITY_OPT_ERROR_PANIC); + break; + default: + DMEMIT("invalid"); + } + } DMEMIT(";"); break; } @@ -1088,6 +1143,25 @@ static int verity_parse_verity_mode(struct dm_verity *v, const char *arg_name) return 0; } +static inline bool verity_is_verity_error_mode(const char *arg_name) +{ + return (!strcasecmp(arg_name, DM_VERITY_OPT_ERROR_RESTART) || + !strcasecmp(arg_name, DM_VERITY_OPT_ERROR_PANIC)); +} + +static int verity_parse_verity_error_mode(struct dm_verity *v, const char *arg_name) +{ + if (v->error_mode) + return -EINVAL; + + if (!strcasecmp(arg_name, DM_VERITY_OPT_ERROR_RESTART)) + v->error_mode = DM_VERITY_MODE_RESTART; + else if (!strcasecmp(arg_name, DM_VERITY_OPT_ERROR_PANIC)) + v->error_mode = DM_VERITY_MODE_PANIC; + + return 0; +} + static int verity_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v, struct dm_verity_sig_opts *verify_args, bool only_modifier_opts) @@ -1122,6 +1196,16 @@ static int verity_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v, } continue; + } else if (verity_is_verity_error_mode(arg_name)) { + if (only_modifier_opts) + continue; + r = verity_parse_verity_error_mode(v, arg_name); + if (r) { + ti->error = "Conflicting error handling parameters"; + return r; + } + continue; + } else if (!strcasecmp(arg_name, DM_VERITY_OPT_IGN_ZEROES)) { if (only_modifier_opts) continue; diff --git a/drivers/md/dm-verity-verify-sig.c b/drivers/md/dm-verity-verify-sig.c index d351d7d39c60..a9e2c6c0a33c 100644 --- a/drivers/md/dm-verity-verify-sig.c +++ b/drivers/md/dm-verity-verify-sig.c @@ -127,7 +127,7 @@ int verity_verify_root_hash(const void *root_hash, size_t root_hash_len, #endif VERIFYING_UNSPECIFIED_SIGNATURE, NULL, NULL); #ifdef CONFIG_DM_VERITY_VERIFY_ROOTHASH_SIG_PLATFORM_KEYRING - if (ret == -ENOKEY) + if (ret == -ENOKEY || ret == -EKEYREJECTED) ret = verify_pkcs7_signature(root_hash, root_hash_len, sig_data, sig_len, VERIFY_USE_PLATFORM_KEYRING, diff --git a/drivers/md/dm-verity.h b/drivers/md/dm-verity.h index 754e70bb5fe0..c996140bda94 100644 --- a/drivers/md/dm-verity.h +++ b/drivers/md/dm-verity.h @@ -64,6 +64,7 @@ struct dm_verity { unsigned int digest_size; /* digest size for the current hash algorithm */ unsigned int hash_reqsize; /* the size of temporary space for crypto */ enum verity_mode mode; /* mode for handling verification errors */ + enum verity_mode error_mode;/* mode for handling I/O errors */ unsigned int corrupted_errs;/* Number of errors for corrupted blocks */ struct workqueue_struct *verify_wq; @@ -91,6 +92,7 @@ struct dm_verity_io { sector_t block; unsigned int n_blocks; bool in_bh; + bool had_mismatch; struct work_struct work; struct work_struct bh_work; diff --git a/drivers/md/dm-zone.c b/drivers/md/dm-zone.c index c0d41c36e06e..20edd3fabbab 100644 --- a/drivers/md/dm-zone.c +++ b/drivers/md/dm-zone.c @@ -344,7 +344,7 @@ int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q, clear_bit(DMF_EMULATE_ZONE_APPEND, &md->flags); } else { set_bit(DMF_EMULATE_ZONE_APPEND, &md->flags); - lim->max_zone_append_sectors = 0; + lim->max_hw_zone_append_sectors = 0; } /* @@ -379,7 +379,7 @@ int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q, if (!zlim.mapped_nr_seq_zones) { lim->max_open_zones = 0; lim->max_active_zones = 0; - lim->max_zone_append_sectors = 0; + lim->max_hw_zone_append_sectors = 0; lim->zone_write_granularity = 0; lim->chunk_sectors = 0; lim->features &= ~BLK_FEAT_ZONED; diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 87bb90303435..19230404d8c2 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -2030,10 +2030,15 @@ static void dm_submit_bio(struct bio *bio) struct dm_table *map; map = dm_get_live_table(md, &srcu_idx); + if (unlikely(!map)) { + DMERR_LIMIT("%s: mapping table unavailable, erroring io", + dm_device_name(md)); + bio_io_error(bio); + goto out; + } - /* If suspended, or map not yet available, queue this IO for later */ - if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) || - unlikely(!map)) { + /* If suspended, queue this IO for later */ + if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) { if (bio->bi_opf & REQ_NOWAIT) bio_wouldblock_error(bio); else if (bio->bi_opf & REQ_RAHEAD) @@ -2285,8 +2290,10 @@ static struct mapped_device *alloc_dev(int minor) * override accordingly. */ md->disk = blk_alloc_disk(NULL, md->numa_node_id); - if (IS_ERR(md->disk)) + if (IS_ERR(md->disk)) { + md->disk = NULL; goto bad; + } md->queue = md->disk->queue; init_waitqueue_head(&md->wait); diff --git a/drivers/md/dm.h b/drivers/md/dm.h index cc466ad5cb1d..8ad782249af8 100644 --- a/drivers/md/dm.h +++ b/drivers/md/dm.h @@ -109,7 +109,6 @@ void dm_zone_endio(struct dm_io *io, struct bio *clone); int dm_blk_report_zones(struct gendisk *disk, sector_t sector, unsigned int nr_zones, report_zones_cb cb, void *data); bool dm_is_zone_write(struct mapped_device *md, struct bio *bio); -int dm_zone_map_bio(struct dm_target_io *io); int dm_zone_get_reset_bitmap(struct mapped_device *md, struct dm_table *t, sector_t sector, unsigned int nr_zones, unsigned long *need_reset); @@ -119,10 +118,6 @@ static inline bool dm_is_zone_write(struct mapped_device *md, struct bio *bio) { return false; } -static inline int dm_zone_map_bio(struct dm_target_io *tio) -{ - return DM_MAPIO_KILL; -} #endif /* diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index 29da10e6f703..c3a42dd66ce5 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -1285,6 +1285,7 @@ static void bitmap_unplug_async(struct bitmap *bitmap) queue_work(md_bitmap_wq, &unplug_work.work); wait_for_completion(&done); + destroy_work_on_stack(&unplug_work.work); } static void bitmap_unplug(struct mddev *mddev, bool sync) diff --git a/drivers/md/md.c b/drivers/md/md.c index 179ee4afe937..aebe12b0ee27 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -546,6 +546,26 @@ static int mddev_set_closing_and_sync_blockdev(struct mddev *mddev, int opener_n return 0; } +/* + * The only difference from bio_chain_endio() is that the current + * bi_status of bio does not affect the bi_status of parent. + */ +static void md_end_flush(struct bio *bio) +{ + struct bio *parent = bio->bi_private; + + /* + * If any flush io error before the power failure, + * disk data may be lost. + */ + if (bio->bi_status) + pr_err("md: %pg flush io error %d\n", bio->bi_bdev, + blk_status_to_errno(bio->bi_status)); + + bio_put(bio); + bio_endio(parent); +} + bool md_flush_request(struct mddev *mddev, struct bio *bio) { struct md_rdev *rdev; @@ -565,7 +585,9 @@ bool md_flush_request(struct mddev *mddev, struct bio *bio) new = bio_alloc_bioset(rdev->bdev, 0, REQ_OP_WRITE | REQ_PREFLUSH, GFP_NOIO, &mddev->bio_set); - bio_chain(new, bio); + new->bi_private = bio; + new->bi_end_io = md_end_flush; + bio_inc_remaining(bio); submit_bio(new); } @@ -9762,9 +9784,7 @@ EXPORT_SYMBOL(md_reap_sync_thread); void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev) { sysfs_notify_dirent_safe(rdev->sysfs_state); - wait_event_timeout(rdev->blocked_wait, - !test_bit(Blocked, &rdev->flags) && - !test_bit(BlockedBadBlocks, &rdev->flags), + wait_event_timeout(rdev->blocked_wait, !rdev_blocked(rdev), msecs_to_jiffies(5000)); rdev_dec_pending(rdev, mddev); } @@ -9793,6 +9813,17 @@ int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, { struct mddev *mddev = rdev->mddev; int rv; + + /* + * Recording new badblocks for faulty rdev will force unnecessary + * super block updating. This is fragile for external management because + * userspace daemon may trying to remove this device and deadlock may + * occur. This will be probably solved in the mdadm, but it is safer to + * avoid it. + */ + if (test_bit(Faulty, &rdev->flags)) + return 1; + if (is_new) s += rdev->new_data_offset; else diff --git a/drivers/md/md.h b/drivers/md/md.h index 5d2e6bd58e4d..4ba93af36126 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -1002,6 +1002,30 @@ static inline void mddev_trace_remap(struct mddev *mddev, struct bio *bio, trace_block_bio_remap(bio, disk_devt(mddev->gendisk), sector); } +static inline bool rdev_blocked(struct md_rdev *rdev) +{ + /* + * Blocked will be set by error handler and cleared by daemon after + * updating superblock, meanwhile write IO should be blocked to prevent + * reading old data after power failure. + */ + if (test_bit(Blocked, &rdev->flags)) + return true; + + /* + * Faulty device should not be accessed anymore, there is no need to + * wait for bad block to be acknowledged. + */ + if (test_bit(Faulty, &rdev->flags)) + return false; + + /* rdev is blocked by badblocks. */ + if (test_bit(BlockedBadBlocks, &rdev->flags)) + return true; + + return false; +} + #define mddev_add_trace_msg(mddev, fmt, args...) \ do { \ if (!mddev_is_dm(mddev)) \ diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 32d587524778..baaf5f8b80ae 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -466,6 +466,12 @@ static void raid0_handle_discard(struct mddev *mddev, struct bio *bio) struct bio *split = bio_split(bio, zone->zone_end - bio->bi_iter.bi_sector, GFP_NOIO, &mddev->bio_set); + + if (IS_ERR(split)) { + bio->bi_status = errno_to_blk_status(PTR_ERR(split)); + bio_endio(bio); + return; + } bio_chain(split, bio); submit_bio_noacct(bio); bio = split; @@ -608,6 +614,12 @@ static bool raid0_make_request(struct mddev *mddev, struct bio *bio) if (sectors < bio_sectors(bio)) { struct bio *split = bio_split(bio, sectors, GFP_NOIO, &mddev->bio_set); + + if (IS_ERR(split)) { + bio->bi_status = errno_to_blk_status(PTR_ERR(split)); + bio_endio(bio); + return true; + } bio_chain(split, bio); raid0_map_submit_bio(mddev, bio); bio = split; diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 6c9d24203f39..a5adf08ee174 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1322,7 +1322,7 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio, const enum req_op op = bio_op(bio); const blk_opf_t do_sync = bio->bi_opf & REQ_SYNC; int max_sectors; - int rdisk; + int rdisk, error; bool r1bio_existed = !!r1_bio; /* @@ -1383,6 +1383,11 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio, if (max_sectors < bio_sectors(bio)) { struct bio *split = bio_split(bio, max_sectors, gfp, &conf->bio_split); + + if (IS_ERR(split)) { + error = PTR_ERR(split); + goto err_handle; + } bio_chain(split, bio); submit_bio_noacct(bio); bio = split; @@ -1410,6 +1415,47 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio, read_bio->bi_private = r1_bio; mddev_trace_remap(mddev, read_bio, r1_bio->sector); submit_bio_noacct(read_bio); + return; + +err_handle: + atomic_dec(&mirror->rdev->nr_pending); + bio->bi_status = errno_to_blk_status(error); + set_bit(R1BIO_Uptodate, &r1_bio->state); + raid_end_bio_io(r1_bio); +} + +static bool wait_blocked_rdev(struct mddev *mddev, struct bio *bio) +{ + struct r1conf *conf = mddev->private; + int disks = conf->raid_disks * 2; + int i; + +retry: + for (i = 0; i < disks; i++) { + struct md_rdev *rdev = conf->mirrors[i].rdev; + + if (!rdev) + continue; + + /* don't write here until the bad block is acknowledged */ + if (test_bit(WriteErrorSeen, &rdev->flags) && + rdev_has_badblock(rdev, bio->bi_iter.bi_sector, + bio_sectors(bio)) < 0) + set_bit(BlockedBadBlocks, &rdev->flags); + + if (rdev_blocked(rdev)) { + if (bio->bi_opf & REQ_NOWAIT) + return false; + + mddev_add_trace_msg(rdev->mddev, "raid1 wait rdev %d blocked", + rdev->raid_disk); + atomic_inc(&rdev->nr_pending); + md_wait_for_blocked_rdev(rdev, rdev->mddev); + goto retry; + } + } + + return true; } static void raid1_write_request(struct mddev *mddev, struct bio *bio, @@ -1417,9 +1463,8 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, { struct r1conf *conf = mddev->private; struct r1bio *r1_bio; - int i, disks; + int i, disks, k, error; unsigned long flags; - struct md_rdev *blocked_rdev; int first_clone; int max_sectors; bool write_behind = false; @@ -1457,7 +1502,11 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, return; } - retry_write: + if (!wait_blocked_rdev(mddev, bio)) { + bio_wouldblock_error(bio); + return; + } + r1_bio = alloc_r1bio(mddev, bio); r1_bio->sectors = max_write_sectors; @@ -1473,7 +1522,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, */ disks = conf->raid_disks * 2; - blocked_rdev = NULL; max_sectors = r1_bio->sectors; for (i = 0; i < disks; i++) { struct md_rdev *rdev = conf->mirrors[i].rdev; @@ -1486,11 +1534,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, if (!is_discard && rdev && test_bit(WriteMostly, &rdev->flags)) write_behind = true; - if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) { - atomic_inc(&rdev->nr_pending); - blocked_rdev = rdev; - break; - } r1_bio->bios[i] = NULL; if (!rdev || test_bit(Faulty, &rdev->flags)) { if (i < conf->raid_disks) @@ -1506,13 +1549,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, is_bad = is_badblock(rdev, r1_bio->sector, max_sectors, &first_bad, &bad_sectors); - if (is_bad < 0) { - /* mustn't write here until the bad block is - * acknowledged*/ - set_bit(BlockedBadBlocks, &rdev->flags); - blocked_rdev = rdev; - break; - } if (is_bad && first_bad <= r1_bio->sector) { /* Cannot write here at all */ bad_sectors -= (r1_bio->sector - first_bad); @@ -1543,27 +1579,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, r1_bio->bios[i] = bio; } - if (unlikely(blocked_rdev)) { - /* Wait for this device to become unblocked */ - int j; - - for (j = 0; j < i; j++) - if (r1_bio->bios[j]) - rdev_dec_pending(conf->mirrors[j].rdev, mddev); - mempool_free(r1_bio, &conf->r1bio_pool); - allow_barrier(conf, bio->bi_iter.bi_sector); - - if (bio->bi_opf & REQ_NOWAIT) { - bio_wouldblock_error(bio); - return; - } - mddev_add_trace_msg(mddev, "raid1 wait rdev %d blocked", - blocked_rdev->raid_disk); - md_wait_for_blocked_rdev(blocked_rdev, mddev); - wait_barrier(conf, bio->bi_iter.bi_sector, false); - goto retry_write; - } - /* * When using a bitmap, we may call alloc_behind_master_bio below. * alloc_behind_master_bio allocates a copy of the data payload a page @@ -1576,6 +1591,11 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, if (max_sectors < bio_sectors(bio)) { struct bio *split = bio_split(bio, max_sectors, GFP_NOIO, &conf->bio_split); + + if (IS_ERR(split)) { + error = PTR_ERR(split); + goto err_handle; + } bio_chain(split, bio); submit_bio_noacct(bio); bio = split; @@ -1660,6 +1680,18 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, /* In case raid1d snuck in to freeze_array */ wake_up_barrier(conf); + return; +err_handle: + for (k = 0; k < i; k++) { + if (r1_bio->bios[k]) { + rdev_dec_pending(conf->mirrors[k].rdev, mddev); + r1_bio->bios[k] = NULL; + } + } + + bio->bi_status = errno_to_blk_status(error); + set_bit(R1BIO_Uptodate, &r1_bio->state); + raid_end_bio_io(r1_bio); } static bool raid1_make_request(struct mddev *mddev, struct bio *bio) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index f3bf1116794a..18989231791a 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1159,6 +1159,7 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio, int slot = r10_bio->read_slot; struct md_rdev *err_rdev = NULL; gfp_t gfp = GFP_NOIO; + int error; if (slot >= 0 && r10_bio->devs[slot].rdev) { /* @@ -1206,6 +1207,10 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio, if (max_sectors < bio_sectors(bio)) { struct bio *split = bio_split(bio, max_sectors, gfp, &conf->bio_split); + if (IS_ERR(split)) { + error = PTR_ERR(split); + goto err_handle; + } bio_chain(split, bio); allow_barrier(conf); submit_bio_noacct(bio); @@ -1236,6 +1241,11 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio, mddev_trace_remap(mddev, read_bio, r10_bio->sector); submit_bio_noacct(read_bio); return; +err_handle: + atomic_dec(&rdev->nr_pending); + bio->bi_status = errno_to_blk_status(error); + set_bit(R10BIO_Uptodate, &r10_bio->state); + raid_end_bio_io(r10_bio); } static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio, @@ -1285,9 +1295,9 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio, static void wait_blocked_dev(struct mddev *mddev, struct r10bio *r10_bio) { - int i; struct r10conf *conf = mddev->private; struct md_rdev *blocked_rdev; + int i; retry_wait: blocked_rdev = NULL; @@ -1295,40 +1305,36 @@ retry_wait: struct md_rdev *rdev, *rrdev; rdev = conf->mirrors[i].rdev; - rrdev = conf->mirrors[i].replacement; - if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) { - atomic_inc(&rdev->nr_pending); - blocked_rdev = rdev; - break; - } - if (rrdev && unlikely(test_bit(Blocked, &rrdev->flags))) { - atomic_inc(&rrdev->nr_pending); - blocked_rdev = rrdev; - break; - } - - if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) { + if (rdev) { sector_t dev_sector = r10_bio->devs[i].addr; /* * Discard request doesn't care the write result * so it doesn't need to wait blocked disk here. */ - if (!r10_bio->sectors) - continue; - - if (rdev_has_badblock(rdev, dev_sector, - r10_bio->sectors) < 0) { + if (test_bit(WriteErrorSeen, &rdev->flags) && + r10_bio->sectors && + rdev_has_badblock(rdev, dev_sector, + r10_bio->sectors) < 0) /* - * Mustn't write here until the bad block - * is acknowledged + * Mustn't write here until the bad + * block is acknowledged */ - atomic_inc(&rdev->nr_pending); set_bit(BlockedBadBlocks, &rdev->flags); + + if (rdev_blocked(rdev)) { blocked_rdev = rdev; + atomic_inc(&rdev->nr_pending); break; } } + + rrdev = conf->mirrors[i].replacement; + if (rrdev && rdev_blocked(rrdev)) { + atomic_inc(&rrdev->nr_pending); + blocked_rdev = rrdev; + break; + } } if (unlikely(blocked_rdev)) { @@ -1347,9 +1353,10 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, struct r10bio *r10_bio) { struct r10conf *conf = mddev->private; - int i; + int i, k; sector_t sectors; int max_sectors; + int error; if ((mddev_is_clustered(mddev) && md_cluster_ops->area_resyncing(mddev, WRITE, @@ -1482,6 +1489,10 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, if (r10_bio->sectors < bio_sectors(bio)) { struct bio *split = bio_split(bio, r10_bio->sectors, GFP_NOIO, &conf->bio_split); + if (IS_ERR(split)) { + error = PTR_ERR(split); + goto err_handle; + } bio_chain(split, bio); allow_barrier(conf); submit_bio_noacct(bio); @@ -1503,6 +1514,26 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, raid10_write_one_disk(mddev, r10_bio, bio, true, i); } one_write_done(r10_bio); + return; +err_handle: + for (k = 0; k < i; k++) { + int d = r10_bio->devs[k].devnum; + struct md_rdev *rdev = conf->mirrors[d].rdev; + struct md_rdev *rrdev = conf->mirrors[d].replacement; + + if (r10_bio->devs[k].bio) { + rdev_dec_pending(rdev, mddev); + r10_bio->devs[k].bio = NULL; + } + if (r10_bio->devs[k].repl_bio) { + rdev_dec_pending(rrdev, mddev); + r10_bio->devs[k].repl_bio = NULL; + } + } + + bio->bi_status = errno_to_blk_status(error); + set_bit(R10BIO_Uptodate, &r10_bio->state); + raid_end_bio_io(r10_bio); } static void __make_request(struct mddev *mddev, struct bio *bio, int sectors) @@ -1644,6 +1675,11 @@ static int raid10_handle_discard(struct mddev *mddev, struct bio *bio) if (remainder) { split_size = stripe_size - remainder; split = bio_split(bio, split_size, GFP_NOIO, &conf->bio_split); + if (IS_ERR(split)) { + bio->bi_status = errno_to_blk_status(PTR_ERR(split)); + bio_endio(bio); + return 0; + } bio_chain(split, bio); allow_barrier(conf); /* Resend the fist split part */ @@ -1654,6 +1690,11 @@ static int raid10_handle_discard(struct mddev *mddev, struct bio *bio) if (remainder) { split_size = bio_sectors(bio) - remainder; split = bio_split(bio, split_size, GFP_NOIO, &conf->bio_split); + if (IS_ERR(split)) { + bio->bi_status = errno_to_blk_status(PTR_ERR(split)); + bio_endio(bio); + return 0; + } bio_chain(split, bio); allow_barrier(conf); /* Resend the second split part */ @@ -4061,9 +4102,12 @@ static int raid10_run(struct mddev *mddev) } if (!mddev_is_dm(conf->mddev)) { - ret = raid10_set_queue_limits(mddev); - if (ret) + int err = raid10_set_queue_limits(mddev); + + if (err) { + ret = err; goto out_free_conf; + } } /* need to check that every block has at least one working mirror */ diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c index a70cbec12ed0..37c4da5311ca 100644 --- a/drivers/md/raid5-ppl.c +++ b/drivers/md/raid5-ppl.c @@ -258,7 +258,7 @@ static struct ppl_io_unit *ppl_new_iounit(struct ppl_log *log, memset(pplhdr->reserved, 0xff, PPL_HDR_RESERVED); pplhdr->signature = cpu_to_le32(ppl_conf->signature); - io->seq = atomic64_add_return(1, &ppl_conf->seq); + io->seq = atomic64_inc_return(&ppl_conf->seq); pplhdr->generation = cpu_to_le64(io->seq); return io; diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index dc2ea636d173..f09e7677ee9f 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -4724,14 +4724,13 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) if (rdev) { is_bad = rdev_has_badblock(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf)); - if (s->blocked_rdev == NULL - && (test_bit(Blocked, &rdev->flags) - || is_bad < 0)) { + if (s->blocked_rdev == NULL) { if (is_bad < 0) - set_bit(BlockedBadBlocks, - &rdev->flags); - s->blocked_rdev = rdev; - atomic_inc(&rdev->nr_pending); + set_bit(BlockedBadBlocks, &rdev->flags); + if (rdev_blocked(rdev)) { + s->blocked_rdev = rdev; + atomic_inc(&rdev->nr_pending); + } } } clear_bit(R5_Insync, &dev->flags); @@ -7177,6 +7176,8 @@ raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len) err = mddev_suspend_and_lock(mddev); if (err) return err; + raid5_quiesce(mddev, true); + conf = mddev->private; if (!conf) err = -ENODEV; @@ -7198,6 +7199,8 @@ raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len) kfree(old_groups); } } + + raid5_quiesce(mddev, false); mddev_unlock_and_resume(mddev); return err ?: len; diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 896ecfc4afa6..d174e586698f 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -633,7 +633,7 @@ struct r5conf { * two caches. */ int active_name; - char cache_name[2][32]; + char cache_name[2][48]; struct kmem_cache *slab_cache; /* for allocating stripes */ struct mutex cache_size_mutex; /* Protect changes to cache size */ |